Esempio n. 1
0
def merge_overlaps(in_file, data, distance=None, out_dir=None):
    """Merge bed file intervals to avoid overlapping regions.

    Overlapping regions (1:1-100, 1:90-100) cause issues with callers like FreeBayes
    that don't collapse BEDs prior to using them.
    """
    config = data["config"]
    if in_file:
        bedtools = config_utils.get_program("bedtools", config,
                                            default="bedtools")
        work_dir = tz.get_in(["dirs", "work"], data)
        if out_dir:
            bedprep_dir = out_dir
        elif work_dir:
            bedprep_dir = utils.safe_makedir(os.path.join(work_dir, "bedprep"))
        else:
            bedprep_dir = os.path.dirname(in_file)
        out_file = os.path.join(bedprep_dir, "%s-merged.bed" % (utils.splitext_plus(os.path.basename(in_file))[0]))
        if not utils.file_uptodate(out_file, in_file):
            with file_transaction(data, out_file) as tx_out_file:
                distance = "-d %s" % distance if distance else ""
                cmd = "{bedtools} merge {distance} -i {in_file} > {tx_out_file}"
                do.run(cmd.format(**locals()), "Prepare merged BED file", data)
        vcfutils.bgzip_and_index(out_file, data["config"], remove_orig=False)
        return out_file
Esempio n. 2
0
def _run_genomicsdb_import(vrn_files, region, out_file, data):
    """Create a GenomicsDB reference for all the variation files: GATK4.

    Not yet tested as scale, need to explore --batchSize to reduce memory
    usage if needed.

    Does not support transactional directories yet, since
    GenomicsDB databases cannot be moved to new locations. We try to
    identify half-finished databases and restart:
https://gatkforums.broadinstitute.org/gatk/discussion/10061/using-genomicsdbimport-to-prepare-gvcfs-for-input-to-genotypegvcfs-in-gatk4

    Known issue -- Genomics DB workspace path core dumps on longer paths:
    (std::string::compare(char const*))
    """
    out_dir = "%s_genomicsdb" % utils.splitext_plus(out_file)[0]
    if not os.path.exists(out_dir) or _incomplete_genomicsdb(out_dir):
        if os.path.exists(out_dir):
            shutil.rmtree(out_dir)
        with utils.chdir(os.path.dirname(out_file)):
            with file_transaction(data, out_dir) as tx_out_dir:
                broad_runner = broad.runner_from_config(data["config"])
                cores = dd.get_cores(data)
                params = ["-T", "GenomicsDBImport",
                          "--reader-threads", str(cores),
                          "--genomicsdb-workspace-path", os.path.relpath(out_dir, os.getcwd()),
                          "-L", bamprep.region_to_gatk(region)]
                for vrn_file in vrn_files:
                    vcfutils.bgzip_and_index(vrn_file, data["config"])
                    params += ["--variant", vrn_file]
                memscale = {"magnitude": 0.9 * cores, "direction": "increase"} if cores > 1 else None
                broad_runner.run_gatk(params, memscale=memscale)
    return out_dir
Esempio n. 3
0
def _run_scalpel_paired(align_bams, items, ref_file, assoc_files,
                          region=None, out_file=None):
    """Detect indels with Scalpel.

    This is used for paired tumor / normal samples.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(config, out_file) as tx_out_file:
            paired = get_paired_bams(align_bams, items)
            if not paired.normal_bam:
                ann_file = _run_scalpel_caller(align_bams, items, ref_file,
                                               assoc_files, region, out_file)
                return ann_file
            vcfstreamsort = config_utils.get_program("vcfstreamsort", config)
            perl_exports = utils.get_perl_exports(os.path.dirname(tx_out_file))
            tmp_path = "%s-scalpel-work" % utils.splitext_plus(out_file)[0]
            db_file = os.path.join(tmp_path, "main", "somatic.db")
            if not os.path.exists(db_file + ".dir"):
                if os.path.exists(tmp_path):
                    utils.remove_safe(tmp_path)
                opts = " ".join(_scalpel_options_from_config(items, config, out_file, region, tmp_path))
                opts += " --ref {}".format(ref_file)
                opts += " --dir %s" % tmp_path
                # caling
                cl = ("{perl_exports} && "
                      "scalpel-discovery --somatic {opts} --tumor {paired.tumor_bam} --normal {paired.normal_bam}")
                do.run(cl.format(**locals()), "Genotyping paired variants with Scalpel", {})
            # filtering to adjust input parameters
            bed_opts = " ".join(_scalpel_bed_file_opts(items, config, out_file, region, tmp_path))
            use_defaults = True
            if use_defaults:
                scalpel_tmp_file = os.path.join(tmp_path, "main/somatic.indel.vcf")
            # Uses default filters but can tweak min-alt-count-tumor and min-phred-fisher
            # to swap precision for sensitivity
            else:
                scalpel_tmp_file = os.path.join(tmp_path, "main/somatic-indel-filter.vcf.gz")
                with file_transaction(config, scalpel_tmp_file) as tx_indel_file:
                    cmd = ("{perl_exports} && "
                           "scalpel-export --somatic {bed_opts} --ref {ref_file} --db {db_file} "
                           "--min-alt-count-tumor 5 --min-phred-fisher 10 --min-vaf-tumor 0.1 "
                           "| bgzip -c > {tx_indel_file}")
                    do.run(cmd.format(**locals()), "Scalpel somatic indel filter", {})
            scalpel_tmp_file = bgzip_and_index(scalpel_tmp_file, config)
            scalpel_tmp_file_common = bgzip_and_index(os.path.join(tmp_path, "main/common.indel.vcf"), config)
            compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
            bcftools_cmd_chi2 = get_scalpel_bcftools_filter_expression("chi2", config)
            bcftools_cmd_common = get_scalpel_bcftools_filter_expression("reject", config)
            fix_ambig = vcfutils.fix_ambiguous_cl()
            cl2 = ("vcfcat <({bcftools_cmd_chi2} {scalpel_tmp_file}) "
                   "<({bcftools_cmd_common} {scalpel_tmp_file_common}) | "
                   " {fix_ambig} | {vcfstreamsort} {compress_cmd} > {tx_out_file}")
            do.run(cl2.format(**locals()), "Finalising Scalpel variants", {})

    ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams,
                                               assoc_files.get("dbsnp"), ref_file,
                                               config)
    return ann_file
Esempio n. 4
0
def _filter_by_bedpe(vcf_file, bedpe_file, data):
    """Add filters to VCF based on pre-filtered bedpe file.
    """
    out_file = "%s-filter%s" % utils.splitext_plus(vcf_file)
    nogzip_out_file = out_file.replace(".vcf.gz", ".vcf")
    if not utils.file_exists(out_file):
        filters = {}
        with open(bedpe_file) as in_handle:
            for line in in_handle:
                parts = line.split("\t")
                name = parts[6]
                cur_filter = parts[-1].strip()
                if cur_filter != "PASS":
                    filters[name] = cur_filter
        with file_transaction(nogzip_out_file) as tx_out_file:
            with open(tx_out_file, "w") as out_handle:
                with utils.open_gzipsafe(vcf_file) as in_handle:
                    for line in in_handle:
                        if not line.startswith("#"):
                            parts = line.split("\t")
                            cur_id = parts[2].split("_")[0]
                            cur_filter = filters.get(cur_id, "PASS")
                            if cur_filter != "PASS":
                                parts[6] = cur_filter
                            line = "\t".join(parts)
                        out_handle.write(line)
        if out_file.endswith(".gz"):
            vcfutils.bgzip_and_index(nogzip_out_file, data["config"])
    return out_file
Esempio n. 5
0
def _run_rtg_eval(vrn_file, rm_file, rm_interval_file, base_dir, data):
    """Run evaluation of a caller against the truth set using rtg vcfeval.
    """
    out_dir = os.path.join(base_dir, "rtg")
    if not utils.file_exists(os.path.join(out_dir, "done")):
        if os.path.exists(out_dir):
            shutil.rmtree(out_dir)
        if not rm_file.endswith(".vcf.gz") or not os.path.exists(rm_file + ".tbi"):
            rm_file = vcfutils.bgzip_and_index(rm_file, data["config"], out_dir=base_dir)
        if len(vcfutils.get_samples(vrn_file)) > 1:
            base, ext = utils.splitext_plus(vrn_file)
            sample_file = os.path.join(base_dir, "%s-%s%s" % (base, dd.get_sample_name(data), ext))
            vrn_file = vcfutils.select_sample(vrn_file, dd.get_sample_name(data), sample_file, data["config"])
        if not vrn_file.endswith(".vcf.gz") or not os.path.exists(vrn_file + ".tbi"):
            vrn_file = vcfutils.bgzip_and_index(vrn_file, data["config"], out_dir=base_dir)

        interval_bed = _get_merged_intervals(rm_interval_file, base_dir, data)
        ref_dir, ref_filebase = os.path.split(dd.get_ref_file(data))
        rtg_ref = os.path.normpath(os.path.join(ref_dir, os.path.pardir, "rtg",
                                                "%s.sdf" % (os.path.splitext(ref_filebase)[0])))
        assert os.path.exists(rtg_ref), ("Did not find rtg indexed reference file for validation:\n%s\n"
                                         "Run bcbio_nextgen.py upgrade --data --aligners rtg" % rtg_ref)
        cmd = ["rtg", "vcfeval", "-b", rm_file, "--bed-regions", interval_bed,
               "-c", vrn_file, "-t", rtg_ref, "-o", out_dir]
        do.run(cmd, "Validate calls using rtg vcfeval", data)
    return {"tp": os.path.join(out_dir, "tp.vcf.gz"),
            "fp": os.path.join(out_dir, "fp.vcf.gz"),
            "fn": os.path.join(out_dir, "fn.vcf.gz")}
Esempio n. 6
0
def clean_file(in_file, data, prefix="", bedprep_dir=None, simple=None):
    """Prepare a clean sorted input BED file without headers
    """
    # Remove non-ascii characters. Used in coverage analysis, to support JSON code in one column
    #   and be happy with sambamba:
    simple = "iconv -c -f utf-8 -t ascii | sed 's/ //g' |" if simple else ""
    if in_file:
        if not bedprep_dir:
            bedprep_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "bedprep"))
        # Avoid running multiple times with same prefix
        if prefix and os.path.basename(in_file).startswith(prefix):
            return in_file
        out_file = os.path.join(bedprep_dir, "%s%s" % (prefix, os.path.basename(in_file)))
        out_file = out_file.replace(".interval_list", ".bed")
        if out_file.endswith(".gz"):
            out_file = out_file[:-3]
        if not utils.file_uptodate(out_file, in_file):
            check_bed_contigs(in_file, data)
            check_bed_coords(in_file, data)
            with file_transaction(data, out_file) as tx_out_file:
                bcbio_py = sys.executable
                cat_cmd = "zcat" if in_file.endswith(".gz") else "cat"
                sort_cmd = get_sort_cmd(os.path.dirname(tx_out_file))
                cmd = ("{cat_cmd} {in_file} | grep -v ^track | grep -v ^browser | grep -v ^@ | "
                       "grep -v ^# | {simple} "
                       "{bcbio_py} -c 'from bcbio.variation import bedutils; bedutils.remove_bad()' | "
                       "{sort_cmd} -k1,1 -k2,2n > {tx_out_file}")
                do.run(cmd.format(**locals()), "Prepare cleaned BED file", data)
        vcfutils.bgzip_and_index(out_file, data.get("config", {}), remove_orig=False)
        return out_file
Esempio n. 7
0
def gatk_rnaseq_calling(data):
    """
    use GATK to perform variant calling on RNA-seq data
    """
    broad_runner = broad.runner_from_config(dd.get_config(data))
    ref_file = dd.get_ref_file(data)
    split_bam = dd.get_split_bam(data)
    out_file = os.path.splitext(split_bam)[0] + ".vcf"
    bgzipped_file = out_file + ".gz"
    num_cores = dd.get_num_cores(data)
    if file_exists(bgzipped_file):
        data = dd.set_vrn_file(data, bgzipped_file)
        return data
    with file_transaction(data, out_file) as tx_out_file:
        params = ["-T", "HaplotypeCaller",
                  "-R", ref_file,
                  "-I", split_bam,
                  "-o", tx_out_file,
                  "-nct", str(num_cores),
                  "--emitRefConfidence", "GVCF",
                  "--variant_index_type", "LINEAR",
                  "--variant_index_parameter", "128000",
                  "-dontUseSoftClippedBases"]
        broad_runner.run_gatk(params)
    bgzip_and_index(out_file, dd.get_config(data))
    data = dd.set_vrn_file(data, bgzipped_file)
    return data
Esempio n. 8
0
def _prioritize_vcf(caller, vcf_file, prioritize_by, post_prior_fn, work_dir, data):
    """Provide prioritized tab delimited output for a single caller.
    """
    sample = dd.get_sample_name(data)
    out_file = os.path.join(work_dir, "%s-%s-prioritize.tsv" % (sample, caller))
    simple_vcf = os.path.join(work_dir, "%s-%s-simple.vcf.gz" % (sample, caller))
    if not utils.file_exists(simple_vcf):
        gene_list = _find_gene_list_from_bed(prioritize_by, out_file, data)
        # If we have a standard gene list we can skip BED based prioritization
        priority_vcf = "%s.vcf.gz" % utils.splitext_plus(out_file)[0]
        if gene_list:
            if vcf_file.endswith(".vcf.gz"):
                utils.symlink_plus(vcf_file, priority_vcf)
            else:
                assert vcf_file.endswith(".vcf")
                utils.symlink_plus(vcf_file, priority_vcf.replace(".vcf.gz", ".vcf"))
                vcfutils.bgzip_and_index(priority_vcf.replace(".vcf.gz", ".vcf"),
                                         data["config"], remove_orig=False)
        # otherwise prioritize based on BED and proceed
        else:
            if not utils.file_exists(priority_vcf):
                with file_transaction(data, priority_vcf) as tx_out_file:
                    resources = config_utils.get_resources("bcbio_prioritize", data["config"])
                    jvm_opts = resources.get("jvm_opts", ["-Xms1g", "-Xmx4g"])
                    jvm_opts = config_utils.adjust_opts(jvm_opts, {"algorithm": {"memory_adjust":
                                                                                 {"direction": "increase",
                                                                                  "maximum": "30000M",
                                                                                  "magnitude": dd.get_cores(data)}}})
                    jvm_opts = " ".join(jvm_opts)
                    export = utils.local_path_export()
                    cmd = ("{export} bcbio-prioritize {jvm_opts} known -i {vcf_file} -o {tx_out_file} "
                           " -k {prioritize_by}")
                    do.run(cmd.format(**locals()), "Prioritize: select in known regions of interest")

        data_dir = os.path.dirname(os.path.realpath(utils.which("simple_sv_annotation.py")))
        with file_transaction(data, simple_vcf) as tx_out_file:
            fusion_file = os.path.join(data_dir, "fusion_pairs.txt")
            opts = ""
            if os.path.exists(fusion_file):
                opts += " --known_fusion_pairs %s" % fusion_file
            if not gene_list:
                opts += " --gene_list %s" % os.path.join(data_dir, "az-cancer-panel.txt")
            else:
                opts += " --gene_list %s" % gene_list
            cmd = "simple_sv_annotation.py {opts} -o - {priority_vcf} | bgzip -c > {tx_out_file}"
            do.run(cmd.format(**locals()), "Prioritize: simplified annotation output")
    simple_vcf = vcfutils.bgzip_and_index(vcfutils.sort_by_ref(simple_vcf, data), data["config"])
    if post_prior_fn:
        simple_vcf = post_prior_fn(simple_vcf, work_dir, data)
    if not utils.file_uptodate(out_file, simple_vcf):
        with file_transaction(data, out_file) as tx_out_file:
            export = utils.local_path_export(env_cmd="vawk")
            cmd = ("{export} zcat {simple_vcf} | vawk -v SNAME={sample} -v CALLER={caller} "
                   """'{{if (($7 == "PASS" || $7 == ".") && (S${sample}$GT != "0/0")) """
                   "print CALLER,SNAME,$1,$2,I$END,"
                   """I$SVTYPE=="BND" ? I$SVTYPE":"$3":"I$MATEID : I$SVTYPE,"""
                   "I$LOF,I$SIMPLE_ANN,"
                   "S${sample}$SR,S${sample}$PE,S${sample}$PR}}' > {tx_out_file}")
            do.run(cmd.format(**locals()), "Prioritize: convert to tab delimited")
    return out_file, simple_vcf
Esempio n. 9
0
def _prioritize_vcf(caller, vcf_file, prioritize_by, post_prior_fn, work_dir, data):
    """Provide prioritized tab delimited output for a single caller.
    """
    sample = dd.get_sample_name(data)
    out_file = os.path.join(work_dir, "%s-%s-prioritize.tsv" % (sample, caller))
    if not utils.file_exists(out_file):
        priority_vcf = "%s.vcf.gz" % utils.splitext_plus(out_file)[0]
        if not utils.file_exists(priority_vcf):
            with file_transaction(data, priority_vcf) as tx_out_file:
                cmd = ("bcbio-prioritize known -i {vcf_file} -o {tx_out_file} -k {prioritize_by}")
                do.run(cmd.format(**locals()), "Prioritize: select in known regions of interest")
        if post_prior_fn:
            priority_vcf = post_prior_fn(priority_vcf, work_dir, data)
        simple_vcf = "%s-simple.vcf.gz" % utils.splitext_plus(priority_vcf)[0]
        if not utils.file_exists(simple_vcf):
            with file_transaction(data, simple_vcf) as tx_out_file:
                transcript_file = regions.get_sv_bed(data, "transcripts1000", work_dir)
                if transcript_file:
                    transcript_file = vcfutils.bgzip_and_index(transcript_file, data["config"])
                    ann_opt = "--gene_bed %s" % transcript_file
                else:
                    ann_opt = ""
                cmd = "simple_sv_annotation.py {ann_opt} -o - {priority_vcf} | bgzip -c > {tx_out_file}"
                do.run(cmd.format(**locals()), "Prioritize: simplified annotation output")
        simple_vcf = vcfutils.bgzip_and_index(vcfutils.sort_by_ref(simple_vcf, data), data["config"])
        with file_transaction(data, out_file) as tx_out_file:
            cmd = ("zcat {simple_vcf} | vawk -v SNAME={sample} -v CALLER={caller} "
                   """'{{if (($7 == "PASS" || $7 == ".") && (S${sample}$GT != "0/0")) """
                   "print CALLER,SNAME,$1,$2,I$END,"
                   """I$SVTYPE=="BND" ? I$SVTYPE":"$3":"I$MATEID : I$SVTYPE,"""
                   "I$KNOWN,I$END_GENE,I$LOF,I$SIMPLE_ANN,"
                   "S${sample}$SR,S${sample}$PE}}' > {tx_out_file}")
            do.run(cmd.format(**locals()), "Prioritize: convert to tab delimited")
    return out_file
Esempio n. 10
0
def run_tnhaplotyper(align_bams, items, ref_file, assoc_files,
                     region=None, out_file=None):
    """Call variants with Sentieon's TNhaplotyper (MuTect2 like).
    """
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0]
    if not utils.file_exists(out_file):
        variant_regions = bedutils.merge_overlaps(dd.get_variant_regions(items[0]), items[0])
        interval = _get_interval(variant_regions, region, out_file, items)
        with file_transaction(items[0], out_file) as tx_out_file:
            paired = vcfutils.get_paired_bams(align_bams, items)
            assert paired.normal_bam, "Require normal BAM for Sentieon TNhaplotyper"
            dbsnp = "--dbsnp %s" % (assoc_files.get("dbsnp")) if "dbsnp" in assoc_files else ""
            cosmic = "--cosmic %s" % (assoc_files.get("cosmic")) if "cosmic" in assoc_files else ""
            license = license_export(items[0])
            tx_orig_file = "%s-orig%s" % utils.splitext_plus(tx_out_file)
            cores = dd.get_num_cores(items[0])
            cmd = ("{license}sentieon driver -t {cores} -r {ref_file} "
                   "-i {paired.tumor_bam} -i {paired.normal_bam} {interval} "
                   "--algo TNhaplotyper "
                   "--tumor_sample {paired.tumor_name} --normal_sample {paired.normal_name} "
                   "{dbsnp} {cosmic} {tx_orig_file}")
            do.run(cmd.format(**locals()), "Sentieon TNhaplotyper")
            cmd = ("gunzip -c {tx_orig_file} | "
                   "sed 's/ID=ECNT,Number=1,Type=Integer/ID=ECNT,Number=1,Type=String/' | "
                   "sed 's/ID=HCNT,Number=1,Type=Integer/ID=HCNT,Number=1,Type=String/' | "
                   "sed 's/ID=NLOD,Number=1,Type=Float/ID=NLOD,Number=1,Type=String/' | "
                   "sed 's/ID=TLOD,Number=1,Type=Float/ID=TLOD,Number=1,Type=String/' | "
                   "sed 's/ID=PON,Number=1,Type=Integer/ID=PON,Number=1,Type=String/' | "
                   "bgzip -c > {tx_out_file}")
            do.run(cmd.format(**locals()), "Sentieon TNhaplotyper: make headers GATK compatible")
            vcfutils.bgzip_and_index(tx_out_file, items[0]["config"])
    return out_file
Esempio n. 11
0
def annotate_with_depth(in_file, items):
    """Annotate called VCF file with depth using duphold (https://github.com/brentp/duphold)

    Currently annotates single sample and tumor samples in somatic analysis.
    """
    bam_file = None
    if len(items) == 1:
        bam_file = dd.get_align_bam(items[0])
    else:
        paired = vcfutils.get_paired(items)
        if paired:
            bam_file = paired.tumor_bam
    if bam_file:
        out_file = "%s-duphold.vcf.gz" % utils.splitext_plus(in_file)[0]
        if not utils.file_exists(out_file):
            with file_transaction(items[0], out_file) as tx_out_file:
                if not in_file.endswith(".gz"):
                    in_file = vcfutils.bgzip_and_index(in_file, remove_orig=False,
                                                       out_dir=os.path.dirname(tx_out_file))
                ref_file = dd.get_ref_file(items[0])
                # cores for BAM reader thread, so max out at 4 based on recommendations
                cores = min([dd.get_num_cores(items[0]), 4])
                cmd = ("duphold --threads {cores} --vcf {in_file} --bam {bam_file} --fasta {ref_file} "
                       "-o {tx_out_file}")
                do.run(cmd.format(**locals()), "Annotate SV depth with duphold")
        vcfutils.bgzip_and_index(out_file)
        return out_file
    else:
        return in_file
def annotate_nongatk_vcf(orig_file, bam_files, dbsnp_file, ref_file, config):
    """Annotate a VCF file with dbSNP and standard GATK called annotations.
    """
    orig_file = vcfutils.bgzip_and_index(orig_file, config)
    broad_runner = broad.runner_from_config(config)
    if not broad_runner.has_gatk():
        return orig_file
    else:
        out_file = "%s-gatkann%s" % utils.splitext_plus(orig_file)
        if not utils.file_exists(out_file):
            with file_transaction(out_file) as tx_out_file:
                # Avoid issues with incorrectly created empty GATK index files.
                # Occurs when GATK cannot lock shared dbSNP database on previous run
                idx_file = orig_file + ".idx"
                if os.path.exists(idx_file) and not utils.file_exists(idx_file):
                    os.remove(idx_file)
                annotations = get_gatk_annotations(config)
                params = ["-T", "VariantAnnotator",
                          "-R", ref_file,
                          "--variant", orig_file,
                          "--dbsnp", dbsnp_file,
                          "--out", tx_out_file,
                          "-L", orig_file]
                for bam_file in bam_files:
                    params += ["-I", bam_file]
                for x in annotations:
                    params += ["-A", x]
                broad_runner = broad.runner_from_config(config)
                broad_runner.run_gatk(params, memory_retry=True)
        vcfutils.bgzip_and_index(out_file, config)
        return out_file
Esempio n. 13
0
def summarize_vc(items):
    """CWL target: summarize variant calls and validation for multiple samples.
    """
    items = [utils.to_single_data(x) for x in validate.summarize_grading(items)]
    out = {"validate": items[0]["validate"],
           "variants": {"calls": [], "gvcf": []}}
    added = set([])
    for data in items:
        if data.get("vrn_file"):
            names = dd.get_batches(data)
            if not names:
                names = [dd.get_sample_name(data)]
            batch_name = names[0]
            if data.get("vrn_file_joint") is not None:
                to_add = [("vrn_file", "gvcf", dd.get_sample_name(data)),
                          ("vrn_file_joint", "calls", batch_name)]
            else:
                to_add = [("vrn_file", "calls", batch_name)]
            for vrn_key, out_key, name in to_add:
                cur_name = "%s-%s" % (name, dd.get_variantcaller(data))
                if cur_name not in added:
                    out_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data),
                                                                            "variants", out_key)),
                                            "%s.vcf.gz" % cur_name)
                    added.add(cur_name)
                    # Ideally could symlink here but doesn't appear to work with
                    # Docker container runs on Toil where PATHs don't get remapped
                    utils.copy_plus(os.path.realpath(data[vrn_key]), out_file)
                    vcfutils.bgzip_and_index(out_file, data["config"])
                    out["variants"][out_key].append(out_file)
    return [out]
Esempio n. 14
0
def filter_to_pass_and_reject(in_file, paired, out_dir=None):
    """Filter VCF to only those with a strict PASS/REJECT: somatic + germline.

    Removes low quality calls filtered but also labeled with REJECT.
    """
    from bcbio.heterogeneity import bubbletree
    out_file = "%s-prfilter.vcf.gz" % utils.splitext_plus(in_file)[0]
    if out_dir:
        out_file = os.path.join(out_dir, os.path.basename(out_file))
    if not utils.file_uptodate(out_file, in_file):
        with file_transaction(paired.tumor_data, out_file) as tx_out_file:
            max_depth = bubbletree.max_normal_germline_depth(in_file, bubbletree.PARAMS, paired)
            tx_out_plain = tx_out_file.replace(".vcf.gz", ".vcf")
            with contextlib.closing(cyvcf2.VCF(in_file)) as reader:
                reader = _add_db_to_header(reader)
                with contextlib.closing(cyvcf2.Writer(tx_out_plain, reader)) as writer:
                    for rec in reader:
                        filters = rec.FILTER.split(";") if rec.FILTER else []
                        other_filters = [x for x in filters if x not in ["PASS", ".", "REJECT"]]
                        if len(other_filters) == 0 or bubbletree.is_info_germline(rec):
                            # Germline, check if we should include based on frequencies
                            if "REJECT" in filters or bubbletree.is_info_germline(rec):
                                stats = bubbletree._is_possible_loh(rec, reader, bubbletree.PARAMS, paired,
                                                                    use_status=True, max_normal_depth=max_depth)
                                if stats:
                                    rec.FILTER = "PASS"
                                    rec.INFO["DB"] = True
                                    writer.write_record(rec)
                            # Somatic, always include
                            else:
                                writer.write_record(rec)
            vcfutils.bgzip_and_index(tx_out_plain, paired.tumor_data["config"])
    return out_file
Esempio n. 15
0
def run_vep(data):
    """Annotate input VCF file with Ensembl variant effect predictor.
    """
    out_file = utils.append_stem(data["vrn_file"], "-vepeffects")
    assert data["vrn_file"].endswith(".gz") and out_file.endswith(".gz")
    if not utils.file_exists(out_file):
        with file_transaction(out_file) as tx_out_file:
            vep_dir, ensembl_name = prep_vep_cache(data["genome_build"],
                                                   tz.get_in(["reference", "fasta", "base"], data))
            if vep_dir:
                cores = tz.get_in(("config", "algorithm", "num_cores"), data, 1)
                fork_args = ["--fork", str(cores)] if cores > 1 else []
                vep = config_utils.get_program("variant_effect_predictor.pl", data["config"])
                dbnsfp_args, dbnsfp_fields = _get_dbnsfp(data)
                loftee_args, loftee_fields = _get_loftee(data)
                std_fields = ["Consequence", "Codons", "Amino_acids", "Gene", "SYMBOL", "Feature",
                              "EXON", "PolyPhen", "SIFT", "Protein_position", "BIOTYPE", "CANONICAL", "CCDS"]
                resources = config_utils.get_resources("vep", data["config"])
                extra_args = [str(x) for x in resources.get("options", [])]
                cmd = [vep, "--vcf", "-o", "stdout"] + fork_args + extra_args + \
                      ["--species", ensembl_name,
                       "--no_stats",
                       "--cache", "--offline", "--dir", vep_dir,
                       "--sift", "b", "--polyphen", "b", "--symbol", "--numbers", "--biotype", "--total_length",
                       "--canonical", "--ccds",
                       "--fields", ",".join(std_fields + dbnsfp_fields + loftee_fields)] + dbnsfp_args + loftee_args
                cmd = "gunzip -c %s | %s | bgzip -c > %s" % (data["vrn_file"], " ".join(cmd), tx_out_file)
                do.run(cmd, "Ensembl variant effect predictor", data)
    if utils.file_exists(out_file):
        vcfutils.bgzip_and_index(out_file, data["config"])
        return out_file
Esempio n. 16
0
def cutoff_w_expression(vcf_file, expression, data, name="+", filterext="",
                      extra_cmd="", limit_regions="variant_regions"):
    """Perform cutoff-based soft filtering using bcftools expressions like %QUAL < 20 || DP < 4.
    """
    base, ext = utils.splitext_plus(vcf_file)
    out_file = "{base}-filter{filterext}{ext}".format(**locals())
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            if vcfutils.vcf_has_variants(vcf_file):
                bcftools = config_utils.get_program("bcftools", data["config"])
                bgzip_cmd = "| bgzip -c" if out_file.endswith(".gz") else ""
                intervals = ""
                if limit_regions == "variant_regions":
                    variant_regions = dd.get_variant_regions(data)
                    if variant_regions:
                        intervals = "-T %s" % vcfutils.bgzip_and_index(variant_regions, data["config"])
                cmd = ("{bcftools} filter -O v {intervals} --soft-filter '{name}' "
                       "-e '{expression}' -m '+' {vcf_file} {extra_cmd} {bgzip_cmd} > {tx_out_file}")
                do.run(cmd.format(**locals()),
                       "Cutoff-based soft filtering %s with %s" % (vcf_file, expression), data)
            else:
                shutil.copy(vcf_file, out_file)
    if out_file.endswith(".vcf.gz"):
        out_file = vcfutils.bgzip_and_index(out_file, data["config"])
    return out_file
Esempio n. 17
0
def _filter_by_bedpe(vcf_file, bedpe_file, data):
    """Add filters to VCF based on pre-filtered bedpe file.

    Also removes problem calls in the output VCF with missing alleles.
    """
    out_file = "%s-filter%s" % utils.splitext_plus(vcf_file)
    nogzip_out_file = out_file.replace(".vcf.gz", ".vcf")
    if not utils.file_exists(out_file):
        filters = {}
        with open(bedpe_file) as in_handle:
            for line in in_handle:
                parts = line.split("\t")
                name = parts[6]
                cur_filter = parts[-1].strip()
                if cur_filter != "PASS":
                    filters[name] = cur_filter
        with file_transaction(data, nogzip_out_file) as tx_out_file:
            with open(tx_out_file, "w") as out_handle:
                with utils.open_gzipsafe(vcf_file) as in_handle:
                    for line in in_handle:
                        if not line.startswith("#"):
                            parts = line.split("\t")
                            # Problem breakends can have empty alleles when at contig ends
                            if not parts[3].strip():
                                parts[3] = "N"
                            cur_id = parts[2].split("_")[0]
                            cur_filter = filters.get(cur_id, "PASS")
                            if cur_filter != "PASS":
                                parts[6] = cur_filter
                            line = "\t".join(parts)
                        out_handle.write(line)
        if out_file.endswith(".gz"):
            vcfutils.bgzip_and_index(nogzip_out_file, data["config"])
    return out_file
Esempio n. 18
0
def combine_calls(batch_id, samples, data):
    """Combine multiple callsets into a final set of merged calls.
    """
    logger.info("Ensemble consensus calls for {0}: {1}".format(
        batch_id, ",".join(x["variantcaller"] for x in samples[0]["variants"])))
    edata = copy.deepcopy(data)
    base_dir = utils.safe_makedir(os.path.join(edata["dirs"]["work"], "ensemble", batch_id))
    caller_names, vrn_files, bam_files = _organize_variants(samples, batch_id)
    exist_variants = False
    for tmp_vrn_file in vrn_files:
        if vcfutils.vcf_has_variants(tmp_vrn_file):
            exist_variants = True
            break
    if exist_variants:
        if "classifiers" not in edata["config"]["algorithm"]["ensemble"]:
            callinfo = _run_ensemble_intersection(batch_id, vrn_files, base_dir, edata)
        else:
            config_file = _write_config_file(batch_id, caller_names, base_dir, edata)
            callinfo = _run_ensemble(batch_id, vrn_files, config_file, base_dir,
                                     edata["sam_ref"], edata)
            callinfo["vrn_file"] = vcfutils.bgzip_and_index(callinfo["vrn_file"], data["config"])
        edata["config"]["algorithm"]["variantcaller"] = "ensemble"
        edata["vrn_file"] = callinfo["vrn_file"]
        edata["ensemble_bed"] = callinfo["bed_file"]
        callinfo["validate"] = validate.compare_to_rm(edata)[0][0].get("validate")
    else:
        out_vcf_file = os.path.join(base_dir, "{0}-ensemble.vcf".format(batch_id))
        vcfutils.write_empty_vcf(out_vcf_file)
        callinfo = {"variantcaller": "ensemble",
                    "vrn_file": vcfutils.bgzip_and_index(out_vcf_file, data["config"]),
                    "bed_file": None}
    return [[batch_id, callinfo]]
Esempio n. 19
0
def run_vep(in_file, data):
    """Annotate input VCF file with Ensembl variant effect predictor.
    """
    if not vcfutils.vcf_has_variants(in_file):
        return None
    out_file = utils.append_stem(in_file, "-vepeffects")
    assert in_file.endswith(".gz") and out_file.endswith(".gz")
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            vep_dir, ensembl_name = prep_vep_cache(data["genome_build"],
                                                   tz.get_in(["reference", "fasta", "base"], data))
            if vep_dir:
                cores = tz.get_in(("config", "algorithm", "num_cores"), data, 1)
                fork_args = ["--fork", str(cores)] if cores > 1 else []
                vep = config_utils.get_program("vep", data["config"])
                is_human = tz.get_in(["genome_resources", "aliases", "human"], data, False)
                # HGVS requires a bgzip compressed, faidx indexed input file or is unusable slow
                if dd.get_ref_file_compressed(data):
                    hgvs_compatible = True
                    config_args = ["--fasta", dd.get_ref_file_compressed(data)]
                else:
                    hgvs_compatible = False
                    config_args = ["--fasta", dd.get_ref_file(data)]
                if is_human:
                    plugin_fns = {"loftee": _get_loftee, "maxentscan": _get_maxentscan,
                                  "genesplicer": _get_genesplicer, "spliceregion": _get_spliceregion}
                    plugins = ["loftee"]
                    if "vep_splicesite_annotations" in dd.get_tools_on(data):
                        # "genesplicer" too unstable so currently removed
                        plugins += ["maxentscan", "spliceregion"]
                    for plugin in plugins:
                        plugin_args = plugin_fns[plugin](data)
                        config_args += plugin_args
                    config_args += ["--sift", "b", "--polyphen", "b"]
                    if hgvs_compatible:
                        config_args += ["--hgvs", "--shift_hgvs", "1"]
                if (dd.get_effects_transcripts(data).startswith("canonical")
                      or tz.get_in(("config", "algorithm", "clinical_reporting"), data)):
                    config_args += ["--pick_allele"]
                if ensembl_name.endswith("_merged"):
                    config_args += ["--merged"]
                    ensembl_name = ensembl_name.replace("_merged", "")
                resources = config_utils.get_resources("vep", data["config"])
                extra_args = [str(x) for x in resources.get("options", [])]
                cmd = [vep, "--vcf", "-o", "stdout", "-i", in_file] + fork_args + extra_args + \
                      ["--species", ensembl_name,
                       "--no_stats", "--cache",
                        "--offline", "--dir", vep_dir,
                       "--symbol", "--numbers", "--biotype", "--total_length", "--canonical",
                       "--gene_phenotype", "--ccds", "--uniprot", "--domains", "--regulatory",
                       "--protein", "--tsl", "--appris", "--af", "--max_af", "--af_1kg", "--af_esp", "--af_gnomad",
                       "--pubmed", "--variant_class", "--allele_number"] + config_args
                perl_exports = utils.get_perl_exports()
                # Remove empty fields (';;') which can cause parsing errors downstream
                cmd = "%s && %s | sed '/^#/! s/;;/;/g' | bgzip -c > %s" % (perl_exports, " ".join(cmd), tx_out_file)
                do.run(cmd, "Ensembl variant effect predictor", data)
    if utils.file_exists(out_file):
        vcfutils.bgzip_and_index(out_file, data["config"])
        return out_file
Esempio n. 20
0
def _filter_nonref(in_file, data):
    """Remove NON_REF gVCF items from GATK VCF output; these occasionally sneak through in joint calling.
    """
    out_file = "%s-gatkclean%s" % utils.splitext_plus(in_file)
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            cmd = "gunzip -c {in_file} | grep -v NON_REF | bgzip -c > {tx_out_file}"
            do.run(cmd.format(**locals()), "Remove stray NON_REF gVCF information from VCF output", data)
        vcfutils.bgzip_and_index(out_file, data["config"])
    return out_file
Esempio n. 21
0
def add_dbsnp(orig_file, dbsnp_file, config):
    """Annotate a VCF file with dbSNP.
    """
    orig_file = vcfutils.bgzip_and_index(orig_file, config)
    out_file = "%s-wdbsnp.vcf.gz" % utils.splitext_plus(orig_file)[0]
    if not utils.file_uptodate(out_file, orig_file):
        with file_transaction(config, out_file) as tx_out_file:
            cmd = "bcftools annotate -c ID -a {dbsnp_file} -o {tx_out_file} -O z {orig_file}"
            do.run(cmd.format(**locals()), "Annotate with dbSNP")
    return vcfutils.bgzip_and_index(out_file, config)
Esempio n. 22
0
def gatk_joint_calling(data, vrn_files, ref_file):
    joint_file = os.path.join("variation", "joint.vcf")
    out_file = os.path.join("variation", "combined.vcf")
    bgzjoint_file = os.path.join("variation", "joint.vcf.gz")
    bgzout_file = os.path.join("variation", "combined.vcf.gz")
    if not file_exists(bgzout_file):
        joint_file = _run_genotype_gvcfs(data, vrn_files, ref_file, joint_file)
        bgzip_and_index(joint_file, dd.get_config(data))
        out_file = gatk_filter_rnaseq(data, bgzjoint_file, out_file)
        bgzip_and_index(out_file, dd.get_config(data))
    return bgzout_file
def clean_titration():
    """Subset to interval regions and bgzip/tabix.
    """
    region_bed = os.path.join(in_region_dir, "Intervals_TSAVP_Titr.bed")
    for in_vcf in glob.glob(os.path.join(in_vcf_dir, "NA1287*.vcf")):
        out_vcf = os.path.join(out_vcf_dir, "%s.gz" % os.path.join(os.path.basename(in_vcf)))
        if not os.path.exists(out_vcf):
            cmd = ("bcftools view {in_vcf} -T {region_bed} | grep -v '##contig' | "
                   "sed 's/^chr//g' | bgzip -c > {out_vcf}")
            subprocess.check_call(cmd.format(**locals()), shell=True)
        vcfutils.bgzip_and_index(out_vcf)
Esempio n. 24
0
def combine_calls(*args):
    """Combine multiple callsets into a final set of merged calls.
    """
    if len(args) == 3:
        is_cwl = False
        batch_id, samples, data = args
        caller_names, vrn_files = _organize_variants(samples, batch_id)
    else:
        is_cwl = True
        samples = [utils.to_single_data(x) for x in args]
        samples = [cwlutils.unpack_tarballs(x, x) for x in samples]
        data = samples[0]
        batch_id = data["batch_id"]
        caller_names = data["variants"]["variantcallers"]
        vrn_files = data["variants"]["calls"]
    logger.info("Ensemble consensus calls for {0}: {1}".format(
        batch_id, ",".join(caller_names)))
    edata = copy.deepcopy(data)
    base_dir = utils.safe_makedir(os.path.join(edata["dirs"]["work"], "ensemble", batch_id))
    if any([vcfutils.vcf_has_variants(f) for f in vrn_files]):
        # Decompose multiallelic variants and normalize
        passonly = not tz.get_in(["config", "algorithm", "ensemble", "use_filtered"], edata, False)
        vrn_files = [normalize.normalize(f, data, passonly=passonly, rerun_effects=False, remove_oldeffects=True,
                                         nonrefonly=True,
                                         work_dir=utils.safe_makedir(os.path.join(base_dir, c)))
                     for c, f in zip(caller_names, vrn_files)]
        if "classifiers" not in (dd.get_ensemble(edata) or {}):
            callinfo = _run_ensemble_intersection(batch_id, vrn_files, caller_names, base_dir, edata)
        else:
            config_file = _write_config_file(batch_id, caller_names, base_dir, edata)
            callinfo = _run_ensemble(batch_id, vrn_files, config_file, base_dir,
                                     dd.get_ref_file(edata), edata)
            callinfo["vrn_file"] = vcfutils.bgzip_and_index(callinfo["vrn_file"], data["config"])
        # After decomposing multiallelic variants and normalizing, re-evaluate effects
        ann_ma_file, _ = effects.add_to_vcf(callinfo["vrn_file"], data)
        if ann_ma_file:
            callinfo["vrn_file"] = ann_ma_file

        edata["config"]["algorithm"]["variantcaller"] = "ensemble"
        edata["vrn_file"] = callinfo["vrn_file"]
        edata["ensemble_bed"] = callinfo["bed_file"]
        callinfo["validate"] = validate.compare_to_rm(edata)[0][0].get("validate")
    else:
        out_vcf_file = os.path.join(base_dir, "{0}-ensemble.vcf".format(batch_id))
        vcfutils.write_empty_vcf(out_vcf_file, samples=[dd.get_sample_name(d) for d in samples])
        callinfo = {"variantcaller": "ensemble",
                    "vrn_file": vcfutils.bgzip_and_index(out_vcf_file, data["config"]),
                    "bed_file": None}
    if is_cwl:
        callinfo["batch_samples"] = data["batch_samples"]
        callinfo["batch_id"] = batch_id
        return [{"ensemble": callinfo}]
    else:
        return [[batch_id, callinfo]]
Esempio n. 25
0
def clean_file(in_file, data, prefix=""):
    """Prepare a clean sorted input BED file without headers
    """
    if in_file:
        bedprep_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "bedprep"))
        out_file = os.path.join(bedprep_dir, "%s%s" % (prefix, os.path.basename(in_file)))
        if not utils.file_exists(out_file):
            with file_transaction(out_file) as tx_out_file:
                cmd = "grep -v ^track {in_file} | grep -v ^browser | sort -k1,1 -k2,2n > {tx_out_file}"
                do.run(cmd.format(**locals()), "Prepare cleaned BED file", data)
        vcfutils.bgzip_and_index(out_file, data["config"], remove_orig=False)
        return out_file
Esempio n. 26
0
def run_vep(in_file, data):
    """Annotate input VCF file with Ensembl variant effect predictor.
    """
    if not vcfutils.vcf_has_variants(in_file):
        return None
    out_file = utils.append_stem(in_file, "-vepeffects")
    assert in_file.endswith(".gz") and out_file.endswith(".gz")
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            vep_dir, ensembl_name = prep_vep_cache(data["genome_build"],
                                                   tz.get_in(["reference", "fasta", "base"], data))
            if vep_dir:
                cores = tz.get_in(("config", "algorithm", "num_cores"), data, 1)
                fork_args = ["--fork", str(cores)] if cores > 1 else []
                vep = config_utils.get_program("variant_effect_predictor.pl", data["config"])
                dbnsfp_args, dbnsfp_fields = _get_dbnsfp(data)
                loftee_args, loftee_fields = _get_loftee(data)
                std_fields = ["Consequence", "Codons", "Amino_acids", "Gene", "SYMBOL", "Feature",
                              "EXON", "PolyPhen", "SIFT", "Protein_position", "BIOTYPE", "CANONICAL", "CCDS"]
                resources = config_utils.get_resources("vep", data["config"])
                extra_args = [str(x) for x in resources.get("options", [])]
                cmd = [vep, "--vcf", "-o", "stdout", "-i", in_file] + fork_args + extra_args + \
                      ["--species", ensembl_name,
                       "--no_stats",
                       "--cache", "--offline", "--dir", vep_dir,
                       "--sift", "b", "--polyphen", "b", "--symbol", "--numbers", "--biotype", "--total_length",
                       "--canonical", "--ccds",
                       "--fields", ",".join(std_fields + dbnsfp_fields + loftee_fields)] + dbnsfp_args + loftee_args

                if tz.get_in(("config", "algorithm", "clinical_reporting"), data, False):

                    # In case of clinical reporting, we need one and only one
                    # variant per gene
                    # From the VEP docs:
                    # "Pick once line of consequence data per variant,
                    # including transcript-specific columns. Consequences are
                    # chosen by the canonical, biotype status and length of the
                    # transcript, along with the ranking of the consequence
                    # type according to this table. This is the best method to
                    # use if you are interested only in one consequence per
                    #  variant.

                    cmd += ["--pick"]

                    # TODO investigate hgvs reporting but requires indexing the reference file
                    # cmd += ["--hgvs", "--shift-hgvs", "--fasta", dd.get_ref_file(data)]
                perllib = "export PERL5LIB=%s:$PERL5LIB" % _get_perllib()
                # Remove empty fields (';;') which can cause parsing errors downstream
                cmd = "%s && %s | sed '/^#/! s/;;/;/g' | bgzip -c > %s" % (perllib, " ".join(cmd), tx_out_file)
                do.run(cmd, "Ensembl variant effect predictor", data)
    if utils.file_exists(out_file):
        vcfutils.bgzip_and_index(out_file, data["config"])
        return out_file
Esempio n. 27
0
def run_vep(in_file, data):
    """Annotate input VCF file with Ensembl variant effect predictor.
    """
    if not vcfutils.vcf_has_variants(in_file):
        return None
    out_file = utils.append_stem(in_file, "-vepeffects")
    assert in_file.endswith(".gz") and out_file.endswith(".gz")
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            vep_dir, ensembl_name = prep_vep_cache(data["genome_build"],
                                                   tz.get_in(["reference", "fasta", "base"], data))
            if vep_dir:
                cores = tz.get_in(("config", "algorithm", "num_cores"), data, 1)
                fork_args = ["--fork", str(cores)] if cores > 1 else []
                vep = config_utils.get_program("variant_effect_predictor.pl", data["config"])
                is_human = tz.get_in(["genome_resources", "aliases", "human"], data, False)
                if is_human:
                    dbnsfp_args, dbnsfp_fields = _get_dbnsfp(data)
                    loftee_args, loftee_fields = _get_loftee(data)
                    prediction_args = ["--sift", "b", "--polyphen", "b"]
                    prediction_fields = ["PolyPhen", "SIFT"]
                else:
                    dbnsfp_args, dbnsfp_fields = [], []
                    loftee_args, loftee_fields = [], []
                    prediction_args, prediction_fields = [], []
                if tz.get_in(("config", "algorithm", "clinical_reporting"), data, False):
                    # In case of clinical reporting, we need one and only one variant per gene
                    # http://useast.ensembl.org/info/docs/tools/vep/script/vep_other.html#pick
                    # Also use hgvs reporting but requires indexing the reference file
                    clinical_args = ["--pick", "--hgvs", "--shift_hgvs", "1", "--fasta", dd.get_ref_file(data)]
                    clinical_fields = ["HGVSc", "HGVSp"]
                else:
                    clinical_args, clinical_fields = [], []
                std_fields = ["Consequence", "Codons", "Amino_acids", "Gene", "SYMBOL", "Feature",
                              "EXON"] + prediction_fields + ["Protein_position", "BIOTYPE", "CANONICAL", "CCDS"]
                resources = config_utils.get_resources("vep", data["config"])
                extra_args = [str(x) for x in resources.get("options", [])]
                cmd = [vep, "--vcf", "-o", "stdout", "-i", in_file] + fork_args + extra_args + \
                      ["--species", ensembl_name,
                       "--no_stats",
                       "--cache", "--offline", "--dir", vep_dir,
                       "--symbol", "--numbers", "--biotype", "--total_length", "--canonical", "--gene_phenotype", "--ccds",
                       "--fields", ",".join(std_fields + dbnsfp_fields + loftee_fields + clinical_fields)] + \
                       prediction_args + dbnsfp_args + loftee_args + clinical_args

                perl_exports = utils.get_perl_exports()
                # Remove empty fields (';;') which can cause parsing errors downstream
                cmd = "%s && %s | sed '/^#/! s/;;/;/g' | bgzip -c > %s" % (perl_exports, " ".join(cmd), tx_out_file)
                do.run(cmd, "Ensembl variant effect predictor", data)
    if utils.file_exists(out_file):
        vcfutils.bgzip_and_index(out_file, data["config"])
        return out_file
Esempio n. 28
0
def annotate_nongatk_vcf(orig_file, bam_files, dbsnp_file, ref_file, data,
                         out_file=None):
    """Annotate a VCF file with dbSNP and standard GATK called annotations.
    """
    orig_file = vcfutils.bgzip_and_index(orig_file, data["config"])
    broad_runner = broad.runner_from_config_safe(data["config"])
    if not broad_runner or not broad_runner.has_gatk() or broad_runner.gatk_type() == "gatk4":
        if dbsnp_file:
            return add_dbsnp(orig_file, dbsnp_file, data, out_file)
        else:
            return orig_file
    else:
        if out_file is None:
            out_file = "%s-gatkann%s" % utils.splitext_plus(orig_file)
        if not utils.file_exists(out_file):
            with file_transaction(data, out_file) as tx_out_file:
                # Avoid issues with incorrectly created empty GATK index files.
                # Occurs when GATK cannot lock shared dbSNP database on previous run
                idx_file = orig_file + ".idx"
                if os.path.exists(idx_file) and not utils.file_exists(idx_file):
                    os.remove(idx_file)
                annotations = get_gatk_annotations(data["config"], include_depth=False)
                params = ["-T", "VariantAnnotator",
                          "-R", ref_file,
                          "--variant", orig_file,
                          "--out", tx_out_file,
                          "-L", orig_file]
                if dbsnp_file:
                    params += ["--dbsnp", dbsnp_file]
                for bam_file in bam_files:
                    params += ["-I", bam_file]
                for x in annotations:
                    params += ["-A", x]
                if ("--allow_potentially_misencoded_quality_scores" not in params
                      and "-allowPotentiallyMisencodedQuals" not in params):
                    params += ["--allow_potentially_misencoded_quality_scores"]
                # be less stringent about BAM and VCF files (esp. N in CIGAR for RNA-seq)
                # start by removing existing -U or --unsafe opts
                # (if another option is added to Gatk that starts with -U... this may create a bug)
                unsafe_options = [x for x in params if x.startswith(("-U", "--unsafe"))]
                for my_opt in unsafe_options:
                    ind_to_rem = params.index(my_opt)
                    # are the options given as separate strings or in one?
                    if my_opt.strip() == "-U" or my_opt.strip() == "--unsafe":
                        params.pop(ind_to_rem + 1)
                    params.pop(ind_to_rem)
                params.extend(["-U", "ALL"])
                broad_runner = broad.runner_from_config(data["config"])
                broad_runner.run_gatk(params)
        vcfutils.bgzip_and_index(out_file, data["config"])
        return out_file
Esempio n. 29
0
def _varscan_work(align_bams, ref_file, items, target_regions, out_file):
    """Perform SNP and indel genotyping with VarScan.
    """
    config = items[0]["config"]

    orig_out_file = out_file
    out_file = orig_out_file.replace(".vcf.gz", ".vcf")

    max_read_depth = "1000"
    version = programs.jar_versioner("varscan", "VarScan")(config)
    if version < "v2.3.6":
        raise IOError("Please install version 2.3.6 or better of VarScan"
                      " with support for multisample calling and indels"
                      " in VCF format.")
    varscan_jar = config_utils.get_jar("VarScan",
                                       config_utils.get_program("varscan", config, "dir"))
    sample_list = _create_sample_list(align_bams, out_file)
    mpileup = samtools.prep_mpileup(align_bams, ref_file, config, max_read_depth,
                                    target_regions=target_regions, want_bcf=False)
    # VarScan fails to generate a header on files that start with
    # zerocoverage calls; strip these with grep, we're not going to
    # call on them
    remove_zerocoverage = "grep -v -P '\t0\t\t$'"
    # write a temporary mpileup file so we can check if empty
    mpfile = "%s.mpileup" % os.path.splitext(out_file)[0]
    with file_transaction(config, mpfile) as mpfile_tx:
        cmd = ("{mpileup} | {remove_zerocoverage} > {mpfile_tx}")
        do.run(cmd.format(**locals()), "mpileup for Varscan")
    if os.path.getsize(mpfile) == 0:
        write_empty_vcf(out_file)
    else:
        with tx_tmpdir(items[0]) as tmp_dir:
            jvm_opts = _get_varscan_opts(config, tmp_dir)
            fix_ambig = vcfutils.fix_ambiguous_cl()
            cmd = ("cat {mpfile} "
                   "| java {jvm_opts} -jar {varscan_jar} mpileup2cns --min-coverage 5 --p-value 0.98 "
                   "  --vcf-sample-list {sample_list} --output-vcf --variants "
                   "| {fix_ambig} | vcfuniqalleles > {out_file}")
            do.run(cmd.format(**locals()), "Varscan", None,
                   [do.file_exists(out_file)])
    os.remove(sample_list)
    os.remove(mpfile)
    # VarScan can create completely empty files in regions without
    # variants, so we create a correctly formatted empty file
    if os.path.getsize(out_file) == 0:
        write_empty_vcf(out_file)
    else:
        freebayes.clean_vcf_output(out_file, _clean_varscan_line, config)

    if orig_out_file.endswith(".gz"):
        vcfutils.bgzip_and_index(out_file, config)
Esempio n. 30
0
def run_vep(in_file, data):
    """Annotate input VCF file with Ensembl variant effect predictor.
    """
    if not vcfutils.vcf_has_variants(in_file):
        return None
    out_file = utils.append_stem(in_file, "-vepeffects")
    assert in_file.endswith(".gz") and out_file.endswith(".gz")
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            vep_dir, ensembl_name = prep_vep_cache(data["genome_build"],
                                                   tz.get_in(["reference", "fasta", "base"], data))
            if vep_dir:
                cores = tz.get_in(("config", "algorithm", "num_cores"), data, 1)
                fork_args = ["--fork", str(cores)] if cores > 1 else []
                vep = config_utils.get_program("variant_effect_predictor.pl", data["config"])
                is_human = tz.get_in(["genome_resources", "aliases", "human"], data, False)
                config_args, config_fields, prediction_fields = [], [], []
                if is_human:
                    plugin_fns = {"dbnsfp": _get_dbnsfp, "loftee": _get_loftee, "dbscsnv": _get_dbscsnv,
                                  "maxentscan": _get_maxentscan, "genesplicer": _get_genesplicer}
                    plugins = tz.get_in(("config", "resources", "vep", "plugins"), data, ["dbnsfp", "loftee"])
                    for plugin in plugins:
                        plugin_args, plugin_fields = plugin_fns[plugin](data)
                        config_args += plugin_args
                        config_fields += plugin_fields
                    config_args += ["--sift", "b", "--polyphen", "b"]
                    prediction_fields += ["PolyPhen", "SIFT"]
                    # Use HGVS by default, requires indexing the reference genome
                    config_args += ["--hgvs", "--shift_hgvs", "1", "--fasta", dd.get_ref_file(data)]
                    config_fields += ["HGVSc", "HGVSp"]
                if (dd.get_effects_transcripts(data).startswith("canonical")
                      or tz.get_in(("config", "algorithm", "clinical_reporting"), data)):
                    config_args += ["--pick"]
                std_fields = ["Consequence", "Codons", "Amino_acids", "Gene", "SYMBOL", "Feature",
                              "EXON"] + prediction_fields + ["Protein_position", "BIOTYPE", "CANONICAL", "CCDS"]
                resources = config_utils.get_resources("vep", data["config"])
                extra_args = [str(x) for x in resources.get("options", [])]
                cmd = [vep, "--vcf", "-o", "stdout", "-i", in_file] + fork_args + extra_args + \
                      ["--species", ensembl_name,
                       "--no_stats",
                       "--cache", "--offline", "--dir", vep_dir,
                       "--symbol", "--numbers", "--biotype", "--total_length", "--canonical",
                       "--gene_phenotype", "--ccds",
                       "--fields", ",".join(std_fields + config_fields)] + config_args
                perl_exports = utils.get_perl_exports()
                # Remove empty fields (';;') which can cause parsing errors downstream
                cmd = "%s && %s | sed '/^#/! s/;;/;/g' | bgzip -c > %s" % (perl_exports, " ".join(cmd), tx_out_file)
                do.run(cmd, "Ensembl variant effect predictor", data)
    if utils.file_exists(out_file):
        vcfutils.bgzip_and_index(out_file, data["config"])
        return out_file
Esempio n. 31
0
def _prioritize_vcf(caller, vcf_file, prioritize_by, post_prior_fn, work_dir, data):
    """Provide prioritized tab delimited output for a single caller.
    """
    sample = dd.get_sample_name(data)
    out_file = os.path.join(work_dir, "%s-%s-prioritize.tsv" % (sample, caller))
    simple_vcf = os.path.join(work_dir, "%s-%s-simple.vcf.gz" % (sample, caller))
    if not utils.file_exists(simple_vcf):
        gene_list = _find_gene_list_from_bed(prioritize_by, out_file, data)
        # If we have a standard gene list we can skip BED based prioritization
        if gene_list:
            priority_vcf = os.path.join(work_dir, os.path.basename(vcf_file))
            utils.symlink_plus(vcf_file, priority_vcf)
        # otherwise prioritize based on BED and proceed
        else:
            priority_vcf = "%s.vcf.gz" % utils.splitext_plus(out_file)[0]
            if not utils.file_exists(priority_vcf):
                with file_transaction(data, priority_vcf) as tx_out_file:
                    resources = config_utils.get_resources("bcbio_prioritize", data["config"])
                    jvm_opts = " ".join(resources.get("jvm_opts", ["-Xms1g", "-Xmx4g"]))
                    export = utils.local_path_export()
                    cmd = ("{export} bcbio-prioritize {jvm_opts} known -i {vcf_file} -o {tx_out_file} "
                           " -k {prioritize_by}")
                    do.run(cmd.format(**locals()), "Prioritize: select in known regions of interest")
        if post_prior_fn:
            priority_vcf = post_prior_fn(priority_vcf, work_dir, data)

        data_dir = os.path.dirname(os.path.realpath(utils.which("simple_sv_annotation.py")))
        with file_transaction(data, simple_vcf) as tx_out_file:
            fusion_file = os.path.join(data_dir, "fusion_pairs.txt")
            opts = ""
            if os.path.exists(fusion_file):
                opts += " --known_fusion_pairs %s" % fusion_file
            if not gene_list:
                opts += " --gene_list %s" % os.path.join(data_dir, "az-cancer-panel.txt")
            else:
                opts += " --gene_list %s" % gene_list
            cmd = "simple_sv_annotation.py {opts} -o - {priority_vcf} | bgzip -c > {tx_out_file}"
            do.run(cmd.format(**locals()), "Prioritize: simplified annotation output")
    simple_vcf = vcfutils.bgzip_and_index(vcfutils.sort_by_ref(simple_vcf, data), data["config"])
    if not utils.file_uptodate(out_file, simple_vcf):
        with file_transaction(data, out_file) as tx_out_file:
            export = utils.local_path_export()
            cmd = ("{export} zcat {simple_vcf} | vawk -v SNAME={sample} -v CALLER={caller} "
                   """'{{if (($7 == "PASS" || $7 == ".") && (S${sample}$GT != "0/0")) """
                   "print CALLER,SNAME,$1,$2,I$END,"
                   """I$SVTYPE=="BND" ? I$SVTYPE":"$3":"I$MATEID : I$SVTYPE,"""
                   "I$LOF,I$SIMPLE_ANN,"
                   "S${sample}$SR,S${sample}$PE,S${sample}$PR}}' > {tx_out_file}")
            do.run(cmd.format(**locals()), "Prioritize: convert to tab delimited")
    return out_file, simple_vcf
Esempio n. 32
0
def run_combine_gvcfs(vrn_files, region, ref_file, out_file, data):
    if not utils.file_exists(out_file):
        broad_runner = broad.runner_from_config(data["config"])
        with file_transaction(data, out_file) as tx_out_file:
            params = ["-T", "CombineGVCFs", "-R", ref_file, "-o", tx_out_file]
            if region:
                params += ["-L", bamprep.region_to_gatk(region)]
            for vrn_file in vrn_files:
                params += ["--variant", vrn_file]
            cores = dd.get_cores(data)
            memscale = {"magnitude": 0.9 * cores, "direction": "increase"} if cores > 1 else None
            broad_runner.new_resources("gatk-haplotype")
            broad_runner.run_gatk(params, memscale=memscale)
    return vcfutils.bgzip_and_index(out_file, data["config"])
Esempio n. 33
0
def _select_sample(data, variant_file, work_dir):
    """Select current sample from original call file.
    """
    sample_name = dd.get_sample_name(data)
    if dd.get_phenotype(data) == "germline":
        variant_file = germline.fix_germline_samplename(variant_file, sample_name, data)

    out_file = os.path.join(work_dir, "%s-%s.vcf.gz" % (utils.splitext_plus(os.path.basename(variant_file))[0],
                                                        sample_name))
    if not utils.file_uptodate(out_file, variant_file):
        with file_transaction(data, out_file) as tx_out_file:
            cmd = "bcftools view -s {sample_name} -O z -o {tx_out_file} {variant_file}"
            do.run(cmd.format(**locals()), "Run manta SV analysis")
    return vcfutils.bgzip_and_index(out_file, data["config"])
Esempio n. 34
0
def to_single(in_file, data):
    """Convert multi-allelic inputs in the original VCF file into single alleles.
    """
    out_file = "%s-nomultiallelic%s" % utils.splitext_plus(in_file)
    if not utils.file_exists(out_file):
        if vcfutils.vcf_has_variants(in_file):
            ready_ma_file = _decompose(in_file, data)
            ann_ma_file, _ = effects.add_to_vcf(ready_ma_file, data)
            if ann_ma_file:
                ready_ma_file = ann_ma_file
            out_file = ready_ma_file
        else:
            utils.symlink_plus(in_file, out_file)
    return vcfutils.bgzip_and_index(out_file, data["config"])
Esempio n. 35
0
def merge_overlaps(in_file, data, distance=None, out_dir=None):
    """Merge bed file intervals to avoid overlapping regions.

    Overlapping regions (1:1-100, 1:90-100) cause issues with callers like FreeBayes
    that don't collapse BEDs prior to using them.
    """
    if in_file:
        bedtools = config_utils.get_program("bedtools", data["config"])
        work_dir = tz.get_in(["dirs", "work"], data)
        if out_dir:
            bedprep_dir = out_dir
        elif work_dir:
            bedprep_dir = utils.safe_makedir(os.path.join(work_dir, "bedprep"))
        else:
            bedprep_dir = os.path.dirname(in_file)
        out_file = os.path.join(bedprep_dir, "%s-merged.bed" % (utils.splitext_plus(os.path.basename(in_file))[0]))
        if not utils.file_exists(out_file):
            with file_transaction(data, out_file) as tx_out_file:
                distance = "-d %s" % distance if distance else ""
                cmd = "{bedtools} merge {distance} -i {in_file} > {tx_out_file}"
                do.run(cmd.format(**locals()), "Prepare merged BED file", data)
        vcfutils.bgzip_and_index(out_file, data["config"], remove_orig=False)
        return out_file
Esempio n. 36
0
def finalize_vcf(in_file, variantcaller, items):
    """Perform cleanup and annotation of the final VCF.
    """
    out_file = "%s-annotated%s" % utils.splitext_plus(in_file)
    if not utils.file_uptodate(out_file, in_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            cl = _add_vcf_header_sample_cl(in_file, items, out_file)
            if cl:
                cmd = "{cl} | bgzip -c > {tx_out_file}"
                do.run(cmd.format(**locals()), "Annotate")
    if utils.file_exists(out_file):
        return vcfutils.bgzip_and_index(out_file, items[0]["config"])
    else:
        return in_file
Esempio n. 37
0
def _run_germline(align_bams, items, ref_file, assoc_files, region, out_file, work_dir):
    if not utils.file_exists(out_file):
        with file_transaction(items[0], work_dir) as tx_work_dir:
            workflow_file = _configure_germline(align_bams, items, ref_file, region, out_file, tx_work_dir)
            if workflow_file:
                _run_workflow(items[0], workflow_file, tx_work_dir)
            else:
                vcfutils.write_empty_vcf(out_file, items[0]["config"], [dd.get_sample_name(d) for d in items])
        raw_file = os.path.join(work_dir, "results", "variants",
                                "genome.vcf.gz" if joint.want_gvcf(items) else "variants.vcf.gz")
        utils.copy_plus(raw_file, out_file)
        # Remove files with relative symlinks
        utils.remove_plus(os.path.join(work_dir, "results", "variants", "genome.vcf.gz"))
    return vcfutils.bgzip_and_index(out_file, items[0]["config"])
Esempio n. 38
0
def _apply_priority_filter(in_file, priority_file, data):
    """Annotate variants with priority information and use to apply filters.
    """
    out_file = "%s-priority%s" % utils.splitext_plus(in_file)
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            header = ('##INFO=<ID=EPR,Number=.,Type=String,'
                      'Description="Somatic prioritization based on external annotations, '
                      'identify as likely germline">')
            header_file = "%s-repeatheader.txt" % utils.splitext_plus(tx_out_file)[0]
            with open(header_file, "w") as out_handle:
                out_handle.write(header)
            if "tumoronly_germline_filter" in dd.get_tools_on(data):
                filter_cmd = ("bcftools filter -m '+' -s 'LowPriority' "
                              """-e "EPR[*] != 'pass'" |""")
            else:
                filter_cmd = ""
            cmd = ("bcftools annotate -a {priority_file} -h {header_file} "
                   "-c CHROM,FROM,TO,REF,ALT,INFO/EPR {in_file} | "
                   "{filter_cmd} bgzip -c > {tx_out_file}")
            do.run(cmd.format(**locals()), "Run external annotation based prioritization filtering")
    vcfutils.bgzip_and_index(out_file, data["config"])
    return out_file
Esempio n. 39
0
def _cnn_score_variants(in_file, tensor_type, data):
    """Score variants with pre-trained CNN models.
    """
    out_file = "%s-cnnscore.vcf.gz" % utils.splitext_plus(in_file)[0]
    if not utils.file_uptodate(out_file, in_file):
        runner = broad.runner_from_config(data["config"])
        gatk_type = runner.gatk_type()
        assert gatk_type == "gatk4", "CNN filtering requires GATK4"
        with file_transaction(data, out_file) as tx_out_file:
            params = ["-T", "CNNScoreVariants", "--variant", in_file, "--reference", dd.get_ref_file(data),
                    "--output", tx_out_file, "--input", dd.get_align_bam(data)]
            params += ["--tensor-type", tensor_type]
            runner.run_gatk(params)
    return vcfutils.bgzip_and_index(out_file, data["config"])
Esempio n. 40
0
def run(align_bams, items, ref_file, assoc_files, region, out_file):
    """Return DeepVariant calling on germline samples.

    region can be a single region or list of multiple regions for multicore calling.
    """
    assert not vcfutils.is_paired_analysis(align_bams, items), \
        ("DeepVariant currently only supports germline calling: %s" %
         (", ".join([dd.get_sample_name(d) for d in items])))
    assert len(items) == 1, \
        ("DeepVariant currently only supports single sample calling: %s" %
         (", ".join([dd.get_sample_name(d) for d in items])))
    out_file = _run_germline(align_bams[0], items[0], ref_file,
                             region, out_file)
    return vcfutils.bgzip_and_index(out_file, items[0]["config"])
Esempio n. 41
0
def _run_genomicsdb_import(vrn_files, region, out_file, data):
    """Create a GenomicsDB reference for all the variation files: GATK4.

    Not yet tested as scale, need to explore --batchSize to reduce memory
    usage if needed.

    Does not support transactional directories yet, since
    GenomicsDB databases cannot be moved to new locations. We try to
    identify half-finished databases and restart:
https://gatkforums.broadinstitute.org/gatk/discussion/10061/using-genomicsdbimport-to-prepare-gvcfs-for-input-to-genotypegvcfs-in-gatk4

    Known issue -- Genomics DB workspace path core dumps on longer paths:
    (std::string::compare(char const*))
    """
    out_dir = "%s_genomicsdb" % utils.splitext_plus(out_file)[0]
    if not os.path.exists(out_dir) or _incomplete_genomicsdb(out_dir):
        if os.path.exists(out_dir):
            shutil.rmtree(out_dir)
        with utils.chdir(os.path.dirname(out_file)):
            with file_transaction(data, out_dir) as tx_out_dir:
                broad_runner = broad.runner_from_config(data["config"])
                cores = dd.get_cores(data)
                params = ["-T", "GenomicsDBImport",
                          "--reader-threads", str(cores),
                          "--genomicsdb-workspace-path", os.path.relpath(out_dir, os.getcwd()),
                          "-L", bamprep.region_to_gatk(region)]
                for vrn_file in vrn_files:
                    vcfutils.bgzip_and_index(vrn_file, data["config"])
                samplemap = _create_samplemap_file(vrn_files)
                params += ["--sample-name-map", samplemap]
                # For large inputs, reduce memory usage by batching
                # https://github.com/bcbio/bcbio-nextgen/issues/2852
                if len(vrn_files) > 200:
                    params += ["--batch-size", "50"]
                memscale = {"magnitude": 0.9 * cores, "direction": "increase"} if cores > 1 else None
                broad_runner.run_gatk(params, memscale=memscale)
    return out_dir
Esempio n. 42
0
def run(align_bams, items, ref_file, assoc_files, region, out_file):
    """Run platypus variant calling, germline whole genome or exome.
    """
    assert out_file.endswith(".vcf.gz")
    if not utils.file_exists(out_file):
        with file_transaction(items[0], out_file) as tx_out_file:
            for align_bam in align_bams:
                bam.index(align_bam, items[0]["config"])
            cmd = [
                "platypus", "callVariants",
                "--regions=%s" % _subset_regions(region, out_file, items),
                "--bamFiles=%s" % ",".join(align_bams),
                "--refFile=%s" % dd.get_ref_file(items[0]), "--output=-",
                "--logFileName", "/dev/null", "--verbosity=1"
            ]
            resources = config_utils.get_resources("platypus",
                                                   items[0]["config"])
            if resources.get("options"):
                # normalize options so we can set defaults without overwriting user specified
                for opt in resources["options"]:
                    if "=" in opt:
                        key, val = opt.split("=")
                        cmd.extend([key, val])
                    else:
                        cmd.append(opt)
            if any("gvcf" in dd.get_tools_on(d) for d in items):
                cmd += ["--outputRefCalls", "1", "--refCallBlockSize", "50000"]
            # Adjust default filter thresholds to achieve similar sensitivity/specificity to other callers
            # Currently not used after doing more cross validation as they increase false positives
            # which seems to be a major advantage for Platypus users.
            # tuned_opts = ["--hapScoreThreshold", "10", "--scThreshold", "0.99", "--filteredReadsFrac", "0.9",
            #               "--rmsmqThreshold", "20", "--qdThreshold", "0", "--abThreshold", "0.0001",
            #               "--minVarFreq", "0.0", "--assemble", "1"]
            # for okey, oval in utils.partition_all(2, tuned_opts):
            #     if okey not in cmd:
            #         cmd.extend([okey, oval])

            # Avoid filtering duplicates on high depth targeted regions where we don't mark duplicates
            if any(not dd.get_mark_duplicates(data) for data in items):
                cmd += ["--filterDuplicates=0"]
            post_process_cmd = (
                " | %s | %s | %s | vcfallelicprimitives -t DECOMPOSED --keep-geno | vcffixup - | "
                "vcfstreamsort | bgzip -c > %s" %
                (vcfutils.fix_ambiguous_cl(), vcfutils.fix_ambiguous_cl(5),
                 vcfutils.add_contig_to_header_cl(items[0]), tx_out_file))
            do.run(" ".join(cmd) + post_process_cmd,
                   "platypus variant calling")
        out_file = vcfutils.bgzip_and_index(out_file, items[0]["config"])
    return out_file
Esempio n. 43
0
def variants(data):
    if "vrn_file" not in data:
        return data
    if not dd.get_coverage(data):
        return data

    in_vcf = data['vrn_file']
    sample = dd.get_sample_name(data)
    cg_file = os.path.join(sample + "_with-gc.vcf.gz")
    parse_file = os.path.join(sample + "_gc-depth-parse.tsv")
    qc_file = os.path.join(sample + "_bcbio_variants.txt")
    work_dir = os.path.join(dd.get_work_dir(data), "report", "variants")
    with chdir(work_dir):
        if file_exists(qc_file):
            return data
        in_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
        ref_file = dd.get_ref_file(data)
        assert ref_file, "Need the reference genome fasta file."
        bed_file = dd.get_variant_regions(data)
        in_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
        num_cores = dd.get_num_cores(data)
        broad_runner = broad.runner_from_config_safe(data["config"])
        if in_bam and broad_runner and broad_runner.has_gatk():
            if not file_exists(cg_file):
                with file_transaction(cg_file) as tx_out:
                    params = [
                        "-T", "VariantAnnotator", "-R", ref_file, "-L",
                        bed_file, "-I", in_bam, "-A", "GCContent", "-A",
                        "Coverage", "--variant", in_vcf, "--out", tx_out
                    ]
                    broad_runner.run_gatk(params)
            cg_file = vcfutils.bgzip_and_index(cg_file, data["config"])

            if not file_exists(parse_file):
                with file_transaction(parse_file) as out_tx:
                    with open(out_tx, 'w') as out_handle:
                        print >> out_handle, "CG\tdepth\tsample"
                    cmd = (
                        "bcftools query -s {sample} -f '[%GC][\\t%DP][\\t%SAMPLE]\\n' -R "
                        "{bed_file} {cg_file} >> {out_tx}")
                    do.run(cmd.format(**locals()),
                           "Calculating GC content and depth for %s" % in_vcf)
                    logger.debug('parsing coverage: %s' % sample)
            if not file_exists(qc_file):
                # This files will be copied to final
                _summary_variants(parse_file, qc_file)
            if file_exists(qc_file) and file_exists(parse_file):
                os.remove(cg_file)
        return data
Esempio n. 44
0
def mutect2_caller(align_bams, items, ref_file, assoc_files,
                       region=None, out_file=None):
    """Call variation with GATK's MuTect2.

    This requires the full non open-source version of GATK 3.5+.
    """
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0]
    if not utils.file_exists(out_file):
        paired = vcfutils.get_paired_bams(align_bams, items)
        broad_runner = broad.runner_from_config(items[0]["config"])
        gatk_type = broad_runner.gatk_type()
        _prep_inputs(align_bams, ref_file, items)
        with file_transaction(items[0], out_file) as tx_out_file:
            params = ["-T", "Mutect2" if gatk_type == "gatk4" else "MuTect2",
                      "--annotation", "ClippingRankSumTest",
                      "--annotation", "DepthPerSampleHC"]
            if gatk_type == "gatk4":
                params += ["--reference", ref_file]
            else:
                params += ["-R", ref_file]
            for a in annotation.get_gatk_annotations(items[0]["config"], include_baseqranksum=False):
                params += ["--annotation", a]
            # Avoid issues with BAM CIGAR reads that GATK doesn't like
            if gatk_type == "gatk4":
                params += ["--read-validation-stringency", "LENIENT"]
            params += _add_tumor_params(paired, items, gatk_type)
            params += _add_region_params(region, out_file, items, gatk_type)
            # Avoid adding dbSNP/Cosmic so they do not get fed to variant filtering algorithm
            # Not yet clear how this helps or hurts in a general case.
            #params += _add_assoc_params(assoc_files)
            resources = config_utils.get_resources("mutect2", items[0]["config"])
            if "options" in resources:
                params += [str(x) for x in resources.get("options", [])]
            assert LooseVersion(broad_runner.gatk_major_version()) >= LooseVersion("3.5"), \
                "Require full version of GATK 3.5+ for mutect2 calling"
            broad_runner.new_resources("mutect2")
            gatk_cmd = broad_runner.cl_gatk(params, os.path.dirname(tx_out_file))
            if gatk_type == "gatk4":
                tx_raw_prefilt_file = "%s-raw%s" % utils.splitext_plus(tx_out_file)
                tx_raw_file = "%s-raw-filt%s" % utils.splitext_plus(tx_out_file)
                filter_cmd = _mutect2_filter(broad_runner, tx_raw_prefilt_file, tx_raw_file, ref_file)
                cmd = "{gatk_cmd} -O {tx_raw_prefilt_file} && {filter_cmd}"
            else:
                tx_raw_file = "%s-raw%s" % utils.splitext_plus(tx_out_file)
                cmd = "{gatk_cmd} > {tx_raw_file}"
            do.run(cmd.format(**locals()), "MuTect2")
            out_file = _af_filter(paired.tumor_data, tx_raw_file, out_file)
    return vcfutils.bgzip_and_index(out_file, items[0]["config"])
Esempio n. 45
0
def _rename_allelic_fraction_field(orig_file, config, out_file):
    """Rename allelic fraction field in mutect output
       from FA to FREQ to standarize with other tools
    """
    out_file_noc = out_file.replace(".vcf.gz", ".vcf")
    with file_transaction(config, out_file_noc) as tx_out_file:
        with open_gzipsafe(orig_file) as in_handle:
            with open(tx_out_file, 'w') as out_handle:
                for line in in_handle:
                    if line.startswith("##FORMAT=<ID=FA"):
                        line = line.replace("=FA", "=FREQ")
                    if not line.startswith("#"):
                        line = line.replace("FA", "FREQ")
                    out_handle.write(line)
    return bgzip_and_index(out_file_noc, config)
Esempio n. 46
0
def _get_ploidy(regions, items, base_file):
    samples = [dd.get_sample_name(d) for d in items]
    out_file = "%s-ploidy.vcf" % utils.splitext_plus(base_file)[0]
    if not utils.file_exists(out_file) and not utils.file_exists(out_file + ".gz"):
        with file_transaction(items[0], out_file) as tx_outfile:
            with open(tx_outfile, "w") as h:
                h.write("##fileformat=VCFv4.1\n")
                h.write('##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the variant described in this record">\n')
                h.write('##FORMAT=<ID=CN,Number=1,Type=Integer,Description="Copy number genotype for imprecise events">\n')
                h.write("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t" + "\t".join(samples) + "\n")
                for region in regions:
                    ploidies = [ploidy.get_ploidy([d], region) for d in items]
                    h.write("\t".join([region[0], str(region[1]), ".", "N", "<CNV>", ".", ".",
                                       "END=%s" % region[2], "CN"] + [str(x) for x in ploidies]) + "\n")
    return vcfutils.bgzip_and_index(out_file, items[0]["config"])
Esempio n. 47
0
def _filter_vcf(orig_file, ftype="max", name="ColorCustom"):
    """Filter VCF with bcftools, providing count summary of items removed.
    """
    exprs = {}
    exprs["max"] = ('SUM(AD[*]) < 15 || '
                    'PL[0] / SUM(AD[*]) <= 3.0 || '
                    'GC < 20.0 || GC > 77.0 || '
                    'RPT[*] = "rmsk" || '
                    'RPT[*] = "lcr"')
    exprs["min2"] = ('SUM(AD[*]) < 15 || '
                     'PL[0] / SUM(AD[*]) <= 3.0 || '
                     'RPT[*] = "lcr"')
    exprs["min1"] = ('SUM(AD[*]) < 15 || '
                     'PL[0] / SUM(AD[*]) <= 3.0 || '
                     '(RPT[*] = "lcr" && RPT[*] = "rmsk")')
    exprs["min0"] = ('SUM(AD[*]) < 15 || ' 'PL[0] / SUM(AD[*]) <= 3.0')
    exprs["all"] = 'GC < 1.0'
    expr = exprs[ftype]
    base, ext = utils.splitext_plus(orig_file)
    out_file = "%s-filter%s%s" % (base, ftype, ext)
    if not utils.file_exists(out_file):
        with file_transaction({}, out_file) as tx_out_file:
            cmd = ("bcftools filter -O z -o {tx_out_file} "
                   "-m '+' -e '{expr}' -s '{name}' {orig_file}")
            do.run(cmd.format(**locals()), "Hard filter VCF")
    vcfutils.bgzip_and_index(out_file, {})

    def count(f):
        with gzip.open(f) as h:
            return sum(1 for line in h if not line.startswith("#")
                       and line.split("\t")[6] in ["PASS", "."])

    removed_stats = {"orig": count(orig_file), "final": count(out_file)}
    removed_stats["pct"] = float(
        removed_stats["final"]) * 100.0 / removed_stats["orig"]
    return out_file, removed_stats
Esempio n. 48
0
def _run_germline(align_bams, items, ref_file, assoc_files, region, out_file):
    if not utils.file_exists(out_file):
        work_dir = "%s-work" % utils.splitext_plus(out_file)[0]
        with file_transaction(items[0], work_dir) as tx_work_dir:
            workflow_file = _configure_germline(align_bams, items, ref_file,
                                                region, out_file, tx_work_dir)
            _run_workflow(items[0], workflow_file, tx_work_dir)
        raw_file = os.path.join(
            work_dir, "results", "variants",
            "genome.vcf.gz" if joint.want_gvcf(items) else "variants.vcf.gz")
        out_file = annotation.annotate_nongatk_vcf(raw_file, align_bams,
                                                   assoc_files.get("dbsnp"),
                                                   ref_file, items[0],
                                                   out_file)
    return vcfutils.bgzip_and_index(out_file, items[0]["config"])
Esempio n. 49
0
def merge_gvcfs(data, region, vrn_files, out_file):
    """Simple merging of gVCFs with gvcftools.

    merge_variants does appear to work correctly, so we remove gVCF parts
    with extract_variants and then combine the merged samples together.

    Longer term we plan to replace this with
    agg (https://github.com/Illumina/agg) or
    GLnexus (https://github.com/dnanexus-rnd/GLnexus).
    """
    if not utils.file_exists(out_file):
        region = bamprep.region_to_gatk(region)
        vcfutils.merge_variant_files([_extract_variants_from_gvcf(f, region, out_file, data) for f in vrn_files],
                                     out_file, dd.get_ref_file(data), data["config"], region)
    return vcfutils.bgzip_and_index(out_file, data["config"])
Esempio n. 50
0
def filter_to_pass_and_reject(in_file, paired, out_dir=None):
    """Filter VCF to only those with a strict PASS/REJECT: somatic + germline.

    Removes low quality calls filtered but also labeled with REJECT.
    """
    from bcbio.heterogeneity import bubbletree
    out_file = "%s-prfilter.vcf.gz" % utils.splitext_plus(in_file)[0]
    if out_dir:
        out_file = os.path.join(out_dir, os.path.basename(out_file))
    if not utils.file_uptodate(out_file, in_file):
        with file_transaction(paired.tumor_data, out_file) as tx_out_file:
            tx_out_plain = tx_out_file.replace(".vcf.gz", ".vcf")
            with contextlib.closing(cyvcf2.VCF(in_file)) as reader:
                reader = _add_db_to_header(reader)
                with contextlib.closing(cyvcf2.Writer(tx_out_plain,
                                                      reader)) as writer:
                    for rec in reader:
                        filters = rec.FILTER.split(";") if rec.FILTER else []
                        other_filters = [
                            x for x in filters
                            if x not in ["PASS", ".", "REJECT"]
                        ]
                        if len(other_filters) == 0:
                            # Germline, check if we should include based on frequencies
                            if "REJECT" in filters or rec.INFO.get(
                                    "STATUS", "").lower() == "germline":
                                stats = bubbletree._is_possible_loh(
                                    rec, reader, bubbletree.PARAMS, paired)
                                if stats:
                                    rec.INFO["DB"] = True
                                    writer.write_record(rec)
                            # Somatic, always include
                            else:
                                writer.write_record(rec)
            vcfutils.bgzip_and_index(tx_out_plain, paired.tumor_data["config"])
    return out_file
Esempio n. 51
0
def add_dbsnp(orig_file, dbsnp_file, data, out_file=None):
    """Annotate a VCF file with dbSNP.
    """
    orig_file = vcfutils.bgzip_and_index(orig_file, data["config"])
    if out_file is None:
        out_file = "%s-wdbsnp.vcf.gz" % utils.splitext_plus(orig_file)[0]
    if not utils.file_uptodate(out_file, orig_file):
        with file_transaction(data, out_file) as tx_out_file:
            conf_file = os.path.join(os.path.dirname(tx_out_file),
                                     "dbsnp.conf")
            with open(conf_file, "w") as out_handle:
                out_handle.write('[[annotation]]\n')
                out_handle.write('file="%s"\n' % os.path.normpath(
                    os.path.join(dd.get_work_dir(data), dbsnp_file)))
                out_handle.write('fields=["ID"]\n')
                out_handle.write('names=["rs_ids"]\n')
                out_handle.write('ops=["concat"]\n')
            ref_file = dd.get_ref_file(data)
            cmd = (
                "vcfanno {conf_file} {orig_file} | "
                "bcftools annotate --set-id +'%INFO/rs_ids' -o {tx_out_file} -O z"
            )
            do.run(cmd.format(**locals()), "Annotate with dbSNP")
    return vcfutils.bgzip_and_index(out_file, data["config"])
Esempio n. 52
0
def _decompose(in_file, data):
    """Convert multi-allelic variants into single allelic.
    """
    out_file = "%s-decompose%s" % utils.splitext_plus(in_file)
    if not utils.file_exists(out_file):
        assert out_file.endswith(".vcf.gz")
        with file_transaction(data, out_file) as tx_out_file:
            cmd = ("gunzip -c %s | "
                   "sed 's/ID=AD,Number=./ID=AD,Number=R/' | "
                   "vt decompose -s - "
                   """| awk '{ gsub("./-65", "./."); print $0 }'"""
                   "| bgzip -c > %s")
            do.run(cmd % (in_file, tx_out_file),
                   "Multi-allelic to single allele")
    return vcfutils.bgzip_and_index(out_file, data["config"])
Esempio n. 53
0
def _run_germline(align_bams, items, ref_file, assoc_files, region, out_file):
    if not utils.file_exists(out_file):
        work_dir = "%s-work" % utils.splitext_plus(out_file)[0]
        with file_transaction(items[0], work_dir) as tx_work_dir:
            workflow_file = _configure_germline(align_bams, items, ref_file,
                                                region, out_file, tx_work_dir)
            _run_workflow(items[0], workflow_file, tx_work_dir)
        raw_file = os.path.join(
            work_dir, "results", "variants",
            "genome.vcf.gz" if joint.want_gvcf(items) else "variants.vcf.gz")
        utils.copy_plus(raw_file, out_file)
        # Remove files with relative symlinks
        utils.remove_plus(
            os.path.join(work_dir, "results", "variants", "genome.vcf.gz"))
    return vcfutils.bgzip_and_index(out_file, items[0]["config"])
Esempio n. 54
0
def filter_to_pass_and_reject(in_file, data, out_dir=None):
    """Filter VCF to only those with a strict PASS/REJECT: somatic + germline.

    Removes low quality calls filtered but also labeled with REJECT.
    """
    out_file = "%s-prfilter.vcf.gz" % utils.splitext_plus(in_file)[0]
    if out_dir:
        out_file = os.path.join(out_dir, os.path.basename(out_file))
    if not utils.file_uptodate(out_file, in_file):
        with file_transaction(data, out_file) as tx_out_file:
            tx_out_plain = tx_out_file.replace(".vcf.gz", ".vcf")
            with contextlib.closing(cyvcf2.VCF(in_file)) as reader:
                with contextlib.closing(cyvcf2.Writer(tx_out_plain,
                                                      reader)) as writer:
                    for rec in reader:
                        filters = rec.FILTER.split(";") if rec.FILTER else []
                        filters = [
                            x for x in filters
                            if x not in ["PASS", ".", "REJECT"]
                        ]
                        if len(filters) == 0:
                            writer.write_record(rec)
            vcfutils.bgzip_and_index(tx_out_plain, data["config"])
    return out_file
Esempio n. 55
0
def sort_to_ref(fname, ref_file, add_chr):
    """Match reference genome ordering.
    """
    out_file = "%s-prep.vcf.gz" % (fname.replace(".vcf.gz", ""))
    if not os.path.exists(out_file):
        if add_chr:
            fix_chrom = r'| sed "s/^\([0-9]\+\)\t/chr\1\t/g" | sed "s/^MT/chrM/g" | sed "s/^X/chrX/g" | sed "s/^Y/chrY/g" '
        else:
            fix_chrom = ''
        contig_cl = vcfutils.add_contig_to_header_cl(ref_file, out_file)
        cmd = ("gunzip -c {fname} {fix_chrom} | "
               "gsort /dev/stdin {ref_file}.fai | {contig_cl} | "
               "bgzip -c > {out_file}")
        subprocess.check_call(cmd.format(**locals()), shell=True)
    return vcfutils.bgzip_and_index(out_file, {})
Esempio n. 56
0
def clean_file(in_file, data, prefix="", bedprep_dir=None, simple=None):
    """Prepare a clean sorted input BED file without headers
    """
    # Remove non-ascii characters. Used in coverage analysis, to support JSON code in one column
    #   and be happy with sambamba:
    simple = "iconv -c -f utf-8 -t ascii | sed 's/ //g' |" if simple else ""
    if in_file:
        if not bedprep_dir:
            bedprep_dir = utils.safe_makedir(
                os.path.join(data["dirs"]["work"], "bedprep"))
        # Avoid running multiple times with same prefix
        if prefix and os.path.basename(in_file).startswith(prefix):
            return in_file
        out_file = os.path.join(bedprep_dir,
                                "%s%s" % (prefix, os.path.basename(in_file)))
        out_file = out_file.replace(".interval_list", ".bed")
        if out_file.endswith(".gz"):
            out_file = out_file[:-3]
        if not utils.file_uptodate(out_file, in_file):
            check_bed_contigs(in_file, data)
            check_bed_coords(in_file, data)
            with file_transaction(data, out_file) as tx_out_file:
                bcbio_py = sys.executable
                cat_cmd = "zcat" if in_file.endswith(".gz") else "cat"
                sort_cmd = get_sort_cmd(os.path.dirname(tx_out_file))
                cmd = (
                    "{cat_cmd} {in_file} | grep -v ^track | grep -v ^browser | grep -v ^@ | "
                    "grep -v ^# | {simple} "
                    "{bcbio_py} -c 'from bcbio.variation import bedutils; bedutils.remove_bad()' | "
                    "{sort_cmd} -k1,1 -k2,2n > {tx_out_file}")
                do.run(cmd.format(**locals()), "Prepare cleaned BED file",
                       data)
        vcfutils.bgzip_and_index(out_file,
                                 data.get("config", {}),
                                 remove_orig=False)
        return out_file
Esempio n. 57
0
def subset_by_callers(in_file, callers):
    out_file = "%s-%s.vcf" % (in_file.replace(".vcf", "").replace(
        ".gz", ""), "_".join(callers))
    if not os.path.exists(out_file) and not os.path.exists(out_file + ".gz"):
        want_callers = set(callers)
        reader = VariantFile(in_file)
        writer = VariantFile(out_file, "w", header=reader.header)
        count = 0
        for rec in reader:
            cur_callers = set(rec.info["set"].split("-"))
            if len(cur_callers & want_callers) > 0:
                count += 1
                writer.write(rec)
        print callers, count
    return vcfutils.bgzip_and_index(out_file, {})
Esempio n. 58
0
def hard_w_expression(vcf_file,
                      expression,
                      data,
                      name="+",
                      filterext="",
                      extra_cmd="",
                      limit_regions="variant_regions"):
    """Perform hard filtering using bcftools expressions like %QUAL < 20 || DP < 4.
    """
    base, ext = utils.splitext_plus(vcf_file)
    out_file = "{base}-filter{filterext}{ext}".format(**locals())
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            if vcfutils.vcf_has_variants(vcf_file):
                bcftools = config_utils.get_program("bcftools", data["config"])
                bgzip_cmd = "| bgzip -c" if out_file.endswith(".gz") else ""
                variant_regions = (
                    utils.get_in(data,
                                 ("config", "algorithm", "variant_regions"))
                    if limit_regions == "variant_regions" else None)
                intervals = (
                    "-T %s" %
                    vcfutils.bgzip_and_index(variant_regions, data["config"])
                    if variant_regions else "")
                cmd = (
                    "{bcftools} filter -O v {intervals} --soft-filter '{name}' "
                    "-e '{expression}' -m '+' {vcf_file} {extra_cmd} {bgzip_cmd} > {tx_out_file}"
                )
                do.run(cmd.format(**locals()),
                       "Hard filtering %s with %s" % (vcf_file, expression),
                       data)
            else:
                shutil.copy(vcf_file, out_file)
    if out_file.endswith(".vcf.gz"):
        out_file = vcfutils.bgzip_and_index(out_file, data["config"])
    return out_file
Esempio n. 59
0
def _setup_call_false(vrn_file, rm_bed, base_dir, data, call_type):
    """Create set of false positives or ngatives for inputs with empty truth sets.
    """
    out_file = os.path.join(base_dir, "%s.vcf.gz" % call_type)
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            if not vrn_file.endswith(".gz"):
                vrn_file = vcfutils.bgzip_and_index(
                    vrn_file, out_dir=os.path.dirname(tx_out_file))
            cmd = (
                "bcftools view -R {rm_bed} -f 'PASS,.' {vrn_file} -O z -o {tx_out_file}"
            )
            do.run(cmd.format(**locals()),
                   "Prepare %s with empty reference" % call_type, data)
    return {call_type: out_file}
Esempio n. 60
0
def fix_germline_samplename(in_file, sample_name, data):
    """Replace germline sample names, originally from normal BAM file.
    """
    out_file = "%s-fixnames%s" % utils.splitext_plus(in_file)
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            sample_file = "%s-samples.txt" % utils.splitext_plus(
                tx_out_file)[0]
            with open(sample_file, "w") as out_handle:
                out_handle.write("%s\n" % sample_name)
            cmd = (
                "bcftools reheader -s {sample_file} {in_file} -o {tx_out_file}"
            )
            do.run(cmd.format(**locals()),
                   "Fix germline samplename: %s" % sample_name)
    return vcfutils.bgzip_and_index(out_file, data["config"])