Esempio n. 1
0
def concat_variant_files_catvariants(orig_files, out_file, regions, ref_file, config):
    """Concatenate multiple variant files from regions into a single output file.

    Uses GATK CatVariants as a lightweight approach to merging VCF files split
    by regions with the same sample information, so no complex merging needed.
    Handles both plain text and bgzipped/tabix indexed outputs.

    Falls back to bcftools concat if fails due to GATK stringency issues.
    """
    if not utils.file_exists(out_file):
        input_file_list = _get_file_list(orig_files, out_file, regions, ref_file, config)
        failed = False
        with file_transaction(config, out_file) as tx_out_file:
            params = ["org.broadinstitute.gatk.tools.CatVariants",
                      "-R", ref_file,
                      "-V", input_file_list,
                      "-out", tx_out_file,
                      "-assumeSorted"]
            jvm_opts = broad.get_gatk_framework_opts(config, os.path.dirname(tx_out_file), include_gatk=False)
            try:
                do.run(broad.gatk_cmd("gatk-framework", jvm_opts, params), "Concat variant files", log_error=False)
            except subprocess.CalledProcessError as msg:
                if ("We require all VCFs to have complete VCF headers" in str(msg) or
                      "Features added out of order" in str(msg) or
                      "The reference allele cannot be missing" in str(msg)):
                    os.remove(tx_out_file)
                    failed = True
                else:
                    raise
        if failed:
            return _run_concat_variant_files_bcftools(input_file_list, out_file, config)
    if out_file.endswith(".gz"):
        bgzip_and_index(out_file, config)
    return out_file
Esempio n. 2
0
def _filter_bad_reads(in_bam, ref_file, data):
    """Use GATK filter to remove problem reads which choke GATK and Picard.
    """
    bam.index(in_bam, data["config"])
    out_file = "%s-gatkfilter.bam" % os.path.splitext(in_bam)[0]
    if not utils.file_exists(out_file):
        with tx_tmpdir(data) as tmp_dir:
            with file_transaction(data, out_file) as tx_out_file:
                params = [
                    "-T",
                    "PrintReads",
                    "-R",
                    ref_file,
                    "-I",
                    in_bam,
                    "--out",
                    tx_out_file,
                    "--filter_mismatching_base_and_quals",
                    "--filter_bases_not_stored",
                    "--filter_reads_with_N_cigar",
                ]
                if dd.get_quality_format(data, "").lower() == "illumina":
                    params.append("--fix_misencoded_quality_scores")
                jvm_opts = broad.get_gatk_framework_opts(data["config"], tmp_dir)
                cmd = [config_utils.get_program("gatk-framework", data["config"])] + jvm_opts + params
                do.run(cmd, "Filter problem reads")
    bam.index(out_file, data["config"])
    return out_file
Esempio n. 3
0
def concat_variant_files(orig_files, out_file, regions, ref_file, config):
    """Concatenate multiple variant files from regions into a single output file.

    Lightweight approach to merging VCF files split by regions with the same
    sample information, so no complex merging needed. Handles both plain text
    and bgzipped/tabix indexed outputs.

    Falls back to slower CombineVariants if fails due to GATK stringency issues.
    """
    if not utils.file_exists(out_file):
        with file_transaction(out_file) as tx_out_file:
            sorted_files = _sort_by_region(orig_files, regions, ref_file, config)
            exist_files = [x for x in sorted_files if os.path.exists(x)]
            ready_files = run_multicore(p_bgzip_and_index, [[x, config] for x in exist_files], config)
            input_file_list = "%s-files.list" % utils.splitext_plus(out_file)[0]
            with open(input_file_list, "w") as out_handle:
                for fname in ready_files:
                    out_handle.write(fname + "\n")
            params = ["org.broadinstitute.gatk.tools.CatVariants",
                      "-R" , ref_file,
                      "-V", input_file_list,
                      "-out", tx_out_file,
                      "-assumeSorted"]
            jvm_opts = broad.get_gatk_framework_opts(config, include_gatk=False)
            cmd = [config_utils.get_program("gatk-framework", config)] + params + jvm_opts
            try:
                do.run(cmd, "Concat variant files", log_error=False)
            except subprocess.CalledProcessError, msg:
                if str(msg).find("We require all VCFs to have complete VCF headers"):
                    return combine_variant_files(orig_files, out_file, ref_file, config)
                else:
                    raise
def variants(data):
    if not "vrn_file" in  data:
        return data
    in_vcf = data['vrn_file']
    work_dir = os.path.join(dd.get_work_dir(data), "report", "variants")
    with chdir(work_dir):
        in_bam = data['work_bam']
        ref_file = dd.get_ref_file(data)
        assert ref_file, "Need the reference genome fasta file."
        jvm_opts = broad.get_gatk_framework_opts(data['config'])
        gatk_jar = config_utils.get_program("gatk", data['config'], "dir")
        bed_file = dd.get_variant_regions(data)
        sample = splitext_plus(os.path.basename(in_vcf))[0]
        in_bam = data["work_bam"]
        cg_file = os.path.join(sample + "_with-gc.vcf.gz")
        parse_file = os.path.join(sample + "_cg-depth-parse.tsv")
        if not file_exists(cg_file):
            with file_transaction(cg_file) as tx_out:
                cmd = ("java -jar {gatk_jar}/GenomeAnalysisTK.jar -T VariantAnnotator -R {ref_file} "
                       "-L {bed_file} -I {in_bam} "
                       "-A GCContent --variant {in_vcf} --out {tx_out}")
                do.run(cmd.format(**locals()), " GC bias for %s" % in_vcf)

        if not file_exists(parse_file):
            with file_transaction(parse_file) as out_tx:
                with open(out_tx, 'w') as out_handle:
                    print >>out_handle, "CG\tdepth\tsample"
                cmd = ("bcftools query -f '[%GC][\\t%DP][\\t%SAMPLE]\\n' -R  {bed_file} {cg_file} >> {out_tx}")
                do.run(cmd.format(**locals()), " query for %s" % in_vcf)
                logger.debug('parsing coverage: %s' % sample)
        # return df
        return data
Esempio n. 5
0
def gatk_filter_rnaseq(data, vrn_file, out_file):
    """
    this incorporates filters listed here, dropping clusters of variants
    within a 35 nucleotide window, high fischer strand values and low
    quality by depth
    https://software.broadinstitute.org/gatk/guide/article?id=3891
    java -jar GenomeAnalysisTK.jar -T VariantFiltration -R hg_19.fasta -V
    input.vcf -window 35 -cluster 3 -filterName FS -filter "FS > 30.0"
    -filterName QD -filter "QD < 2.0" -o output.vcf
    """
    broad_runner = broad.runner_from_config(dd.get_config(data))
    ref_file = dd.get_ref_file(data)
    if file_exists(out_file):
        return out_file
    with file_transaction(out_file) as tx_out_file:
        params = ["-T", "VariantFiltration",
                  "-R", ref_file,
                  "-V", vrn_file,
                  "--clusterWindowSize", "35",
                  "--clusterSize", "3",
                  "--filterExpression", "\"'FS > 30.0'\"",
                  "--filterName", "FS",
                  "--filterExpression", "\"'QD < 2.0'\"",
                  "--filterName", "QD",
                  "-o", tx_out_file]
        jvm_opts = broad.get_gatk_framework_opts(dd.get_config(data), os.path.dirname(tx_out_file))
        do.run(broad.gatk_cmd("gatk-framework", jvm_opts, params),
               "Filter variants.")
    return out_file
Esempio n. 6
0
def calc_variants_stats(data, args):
    in_vcf = data['vcf']
    ref_file = args.reference
    # gatk_jar = '/groups/bcbio/bcbio/toolplus/gatk/3.2-2-gec30cee/GenomeAnalysisTK.jar'
    jvm_opts = broad.get_gatk_framework_opts(data['config'])
    gatk_jar = config_utils.get_program("gatk", data['config'], "dir")
    bed_file = args.region
    sample = splitext_plus(op.basename(in_vcf))[0]
    in_bam = data['bam']
    cg_file = op.join(args.out, sample + "_with-gc.vcf.gz")
    parse_file = op.join(args.out, sample + "_cg-depth-parse.tsv")
    if not file_exists(cg_file):
        with file_transaction(cg_file) as tx_out:
            cmd = ("java -jar {gatk_jar}/GenomeAnalysisTK.jar -T VariantAnnotator -R {ref_file} "
                   "-L {bed_file} -I {in_bam} "
                   "-A GCContent --variant {in_vcf} --out {tx_out}")
            do.run(cmd.format(**locals()), " cg for %s" % in_vcf)

    if not file_exists(parse_file):
        with file_transaction(parse_file) as out_tx:
            with open(out_tx, 'w') as out_handle:
                print >>out_handle, "CG\tdepth\tsample"
            cmd = ("bcftools query -f '[%GC][\\t%DP][\\t%SAMPLE]\\n' -R  {bed_file} {cg_file} >> {out_tx}")
            do.run(cmd.format(**locals()), " query for %s" % in_vcf)
            logger.info('parsing coverage: %s' % sample)
    # return df
    return parse_file
Esempio n. 7
0
def genotype_filter(vcf_file, expression, data, name, filterext=""):
    """Perform genotype based filtering using GATK with the provided expression.

    Adds FT tags to genotypes, rather than the general FILTER flag.
    """
    base, ext = utils.splitext_plus(vcf_file)
    out_file = "{base}-filter{filterext}{ext}".format(**locals())
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            params = [
                "-T",
                "VariantFiltration",
                "-R",
                tz.get_in(["reference", "fasta", "base"], data),
                "--variant",
                vcf_file,
                "--out",
                tx_out_file,
                "--genotypeFilterName",
                name,
                "--genotypeFilterExpression",
                "'%s'" % expression,
            ]
            jvm_opts = broad.get_gatk_framework_opts(data["config"])
            do.run(broad.gatk_cmd("gatk-framework", jvm_opts, params), "Filter with expression: %s" % expression)
    if out_file.endswith(".vcf.gz"):
        out_file = vcfutils.bgzip_and_index(out_file, data["config"])
    return out_file
Esempio n. 8
0
def _gatk_extract_reads_cl(data, region, prep_params, tmp_dir):
    """Use GATK to extract reads from full BAM file, recalibrating if configured.
    """
    args = ["-T", "PrintReads", "-L", region_to_gatk(region), "-R", data["sam_ref"], "-I", data["work_bam"]]
    if prep_params.get("max_depth"):
        args += ["--downsample_to_coverage", str(prep_params["max_depth"])]
    if prep_params["recal"] == "gatk":
        if "prep_recal" in data and _recal_has_reads(data["prep_recal"]):
            args += ["-BQSR", data["prep_recal"]]
    elif prep_params["recal"]:
        raise NotImplementedError("Recalibration method %s" % prep_params["recal"])
    jvm_opts = broad.get_gatk_framework_opts(data["config"], memscale={"direction": "decrease", "magnitude": 3})
    return [config_utils.get_program("gatk-framework", data["config"])] + jvm_opts + args
Esempio n. 9
0
def combine_variant_files(orig_files, out_file, ref_file, config,
                          quiet_out=True, region=None):
    """Combine VCF files from the same sample into a single output file.

    Handles cases where we split files into SNPs/Indels for processing then
    need to merge back into a final file.

    Will parallelize up to 4 cores based on documented recommendations:
    https://www.broadinstitute.org/gatk/gatkdocs/
    org_broadinstitute_gatk_tools_walkers_variantutils_CombineVariants.php
    """
    in_pipeline = False
    if isinstance(orig_files, dict):
        file_key = config["file_key"]
        in_pipeline = True
        orig_files = orig_files[file_key]
    if not utils.file_exists(out_file):
        with file_transaction(config, out_file) as tx_out_file:
            exist_files = [x for x in orig_files if os.path.exists(x)]
            ready_files = run_multicore(p_bgzip_and_index, [[x, config] for x in exist_files], config)
            params = ["-T", "CombineVariants",
                      "-R", ref_file,
                      "--out", tx_out_file]
            priority_order = []
            for i, ready_file in enumerate(ready_files):
                name = "v%s" % i
                params.extend(["--variant:{name}".format(name=name), ready_file])
                priority_order.append(name)
            params.extend(["--rod_priority_list", ",".join(priority_order)])
            params.extend(["--genotypemergeoption", "PRIORITIZE"])
            if quiet_out:
                params.extend(["--suppressCommandLineHeader", "--setKey", "null"])
            if region:
                variant_regions = config["algorithm"].get("variant_regions", None)
                cur_region = shared.subset_variant_regions(variant_regions, region, out_file)
                if cur_region:
                    params += ["-L", bamprep.region_to_gatk(cur_region),
                               "--interval_set_rule", "INTERSECTION"]
            cores = tz.get_in(["algorithm", "num_cores"], config, 1)
            if cores > 1:
                params += ["-nt", min(cores, 4)]
            memscale = {"magnitude": 0.9 * cores, "direction": "increase"} if cores > 1 else None
            jvm_opts = broad.get_gatk_framework_opts(config, memscale=memscale)
            cmd = [config_utils.get_program("gatk-framework", config)] + jvm_opts + params
            do.run(cmd, "Combine variant files")
    if out_file.endswith(".gz"):
        bgzip_and_index(out_file, config)
    if in_pipeline:
        return [{file_key: out_file, "region": region, "sam_ref": ref_file, "config": config}]
    else:
        return out_file
Esempio n. 10
0
def _gatk_extract_reads_cl(data, region, prep_params, tmp_dir):
    """Use GATK to extract reads from full BAM file.
    """
    requires_gatkfull = False
    args = ["-T", "PrintReads",
            "-L", region_to_gatk(region),
            "-R", dd.get_ref_file(data),
            "-I", data["work_bam"]]
    if requires_gatkfull:
        runner = broad.runner_from_config(data["config"])
        return runner.cl_gatk(args, tmp_dir)
    else:
        jvm_opts = broad.get_gatk_framework_opts(data["config"], tmp_dir)
        return broad.gatk_cmd("gatk-framework", jvm_opts, args)
Esempio n. 11
0
def _split_mulitallelic(in_file, data):
    """Split input into biallelic and multiallelic files.
    """
    ba_out = "%s-biallelic%s" % utils.splitext_plus(in_file)
    ma_out = "%s-multiallelic%s" % utils.splitext_plus(in_file)
    for out_file, select_type in [(ba_out, "BIALLELIC"), (ma_out, "MULTIALLELIC")]:
        if not utils.file_exists(out_file):
            with file_transaction(data, out_file) as tx_out_file:
                params = ["-T", "SelectVariants", "-R", dd.get_ref_file(data),
                          "--variant", in_file, "--out", tx_out_file,
                          "-restrictAllelesTo", select_type]
                jvm_opts = broad.get_gatk_framework_opts(data["config"])
                cmd = ["gatk-framework"] + jvm_opts + params
                do.run(cmd, "Select %s variants" % select_type)
        vcfutils.bgzip_and_index(out_file, data["config"])
    return ba_out, ma_out
Esempio n. 12
0
def _gatk_extract_reads_cl(data, region, prep_params, tmp_dir):
    """Use GATK to extract reads from full BAM file, recalibrating if configured.
    """
    requires_gatkfull = False
    args = ["-T", "PrintReads", "-L", region_to_gatk(region), "-R", data["sam_ref"], "-I", data["work_bam"]]
    if prep_params["recal"] == "gatk":
        if "prep_recal" in data and _recal_has_reads(data["prep_recal"]):
            requires_gatkfull = True
            args += ["-BQSR", data["prep_recal"]]
    elif prep_params["recal"]:
        raise NotImplementedError("Recalibration method %s" % prep_params["recal"])
    if requires_gatkfull:
        runner = broad.runner_from_config(data["config"])
        return runner.cl_gatk(args, tmp_dir)
    else:
        jvm_opts = broad.get_gatk_framework_opts(data["config"])
        return broad.gatk_cmd("gatk-framework", jvm_opts, prep_params)
Esempio n. 13
0
def _filter_bad_reads(in_bam, ref_file, config):
    """Use GATK filter to remove problem reads which choke GATK and Picard.
    """
    bam.index(in_bam, config)
    out_file = "%s-gatkfilter.bam" % os.path.splitext(in_bam)[0]
    if not utils.file_exists(out_file):
        with utils.curdir_tmpdir({"config": config}) as tmp_dir:
            with file_transaction(out_file) as tx_out_file:
                params = ["-T", "PrintReads",
                          "-R", ref_file,
                          "-I", in_bam,
                          "--out", tx_out_file,
                          "--filter_mismatching_base_and_quals"]
                jvm_opts = broad.get_gatk_framework_opts(config, tmp_dir)
                cmd = [config_utils.get_program("gatk-framework", config)] + jvm_opts + params
                do.run(cmd, "Filter problem reads")
    return out_file
Esempio n. 14
0
def _count_rRNA_reads(in_bam, out_file, ref_file, rRNA_interval, single_end, config):
    """Use GATK counter to count reads in rRNA genes
    """
    bam.index(in_bam, config)
    if not utils.file_exists(out_file):
        with file_transaction(out_file) as tx_out_file:
            rRNA_coor = os.path.join(os.path.dirname(out_file), "rRNA.list")
            _transform_browser_coor(rRNA_interval, rRNA_coor)
            params = ["-T", "CountReads",
                      "-R", ref_file,
                      "-I", in_bam,
                      "-log", tx_out_file,
                      "-L", rRNA_coor,
                      "--filter_reads_with_N_cigar"]
            jvm_opts = broad.get_gatk_framework_opts(config)
            cmd = [config_utils.get_program("gatk-framework", config)] + jvm_opts + params
            do.run(cmd, "counts rRNA for %s" % in_bam)
        return out_file
Esempio n. 15
0
def _filter_paired(tumor, normal, out_file, reference, data):
    """filter paired vcf file with GATK
    :param    tumor: (str) sample name for tumor
    :param    normal: (str) sample name for normal
    :param    out_file: (str) final vcf file
    :param    reference: (str) genome in fasta format
    :param    data: (dict) information from yaml file(items[0])
    :returns: (str) name of final vcf file
    """
    in_file = utils.splitext_plus(out_file)[0] + "-tmp.vcf"
    shutil.move(out_file, in_file)
    config = data["config"]
    with file_transaction(data, out_file) as tx_out_file:
        params = ["-T", "SomaticPindelFilter", "-V", in_file, "-o",
                  tx_out_file, "-TID", tumor, "-NID", normal, "-R", reference]
        jvm_opts = broad.get_gatk_framework_opts(config)
        do.run(broad.gatk_cmd("gatk-framework", jvm_opts, params), "Filter pindel variants")
    return out_file
Esempio n. 16
0
def combine_variant_files(orig_files, out_file, ref_file, config,
                          quiet_out=True, region=None):
    """Combine VCF files from the same sample into a single output file.

    Handles cases where we split files into SNPs/Indels for processing then
    need to merge back into a final file.

    We could handle multiple input cases with recursion if needed but currently
    only works with two inputs.
    """
    in_pipeline = False
    if isinstance(orig_files, dict):
        file_key = config["file_key"]
        in_pipeline = True
        orig_files = orig_files[file_key]
    if not utils.file_exists(out_file):
        with file_transaction(out_file) as tx_out_file:
            ready_files = run_multicore(p_bgzip_and_index, [[x, config] for x in orig_files], config)
            params = ["-T", "CombineVariants",
                      "-R", ref_file,
                      "--out", tx_out_file]
            priority_order = []
            for i, ready_file in enumerate(ready_files):
                name = "v%s" % i
                params.extend(["--variant:{name}".format(name=name), ready_file])
                priority_order.append(name)
            params.extend(["--rod_priority_list", ",".join(priority_order)])
            if quiet_out:
                params.extend(["--suppressCommandLineHeader", "--setKey", "null"])
            variant_regions = config["algorithm"].get("variant_regions", None)
            cur_region = shared.subset_variant_regions(variant_regions, region, out_file)
            if cur_region:
                params += ["-L", bamprep.region_to_gatk(cur_region),
                           "--interval_set_rule", "INTERSECTION"]
            jvm_opts = broad.get_gatk_framework_opts(config)
            cmd = [config_utils.get_program("gatk-framework", config)] + jvm_opts + params
            do.run(cmd, "Combine variant files")
    if out_file.endswith(".gz"):
        bgzip_and_index(out_file, config)
    if in_pipeline:
        return [{file_key: out_file, "region": region, "sam_ref": ref_file, "config": config}]
    else:
        return out_file
Esempio n. 17
0
def concat_variant_files(orig_files, out_file, regions, ref_file, config):
    """Concatenate multiple variant files from regions into a single output file.

    Lightweight approach to merging VCF files split by regions with the same
    sample information, so no complex merging needed. Handles both plain text
    and bgzipped/tabix indexed outputs.

    Falls back to bcftools concat if fails due to GATK stringency issues.
    """
    if not utils.file_exists(out_file):
        sorted_files = _sort_by_region(orig_files, regions, ref_file, config)
        exist_files = [x for x in sorted_files if os.path.exists(x) and vcf_has_variants(x)]
        if len(exist_files) == 0:  # no non-empty inputs, merge the empty ones
            exist_files = [x for x in sorted_files if os.path.exists(x)]
        ready_files = run_multicore(p_bgzip_and_index, [[x, config] for x in exist_files], config)
        input_file_list = "%s-files.list" % utils.splitext_plus(out_file)[0]
        with open(input_file_list, "w") as out_handle:
            for fname in ready_files:
                out_handle.write(fname + "\n")
        failed = False
        with file_transaction(config, out_file) as tx_out_file:
            params = ["org.broadinstitute.gatk.tools.CatVariants",
                      "-R", ref_file,
                      "-V", input_file_list,
                      "-out", tx_out_file,
                      "-assumeSorted"]
            jvm_opts = broad.get_gatk_framework_opts(config, os.path.dirname(tx_out_file), include_gatk=False)
            try:
                do.run(broad.gatk_cmd("gatk-framework", jvm_opts, params), "Concat variant files", log_error=False)
            except subprocess.CalledProcessError as msg:
                if ("We require all VCFs to have complete VCF headers" in str(msg) or
                      "Features added out of order" in str(msg) or
                      "The reference allele cannot be missing" in str(msg)):
                    os.remove(tx_out_file)
                    failed = True
                else:
                    raise
        if failed:
            return _run_concat_variant_files_bcftools(input_file_list, out_file, config)
    if out_file.endswith(".gz"):
        bgzip_and_index(out_file, config)
    return out_file
Esempio n. 18
0
def _filter_bad_reads(in_bam, ref_file, data):
    """Use GATK filter to remove problem reads which choke GATK and Picard.
    """
    bam.index(in_bam, data["config"])
    out_file = "%s-gatkfilter.bam" % os.path.splitext(in_bam)[0]
    if not utils.file_exists(out_file):
        with tx_tmpdir(data) as tmp_dir:
            with file_transaction(data, out_file) as tx_out_file:
                params = [
                    "-T", "PrintReads", "-R", ref_file, "-I", in_bam, "--out",
                    tx_out_file, "--filter_mismatching_base_and_quals",
                    "--filter_bases_not_stored", "--filter_reads_with_N_cigar"
                ]
                if dd.get_quality_format(data, "").lower() == "illumina":
                    params.append("--fix_misencoded_quality_scores")
                jvm_opts = broad.get_gatk_framework_opts(
                    data["config"], tmp_dir)
                cmd = [
                    config_utils.get_program("gatk-framework", data["config"])
                ] + jvm_opts + params
                do.run(cmd, "Filter problem reads")
    bam.index(out_file, data["config"])
    return out_file
Esempio n. 19
0
def genotype_filter(vcf_file, expression, data, name, filterext=""):
    """Perform genotype based filtering using GATK with the provided expression.

    Adds FT tags to genotypes, rather than the general FILTER flag.
    """
    base, ext = utils.splitext_plus(vcf_file)
    out_file = "{base}-filter{filterext}{ext}".format(**locals())
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            params = [
                "-T", "VariantFiltration", "-R",
                tz.get_in(["reference", "fasta", "base"],
                          data), "--variant", vcf_file, "--out", tx_out_file,
                "--genotypeFilterName", name, "--genotypeFilterExpression",
                "'%s'" % expression
            ]
            jvm_opts = broad.get_gatk_framework_opts(data["config"])
            cmd = [config_utils.get_program("gatk-framework", data["config"])
                   ] + jvm_opts + params
            do.run(cmd, "Filter with expression: %s" % expression)
    if out_file.endswith(".vcf.gz"):
        out_file = vcfutils.bgzip_and_index(out_file, data["config"])
    return out_file
Esempio n. 20
0
def concat_variant_files(orig_files, out_file, regions, ref_file, config):
    """Concatenate multiple variant files from regions into a single output file.

    Lightweight approach to merging VCF files split by regions with the same
    sample information, so no complex merging needed. Handles both plain text
    and bgzipped/tabix indexed outputs.

    Falls back to bcftools concat if fails due to GATK stringency issues.
    """
    if not utils.file_exists(out_file):
        sorted_files = _sort_by_region(orig_files, regions, ref_file, config)
        exist_files = [x for x in sorted_files if os.path.exists(x)]
        ready_files = run_multicore(p_bgzip_and_index, [[x, config] for x in exist_files], config)
        input_file_list = "%s-files.list" % utils.splitext_plus(out_file)[0]
        with open(input_file_list, "w") as out_handle:
            for fname in ready_files:
                out_handle.write(fname + "\n")
        failed = False
        with file_transaction(config, out_file) as tx_out_file:
            params = ["org.broadinstitute.gatk.tools.CatVariants",
                      "-R", ref_file,
                      "-V", input_file_list,
                      "-out", tx_out_file,
                      "-assumeSorted"]
            jvm_opts = broad.get_gatk_framework_opts(config, include_gatk=False)
            cmd = [config_utils.get_program("gatk-framework", config)] + params + jvm_opts
            try:
                do.run(cmd, "Concat variant files", log_error=False)
            except subprocess.CalledProcessError, msg:
                if ("We require all VCFs to have complete VCF headers" in str(msg) or
                      "Features added out of order" in str(msg)):
                    os.remove(tx_out_file)
                    failed = True
                else:
                    raise
        if failed:
            return concat_variant_files_bcftools(input_file_list, out_file, ref_file, config)
Esempio n. 21
0
def combine_variant_files(orig_files,
                          out_file,
                          ref_file,
                          config,
                          quiet_out=True,
                          region=None):
    """Combine VCF files from the same sample into a single output file.

    Handles cases where we split files into SNPs/Indels for processing then
    need to merge back into a final file.

    Will parallelize up to 4 cores based on documented recommendations:
    https://www.broadinstitute.org/gatk/gatkdocs/
    org_broadinstitute_gatk_tools_walkers_variantutils_CombineVariants.php
    """
    in_pipeline = False
    if isinstance(orig_files, dict):
        file_key = config["file_key"]
        in_pipeline = True
        orig_files = orig_files[file_key]
    if not utils.file_exists(out_file):
        with file_transaction(config, out_file) as tx_out_file:
            exist_files = [x for x in orig_files if os.path.exists(x)]
            ready_files = run_multicore(p_bgzip_and_index,
                                        [[x, config] for x in exist_files],
                                        config)
            params = [
                "-T", "CombineVariants", "-R", ref_file, "--out", tx_out_file
            ]
            priority_order = []
            for i, ready_file in enumerate(ready_files):
                name = "v%s" % i
                params.extend(
                    ["--variant:{name}".format(name=name), ready_file])
                priority_order.append(name)
            params.extend(["--rod_priority_list", ",".join(priority_order)])
            params.extend(["--genotypemergeoption", "PRIORITIZE"])
            if quiet_out:
                params.extend(
                    ["--suppressCommandLineHeader", "--setKey", "null"])
            if region:
                variant_regions = config["algorithm"].get(
                    "variant_regions", None)
                cur_region = shared.subset_variant_regions(
                    variant_regions, region, out_file)
                if cur_region:
                    params += [
                        "-L",
                        bamprep.region_to_gatk(cur_region),
                        "--interval_set_rule", "INTERSECTION"
                    ]
            cores = tz.get_in(["algorithm", "num_cores"], config, 1)
            if cores > 1:
                params += ["-nt", min(cores, 4)]
            memscale = {
                "magnitude": 0.9 * cores,
                "direction": "increase"
            } if cores > 1 else None
            jvm_opts = broad.get_gatk_framework_opts(
                config, os.path.dirname(tx_out_file), memscale=memscale)
            do.run(broad.gatk_cmd("gatk-framework", jvm_opts, params),
                   "Combine variant files")
    if out_file.endswith(".gz"):
        bgzip_and_index(out_file, config)
    if in_pipeline:
        return [{
            file_key: out_file,
            "region": region,
            "sam_ref": ref_file,
            "config": config
        }]
    else:
        return out_file