예제 #1
0
def genotype_filter(vcf_file, expression, data, name, filterext=""):
    """Perform genotype based filtering using GATK with the provided expression.

    Adds FT tags to genotypes, rather than the general FILTER flag.
    """
    base, ext = utils.splitext_plus(vcf_file)
    out_file = "{base}-filter{filterext}{ext}".format(**locals())
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            params = [
                "-T",
                "VariantFiltration",
                "-R",
                tz.get_in(["reference", "fasta", "base"], data),
                "--variant",
                vcf_file,
                "--out",
                tx_out_file,
                "--genotypeFilterName",
                name,
                "--genotypeFilterExpression",
                "'%s'" % expression,
            ]
            jvm_opts = broad.get_gatk_framework_opts(data["config"])
            do.run(broad.gatk_cmd("gatk-framework", jvm_opts, params), "Filter with expression: %s" % expression)
    if out_file.endswith(".vcf.gz"):
        out_file = vcfutils.bgzip_and_index(out_file, data["config"])
    return out_file
예제 #2
0
def gatk_filter_rnaseq(vrn_file, data):
    """
    this incorporates filters listed here, dropping clusters of variants
    within a 35 nucleotide window, high fischer strand values and low
    quality by depth
    https://software.broadinstitute.org/gatk/guide/article?id=3891
    java -jar GenomeAnalysisTK.jar -T VariantFiltration -R hg_19.fasta -V
    input.vcf -window 35 -cluster 3 -filterName FS -filter "FS > 30.0"
    -filterName QD -filter "QD < 2.0" -o output.vcf
    """
    out_file = "%s-filter%s" % utils.splitext_plus(vrn_file)
    if not file_exists(out_file):
        ref_file = dd.get_ref_file(data)
        with file_transaction(data, out_file) as tx_out_file:
            params = [
                "VariantFiltration", "-R", ref_file, "-V", vrn_file,
                "--cluster-window-size", "35", "--cluster-size", "3",
                "--filter-expression", "'FS > 30.0'", "--filter-name", "FS",
                "--filter-expression", "'QD < 2.0'", "--filter-name", "QD",
                "--output", tx_out_file
            ]
            # Use GATK4 for filtering, tools_off is for variant calling
            config = utils.deepish_copy(dd.get_config(data))
            if "gatk4" in dd.get_tools_off({"config": config}):
                config["algorithm"]["tools_off"].remove("gatk4")
            jvm_opts = broad.get_gatk_opts(config,
                                           os.path.dirname(tx_out_file))
            do.run(broad.gatk_cmd("gatk", jvm_opts, params, config),
                   "Filter RNA-seq variants.")
    return out_file
예제 #3
0
def gatk_filter_rnaseq(vrn_file, data):
    """
    this incorporates filters listed here, dropping clusters of variants
    within a 35 nucleotide window, high fischer strand values and low
    quality by depth
    https://software.broadinstitute.org/gatk/guide/article?id=3891
    java -jar GenomeAnalysisTK.jar -T VariantFiltration -R hg_19.fasta -V
    input.vcf -window 35 -cluster 3 -filterName FS -filter "FS > 30.0"
    -filterName QD -filter "QD < 2.0" -o output.vcf
    """
    out_file = "%s-filter%s" % utils.splitext_plus(vrn_file)
    if not file_exists(out_file):
        ref_file = dd.get_ref_file(data)
        with file_transaction(data, out_file) as tx_out_file:
            params = ["VariantFiltration",
                      "-R", ref_file,
                      "-V", vrn_file,
                      "--cluster-window-size", "35",
                      "--cluster-size", "3",
                      "--filter-expression", "'FS > 30.0'",
                      "--filter-name", "FS",
                      "--filter-expression", "'QD < 2.0'",
                      "--filter-name", "QD",
                      "--output", tx_out_file]
            # Use GATK4 for filtering, tools_off is for variant calling
            config = utils.deepish_copy(dd.get_config(data))
            if "gatk4" in dd.get_tools_off({"config": config}):
                config["algorithm"]["tools_off"].remove("gatk4")
            jvm_opts = broad.get_gatk_opts(config, os.path.dirname(tx_out_file))
            do.run(broad.gatk_cmd("gatk", jvm_opts, params, config), "Filter RNA-seq variants.")
    return out_file
예제 #4
0
def gatk_filter_rnaseq(vrn_file, data):
    """
    this incorporates filters listed here, dropping clusters of variants
    within a 35 nucleotide window, high fischer strand values and low
    quality by depth
    https://software.broadinstitute.org/gatk/guide/article?id=3891
    java -jar GenomeAnalysisTK.jar -T VariantFiltration -R hg_19.fasta -V
    input.vcf -window 35 -cluster 3 -filterName FS -filter "FS > 30.0"
    -filterName QD -filter "QD < 2.0" -o output.vcf
    """
    out_file = "%s-filter%s" % utils.splitext_plus(vrn_file)
    if not file_exists(out_file):
        ref_file = dd.get_ref_file(data)
        with file_transaction(data, out_file) as tx_out_file:
            params = ["-T", "VariantFiltration",
                      "-R", ref_file,
                      "-V", vrn_file,
                      "--clusterWindowSize", "35",
                      "--clusterSize", "3",
                      "--filterExpression", "\"'FS > 30.0'\"",
                      "--filterName", "FS",
                      "--filterExpression", "\"'QD < 2.0'\"",
                      "--filterName", "QD",
                      "-o", tx_out_file]
            jvm_opts = broad.get_gatk_framework_opts(dd.get_config(data), os.path.dirname(tx_out_file))
            do.run(broad.gatk_cmd("gatk-framework", jvm_opts, params), "Filter RNA-seq variants.")
    return out_file
예제 #5
0
def concat_variant_files_catvariants(orig_files, out_file, regions, ref_file, config):
    """Concatenate multiple variant files from regions into a single output file.

    Uses GATK CatVariants as a lightweight approach to merging VCF files split
    by regions with the same sample information, so no complex merging needed.
    Handles both plain text and bgzipped/tabix indexed outputs.

    Falls back to bcftools concat if fails due to GATK stringency issues.
    """
    if not utils.file_exists(out_file):
        input_file_list = _get_file_list(orig_files, out_file, regions, ref_file, config)
        failed = False
        with file_transaction(config, out_file) as tx_out_file:
            params = ["org.broadinstitute.gatk.tools.CatVariants",
                      "-R", ref_file,
                      "-V", input_file_list,
                      "-out", tx_out_file,
                      "-assumeSorted"]
            jvm_opts = broad.get_gatk_framework_opts(config, os.path.dirname(tx_out_file), include_gatk=False)
            try:
                do.run(broad.gatk_cmd("gatk-framework", jvm_opts, params), "Concat variant files", log_error=False)
            except subprocess.CalledProcessError as msg:
                if ("We require all VCFs to have complete VCF headers" in str(msg) or
                      "Features added out of order" in str(msg) or
                      "The reference allele cannot be missing" in str(msg)):
                    os.remove(tx_out_file)
                    failed = True
                else:
                    raise
        if failed:
            return _run_concat_variant_files_bcftools(input_file_list, out_file, config)
    if out_file.endswith(".gz"):
        bgzip_and_index(out_file, config)
    return out_file
예제 #6
0
def gatk_filter_rnaseq(data, vrn_file, out_file):
    """
    this incorporates filters listed here, dropping clusters of variants
    within a 35 nucleotide window, high fischer strand values and low
    quality by depth
    https://software.broadinstitute.org/gatk/guide/article?id=3891
    java -jar GenomeAnalysisTK.jar -T VariantFiltration -R hg_19.fasta -V
    input.vcf -window 35 -cluster 3 -filterName FS -filter "FS > 30.0"
    -filterName QD -filter "QD < 2.0" -o output.vcf
    """
    broad_runner = broad.runner_from_config(dd.get_config(data))
    ref_file = dd.get_ref_file(data)
    if file_exists(out_file):
        return out_file
    with file_transaction(out_file) as tx_out_file:
        params = ["-T", "VariantFiltration",
                  "-R", ref_file,
                  "-V", vrn_file,
                  "--clusterWindowSize", "35",
                  "--clusterSize", "3",
                  "--filterExpression", "\"'FS > 30.0'\"",
                  "--filterName", "FS",
                  "--filterExpression", "\"'QD < 2.0'\"",
                  "--filterName", "QD",
                  "-o", tx_out_file]
        jvm_opts = broad.get_gatk_framework_opts(dd.get_config(data), os.path.dirname(tx_out_file))
        do.run(broad.gatk_cmd("gatk-framework", jvm_opts, params),
               "Filter variants.")
    return out_file
예제 #7
0
def concat_variant_files_catvariants(orig_files, out_file, regions, ref_file, config):
    """Concatenate multiple variant files from regions into a single output file.

    Uses GATK CatVariants as a lightweight approach to merging VCF files split
    by regions with the same sample information, so no complex merging needed.
    Handles both plain text and bgzipped/tabix indexed outputs.

    Falls back to bcftools concat if fails due to GATK stringency issues.
    """
    if not utils.file_exists(out_file):
        input_file_list = _get_file_list(orig_files, out_file, regions, ref_file, config)
        failed = False
        with file_transaction(config, out_file) as tx_out_file:
            params = ["org.broadinstitute.gatk.tools.CatVariants",
                      "-R", ref_file,
                      "-V", input_file_list,
                      "-out", tx_out_file,
                      "-assumeSorted"]
            jvm_opts = broad.get_gatk_framework_opts(config, os.path.dirname(tx_out_file), include_gatk=False)
            try:
                do.run(broad.gatk_cmd("gatk-framework", jvm_opts, params), "Concat variant files", log_error=False)
            except subprocess.CalledProcessError as msg:
                if ("We require all VCFs to have complete VCF headers" in str(msg) or
                      "Features added out of order" in str(msg) or
                      "The reference allele cannot be missing" in str(msg)):
                    os.remove(tx_out_file)
                    failed = True
                else:
                    raise
        if failed:
            return _run_concat_variant_files_bcftools(input_file_list, out_file, config)
    if out_file.endswith(".gz"):
        bgzip_and_index(out_file, config)
    return out_file
예제 #8
0
def combine_variant_files(orig_files, out_file, ref_file, config,
                          quiet_out=True, region=None):
    """Combine VCF files from the same sample into a single output file.

    Handles cases where we split files into SNPs/Indels for processing then
    need to merge back into a final file.

    Will parallelize up to 4 cores based on documented recommendations:
    https://software.broadinstitute.org/gatk/documentation/tooldocs/current/
    org_broadinstitute_gatk_tools_walkers_variantutils_CombineVariants.php
    """
    in_pipeline = False
    if isinstance(orig_files, dict):
        file_key = config["file_key"]
        in_pipeline = True
        orig_files = orig_files[file_key]
    if not utils.file_exists(out_file):
        with file_transaction(config, out_file) as tx_out_file:
            exist_files = [x for x in orig_files if os.path.exists(x)]
            ready_files = run_multicore(p_bgzip_and_index, [[x, config] for x in exist_files], config)
            params = ["-T", "CombineVariants",
                      "-R", ref_file,
                      "--out", tx_out_file]
            priority_order = []
            for i, ready_file in enumerate(ready_files):
                name = "v%s" % i
                params.extend(["--variant:{name}".format(name=name), ready_file])
                priority_order.append(name)
            params.extend(["--rod_priority_list", ",".join(priority_order)])
            params.extend(["--genotypemergeoption", "PRIORITIZE"])
            if quiet_out:
                params.extend(["--suppressCommandLineHeader", "--setKey", "null"])
            if region:
                variant_regions = config["algorithm"].get("variant_regions", None)
                cur_region = shared.subset_variant_regions(variant_regions, region, out_file)
                if cur_region:
                    params += ["-L", bamprep.region_to_gatk(cur_region),
                               "--interval_set_rule", "INTERSECTION"]
            cores = tz.get_in(["algorithm", "num_cores"], config, 1)
            if cores > 1:
                params += ["-nt", min(cores, 4)]
            memscale = {"magnitude": 0.9 * cores, "direction": "increase"} if cores > 1 else None
            jvm_opts = broad.get_gatk_framework_opts(config, os.path.dirname(tx_out_file), memscale=memscale,
                                                     parallel_gc=True)
            do.run(broad.gatk_cmd("gatk-framework", jvm_opts, params), "Combine variant files")
    if out_file.endswith(".gz"):
        bgzip_and_index(out_file, config)
    if in_pipeline:
        return [{file_key: out_file, "region": region, "sam_ref": ref_file, "config": config}]
    else:
        return out_file
예제 #9
0
def concat_variant_files(orig_files, out_file, regions, ref_file, config):
    """Concatenate multiple variant files from regions into a single output file.

    Lightweight approach to merging VCF files split by regions with the same
    sample information, so no complex merging needed. Handles both plain text
    and bgzipped/tabix indexed outputs.

    Falls back to bcftools concat if fails due to GATK stringency issues.
    """
    if not utils.file_exists(out_file):
        sorted_files = _sort_by_region(orig_files, regions, ref_file, config)
        exist_files = [
            x for x in sorted_files
            if os.path.exists(x) and vcf_has_variants(x)
        ]
        if len(exist_files) == 0:  # no non-empty inputs, merge the empty ones
            exist_files = [x for x in sorted_files if os.path.exists(x)]
        ready_files = run_multicore(p_bgzip_and_index,
                                    [[x, config] for x in exist_files], config)
        input_file_list = "%s-files.list" % utils.splitext_plus(out_file)[0]
        with open(input_file_list, "w") as out_handle:
            for fname in ready_files:
                out_handle.write(fname + "\n")
        failed = False
        with file_transaction(config, out_file) as tx_out_file:
            params = [
                "org.broadinstitute.gatk.tools.CatVariants", "-R", ref_file,
                "-V", input_file_list, "-out", tx_out_file, "-assumeSorted"
            ]
            jvm_opts = broad.get_gatk_framework_opts(
                config, os.path.dirname(tx_out_file), include_gatk=False)
            try:
                do.run(broad.gatk_cmd("gatk-framework", jvm_opts, params),
                       "Concat variant files",
                       log_error=False)
            except subprocess.CalledProcessError as msg:
                if ("We require all VCFs to have complete VCF headers"
                        in str(msg)
                        or "Features added out of order" in str(msg) or
                        "The reference allele cannot be missing" in str(msg)):
                    os.remove(tx_out_file)
                    failed = True
                else:
                    raise
        if failed:
            return _run_concat_variant_files_bcftools(input_file_list,
                                                      out_file, config)
    if out_file.endswith(".gz"):
        bgzip_and_index(out_file, config)
    return out_file
예제 #10
0
def combine_variant_files(orig_files, out_file, ref_file, config,
                          quiet_out=True, region=None):
    """Combine VCF files from the same sample into a single output file.

    Handles cases where we split files into SNPs/Indels for processing then
    need to merge back into a final file.

    Will parallelize up to 4 cores based on documented recommendations:
    https://www.broadinstitute.org/gatk/gatkdocs/
    org_broadinstitute_gatk_tools_walkers_variantutils_CombineVariants.php
    """
    in_pipeline = False
    if isinstance(orig_files, dict):
        file_key = config["file_key"]
        in_pipeline = True
        orig_files = orig_files[file_key]
    if not utils.file_exists(out_file):
        with file_transaction(config, out_file) as tx_out_file:
            exist_files = [x for x in orig_files if os.path.exists(x)]
            ready_files = run_multicore(p_bgzip_and_index, [[x, config] for x in exist_files], config)
            params = ["-T", "CombineVariants",
                      "-R", ref_file,
                      "--out", tx_out_file]
            priority_order = []
            for i, ready_file in enumerate(ready_files):
                name = "v%s" % i
                params.extend(["--variant:{name}".format(name=name), ready_file])
                priority_order.append(name)
            params.extend(["--rod_priority_list", ",".join(priority_order)])
            params.extend(["--genotypemergeoption", "PRIORITIZE"])
            if quiet_out:
                params.extend(["--suppressCommandLineHeader", "--setKey", "null"])
            if region:
                variant_regions = config["algorithm"].get("variant_regions", None)
                cur_region = shared.subset_variant_regions(variant_regions, region, out_file)
                if cur_region:
                    params += ["-L", bamprep.region_to_gatk(cur_region),
                               "--interval_set_rule", "INTERSECTION"]
            cores = tz.get_in(["algorithm", "num_cores"], config, 1)
            if cores > 1:
                params += ["-nt", min(cores, 4)]
            memscale = {"magnitude": 0.9 * cores, "direction": "increase"} if cores > 1 else None
            jvm_opts = broad.get_gatk_framework_opts(config, memscale=memscale)
            do.run(broad.gatk_cmd("gatk-framework", jvm_opts, params), "Combine variant files")
    if out_file.endswith(".gz"):
        bgzip_and_index(out_file, config)
    if in_pipeline:
        return [{file_key: out_file, "region": region, "sam_ref": ref_file, "config": config}]
    else:
        return out_file
예제 #11
0
def _gatk_extract_reads_cl(data, region, prep_params, tmp_dir):
    """Use GATK to extract reads from full BAM file.
    """
    requires_gatkfull = False
    args = ["-T", "PrintReads",
            "-L", region_to_gatk(region),
            "-R", dd.get_ref_file(data),
            "-I", data["work_bam"]]
    if requires_gatkfull:
        runner = broad.runner_from_config(data["config"])
        return runner.cl_gatk(args, tmp_dir)
    else:
        jvm_opts = broad.get_gatk_framework_opts(data["config"], tmp_dir)
        return broad.gatk_cmd("gatk-framework", jvm_opts, args)
예제 #12
0
def _gatk_extract_reads_cl(data, region, prep_params, tmp_dir):
    """Use GATK to extract reads from full BAM file, recalibrating if configured.
    """
    requires_gatkfull = False
    args = ["-T", "PrintReads", "-L", region_to_gatk(region), "-R", data["sam_ref"], "-I", data["work_bam"]]
    if prep_params["recal"] == "gatk":
        if "prep_recal" in data and _recal_has_reads(data["prep_recal"]):
            requires_gatkfull = True
            args += ["-BQSR", data["prep_recal"]]
    elif prep_params["recal"]:
        raise NotImplementedError("Recalibration method %s" % prep_params["recal"])
    if requires_gatkfull:
        runner = broad.runner_from_config(data["config"])
        return runner.cl_gatk(args, tmp_dir)
    else:
        jvm_opts = broad.get_gatk_framework_opts(data["config"])
        return broad.gatk_cmd("gatk-framework", jvm_opts, prep_params)
예제 #13
0
def _filter_paired(tumor, normal, out_file, reference, data):
    """filter paired vcf file with GATK
    :param    tumor: (str) sample name for tumor
    :param    normal: (str) sample name for normal
    :param    out_file: (str) final vcf file
    :param    reference: (str) genome in fasta format
    :param    data: (dict) information from yaml file(items[0])
    :returns: (str) name of final vcf file
    """
    in_file = utils.splitext_plus(out_file)[0] + "-tmp.vcf"
    shutil.move(out_file, in_file)
    config = data["config"]
    with file_transaction(data, out_file) as tx_out_file:
        params = ["-T", "SomaticPindelFilter", "-V", in_file, "-o",
                  tx_out_file, "-TID", tumor, "-NID", normal, "-R", reference]
        jvm_opts = broad.get_gatk_opts(config)
        do.run(broad.gatk_cmd("gatk-framework", jvm_opts, params), "Filter pindel variants")
    return out_file
예제 #14
0
def concat_variant_files(orig_files, out_file, regions, ref_file, config):
    """Concatenate multiple variant files from regions into a single output file.

    Lightweight approach to merging VCF files split by regions with the same
    sample information, so no complex merging needed. Handles both plain text
    and bgzipped/tabix indexed outputs.

    Falls back to bcftools concat if fails due to GATK stringency issues.
    """
    if not utils.file_exists(out_file):
        sorted_files = _sort_by_region(orig_files, regions, ref_file, config)
        exist_files = [x for x in sorted_files if os.path.exists(x) and vcf_has_variants(x)]
        if len(exist_files) == 0:  # no non-empty inputs, merge the empty ones
            exist_files = [x for x in sorted_files if os.path.exists(x)]
        ready_files = run_multicore(p_bgzip_and_index, [[x, config] for x in exist_files], config)
        input_file_list = "%s-files.list" % utils.splitext_plus(out_file)[0]
        with open(input_file_list, "w") as out_handle:
            for fname in ready_files:
                out_handle.write(fname + "\n")
        failed = False
        with file_transaction(config, out_file) as tx_out_file:
            params = ["org.broadinstitute.gatk.tools.CatVariants",
                      "-R", ref_file,
                      "-V", input_file_list,
                      "-out", tx_out_file,
                      "-assumeSorted"]
            jvm_opts = broad.get_gatk_framework_opts(config, os.path.dirname(tx_out_file), include_gatk=False)
            try:
                do.run(broad.gatk_cmd("gatk-framework", jvm_opts, params), "Concat variant files", log_error=False)
            except subprocess.CalledProcessError as msg:
                if ("We require all VCFs to have complete VCF headers" in str(msg) or
                      "Features added out of order" in str(msg) or
                      "The reference allele cannot be missing" in str(msg)):
                    os.remove(tx_out_file)
                    failed = True
                else:
                    raise
        if failed:
            return _run_concat_variant_files_bcftools(input_file_list, out_file, config)
    if out_file.endswith(".gz"):
        bgzip_and_index(out_file, config)
    return out_file
예제 #15
0
def _filter_bad_reads(in_bam, ref_file, data):
    """Use GATK filter to remove problem reads which choke GATK and Picard.
    """
    bam.index(in_bam, data["config"])
    out_file = "%s-gatkfilter.bam" % os.path.splitext(in_bam)[0]
    if not utils.file_exists(out_file):
        with tx_tmpdir(data) as tmp_dir:
            with file_transaction(data, out_file) as tx_out_file:
                params = [
                    ("FixMisencodedBaseQualityReads" if dd.get_quality_format(
                        data, "").lower() == "illumina" else "PrintReads"),
                    "-R", ref_file, "-I", in_bam, "-O", tx_out_file, "-RF",
                    "MatchingBasesAndQualsReadFilter", "-RF",
                    "SeqIsStoredReadFilter", "-RF", "CigarContainsNoNOperator"
                ]
                jvm_opts = broad.get_gatk_opts(data["config"], tmp_dir)
                do.run(broad.gatk_cmd("gatk", jvm_opts, params),
                       "Filter problem reads")
    bam.index(out_file, data["config"])
    return out_file
예제 #16
0
def genotype_filter(vcf_file, expression, data, name, filterext=""):
    """Perform genotype based filtering using GATK with the provided expression.

    Adds FT tags to genotypes, rather than the general FILTER flag.
    """
    base, ext = utils.splitext_plus(vcf_file)
    out_file = "{base}-filter{filterext}{ext}".format(**locals())
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            params = ["-T", "VariantFiltration",
                      "-R", tz.get_in(["reference", "fasta", "base"], data),
                      "--variant", vcf_file,
                      "--out", tx_out_file,
                      "--genotypeFilterName", name,
                      "--genotypeFilterExpression", "'%s'" % expression]
            jvm_opts = broad.get_gatk_framework_opts(data["config"], os.path.dirname(tx_out_file))
            do.run(broad.gatk_cmd("gatk-framework", jvm_opts, params), "Filter with expression: %s" % expression)
    if out_file.endswith(".vcf.gz"):
        out_file = vcfutils.bgzip_and_index(out_file, data["config"])
    return out_file
예제 #17
0
def _filter_bad_reads(in_bam, ref_file, data):
    """Use GATK filter to remove problem reads which choke GATK and Picard.
    """
    bam.index(in_bam, data["config"])
    out_file = "%s-gatkfilter.bam" % os.path.splitext(in_bam)[0]
    if not utils.file_exists(out_file):
        with tx_tmpdir(data) as tmp_dir:
            with file_transaction(data, out_file) as tx_out_file:
                params = ["-T", "PrintReads",
                          "-R", ref_file,
                          "-I", in_bam,
                          "--out", tx_out_file,
                          "--filter_mismatching_base_and_quals",
                          "--filter_bases_not_stored",
                          "--filter_reads_with_N_cigar"]
                if dd.get_quality_format(data, "").lower() == "illumina":
                    params.append("--fix_misencoded_quality_scores")
                jvm_opts = broad.get_gatk_framework_opts(data["config"], tmp_dir)
                do.run(broad.gatk_cmd("gatk-framework", jvm_opts, params), "Filter problem reads")
    bam.index(out_file, data["config"])
    return out_file
예제 #18
0
def _filter_bad_reads(in_bam, ref_file, data):
    """Use GATK filter to remove problem reads which choke GATK and Picard.
    """
    bam.index(in_bam, data["config"])
    out_file = "%s-gatkfilter.bam" % os.path.splitext(in_bam)[0]
    if not utils.file_exists(out_file):
        with tx_tmpdir(data) as tmp_dir:
            with file_transaction(data, out_file) as tx_out_file:
                params = ["-T", "PrintReads",
                          "-R", ref_file,
                          "-I", in_bam,
                          "--out", tx_out_file,
                          "--filter_mismatching_base_and_quals",
                          "--filter_bases_not_stored",
                          "--filter_reads_with_N_cigar"]
                if dd.get_quality_format(data, "").lower() == "illumina":
                    params.append("--fix_misencoded_quality_scores")
                jvm_opts = broad.get_gatk_framework_opts(data["config"], tmp_dir)
                do.run(broad.gatk_cmd("gatk-framework", jvm_opts, params), "Filter problem reads")
    bam.index(out_file, data["config"])
    return out_file
예제 #19
0
def _gatk_extract_reads_cl(data, region, prep_params, tmp_dir):
    """Use GATK to extract reads from full BAM file, recalibrating if configured.
    """
    requires_gatkfull = False
    args = [
        "-T", "PrintReads", "-L",
        region_to_gatk(region), "-R", data["sam_ref"], "-I", data["work_bam"]
    ]
    if prep_params["recal"] == "gatk":
        if "prep_recal" in data and _recal_has_reads(data["prep_recal"]):
            requires_gatkfull = True
            args += ["-BQSR", data["prep_recal"]]
    elif prep_params["recal"]:
        raise NotImplementedError("Recalibration method %s" %
                                  prep_params["recal"])
    if requires_gatkfull:
        runner = broad.runner_from_config(data["config"])
        return runner.cl_gatk(args, tmp_dir)
    else:
        jvm_opts = broad.get_gatk_framework_opts(data["config"], tmp_dir)
        return broad.gatk_cmd("gatk-framework", jvm_opts, args)
예제 #20
0
def _filter_bad_reads(in_bam, ref_file, data):
    """Use GATK filter to remove problem reads which choke GATK and Picard.
    """
    bam.index(in_bam, data["config"])
    out_file = "%s-gatkfilter.bam" % os.path.splitext(in_bam)[0]
    if not utils.file_exists(out_file):
        with tx_tmpdir(data) as tmp_dir:
            with file_transaction(data, out_file) as tx_out_file:
                params = [("FixMisencodedBaseQualityReads"
                           if dd.get_quality_format(data, "").lower() == "illumina"
                           else "PrintReads"),
                          "-R", ref_file,
                          "-I", in_bam,
                          "-O", tx_out_file,
                          "-RF", "MatchingBasesAndQualsReadFilter",
                          "-RF", "SeqIsStoredReadFilter",
                          "-RF", "CigarContainsNoNOperator"]
                jvm_opts = broad.get_gatk_opts(data["config"], tmp_dir)
                do.run(broad.gatk_cmd("gatk", jvm_opts, params), "Filter problem reads")
    bam.index(out_file, data["config"])
    return out_file