Ejemplo n.º 1
0
def gatk_snp_cutoff(in_file, data):
    """Perform cutoff-based soft filtering on GATK SNPs using best-practice recommendations.

    We have a more lenient mapping quality (MQ) filter compared to GATK defaults.
    The recommended filter (MQ < 40) is too stringent, so we adjust to 30:
    http://imgur.com/a/oHRVB

    QD and FS are not calculated when generating gVCF output:
    https://github.com/broadgsa/gatk-protected/blob/e91472ddc7d58ace52db0cab4d70a072a918d64c/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java#L300

    The extra command removes escaped quotes in the VCF output which
    pyVCF fails on.

    Does not use the GATK best practice recommend SOR filter (SOR > 3.0) as it
    has a negative impact on sensitivity relative to precision:

    https://github.com/bcbio/bcbio_validations/tree/master/gatk4#na12878-hg38
    """
    filters = ["MQRankSum < -12.5", "ReadPosRankSum < -8.0"]
    # GATK Haplotype caller (v2.2) appears to have much larger HaplotypeScores
    # resulting in excessive filtering, so avoid this metric
    variantcaller = utils.get_in(data, ("config", "algorithm", "variantcaller"))
    if variantcaller not in ["gatk-haplotype", "haplotyper"]:
        filters.append("HaplotypeScore > 13.0")
    # Additional filter metrics, unless using raw GATK HaplotypeCaller or Sentieon gVCFs
    if not (vcfutils.is_gvcf_file(in_file) and variantcaller in ["gatk-haplotype", "haplotyper"]):
        filters += ["QD < 2.0"]
        filters += ["FS > 60.0"]
        filters += _gatk_general()
    # Additional filter metrics, unless using raw Sentieon gVCFs
    if not (vcfutils.is_gvcf_file(in_file) and variantcaller in ["haplotyper"]):
        filters += ["MQ < 30.0"]
    return cutoff_w_expression(in_file, 'TYPE="snp" && (%s)' % " || ".join(filters), data, "GATKCutoffSNP", "SNP",
                               extra_cmd=r"""| sed 's/\\"//g'""")
Ejemplo n.º 2
0
def get_analysis_intervals(data, vrn_file, base_dir):
    """Retrieve analysis regions for the current variant calling pipeline.
    """
    from bcbio.bam import callable
    if vrn_file and vcfutils.is_gvcf_file(vrn_file):
        callable_bed = _callable_from_gvcf(data, vrn_file, base_dir)
        if callable_bed:
            return callable_bed

    if data.get("ensemble_bed"):
        return data["ensemble_bed"]
    elif dd.get_sample_callable(data):
        return dd.get_sample_callable(data)
    elif data.get("align_bam"):
        return callable.sample_callable_bed(data["align_bam"],
                                            dd.get_ref_file(data), data)[0]
    elif data.get("work_bam"):
        return callable.sample_callable_bed(data["work_bam"],
                                            dd.get_ref_file(data), data)[0]
    elif data.get("work_bam_callable"):
        data = utils.deepish_copy(data)
        data["work_bam"] = data.pop("work_bam_callable")
        return callable.sample_callable_bed(data["work_bam"],
                                            dd.get_ref_file(data), data)[0]
    elif tz.get_in(["config", "algorithm", "callable_regions"], data):
        return tz.get_in(["config", "algorithm", "callable_regions"], data)
    elif tz.get_in(["config", "algorithm", "variant_regions"], data):
        return tz.get_in(["config", "algorithm", "variant_regions"], data)
Ejemplo n.º 3
0
def run(call_file, ref_file, vrn_files, data):
    """Run filtering on the input call file, handling SNPs and indels separately.
    """
    algs = [data["config"]["algorithm"]] * len(data.get("vrn_files", [1]))
    if config_utils.use_vqsr(algs):
        if vcfutils.is_gvcf_file(call_file):
            raise ValueError(
                "Cannot force gVCF output with joint calling using tools_on: [gvcf] and use VQSR. "
                "Try using cutoff-based soft filtering with tools_off: [vqsr]")
        snp_file, indel_file = vcfutils.split_snps_indels(
            call_file, ref_file, data["config"])
        snp_filter_file = _variant_filtration(snp_file, ref_file, vrn_files,
                                              data, "SNP",
                                              vfilter.gatk_snp_cutoff)
        indel_filter_file = _variant_filtration(indel_file, ref_file,
                                                vrn_files, data, "INDEL",
                                                vfilter.gatk_indel_cutoff)
        orig_files = [snp_filter_file, indel_filter_file]
        out_file = "%scombined.vcf.gz" % os.path.commonprefix(orig_files)
        combined_file = vcfutils.combine_variant_files(orig_files, out_file,
                                                       ref_file,
                                                       data["config"])
        return combined_file
    else:
        snp_filter = vfilter.gatk_snp_cutoff(call_file, data)
        indel_filter = vfilter.gatk_indel_cutoff(snp_filter, data)
        return indel_filter
Ejemplo n.º 4
0
def gatk_snp_cutoff(in_file, data):
    """Perform cutoff-based soft filtering on GATK SNPs using best-practice recommendations.

    We have a more lenient mapping quality (MQ) filter compared to GATK defaults.
    The recommended filter (MQ < 40) is too stringent, so we adjust to 30:
    http://imgur.com/a/oHRVB

    QD and FS are not calculated when generating gVCF output:
    https://github.com/broadgsa/gatk-protected/blob/e91472ddc7d58ace52db0cab4d70a072a918d64c/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java#L300

    The extra command removes escaped quotes in the VCF output which
    pyVCF fails on.

    Does not use the GATK best practice recommend SOR filter (SOR > 3.0) as it
    has a negative impact on sensitivity relative to precision:

    https://github.com/bcbio/bcbio_validations/tree/master/gatk4#na12878-hg38
    """
    filters = ["MQRankSum < -12.5", "ReadPosRankSum < -8.0"]
    # GATK Haplotype caller (v2.2) appears to have much larger HaplotypeScores
    # resulting in excessive filtering, so avoid this metric
    variantcaller = utils.get_in(data, ("config", "algorithm", "variantcaller"))
    if variantcaller not in ["gatk-haplotype", "haplotyper"]:
        filters.append("HaplotypeScore > 13.0")
    # Additional filter metrics, unless using raw GATK HaplotypeCaller or Sentieon gVCFs
    if not (vcfutils.is_gvcf_file(in_file) and variantcaller in ["gatk-haplotype", "haplotyper"]):
        filters += ["QD < 2.0"]
        filters += ["FS > 60.0"]
        filters += _gatk_general()
        filters += ["MQ < 30.0"]
    return cutoff_w_expression(in_file, 'TYPE="snp" && (%s)' % " || ".join(filters), data, "GATKCutoffSNP", "SNP",
                               extra_cmd=r"""| sed 's/\\"//g'""")
Ejemplo n.º 5
0
def use_vqsr(algs, call_file=None):
    """Processing uses GATK's Variant Quality Score Recalibration.
    """
    from bcbio.variation import vcfutils
    vqsr_callers = set(["gatk", "gatk-haplotype"])
    vqsr_sample_thresh = 50
    vqsr_supported = collections.defaultdict(int)
    coverage_intervals = set([])
    for alg in algs:
        callers = alg.get("variantcaller")
        if isinstance(callers, basestring):
            callers = [callers]
        if not callers:  # no variant calling, no VQSR
            continue
        if "vqsr" in (alg.get("tools_off") or []):  # VQSR turned off
            continue
        for c in callers:
            if c in vqsr_callers:
                if "vqsr" in (alg.get("tools_on") or []):  # VQSR turned on:
                    vqsr_supported[c] += 1
                    coverage_intervals.add("genome")
                # Do not try VQSR for gVCF inputs
                elif call_file and vcfutils.is_gvcf_file(call_file):
                    pass
                else:
                    coverage_intervals.add(alg.get("coverage_interval", "exome").lower())
                    vqsr_supported[c] += 1
    if len(vqsr_supported) > 0:
        num_samples = max(vqsr_supported.values())
        if "genome" in coverage_intervals or num_samples >= vqsr_sample_thresh:
            return True
    return False
Ejemplo n.º 6
0
def run(call_file, ref_file, vrn_files, data):
    """Run filtering on the input call file, handling SNPs and indels separately.
    """
    algs = [data["config"]["algorithm"]] * len(data.get("vrn_files", [1]))
    if includes_missingalt(data):
        logger.info("Removing variants with missing alts from %s." % call_file)
        call_file = gatk_remove_missingalt(call_file, data)

    if "gatkcnn" in dd.get_tools_on(data):
        return _cnn_filter(call_file, vrn_files, data)
    elif config_utils.use_vqsr(algs, call_file):
        if vcfutils.is_gvcf_file(call_file):
            raise ValueError("Cannot force gVCF output with joint calling using tools_on: [gvcf] and use VQSR. "
                             "Try using cutoff-based soft filtering with tools_off: [vqsr]")
        snp_file, indel_file = vcfutils.split_snps_indels(call_file, ref_file, data["config"])
        snp_filter_file = _variant_filtration(snp_file, ref_file, vrn_files, data, "SNP",
                                              vfilter.gatk_snp_cutoff)
        indel_filter_file = _variant_filtration(indel_file, ref_file, vrn_files, data, "INDEL",
                                                vfilter.gatk_indel_cutoff)
        orig_files = [snp_filter_file, indel_filter_file]
        out_file = "%scombined.vcf.gz" % os.path.commonprefix(orig_files)
        combined_file = vcfutils.combine_variant_files(orig_files, out_file, ref_file, data["config"])
        return combined_file
    else:
        snp_filter = vfilter.gatk_snp_cutoff(call_file, data)
        indel_filter = vfilter.gatk_indel_cutoff(snp_filter, data)
        return indel_filter
Ejemplo n.º 7
0
def use_vqsr(algs, call_file=None):
    """Processing uses GATK's Variant Quality Score Recalibration.
    """
    from bcbio.variation import vcfutils
    vqsr_callers = set(["gatk", "gatk-haplotype"])
    vqsr_sample_thresh = 50
    vqsr_supported = collections.defaultdict(int)
    coverage_intervals = set([])
    for alg in algs:
        callers = alg.get("variantcaller")
        if isinstance(callers, six.string_types):
            callers = [callers]
        if not callers:  # no variant calling, no VQSR
            continue
        if "vqsr" in (alg.get("tools_off") or []):  # VQSR turned off
            continue
        for c in callers:
            if c in vqsr_callers:
                if "vqsr" in (alg.get("tools_on") or []):  # VQSR turned on:
                    vqsr_supported[c] += 1
                    coverage_intervals.add("genome")
                # Do not try VQSR for gVCF inputs
                elif call_file and vcfutils.is_gvcf_file(call_file):
                    pass
                else:
                    coverage_intervals.add(alg.get("coverage_interval", "exome").lower())
                    vqsr_supported[c] += 1
    if len(vqsr_supported) > 0:
        num_samples = max(vqsr_supported.values())
        if "genome" in coverage_intervals or num_samples >= vqsr_sample_thresh:
            return True
    return False
Ejemplo n.º 8
0
def get_analysis_intervals(data, vrn_file, base_dir):
    """Retrieve analysis regions for the current variant calling pipeline.
    """
    from bcbio.bam import callable
    if vrn_file and vcfutils.is_gvcf_file(vrn_file):
        callable_bed = _callable_from_gvcf(data, vrn_file, base_dir)
        if callable_bed:
            return callable_bed

    if data.get("ensemble_bed"):
        return data["ensemble_bed"]
    elif dd.get_sample_callable(data):
        return dd.get_sample_callable(data)
    elif data.get("align_bam"):
        return callable.sample_callable_bed(data["align_bam"], dd.get_ref_file(data), data)[0]
    elif data.get("work_bam"):
        return callable.sample_callable_bed(data["work_bam"], dd.get_ref_file(data), data)[0]
    elif data.get("work_bam_callable"):
        data = utils.deepish_copy(data)
        data["work_bam"] = data.pop("work_bam_callable")
        return callable.sample_callable_bed(data["work_bam"], dd.get_ref_file(data), data)[0]
    elif tz.get_in(["config", "algorithm", "callable_regions"], data):
        return tz.get_in(["config", "algorithm", "callable_regions"], data)
    elif tz.get_in(["config", "algorithm", "variant_regions"], data):
        return tz.get_in(["config", "algorithm", "variant_regions"], data)
Ejemplo n.º 9
0
def gatk_indel_cutoff(in_file, data):
    """Perform cutoff-based soft filtering on GATK indels using best-practice recommendations.
    """
    filters = ["ReadPosRankSum < -20.0"]
    variantcaller = utils.get_in(data, ("config", "algorithm", "variantcaller"))
    # Additional filter metrics, unless using raw GATK HaplotypeCaller or Sentieon gVCFs
    if not (vcfutils.is_gvcf_file(in_file) and variantcaller in ["gatk-haplotype", "haplotyper"]):
        filters += ["QD < 2.0"]
        filters += ["FS > 200.0"]
        filters += ["SOR > 10.0"]
        filters += _gatk_general()
    return cutoff_w_expression(in_file, 'TYPE="indel" && (%s)' % " || ".join(filters), data, "GATKCutoffIndel",
                               "INDEL", extra_cmd=r"""| sed 's/\\"//g'""")
Ejemplo n.º 10
0
def gatk_indel_cutoff(in_file, data):
    """Perform cutoff-based soft filtering on GATK indels using best-practice recommendations.
    """
    filters = ["ReadPosRankSum < -20.0"]
    variantcaller = utils.get_in(data, ("config", "algorithm", "variantcaller"))
    # Additional filter metrics, unless using raw GATK HaplotypeCaller or Sentieon gVCFs
    if not (vcfutils.is_gvcf_file(in_file) and variantcaller in ["gatk-haplotype", "haplotyper"]):
        filters += ["QD < 2.0"]
        filters += ["FS > 200.0"]
        filters += ["SOR > 10.0"]
        filters += _gatk_general()
    return cutoff_w_expression(in_file, 'TYPE="indel" && (%s)' % " || ".join(filters), data, "GATKCutoffIndel",
                               "INDEL", extra_cmd=r"""| sed 's/\\"//g'""")
Ejemplo n.º 11
0
def platypus(in_file, data):
    """Filter Platypus calls, removing Q20 filter and replacing with depth and quality based filter.

    Platypus uses its own VCF nomenclature: TC == DP, FR == AF

    Platypus gVCF output appears to have an 0/1 index problem so the reference block
    regions are 1 base outside regions of interest. We avoid limiting regions during
    filtering when using it.
    """
    filters = ('(FR[0] <= 0.5 && TC < 4 && %QUAL < 20) || '
               '(TC < 13 && %QUAL < 10) || '
               '(FR[0] > 0.5 && TC < 4 && %QUAL < 50)')
    limit_regions = "variant_regions" if not vcfutils.is_gvcf_file(in_file) else None
    return cutoff_w_expression(in_file, filters, data, name="PlatQualDepth",
                               extra_cmd="| sed 's/\\tQ20\\t/\\tPASS\\t/'", limit_regions=limit_regions)
Ejemplo n.º 12
0
def platypus(in_file, data):
    """Filter Platypus calls, removing Q20 filter and replacing with depth and quality based filter.

    Platypus uses its own VCF nomenclature: TC == DP, FR == AF

    Platypus gVCF output appears to have an 0/1 index problem so the reference block
    regions are 1 base outside regions of interest. We avoid limiting regions during
    filtering when using it.
    """
    filters = ('(FR[0] <= 0.5 && TC < 4 && %QUAL < 20) || '
               '(TC < 13 && %QUAL < 10) || '
               '(FR[0] > 0.5 && TC < 4 && %QUAL < 50)')
    limit_regions = "variant_regions" if not vcfutils.is_gvcf_file(in_file) else None
    return cutoff_w_expression(in_file, filters, data, name="PlatQualDepth",
                               extra_cmd="| sed 's/\\tQ20\\t/\\tPASS\\t/'", limit_regions=limit_regions)
Ejemplo n.º 13
0
def _add_dbsnp(orig_file, dbsnp_file, data, out_file=None, post_cl=None):
    """Annotate a VCF file with dbSNP.

    Adds rsIDs to NON_REF positions for gVCF inputs, but requires strict
    allele matching for non-gVCF.
    """
    orig_file = vcfutils.bgzip_and_index(orig_file, data["config"])
    if out_file is None:
        out_file = "%s-wdbsnp.vcf.gz" % utils.splitext_plus(orig_file)[0]
    if not utils.file_uptodate(out_file, orig_file):
        with file_transaction(data, out_file) as tx_out_file:
            conf_file = os.path.join(os.path.dirname(out_file), "dbsnp.conf")
            with open(conf_file, "w") as out_handle:
                out_handle.write(_DBSNP_TEMPLATE % os.path.normpath(os.path.join(dd.get_work_dir(data), dbsnp_file)))
            if not post_cl: post_cl = ""
            cores = dd.get_num_cores(data)
            opts = " -permissive-overlap" if vcfutils.is_gvcf_file(orig_file) else ""
            cmd = ("vcfanno -p {cores}{opts} {conf_file} {orig_file} | {post_cl} "
                   "bgzip -c > {tx_out_file}")
            do.run(cmd.format(**locals()), "Annotate with dbSNP")
    return vcfutils.bgzip_and_index(out_file, data["config"])