Example #1
0
def variant_filtration(call_file, ref_file, vrn_files, data):
    """Filter variant calls using Variant Quality Score Recalibration.

    Newer GATK with Haplotype calling has combined SNP/indel filtering.
    """
    caller = data["config"]["algorithm"].get("variantcaller")
    call_file = ploidy.filter_vcf_by_sex(call_file, data)
    if caller in ["freebayes"]:
        return vfilter.freebayes(call_file, ref_file, vrn_files, data)
    # no additional filtration for callers that filter as part of call process
    elif caller in ["samtools", "varscan", "mutect"]:
        return call_file
    else:
        config = data["config"]
        snp_file, indel_file = vcfutils.split_snps_indels(
            call_file, ref_file, config)
        snp_filter_file = _variant_filtration_snp(snp_file, ref_file,
                                                  vrn_files, data)
        indel_filter_file = _variant_filtration_indel(indel_file, ref_file,
                                                      vrn_files, data)
        orig_files = [snp_filter_file, indel_filter_file]
        out_file = "{base}combined.vcf".format(
            base=os.path.commonprefix(orig_files))
        return vcfutils.combine_variant_files(orig_files, out_file, ref_file,
                                              config)
Example #2
0
def run(call_file, ref_file, vrn_files, data):
    """Run filtering on the input call file, handling SNPs and indels separately.
    """
    algs = [data["config"]["algorithm"]] * len(data.get("vrn_files", [1]))
    if config_utils.use_vqsr(algs):
        if "gvcf" in dd.get_tools_on(data) and not dd.get_jointcaller(data):
            raise ValueError(
                "Cannot force gVCF output with joint calling using tools_on: [gvcf] and use VQSR. "
                "Try using cutoff-based soft filtering with tools_off: [vqsr]")
        snp_file, indel_file = vcfutils.split_snps_indels(
            call_file, ref_file, data["config"])
        snp_filter_file = _variant_filtration(snp_file, ref_file, vrn_files,
                                              data, "SNP",
                                              vfilter.gatk_snp_cutoff)
        indel_filter_file = _variant_filtration(indel_file, ref_file,
                                                vrn_files, data, "INDEL",
                                                vfilter.gatk_indel_cutoff)
        orig_files = [snp_filter_file, indel_filter_file]
        out_file = "%scombined.vcf.gz" % os.path.commonprefix(orig_files)
        combined_file = vcfutils.combine_variant_files(orig_files, out_file,
                                                       ref_file,
                                                       data["config"])
        return combined_file
    else:
        snp_filter = vfilter.gatk_snp_cutoff(call_file, data)
        indel_filter = vfilter.gatk_indel_cutoff(snp_filter, data)
        return indel_filter
Example #3
0
def run(call_file, ref_file, vrn_files, data):
    """Run filtering on the input call file, handling SNPs and indels separately.
    """
    algs = [data["config"]["algorithm"]] * len(data.get("vrn_files", [1]))
    if includes_missingalt(data):
        logger.info("Removing variants with missing alts from %s." % call_file)
        call_file = gatk_remove_missingalt(call_file, data)

    if "gatkcnn" in dd.get_tools_on(data):
        return _cnn_filter(call_file, vrn_files, data)
    elif config_utils.use_vqsr(algs, call_file):
        if vcfutils.is_gvcf_file(call_file):
            raise ValueError("Cannot force gVCF output with joint calling using tools_on: [gvcf] and use VQSR. "
                             "Try using cutoff-based soft filtering with tools_off: [vqsr]")
        snp_file, indel_file = vcfutils.split_snps_indels(call_file, ref_file, data["config"])
        snp_filter_file = _variant_filtration(snp_file, ref_file, vrn_files, data, "SNP",
                                              vfilter.gatk_snp_cutoff)
        indel_filter_file = _variant_filtration(indel_file, ref_file, vrn_files, data, "INDEL",
                                                vfilter.gatk_indel_cutoff)
        orig_files = [snp_filter_file, indel_filter_file]
        out_file = "%scombined.vcf.gz" % os.path.commonprefix(orig_files)
        combined_file = vcfutils.combine_variant_files(orig_files, out_file, ref_file, data["config"])
        return combined_file
    else:
        snp_filter = vfilter.gatk_snp_cutoff(call_file, data)
        indel_filter = vfilter.gatk_indel_cutoff(snp_filter, data)
        return indel_filter
Example #4
0
def run(call_file, ref_file, vrn_files, data):
    """Run filtering on the input call file, handling SNPs and indels separately.

    For VQSR, need to split the file to apply. For hard filters can run on the original
    filter, filtering by bcftools type.
    """
    algs = [data["config"]["algorithm"]] * len(data.get("vrn_files", [1]))
    if config_utils.use_vqsr(algs):
        assert "gvcf" not in dd.get_tools_on(data), \
            ("Cannot force gVCF output and use VQSR. Try using hard filtering with tools_off: [vqsr]")
        snp_file, indel_file = vcfutils.split_snps_indels(
            call_file, ref_file, data["config"])
        snp_filter_file = _variant_filtration(snp_file, ref_file, vrn_files,
                                              data, "SNP",
                                              vfilter.gatk_snp_hard)
        indel_filter_file = _variant_filtration(indel_file, ref_file,
                                                vrn_files, data, "INDEL",
                                                vfilter.gatk_indel_hard)
        orig_files = [snp_filter_file, indel_filter_file]
        out_file = "%scombined.vcf.gz" % os.path.commonprefix(orig_files)
        combined_file = vcfutils.combine_variant_files(orig_files, out_file,
                                                       ref_file,
                                                       data["config"])
        return _filter_nonref(combined_file, data)
    else:
        snp_filter = vfilter.gatk_snp_hard(call_file, data)
        indel_filter = vfilter.gatk_indel_hard(snp_filter, data)
        if "gvcf" not in dd.get_tools_on(data):
            return _filter_nonref(indel_filter, data)
        else:
            return indel_filter
Example #5
0
def variant_filtration(call_file, ref_file, vrn_files, config):
    """Filter variant calls using Variant Quality Score Recalibration.

    Newer GATK with Haplotype calling has combined SNP/indel filtering.
    """
    broad_runner = broad.runner_from_config(config)
    caller = config["algorithm"].get("variantcaller")
    if caller in ["gatk-haplotype"] and not _no_vqsr(config):
        return _variant_filtration_both(broad_runner, call_file, ref_file,
                                        vrn_files, config)
    elif caller in ["freebayes"]:
        return filter_freebayes(broad_runner, call_file, ref_file, vrn_files,
                                config)
    # no additional filtration for callers that filter as part of call process
    elif caller in ["samtools", "varscan"]:
        return call_file
    else:
        snp_file, indel_file = vcfutils.split_snps_indels(
            broad_runner, call_file, ref_file)
        snp_filter_file = _variant_filtration_snp(broad_runner, snp_file,
                                                  ref_file, vrn_files, config)
        indel_filter_file = _variant_filtration_indel(broad_runner, indel_file,
                                                      ref_file, vrn_files,
                                                      config)
        orig_files = [snp_filter_file, indel_filter_file]
        out_file = "{base}combined.vcf".format(
            base=os.path.commonprefix(orig_files))
        return vcfutils.combine_variant_files(orig_files, out_file, ref_file,
                                              config)
Example #6
0
def run(call_file, ref_file, vrn_files, data):
    """Run filtering on the input call file, handling SNPs and indels separately.

    For VQSR, need to split the file to apply. For hard filters can run on the original
    filter, filtering by bcftools type.
    """
    algs = [data["config"]["algorithm"]] * len(data.get("vrn_files", [1]))
    if config_utils.use_vqsr(algs):
        assert "gvcf" not in dd.get_tools_on(data), \
            ("Cannot force gVCF output and use VQSR. Try using hard filtering with tools_off: [vqsr]")
        snp_file, indel_file = vcfutils.split_snps_indels(call_file, ref_file, data["config"])
        snp_filter_file = _variant_filtration(snp_file, ref_file, vrn_files, data, "SNP",
                                              vfilter.gatk_snp_hard)
        indel_filter_file = _variant_filtration(indel_file, ref_file, vrn_files, data, "INDEL",
                                                vfilter.gatk_indel_hard)
        orig_files = [snp_filter_file, indel_filter_file]
        out_file = "%scombined.vcf.gz" % os.path.commonprefix(orig_files)
        combined_file = vcfutils.combine_variant_files(orig_files, out_file, ref_file, data["config"])
        return _filter_nonref(combined_file, data)
    else:
        snp_filter = vfilter.gatk_snp_hard(call_file, data)
        indel_filter = vfilter.gatk_indel_hard(snp_filter, data)
        if "gvcf" not in dd.get_tools_on(data):
            return _filter_nonref(indel_filter, data)
        else:
            return indel_filter
def run(call_file, ref_file, vrn_files, data):
    """Run filtering on the input call file, handling SNPs and indels separately.
    """
    snp_file, indel_file = vcfutils.split_snps_indels(call_file, ref_file, data["config"])
    snp_filter_file = _variant_filtration(snp_file, ref_file, vrn_files, data, "SNP",
                                          vfilter.gatk_snp_hard)
    indel_filter_file = _variant_filtration(indel_file, ref_file, vrn_files, data, "INDEL",
                                            vfilter.gatk_indel_hard)
    orig_files = [snp_filter_file, indel_filter_file]
    out_file = "%scombined.vcf.gz" % os.path.commonprefix(orig_files)
    return vcfutils.combine_variant_files(orig_files, out_file, ref_file, data["config"])
Example #8
0
def run(call_file, ref_file, vrn_files, data):
    """Run filtering on the input call file, handling SNPs and indels separately.
    """
    snp_file, indel_file = vcfutils.split_snps_indels(call_file, ref_file, data["config"])
    snp_filter_file = _variant_filtration(snp_file, ref_file, vrn_files, data, "SNP",
                                          vfilter.gatk_snp_hard)
    indel_filter_file = _variant_filtration(indel_file, ref_file, vrn_files, data, "INDEL",
                                            vfilter.gatk_indel_hard)
    orig_files = [snp_filter_file, indel_filter_file]
    out_file = "%scombined.vcf.gz" % os.path.commonprefix(orig_files)
    combined_file = vcfutils.combine_variant_files(orig_files, out_file, ref_file, data["config"])
    return _filter_nonref(combined_file, data)
 def test_3_vcf_split_combine(self):
     """Split a VCF file into SNPs and indels, then combine back together.
     """
     with make_workdir() as workdir:
         config = load_config(get_post_process_yaml(self.automated_dir, workdir))
         config["algorithm"] = {}
     ref_file = os.path.join(self.data_dir, "genomes", "hg19", "seq", "hg19.fa")
     fname = os.path.join(self.var_dir, "S1-variants.vcf")
     snp_file, indel_file = vcfutils.split_snps_indels(fname, ref_file, config)
     merge_file = "%s-merge%s.gz" % os.path.splitext(fname)
     vcfutils.combine_variant_files([snp_file, indel_file], merge_file, ref_file,
                                    config)
     for f in [snp_file, indel_file, merge_file]:
         self._remove_vcf(f)
Example #10
0
 def test_3_vcf_split_combine(self):
     """Split a VCF file into SNPs and indels, then combine back together.
     """
     with make_workdir() as workdir:
         config = load_config(get_post_process_yaml(self.data_dir, workdir))
         config["algorithm"] = {}
     ref_file = os.path.join(self.data_dir, "genomes", "hg19", "seq", "hg19.fa")
     fname = os.path.join(self.var_dir, "S1-variants.vcf")
     snp_file, indel_file = vcfutils.split_snps_indels(fname, ref_file, config)
     merge_file = "%s-merge%s.gz" % os.path.splitext(fname)
     vcfutils.combine_variant_files([snp_file, indel_file], merge_file, ref_file,
                                    config)
     for f in [snp_file, indel_file, merge_file]:
         self._remove_vcf(f)
Example #11
0
 def test_3_vcf_split_combine(self, global_config):
     """Split a VCF file into SNPs and indels, then combine back together.
     """
     from bcbio.variation import vcfutils
     config = load_config(global_config)
     config["algorithm"] = {}
     fname = os.path.join(self.var_dir, "S1-variants.vcf")
     snp_file, indel_file = vcfutils.split_snps_indels(
         fname, self.ref_file, config)
     merge_file = "%s-merge%s.gz" % os.path.splitext(fname)
     vcfutils.combine_variant_files([snp_file, indel_file], merge_file,
                                    self.ref_file, config)
     for f in [snp_file, indel_file, merge_file]:
         self._remove_vcf(f)
Example #12
0
 def test_3_vcf_split_combine(self):
     """Split a VCF file into SNPs and indels, then combine back together.
     """
     from bcbio.variation import vcfutils
     with make_workdir() as workdir:
         config = load_config(
             get_post_process_yaml(self.automated_dir, workdir))
         config["algorithm"] = {}
     fname = os.path.join(self.var_dir, "S1-variants.vcf")
     snp_file, indel_file = vcfutils.split_snps_indels(
         fname, self.ref_file, config)
     merge_file = "%s-merge%s.gz" % os.path.splitext(fname)
     vcfutils.combine_variant_files([snp_file, indel_file], merge_file,
                                    self.ref_file, config)
     for f in [snp_file, indel_file, merge_file]:
         self._remove_vcf(f)
Example #13
0
def variant_filtration(call_file, ref_file, vrn_files, config):
    """Filter variant calls using Variant Quality Score Recalibration.

    Newer GATK with Haplotype calling has combined SNP/indel filtering.
    """
    caller = config["algorithm"].get("variantcaller")
    if caller in ["freebayes"]:
        return vfilter.freebayes(call_file, ref_file, vrn_files, config)
    # no additional filtration for callers that filter as part of call process
    elif caller in ["samtools", "varscan"]:
        return call_file
    else:
        snp_file, indel_file = vcfutils.split_snps_indels(call_file, ref_file, config)
        snp_filter_file = _variant_filtration_snp(snp_file, ref_file, vrn_files, config)
        indel_filter_file = _variant_filtration_indel(indel_file, ref_file, vrn_files, config)
        orig_files = [snp_filter_file, indel_filter_file]
        out_file = "{base}combined.vcf".format(base=os.path.commonprefix(orig_files))
        return vcfutils.combine_variant_files(orig_files, out_file, ref_file, config)