def variant_filtration(call_file, ref_file, vrn_files, data): """Filter variant calls using Variant Quality Score Recalibration. Newer GATK with Haplotype calling has combined SNP/indel filtering. """ caller = data["config"]["algorithm"].get("variantcaller") call_file = ploidy.filter_vcf_by_sex(call_file, data) if caller in ["freebayes"]: return vfilter.freebayes(call_file, ref_file, vrn_files, data) # no additional filtration for callers that filter as part of call process elif caller in ["samtools", "varscan", "mutect"]: return call_file else: config = data["config"] snp_file, indel_file = vcfutils.split_snps_indels( call_file, ref_file, config) snp_filter_file = _variant_filtration_snp(snp_file, ref_file, vrn_files, data) indel_filter_file = _variant_filtration_indel(indel_file, ref_file, vrn_files, data) orig_files = [snp_filter_file, indel_filter_file] out_file = "{base}combined.vcf".format( base=os.path.commonprefix(orig_files)) return vcfutils.combine_variant_files(orig_files, out_file, ref_file, config)
def run(call_file, ref_file, vrn_files, data): """Run filtering on the input call file, handling SNPs and indels separately. """ algs = [data["config"]["algorithm"]] * len(data.get("vrn_files", [1])) if config_utils.use_vqsr(algs): if "gvcf" in dd.get_tools_on(data) and not dd.get_jointcaller(data): raise ValueError( "Cannot force gVCF output with joint calling using tools_on: [gvcf] and use VQSR. " "Try using cutoff-based soft filtering with tools_off: [vqsr]") snp_file, indel_file = vcfutils.split_snps_indels( call_file, ref_file, data["config"]) snp_filter_file = _variant_filtration(snp_file, ref_file, vrn_files, data, "SNP", vfilter.gatk_snp_cutoff) indel_filter_file = _variant_filtration(indel_file, ref_file, vrn_files, data, "INDEL", vfilter.gatk_indel_cutoff) orig_files = [snp_filter_file, indel_filter_file] out_file = "%scombined.vcf.gz" % os.path.commonprefix(orig_files) combined_file = vcfutils.combine_variant_files(orig_files, out_file, ref_file, data["config"]) return combined_file else: snp_filter = vfilter.gatk_snp_cutoff(call_file, data) indel_filter = vfilter.gatk_indel_cutoff(snp_filter, data) return indel_filter
def run(call_file, ref_file, vrn_files, data): """Run filtering on the input call file, handling SNPs and indels separately. """ algs = [data["config"]["algorithm"]] * len(data.get("vrn_files", [1])) if includes_missingalt(data): logger.info("Removing variants with missing alts from %s." % call_file) call_file = gatk_remove_missingalt(call_file, data) if "gatkcnn" in dd.get_tools_on(data): return _cnn_filter(call_file, vrn_files, data) elif config_utils.use_vqsr(algs, call_file): if vcfutils.is_gvcf_file(call_file): raise ValueError("Cannot force gVCF output with joint calling using tools_on: [gvcf] and use VQSR. " "Try using cutoff-based soft filtering with tools_off: [vqsr]") snp_file, indel_file = vcfutils.split_snps_indels(call_file, ref_file, data["config"]) snp_filter_file = _variant_filtration(snp_file, ref_file, vrn_files, data, "SNP", vfilter.gatk_snp_cutoff) indel_filter_file = _variant_filtration(indel_file, ref_file, vrn_files, data, "INDEL", vfilter.gatk_indel_cutoff) orig_files = [snp_filter_file, indel_filter_file] out_file = "%scombined.vcf.gz" % os.path.commonprefix(orig_files) combined_file = vcfutils.combine_variant_files(orig_files, out_file, ref_file, data["config"]) return combined_file else: snp_filter = vfilter.gatk_snp_cutoff(call_file, data) indel_filter = vfilter.gatk_indel_cutoff(snp_filter, data) return indel_filter
def run(call_file, ref_file, vrn_files, data): """Run filtering on the input call file, handling SNPs and indels separately. For VQSR, need to split the file to apply. For hard filters can run on the original filter, filtering by bcftools type. """ algs = [data["config"]["algorithm"]] * len(data.get("vrn_files", [1])) if config_utils.use_vqsr(algs): assert "gvcf" not in dd.get_tools_on(data), \ ("Cannot force gVCF output and use VQSR. Try using hard filtering with tools_off: [vqsr]") snp_file, indel_file = vcfutils.split_snps_indels( call_file, ref_file, data["config"]) snp_filter_file = _variant_filtration(snp_file, ref_file, vrn_files, data, "SNP", vfilter.gatk_snp_hard) indel_filter_file = _variant_filtration(indel_file, ref_file, vrn_files, data, "INDEL", vfilter.gatk_indel_hard) orig_files = [snp_filter_file, indel_filter_file] out_file = "%scombined.vcf.gz" % os.path.commonprefix(orig_files) combined_file = vcfutils.combine_variant_files(orig_files, out_file, ref_file, data["config"]) return _filter_nonref(combined_file, data) else: snp_filter = vfilter.gatk_snp_hard(call_file, data) indel_filter = vfilter.gatk_indel_hard(snp_filter, data) if "gvcf" not in dd.get_tools_on(data): return _filter_nonref(indel_filter, data) else: return indel_filter
def variant_filtration(call_file, ref_file, vrn_files, config): """Filter variant calls using Variant Quality Score Recalibration. Newer GATK with Haplotype calling has combined SNP/indel filtering. """ broad_runner = broad.runner_from_config(config) caller = config["algorithm"].get("variantcaller") if caller in ["gatk-haplotype"] and not _no_vqsr(config): return _variant_filtration_both(broad_runner, call_file, ref_file, vrn_files, config) elif caller in ["freebayes"]: return filter_freebayes(broad_runner, call_file, ref_file, vrn_files, config) # no additional filtration for callers that filter as part of call process elif caller in ["samtools", "varscan"]: return call_file else: snp_file, indel_file = vcfutils.split_snps_indels( broad_runner, call_file, ref_file) snp_filter_file = _variant_filtration_snp(broad_runner, snp_file, ref_file, vrn_files, config) indel_filter_file = _variant_filtration_indel(broad_runner, indel_file, ref_file, vrn_files, config) orig_files = [snp_filter_file, indel_filter_file] out_file = "{base}combined.vcf".format( base=os.path.commonprefix(orig_files)) return vcfutils.combine_variant_files(orig_files, out_file, ref_file, config)
def run(call_file, ref_file, vrn_files, data): """Run filtering on the input call file, handling SNPs and indels separately. For VQSR, need to split the file to apply. For hard filters can run on the original filter, filtering by bcftools type. """ algs = [data["config"]["algorithm"]] * len(data.get("vrn_files", [1])) if config_utils.use_vqsr(algs): assert "gvcf" not in dd.get_tools_on(data), \ ("Cannot force gVCF output and use VQSR. Try using hard filtering with tools_off: [vqsr]") snp_file, indel_file = vcfutils.split_snps_indels(call_file, ref_file, data["config"]) snp_filter_file = _variant_filtration(snp_file, ref_file, vrn_files, data, "SNP", vfilter.gatk_snp_hard) indel_filter_file = _variant_filtration(indel_file, ref_file, vrn_files, data, "INDEL", vfilter.gatk_indel_hard) orig_files = [snp_filter_file, indel_filter_file] out_file = "%scombined.vcf.gz" % os.path.commonprefix(orig_files) combined_file = vcfutils.combine_variant_files(orig_files, out_file, ref_file, data["config"]) return _filter_nonref(combined_file, data) else: snp_filter = vfilter.gatk_snp_hard(call_file, data) indel_filter = vfilter.gatk_indel_hard(snp_filter, data) if "gvcf" not in dd.get_tools_on(data): return _filter_nonref(indel_filter, data) else: return indel_filter
def run(call_file, ref_file, vrn_files, data): """Run filtering on the input call file, handling SNPs and indels separately. """ snp_file, indel_file = vcfutils.split_snps_indels(call_file, ref_file, data["config"]) snp_filter_file = _variant_filtration(snp_file, ref_file, vrn_files, data, "SNP", vfilter.gatk_snp_hard) indel_filter_file = _variant_filtration(indel_file, ref_file, vrn_files, data, "INDEL", vfilter.gatk_indel_hard) orig_files = [snp_filter_file, indel_filter_file] out_file = "%scombined.vcf.gz" % os.path.commonprefix(orig_files) return vcfutils.combine_variant_files(orig_files, out_file, ref_file, data["config"])
def run(call_file, ref_file, vrn_files, data): """Run filtering on the input call file, handling SNPs and indels separately. """ snp_file, indel_file = vcfutils.split_snps_indels(call_file, ref_file, data["config"]) snp_filter_file = _variant_filtration(snp_file, ref_file, vrn_files, data, "SNP", vfilter.gatk_snp_hard) indel_filter_file = _variant_filtration(indel_file, ref_file, vrn_files, data, "INDEL", vfilter.gatk_indel_hard) orig_files = [snp_filter_file, indel_filter_file] out_file = "%scombined.vcf.gz" % os.path.commonprefix(orig_files) combined_file = vcfutils.combine_variant_files(orig_files, out_file, ref_file, data["config"]) return _filter_nonref(combined_file, data)
def test_3_vcf_split_combine(self): """Split a VCF file into SNPs and indels, then combine back together. """ with make_workdir() as workdir: config = load_config(get_post_process_yaml(self.automated_dir, workdir)) config["algorithm"] = {} ref_file = os.path.join(self.data_dir, "genomes", "hg19", "seq", "hg19.fa") fname = os.path.join(self.var_dir, "S1-variants.vcf") snp_file, indel_file = vcfutils.split_snps_indels(fname, ref_file, config) merge_file = "%s-merge%s.gz" % os.path.splitext(fname) vcfutils.combine_variant_files([snp_file, indel_file], merge_file, ref_file, config) for f in [snp_file, indel_file, merge_file]: self._remove_vcf(f)
def test_3_vcf_split_combine(self): """Split a VCF file into SNPs and indels, then combine back together. """ with make_workdir() as workdir: config = load_config(get_post_process_yaml(self.data_dir, workdir)) config["algorithm"] = {} ref_file = os.path.join(self.data_dir, "genomes", "hg19", "seq", "hg19.fa") fname = os.path.join(self.var_dir, "S1-variants.vcf") snp_file, indel_file = vcfutils.split_snps_indels(fname, ref_file, config) merge_file = "%s-merge%s.gz" % os.path.splitext(fname) vcfutils.combine_variant_files([snp_file, indel_file], merge_file, ref_file, config) for f in [snp_file, indel_file, merge_file]: self._remove_vcf(f)
def test_3_vcf_split_combine(self, global_config): """Split a VCF file into SNPs and indels, then combine back together. """ from bcbio.variation import vcfutils config = load_config(global_config) config["algorithm"] = {} fname = os.path.join(self.var_dir, "S1-variants.vcf") snp_file, indel_file = vcfutils.split_snps_indels( fname, self.ref_file, config) merge_file = "%s-merge%s.gz" % os.path.splitext(fname) vcfutils.combine_variant_files([snp_file, indel_file], merge_file, self.ref_file, config) for f in [snp_file, indel_file, merge_file]: self._remove_vcf(f)
def test_3_vcf_split_combine(self): """Split a VCF file into SNPs and indels, then combine back together. """ from bcbio.variation import vcfutils with make_workdir() as workdir: config = load_config( get_post_process_yaml(self.automated_dir, workdir)) config["algorithm"] = {} fname = os.path.join(self.var_dir, "S1-variants.vcf") snp_file, indel_file = vcfutils.split_snps_indels( fname, self.ref_file, config) merge_file = "%s-merge%s.gz" % os.path.splitext(fname) vcfutils.combine_variant_files([snp_file, indel_file], merge_file, self.ref_file, config) for f in [snp_file, indel_file, merge_file]: self._remove_vcf(f)
def variant_filtration(call_file, ref_file, vrn_files, config): """Filter variant calls using Variant Quality Score Recalibration. Newer GATK with Haplotype calling has combined SNP/indel filtering. """ caller = config["algorithm"].get("variantcaller") if caller in ["freebayes"]: return vfilter.freebayes(call_file, ref_file, vrn_files, config) # no additional filtration for callers that filter as part of call process elif caller in ["samtools", "varscan"]: return call_file else: snp_file, indel_file = vcfutils.split_snps_indels(call_file, ref_file, config) snp_filter_file = _variant_filtration_snp(snp_file, ref_file, vrn_files, config) indel_filter_file = _variant_filtration_indel(indel_file, ref_file, vrn_files, config) orig_files = [snp_filter_file, indel_filter_file] out_file = "{base}combined.vcf".format(base=os.path.commonprefix(orig_files)) return vcfutils.combine_variant_files(orig_files, out_file, ref_file, config)