def add_snp(snp, position, sequence): if snp.alt != ".": genotype = "".join(vcfwrap.get_ml_genotype(snp)) encoded = sequences.encode_genotype(genotype) else: encoded = snp.ref sequence[position] = encoded
def main(argv): if len(argv) != 2: sys.stderr.write("Usage: %s <BED-file> <VCF.bgz>\n") return 1 sites = 0 sites_non_ref = 0 sites_homo_non_ref = 0 sites_het_one_non_ref = 0 sites_het_two_non_ref = 0 vcf_records = pysam.Tabixfile(argv[1]) bed_records = read_bed_records(argv[0]) for record in select_vcf_records(bed_records, vcf_records): if record.alt != '.': # Get the most likely diploid genotype nt_a, nt_b = get_ml_genotype(record) if (nt_a, nt_b) == ('N', 'N'): # Skip sites with no most likely genotype continue sites += 1 sites_non_ref += 1 if nt_a == nt_b: sites_homo_non_ref += 1 elif record.ref not in (nt_a, nt_b): sites_het_two_non_ref += 1 else: sites_het_one_non_ref += 1 else: # Heterozygous for the reference allele sites += 1 print print "%i sites kept after filtering:" % (sites, ) print " % 10i homozygous sites containing the reference allele (%.2f%%)" % ( sites - sites_non_ref, 100.0 * (sites - sites_non_ref) / float(sites)) print " % 10i heterozygous sites containing the reference and a non-reference allele (%.2f%%)" % ( sites_het_one_non_ref, (100.0 * sites_het_one_non_ref) / sites) print " % 10i homozygous sites containing a single non-reference allele (%.2f%%)" % ( sites_homo_non_ref, (100.0 * sites_homo_non_ref) / sites) print " % 10i heterozygous sites containing two different non-reference alleles (%.2f%%)" % ( sites_het_two_non_ref, (100.0 * sites_het_two_non_ref) / sites)
def add_indel(options, bed, indel, sequence): if indel.alt == ".": return genotype = vcfwrap.get_ml_genotype(indel) if genotype[0] != genotype[1]: # No way to represent heterozygous indels return elif genotype[0] == "N": # No most likely genotype return # Note that bed.end is a past-the-end coordinate start = max(0, bed.start - options.padding) # FIXME: parse_indel only supports a single 'alt' values indel.alt = genotype[0] indel = vcfwrap.parse_indel(indel) if indel.in_reference: del_start = max(indel.pos + 1, bed.start) del_end = min(indel.pos + 1 + len(indel.what), bed.end) if del_start >= del_end: # Deletion does not cover any bases of interest return elif options.whole_codon_indels_only: if (del_end - del_start) % 3: # Non-codon sized overlap with area of interest return for position in range(del_start, del_end): sequence[position - start] = "" elif (len(indel.what) % 3 == 0) or not options.whole_codon_indels_only: # parse_indel assumes that the insertion is always the first possible # base when multiple positions are possible. As a consequence, the # position may be before start, with the rest of the bases overlapping # the current sequence. For example: # ref = ATTT # alt = ATTTT # It is assumed that the insertion (_) happened thus: # interpretation = A_TTT if indel.pos >= start: sequence[indel.pos - start] += indel.what
def main(argv): if len(argv) != 2: sys.stderr.write("Usage: %s <BED-file> <VCF.bgz>\n") return 1 sites = 0 sites_non_ref = 0 sites_homo_non_ref = 0 sites_het_one_non_ref = 0 sites_het_two_non_ref = 0 vcf_records = pysam.Tabixfile(argv[1]) bed_records = read_bed_records(argv[0]) for record in select_vcf_records(bed_records, vcf_records): if record.alt != '.': # Get the most likely diploid genotype nt_a, nt_b = get_ml_genotype(record) if (nt_a, nt_b) == ('N', 'N'): # Skip sites with no most likely genotype continue sites += 1 sites_non_ref += 1 if nt_a == nt_b: sites_homo_non_ref += 1 elif record.ref not in (nt_a, nt_b): sites_het_two_non_ref += 1 else: sites_het_one_non_ref += 1 else: # Heterozygous for the reference allele sites += 1 print print "%i sites kept after filtering:" % (sites,) print " % 10i homozygous sites containing the reference allele (%.2f%%)" % (sites - sites_non_ref, 100.0 * (sites - sites_non_ref) / float(sites)) print " % 10i heterozygous sites containing the reference and a non-reference allele (%.2f%%)" % (sites_het_one_non_ref, (100.0 * sites_het_one_non_ref) / sites) print " % 10i homozygous sites containing a single non-reference allele (%.2f%%)" % (sites_homo_non_ref, (100.0 * sites_homo_non_ref) / sites) print " % 10i heterozygous sites containing two different non-reference alleles (%.2f%%)" % (sites_het_two_non_ref, (100.0 * sites_het_two_non_ref) / sites)
def _filter_by_properties(options, vcfs, frequencies): """Filters a list of SNPs/indels based on the various properties recorded in the info column, and others. This mirrors most of the filtering carried out by vcfutils.pl varFilter.""" for vcf in vcfs: if float(vcf.qual) < options.min_quality: _mark_as_filtered(vcf, "q:%i" % options.min_quality) properties = {} for field in vcf.info.split(";"): if "=" in field: key, value = field.split("=") else: key, value = field, None properties[key] = value read_depth = float(properties["DP"]) if options.min_read_depth > read_depth: _mark_as_filtered(vcf, "d:%i" % options.min_read_depth) elif options.max_read_depth < read_depth: _mark_as_filtered(vcf, "D:%i" % options.max_read_depth) if "MQ" in properties: if float(properties["MQ"]) < options.min_mapping_quality: _mark_as_filtered(vcf, "Q:%i" % options.min_mapping_quality) if "PV4" in properties: pv4 = [float(value) for value in properties["PV4"].split(",")] if (pv4[0] < options.min_strand_bias): _mark_as_filtered(vcf, "1:%e" % options.min_strand_bias) if (pv4[1] < options.min_baseq_bias): _mark_as_filtered(vcf, "2:%e" % options.min_baseq_bias) if (pv4[2] < options.min_mapq_bias): _mark_as_filtered(vcf, "3:%e" % options.min_mapq_bias) if (pv4[3] < options.min_end_distance_bias): _mark_as_filtered(vcf, "4:%e" % options.min_end_distance_bias) if vcf.alt != ".": ref_fw, ref_rev, alt_fw, alt_rev = map(int, properties["DP4"].split(",")) if (alt_fw + alt_rev) < options.min_num_alt_bases: _mark_as_filtered(vcf, "a:%i" % options.min_num_alt_bases) ml_genotype = vcfwrap.get_ml_genotype(vcf) if (ml_genotype == ("N", "N")) and not options.keep_ambigious_genotypes: # No most likely genotype _mark_as_filtered(vcf, "k") if (ml_genotype[0] != ml_genotype[1]): if vcf.contig in options.homozygous_chromosome: _mark_as_filtered(vcf, "HET") # Filter by frequency of minor allele if vcf.ref in ml_genotype: n_minor = min(ref_fw + ref_rev, alt_fw + alt_rev) n_major = max(ref_fw + ref_rev, alt_fw + alt_rev) if (n_minor / float(n_minor + n_major)) < options.min_allele_frequency: _mark_as_filtered(vcf, "f:%.4f" % options.min_allele_frequency) else: state = frequencies.frequency_is_valid(vcf.contig, vcf.pos, vcf.ref, *ml_genotype) if state is frequencies.INVALID: _mark_as_filtered(vcf, "f:%.4f" % options.min_allele_frequency) elif state is frequencies.NA: if _mark_as_filtered(vcf, "F:%.4f" % options.min_allele_frequency): sys.stderr.write("WARNING: Could not determine allele-counts for SNP at %s:%s, filtering ...\n" % (vcf.contig, vcf.pos + 1))
def _filter_by_properties(options, vcfs, frequencies): """Filters a list of SNPs/indels based on the various properties recorded in the info column, and others. This mirrors most of the filtering carried out by vcfutils.pl varFilter.""" for vcf in vcfs: if float(vcf.qual) < options.min_quality: _mark_as_filtered(vcf, "q:%i" % options.min_quality) properties = {} for field in vcf.info.split(";"): if "=" in field: key, value = field.split("=") else: key, value = field, None properties[key] = value read_depth = float(properties["DP"]) if options.min_read_depth > read_depth: _mark_as_filtered(vcf, "d:%i" % options.min_read_depth) elif options.max_read_depth < read_depth: _mark_as_filtered(vcf, "D:%i" % options.max_read_depth) if "MQ" in properties: if float(properties["MQ"]) < options.min_mapping_quality: _mark_as_filtered(vcf, "Q:%i" % options.min_mapping_quality) if "PV4" in properties: pv4 = [float(value) for value in properties["PV4"].split(",")] if (pv4[0] < options.min_strand_bias): _mark_as_filtered(vcf, "1:%e" % options.min_strand_bias) if (pv4[1] < options.min_baseq_bias): _mark_as_filtered(vcf, "2:%e" % options.min_baseq_bias) if (pv4[2] < options.min_mapq_bias): _mark_as_filtered(vcf, "3:%e" % options.min_mapq_bias) if (pv4[3] < options.min_end_distance_bias): _mark_as_filtered(vcf, "4:%e" % options.min_end_distance_bias) if vcf.alt != ".": ref_fw, ref_rev, alt_fw, alt_rev = map( int, properties["DP4"].split(",")) if (alt_fw + alt_rev) < options.min_num_alt_bases: _mark_as_filtered(vcf, "a:%i" % options.min_num_alt_bases) ml_genotype = vcfwrap.get_ml_genotype(vcf) if (ml_genotype == ("N", "N")) and not options.keep_ambigious_genotypes: # No most likely genotype _mark_as_filtered(vcf, "k") if (ml_genotype[0] != ml_genotype[1]): if vcf.contig in options.homozygous_chromosome: _mark_as_filtered(vcf, "HET") # Filter by frequency of minor allele if vcf.ref in ml_genotype: n_minor = min(ref_fw + ref_rev, alt_fw + alt_rev) n_major = max(ref_fw + ref_rev, alt_fw + alt_rev) if (n_minor / float(n_minor + n_major) ) < options.min_allele_frequency: _mark_as_filtered( vcf, "f:%.4f" % options.min_allele_frequency) else: state = frequencies.frequency_is_valid( vcf.contig, vcf.pos, vcf.ref, *ml_genotype) if state is frequencies.INVALID: _mark_as_filtered( vcf, "f:%.4f" % options.min_allele_frequency) elif state is frequencies.NA: if _mark_as_filtered( vcf, "F:%.4f" % options.min_allele_frequency): sys.stderr.write( "WARNING: Could not determine allele-counts for SNP at %s:%s, filtering ...\n" % (vcf.contig, vcf.pos + 1))