def vcf_dp(fn_list, max_dp=1024): dp_array = np.zeros(max_dp, dtype=int) qual_dictlist = collections.defaultdict(list) for filename in fn_list: vp = vcf_parser(filename, yield_samples=True, parse_genotypes=True) samples = next(vp) if len(samples) != 1: raise Exception("{} samples. Single sample vcf expected.".format(len(samples))) s = samples[0] for vline in vp: qual = float(vline[7]) genotype_field = vline[11][s] ad = genotype_field["AD"] dp = np.sum(map(int, ad.split(","))) if dp < max_dp: dp_array[dp] += 1 qual_dictlist[dp].append(qual) qual_mean = [np.mean(qual_dictlist[i]) if len(qual_dictlist[i])>0 else 0 for i in range(max_dp)] qual_var = [np.var(qual_dictlist[i]) if len(qual_dictlist[i])>0 else 0 for i in range(max_dp)] return dp_array, qual_mean, qual_var
def filter_snps_by_dist(vcf_filename, plen): """ Yield SNPs which are plen positions away from other SNPs. """ vp = vcf_parser(vcf_filename, yield_samples=True, parse_genotypes=True) samples = next(vp) if len(samples) != 1: raise Exception("{} samples. Single sample vcf expected.".format( len(samples))) sample = samples[0] last = None exclude = False for vline in vp: chrom = vline[2] pos = vline[3] ref = vline[5].upper() alt = vline[6].upper() genotype_field = vline[11][sample] ad = genotype_field["AD"] dp = np.sum(map(int, ad.split(","))) if last is not None: if last[0] == chrom and pos - last[1] < plen: exclude = True elif not exclude: yield last else: exclude = False last = [chrom, pos, ref, alt, dp] if not exclude: yield last
if __name__ == "__main__": args = parse_args() chrom_dict = parse_fai(args.fai) loci = parse_bed(args.bed) total_dp = collections.defaultdict(lambda: collections.Counter()) pdf = PdfPages("plot_chrom_cov_from_vcf.pdf") fig_w, fig_h = plt.figaspect(4.0 / 1.0) #fig_w, fig_h = plt.figaspect(9.0/16.0) #fig_w, fig_h = plt.figaspect(3.0/4.0) for filename in args.vcf: vparser = parsers.vcf_parser(filename) samples = next(vparser) x_pos = collections.defaultdict(lambda: collections.defaultdict(list)) y_dp = collections.defaultdict(lambda: collections.defaultdict(list)) max_dp = collections.defaultdict(int) for row in vparser: (lineno, line, chrom, pos, id, ref, alt, qual, filter, info_str, fmt_fields, gts) = row for sample in samples: dp = gts[sample].get("DP") if not dp: print(
from __future__ import print_function import sys from parsers import vcf_parser import numpy as np import collections if __name__ == "__main__": if len(sys.argv) < 3: print("usage: {} file.vcf DP1 [...DPN]".format(sys.argv[0]), file=sys.stderr) exit(1) dp_match = set(map(int, sys.argv[2:])) vp = vcf_parser(sys.argv[1], yield_samples=True, parse_genotypes=True) samples = next(vp) if len(samples) != 1: raise Exception("{} samples. Single sample vcf expected.".format( len(samples))) s = samples[0] for vline in vp: chrom = vline[2] pos = vline[3] qual = float(vline[7]) genotype_field = vline[11][s] ad = genotype_field["AD"] dp = np.sum(map(int, ad.split(","))) if dp in dp_match:
import matplotlib.gridspec as gridspec import parsers if __name__ == "__main__": if len(sys.argv) < 2: print("usage: {} in1.vcf [in2.vcf...inN.vcf]".format(sys.argv[0])) sys.exit(1) pdf = PdfPages("plot_hist_dp_qual.pdf") fig_w, fig_h = plt.figaspect(9.0 / 16.0) #fig_w, fig_h = plt.figaspect(3.0/4.0) for vcf_in in sys.argv[1:]: vparser = parsers.vcf_parser(vcf_in) samples = next(vparser) if (len(samples) != 1): print("Error: {}: multisample vcf is unexpected.".format(vcf_in), file=sys.stderr) sys.exit(1) sample = samples[0] qual_list = [] dp_list = [] gt_count = collections.Counter() for row in vparser: (lineno, line, chrom, pos, id, ref, alt, qual, filter, info_str, fmt_fields, gts) = row
print("Error: no individuals in group ``{}''".format(args.group), file=sys.stderr) exit(1) else: ii = list(range(len(indlist))) alleles = {} for (chrom, pos), g in zip(parse_snp(snp_fn), parse_geno(geno_fn)): gts = [int(g[i]) for i in ii if g[i] != "9"] if len(gts) == 0: continue n_ref = sum(gts) n_alt = 2 * len(gts) - n_ref alleles[(chrom, pos)] = (n_alt, n_ref + n_alt) vparser = vcf_parser(args.vcf) vcf_samples = next(vparser) assert len(vcf_samples) == 1, "multi-sample vcf not supported" sample = vcf_samples[0] gt_map = {"./.": -1, ".|.": -1} for a0 in range(9): for a1 in range(9): n_alts = sum(1 if a else 0 for a in [a0, a1]) gt_map["{}/{}".format(a0, a1)] = n_alts gt_map["{}|{}".format(a0, a1)] = n_alts prev_pos = 0 prev_chrom = None last = (-1, -1, -1)