Ejemplo n.º 1
0
def vcf_dp(fn_list, max_dp=1024):
    dp_array = np.zeros(max_dp, dtype=int)
    qual_dictlist = collections.defaultdict(list)
    for filename in fn_list:
        vp = vcf_parser(filename, yield_samples=True, parse_genotypes=True)
        samples = next(vp)
        if len(samples) != 1:
            raise Exception("{} samples. Single sample vcf expected.".format(len(samples)))
        s = samples[0]
        for vline in vp:
            qual = float(vline[7])
            genotype_field = vline[11][s]
            ad = genotype_field["AD"]
            dp = np.sum(map(int, ad.split(",")))
            if dp < max_dp:
                dp_array[dp] += 1
                qual_dictlist[dp].append(qual)

    qual_mean = [np.mean(qual_dictlist[i]) if len(qual_dictlist[i])>0 else 0 for i in range(max_dp)]
    qual_var = [np.var(qual_dictlist[i]) if len(qual_dictlist[i])>0 else 0 for i in range(max_dp)]
    return dp_array, qual_mean, qual_var
Ejemplo n.º 2
0
def filter_snps_by_dist(vcf_filename, plen):
    """
    Yield SNPs which are plen positions away from other SNPs.
    """

    vp = vcf_parser(vcf_filename, yield_samples=True, parse_genotypes=True)
    samples = next(vp)
    if len(samples) != 1:
        raise Exception("{} samples. Single sample vcf expected.".format(
            len(samples)))

    sample = samples[0]

    last = None
    exclude = False

    for vline in vp:
        chrom = vline[2]
        pos = vline[3]
        ref = vline[5].upper()
        alt = vline[6].upper()
        genotype_field = vline[11][sample]
        ad = genotype_field["AD"]
        dp = np.sum(map(int, ad.split(",")))

        if last is not None:
            if last[0] == chrom and pos - last[1] < plen:
                exclude = True
            elif not exclude:
                yield last
            else:
                exclude = False

        last = [chrom, pos, ref, alt, dp]

    if not exclude:
        yield last
Ejemplo n.º 3
0
if __name__ == "__main__":
    args = parse_args()

    chrom_dict = parse_fai(args.fai)
    loci = parse_bed(args.bed)

    total_dp = collections.defaultdict(lambda: collections.Counter())

    pdf = PdfPages("plot_chrom_cov_from_vcf.pdf")
    fig_w, fig_h = plt.figaspect(4.0 / 1.0)
    #fig_w, fig_h = plt.figaspect(9.0/16.0)
    #fig_w, fig_h = plt.figaspect(3.0/4.0)

    for filename in args.vcf:
        vparser = parsers.vcf_parser(filename)
        samples = next(vparser)

        x_pos = collections.defaultdict(lambda: collections.defaultdict(list))
        y_dp = collections.defaultdict(lambda: collections.defaultdict(list))

        max_dp = collections.defaultdict(int)

        for row in vparser:
            (lineno, line, chrom, pos, id, ref, alt, qual, filter, info_str,
             fmt_fields, gts) = row

            for sample in samples:
                dp = gts[sample].get("DP")
                if not dp:
                    print(
Ejemplo n.º 4
0
from __future__ import print_function
import sys
from parsers import vcf_parser
import numpy as np
import collections

if __name__ == "__main__":
    if len(sys.argv) < 3:
        print("usage: {} file.vcf DP1 [...DPN]".format(sys.argv[0]),
              file=sys.stderr)
        exit(1)

    dp_match = set(map(int, sys.argv[2:]))

    vp = vcf_parser(sys.argv[1], yield_samples=True, parse_genotypes=True)
    samples = next(vp)
    if len(samples) != 1:
        raise Exception("{} samples. Single sample vcf expected.".format(
            len(samples)))

    s = samples[0]

    for vline in vp:
        chrom = vline[2]
        pos = vline[3]
        qual = float(vline[7])
        genotype_field = vline[11][s]
        ad = genotype_field["AD"]
        dp = np.sum(map(int, ad.split(",")))
        if dp in dp_match:
Ejemplo n.º 5
0
import matplotlib.gridspec as gridspec
import parsers

if __name__ == "__main__":

    if len(sys.argv) < 2:
        print("usage: {} in1.vcf [in2.vcf...inN.vcf]".format(sys.argv[0]))
        sys.exit(1)

    pdf = PdfPages("plot_hist_dp_qual.pdf")
    fig_w, fig_h = plt.figaspect(9.0 / 16.0)
    #fig_w, fig_h = plt.figaspect(3.0/4.0)

    for vcf_in in sys.argv[1:]:

        vparser = parsers.vcf_parser(vcf_in)
        samples = next(vparser)
        if (len(samples) != 1):
            print("Error: {}: multisample vcf is unexpected.".format(vcf_in),
                  file=sys.stderr)
            sys.exit(1)

        sample = samples[0]

        qual_list = []
        dp_list = []
        gt_count = collections.Counter()

        for row in vparser:
            (lineno, line, chrom, pos, id, ref, alt, qual, filter, info_str,
             fmt_fields, gts) = row
Ejemplo n.º 6
0
            print("Error: no individuals in group ``{}''".format(args.group),
                  file=sys.stderr)
            exit(1)
    else:
        ii = list(range(len(indlist)))

    alleles = {}
    for (chrom, pos), g in zip(parse_snp(snp_fn), parse_geno(geno_fn)):
        gts = [int(g[i]) for i in ii if g[i] != "9"]
        if len(gts) == 0:
            continue
        n_ref = sum(gts)
        n_alt = 2 * len(gts) - n_ref
        alleles[(chrom, pos)] = (n_alt, n_ref + n_alt)

    vparser = vcf_parser(args.vcf)
    vcf_samples = next(vparser)

    assert len(vcf_samples) == 1, "multi-sample vcf not supported"
    sample = vcf_samples[0]

    gt_map = {"./.": -1, ".|.": -1}
    for a0 in range(9):
        for a1 in range(9):
            n_alts = sum(1 if a else 0 for a in [a0, a1])
            gt_map["{}/{}".format(a0, a1)] = n_alts
            gt_map["{}|{}".format(a0, a1)] = n_alts

    prev_pos = 0
    prev_chrom = None
    last = (-1, -1, -1)