def count_gts(sample_list): # load VCF and choose only samples of interest vcf = VCF(sys.argv[1]) vcf.set_samples(sample_list) # initialize lists ids = [] hets = [] refs = [] alts = [] missing = [] # count # of samples of each genotype for every variant for variant in vcf: if (variant.CHROM != "chrX") and ( variant.CHROM != "chrY"): # skip X, Y chromosome variants ids.append(variant.ID) hets.append(variant.num_het) refs.append(variant.num_hom_ref) alts.append(variant.num_hom_alt) missing.append(variant.num_unknown) # create dataframes with gt counts df = pd.DataFrame({ "SV": ids, "ref": refs, "het": hets, "alt": alts, "missing": missing }) return (df)
def count_gts_xchr(sample_list, sex): # load VCF and choose only samples of interest vcf = VCF(sys.argv[1]) vcf.set_samples(sample_list) # initialize lists ids = [] hets = [] refs = [] alts = [] missing = [] # count # of samples of each genotype for every variant for variant in vcf: if variant.CHROM == "chrX": # only look at variants on X chromosome ids.append(variant.ID) hets.append(variant.num_het) refs.append(variant.num_hom_ref) alts.append(variant.num_hom_alt) missing.append(variant.num_unknown) # create dataframes with gt counts df = pd.DataFrame({ "SV": ids, "ref": refs, "het": hets, "alt": alts, "missing": missing }) if sex == "male": # incorporate het samples into alt allele calls df["alt"] = df["het"] + df["alt"] return (df[["SV", "ref", "alt", "missing"]]) else: return (df[["ref", "het", "alt", "missing"]])
def test_set_samples(): vcf = VCF(VCF_PATH) assert len(vcf.samples) == 189, len(vcf.samples) vcf.set_samples([vcf.samples[2]]) assert len(vcf.samples) == 1 v = next(vcf) assert len(v.gt_types) == 1
def get_call_rate_Ychr(sample_list): vcf = VCF(path_to_vcf) vcf.set_samples(sample_list) # initialize lists ids = [] call_rate = [] # count # of samples of each genotype for every variant for variant in vcf: if variant.CHROM == "chrY": # only look at variants on Y chromosome ids.append(variant.ID) call_rate.append(variant.call_rate) # create dataframe df = pd.DataFrame( {"SV" : ids, "call_rate" : call_rate }) return(df)
if self.freqs[base] != 0.0: print(self.sample + " Already has a a freq of " + str(self.freqs[base]) + " at " + self.chrom + ' ' + str(self.pos)) else: self.freqs[base] = freq def print_var(self): print(self.sample, self.chrom, self.pos, self.ref, self.depth, self.freqs) variants = {} for sample in samples: vcf = VCF(path) vcf.set_samples(sample) for var in vcf: dp = var.format("DP")[0] ad = var.format("AD")[0] alts = var.ALT bases = [var.REF] for alt in alts: bases.append(alt) name = sample + str(var.CHROM) + str(var.POS) if dp[0] != -2147483648: i = 0 while i < len(bases): if name not in variants: variants[name] = _var(sample, var.CHROM, var.POS, var.REF, dp[0]) variants[name].set_freq(bases[i], ad[i])
def vcf_to_fasta_markers(args): if args.samples: samples = open(args.samples).read().strip().split("\n") elif args.nsamples: samples = get_samples_from_bcf(args.vcf)[:args.nsamples] else: samples = get_samples_from_bcf(args.vcf) if args.fasta == "-": fasta_in = sys.stdin else: fasta_in = open(args.fasta) pos = 0 if args.save_fasta: fasta_out = open(args.out + ".from_vcf.fa", "w") marker_out = open(args.out + ".markers", "wb") # First, feed the reference sequence to the thing: for name, seq, qual in readfq(fasta_in): pos += len(seq) to_write = ">{}\n{}\n".format(name, seq) if args.save_fasta: fasta_out.write(to_write) yield to_write.encode() fasta_in.seek(0) # Then feed in each haplotype for sample in samples: for i in range(2): # process two haplotypes for name, seq, qual in readfq(fasta_in): prec_end = 0 prec_start = 0 to_write = ">{}.{}.{}\n".format(sample, i + 1, name) if args.save_fasta: fasta_out.write(to_write) yield to_write.encode() bcf = VCF(args.vcf) bcf.set_samples([sample]) # recs = bcf.__call__(contig) recs = bcf.__call__(name) for rec in recs: gts = rec.genotypes[0] if (len(gts) != 3): die("error: number of genotypes for {} at marker {} is not 2!" .format(sample, rec.ID)) if gts[i]: if rec.start in range(prec_start, prec_end): sys.stderr.write( "warning: skipping {} at {}:{} because it overlaps a previous marker\n" .format(rec.ID, rec.CHROM, rec.start)) continue pos += rec.start - prec_end # ffwd current position if len(rec.REF) >= len( rec.ALT[gts[i] - 1]): # del and snp write_marker(marker_out, pos, rec.start, gts[i], args.bytes, args.endian) pos += 1 elif len(rec.REF) < len(rec.ALT[gts[i] - 1]): # insertion ins_size = len(rec.ALT[gts[i] - 1]) - len( rec.REF) + 1 for j in range(ins_size): write_marker(marker_out, pos, rec.start, gts[i], args.bytes, args.endian) pos += 1 # to_write = str(seq[prec_end:rec.start]) + str(rec.ALT[gts[i]-1]) to_write = seq[prec_end:rec.start] + rec.ALT[gts[i] - 1] if args.save_fasta: fasta_out.write(to_write) yield to_write.encode() prec_start = rec.start # prec_end skips over to position just after record prec_end = rec.start + len(rec.REF) bcf.close() pos += len(seq) - prec_end to_write = seq[prec_end:] + "\n" if args.save_fasta: fasta_out.write(to_write) yield to_write.encode() fasta_in.seek(0) if args.save_fasta: fasta_out.close() marker_out.close() fasta_in.close()