def get_tract_snps(vcf, chrom, start, end): ''' (str, str, int, int) -> int helper function that returns # of variants in given window ''' v = VCF(vcf) region = '{chrom}:{start}-{end}'.format(chrom=chrom, start=start, end=end) snp_count = len([record for record in v.__call__(region)]) return snp_count
def count_window(vcf, chrom, start, end): ''' (str, str, int, int) -> int helper function for get_density that counts SNPs in a given window ''' v = VCF(vcf) region = '{chrom}:{start}-{end}'.format(chrom=chrom, start=start, end=end) snp_count = len([record for record in v.__call__(region)]) return snp_count
def count_window_snps(vcf, start, read_length, insert_size, chrom): ''' (str, int, int, int, str) -> int, int helper function for parse_windows below counts SNPs in both left and right read given start point + read_length + ins size ''' window_left = (start, start + read_length) right_start = window_left[1] + insert_size window_right = (right_start, right_start + read_length) left_count, right_count = 0, 0 vcfin = VCF(vcf) range_str = '{0}:{1}-{2}' for record in vcfin.__call__(range_str.format(chrom, *window_left)): left_count += 1 for record in vcfin.__call__(range_str.format(chrom, *window_right)): right_count += 1 return left_count, right_count
def vcf_to_fasta_markers(args): if args.samples: samples = open(args.samples).read().strip().split("\n") elif args.nsamples: samples = get_samples_from_bcf(args.vcf)[:args.nsamples] else: samples = get_samples_from_bcf(args.vcf) if args.fasta == "-": fasta_in = sys.stdin else: fasta_in = open(args.fasta) pos = 0 if args.save_fasta: fasta_out = open(args.out + ".from_vcf.fa", "w") marker_out = open(args.out + ".markers", "wb") # First, feed the reference sequence to the thing: for name, seq, qual in readfq(fasta_in): pos += len(seq) to_write = ">{}\n{}\n".format(name, seq) if args.save_fasta: fasta_out.write(to_write) yield to_write.encode() fasta_in.seek(0) # Then feed in each haplotype for sample in samples: for i in range(2): # process two haplotypes for name, seq, qual in readfq(fasta_in): prec_end = 0 prec_start = 0 to_write = ">{}.{}.{}\n".format(sample, i + 1, name) if args.save_fasta: fasta_out.write(to_write) yield to_write.encode() bcf = VCF(args.vcf) bcf.set_samples([sample]) # recs = bcf.__call__(contig) recs = bcf.__call__(name) for rec in recs: gts = rec.genotypes[0] if (len(gts) != 3): die("error: number of genotypes for {} at marker {} is not 2!" .format(sample, rec.ID)) if gts[i]: if rec.start in range(prec_start, prec_end): sys.stderr.write( "warning: skipping {} at {}:{} because it overlaps a previous marker\n" .format(rec.ID, rec.CHROM, rec.start)) continue pos += rec.start - prec_end # ffwd current position if len(rec.REF) >= len( rec.ALT[gts[i] - 1]): # del and snp write_marker(marker_out, pos, rec.start, gts[i], args.bytes, args.endian) pos += 1 elif len(rec.REF) < len(rec.ALT[gts[i] - 1]): # insertion ins_size = len(rec.ALT[gts[i] - 1]) - len( rec.REF) + 1 for j in range(ins_size): write_marker(marker_out, pos, rec.start, gts[i], args.bytes, args.endian) pos += 1 # to_write = str(seq[prec_end:rec.start]) + str(rec.ALT[gts[i]-1]) to_write = seq[prec_end:rec.start] + rec.ALT[gts[i] - 1] if args.save_fasta: fasta_out.write(to_write) yield to_write.encode() prec_start = rec.start # prec_end skips over to position just after record prec_end = rec.start + len(rec.REF) bcf.close() pos += len(seq) - prec_end to_write = seq[prec_end:] + "\n" if args.save_fasta: fasta_out.write(to_write) yield to_write.encode() fasta_in.seek(0) if args.save_fasta: fasta_out.close() marker_out.close() fasta_in.close()