def get_tract_snps(vcf, chrom, start, end):
    ''' (str, str, int, int) -> int
    helper function that returns # of variants in given window
    '''
    v = VCF(vcf)
    region = '{chrom}:{start}-{end}'.format(chrom=chrom, start=start, end=end)
    snp_count = len([record for record in v.__call__(region)])
    return snp_count
Example #2
0
def count_window(vcf, chrom, start, end):
    ''' (str, str, int, int) -> int
    helper function for get_density that counts SNPs
    in a given window
    '''
    v = VCF(vcf)
    region = '{chrom}:{start}-{end}'.format(chrom=chrom, start=start, end=end)
    snp_count = len([record for record in v.__call__(region)])
    return snp_count
Example #3
0
def count_window_snps(vcf, start, read_length, insert_size, chrom):
    ''' (str, int, int, int, str) -> int, int
    helper function for parse_windows below
    counts SNPs in both left and right read given start point + read_length + ins size
    '''
    window_left = (start, start + read_length)
    right_start = window_left[1] + insert_size
    window_right = (right_start, right_start + read_length)
    left_count, right_count = 0, 0

    vcfin = VCF(vcf)

    range_str = '{0}:{1}-{2}'
    for record in vcfin.__call__(range_str.format(chrom, *window_left)):
        left_count += 1
    for record in vcfin.__call__(range_str.format(chrom, *window_right)):
        right_count += 1

    return left_count, right_count
Example #4
0
def vcf_to_fasta_markers(args):
    if args.samples:
        samples = open(args.samples).read().strip().split("\n")
    elif args.nsamples:
        samples = get_samples_from_bcf(args.vcf)[:args.nsamples]
    else:
        samples = get_samples_from_bcf(args.vcf)
    if args.fasta == "-":
        fasta_in = sys.stdin
    else:
        fasta_in = open(args.fasta)
    pos = 0
    if args.save_fasta:
        fasta_out = open(args.out + ".from_vcf.fa", "w")
    marker_out = open(args.out + ".markers", "wb")
    # First, feed the reference sequence to the thing:
    for name, seq, qual in readfq(fasta_in):
        pos += len(seq)
        to_write = ">{}\n{}\n".format(name, seq)
        if args.save_fasta:
            fasta_out.write(to_write)
        yield to_write.encode()
    fasta_in.seek(0)
    # Then feed in each haplotype
    for sample in samples:
        for i in range(2):  # process two haplotypes
            for name, seq, qual in readfq(fasta_in):
                prec_end = 0
                prec_start = 0
                to_write = ">{}.{}.{}\n".format(sample, i + 1, name)
                if args.save_fasta:
                    fasta_out.write(to_write)
                yield to_write.encode()
                bcf = VCF(args.vcf)
                bcf.set_samples([sample])
                # recs = bcf.__call__(contig)
                recs = bcf.__call__(name)
                for rec in recs:
                    gts = rec.genotypes[0]
                    if (len(gts) != 3):
                        die("error: number of genotypes for {} at marker {} is not 2!"
                            .format(sample, rec.ID))
                    if gts[i]:
                        if rec.start in range(prec_start, prec_end):
                            sys.stderr.write(
                                "warning: skipping {} at {}:{} because it overlaps a previous marker\n"
                                .format(rec.ID, rec.CHROM, rec.start))
                            continue
                        pos += rec.start - prec_end  # ffwd current position
                        if len(rec.REF) >= len(
                                rec.ALT[gts[i] - 1]):  # del and snp
                            write_marker(marker_out, pos, rec.start, gts[i],
                                         args.bytes, args.endian)
                            pos += 1
                        elif len(rec.REF) < len(rec.ALT[gts[i] - 1]):
                            # insertion
                            ins_size = len(rec.ALT[gts[i] - 1]) - len(
                                rec.REF) + 1
                            for j in range(ins_size):
                                write_marker(marker_out, pos, rec.start,
                                             gts[i], args.bytes, args.endian)
                                pos += 1
                        # to_write = str(seq[prec_end:rec.start]) + str(rec.ALT[gts[i]-1])
                        to_write = seq[prec_end:rec.start] + rec.ALT[gts[i] -
                                                                     1]
                        if args.save_fasta:
                            fasta_out.write(to_write)
                        yield to_write.encode()
                        prec_start = rec.start
                        # prec_end skips over to position just after record
                        prec_end = rec.start + len(rec.REF)
                bcf.close()
                pos += len(seq) - prec_end
                to_write = seq[prec_end:] + "\n"
                if args.save_fasta:
                    fasta_out.write(to_write)
                yield to_write.encode()
            fasta_in.seek(0)
    if args.save_fasta:
        fasta_out.close()
    marker_out.close()
    fasta_in.close()