Example #1
0
def count_gts(sample_list):
    # load VCF and choose only samples of interest
    vcf = VCF(sys.argv[1])
    vcf.set_samples(sample_list)

    # initialize lists
    ids = []
    hets = []
    refs = []
    alts = []
    missing = []

    # count # of samples of each genotype for every variant
    for variant in vcf:
        if (variant.CHROM != "chrX") and (
                variant.CHROM != "chrY"):  # skip X, Y chromosome variants
            ids.append(variant.ID)
            hets.append(variant.num_het)
            refs.append(variant.num_hom_ref)
            alts.append(variant.num_hom_alt)
            missing.append(variant.num_unknown)

    # create dataframes with gt counts
    df = pd.DataFrame({
        "SV": ids,
        "ref": refs,
        "het": hets,
        "alt": alts,
        "missing": missing
    })
    return (df)
Example #2
0
def count_gts_xchr(sample_list, sex):
    # load VCF and choose only samples of interest
    vcf = VCF(sys.argv[1])
    vcf.set_samples(sample_list)

    # initialize lists
    ids = []
    hets = []
    refs = []
    alts = []
    missing = []

    # count # of samples of each genotype for every variant
    for variant in vcf:
        if variant.CHROM == "chrX":  # only look at variants on X chromosome
            ids.append(variant.ID)
            hets.append(variant.num_het)
            refs.append(variant.num_hom_ref)
            alts.append(variant.num_hom_alt)
            missing.append(variant.num_unknown)

    # create dataframes with gt counts
    df = pd.DataFrame({
        "SV": ids,
        "ref": refs,
        "het": hets,
        "alt": alts,
        "missing": missing
    })
    if sex == "male":
        # incorporate het samples into alt allele calls
        df["alt"] = df["het"] + df["alt"]
        return (df[["SV", "ref", "alt", "missing"]])
    else:
        return (df[["ref", "het", "alt", "missing"]])
Example #3
0
def test_set_samples():
    vcf = VCF(VCF_PATH)
    assert len(vcf.samples) == 189, len(vcf.samples)
    vcf.set_samples([vcf.samples[2]])
    assert len(vcf.samples) == 1
    v = next(vcf)
    assert len(v.gt_types) == 1
Example #4
0
def test_set_samples():
    vcf = VCF(VCF_PATH)
    assert len(vcf.samples) == 189, len(vcf.samples)
    vcf.set_samples([vcf.samples[2]])
    assert len(vcf.samples) == 1
    v = next(vcf)
    assert len(v.gt_types) == 1
Example #5
0
def get_call_rate_Ychr(sample_list):
    vcf = VCF(path_to_vcf)
    vcf.set_samples(sample_list)

    # initialize lists
    ids = []
    call_rate = []

    # count # of samples of each genotype for every variant
    for variant in vcf:
        if variant.CHROM == "chrY": # only look at variants on Y chromosome
            ids.append(variant.ID)
            call_rate.append(variant.call_rate)
    
    # create dataframe
    df = pd.DataFrame(
        {"SV" : ids,
        "call_rate" : call_rate
        })
    return(df)
Example #6
0
        if self.freqs[base] != 0.0:
            print(self.sample + " Already has a a freq of " +
                  str(self.freqs[base]) + " at " + self.chrom + ' ' +
                  str(self.pos))
        else:
            self.freqs[base] = freq

    def print_var(self):
        print(self.sample, self.chrom, self.pos, self.ref, self.depth,
              self.freqs)


variants = {}
for sample in samples:
    vcf = VCF(path)
    vcf.set_samples(sample)
    for var in vcf:
        dp = var.format("DP")[0]
        ad = var.format("AD")[0]
        alts = var.ALT
        bases = [var.REF]
        for alt in alts:
            bases.append(alt)
        name = sample + str(var.CHROM) + str(var.POS)
        if dp[0] != -2147483648:
            i = 0
            while i < len(bases):
                if name not in variants:
                    variants[name] = _var(sample, var.CHROM, var.POS, var.REF,
                                          dp[0])
                    variants[name].set_freq(bases[i], ad[i])
Example #7
0
def vcf_to_fasta_markers(args):
    if args.samples:
        samples = open(args.samples).read().strip().split("\n")
    elif args.nsamples:
        samples = get_samples_from_bcf(args.vcf)[:args.nsamples]
    else:
        samples = get_samples_from_bcf(args.vcf)
    if args.fasta == "-":
        fasta_in = sys.stdin
    else:
        fasta_in = open(args.fasta)
    pos = 0
    if args.save_fasta:
        fasta_out = open(args.out + ".from_vcf.fa", "w")
    marker_out = open(args.out + ".markers", "wb")
    # First, feed the reference sequence to the thing:
    for name, seq, qual in readfq(fasta_in):
        pos += len(seq)
        to_write = ">{}\n{}\n".format(name, seq)
        if args.save_fasta:
            fasta_out.write(to_write)
        yield to_write.encode()
    fasta_in.seek(0)
    # Then feed in each haplotype
    for sample in samples:
        for i in range(2):  # process two haplotypes
            for name, seq, qual in readfq(fasta_in):
                prec_end = 0
                prec_start = 0
                to_write = ">{}.{}.{}\n".format(sample, i + 1, name)
                if args.save_fasta:
                    fasta_out.write(to_write)
                yield to_write.encode()
                bcf = VCF(args.vcf)
                bcf.set_samples([sample])
                # recs = bcf.__call__(contig)
                recs = bcf.__call__(name)
                for rec in recs:
                    gts = rec.genotypes[0]
                    if (len(gts) != 3):
                        die("error: number of genotypes for {} at marker {} is not 2!"
                            .format(sample, rec.ID))
                    if gts[i]:
                        if rec.start in range(prec_start, prec_end):
                            sys.stderr.write(
                                "warning: skipping {} at {}:{} because it overlaps a previous marker\n"
                                .format(rec.ID, rec.CHROM, rec.start))
                            continue
                        pos += rec.start - prec_end  # ffwd current position
                        if len(rec.REF) >= len(
                                rec.ALT[gts[i] - 1]):  # del and snp
                            write_marker(marker_out, pos, rec.start, gts[i],
                                         args.bytes, args.endian)
                            pos += 1
                        elif len(rec.REF) < len(rec.ALT[gts[i] - 1]):
                            # insertion
                            ins_size = len(rec.ALT[gts[i] - 1]) - len(
                                rec.REF) + 1
                            for j in range(ins_size):
                                write_marker(marker_out, pos, rec.start,
                                             gts[i], args.bytes, args.endian)
                                pos += 1
                        # to_write = str(seq[prec_end:rec.start]) + str(rec.ALT[gts[i]-1])
                        to_write = seq[prec_end:rec.start] + rec.ALT[gts[i] -
                                                                     1]
                        if args.save_fasta:
                            fasta_out.write(to_write)
                        yield to_write.encode()
                        prec_start = rec.start
                        # prec_end skips over to position just after record
                        prec_end = rec.start + len(rec.REF)
                bcf.close()
                pos += len(seq) - prec_end
                to_write = seq[prec_end:] + "\n"
                if args.save_fasta:
                    fasta_out.write(to_write)
                yield to_write.encode()
            fasta_in.seek(0)
    if args.save_fasta:
        fasta_out.close()
    marker_out.close()
    fasta_in.close()