def get_filtered_phased_het_trio_variants(trio_vcf, trio_filtered_het_phased_vcf, sample_name): vcf_in = VariantFile(trio_vcf) vcf_in.subset_samples([sample_name]) vcf_out = VariantFile(trio_filtered_het_phased_vcf, 'w', header=vcf_in.header) for rec in vcf_in.fetch(): if rec.filter.keys()[0] == 'PASS': rec_sample = rec.samples[0] if rec_sample.phased and rec_sample['GT'][0] != rec_sample['GT'][1]: rec.samples[0].update({'PS':1}) vcf_out.write(rec) return 0
def read_vcf(vcf, sample) : """Reads a VCF and fetches relevant information to a Pandas DataFrame""" vcf_in = VariantFile(vcf) # auto-detect input format vcf_in.subset_samples([sample]) # Variant sites for probability computation (and later modelisation) VariantSites = {"CHROM":[], "POS":[], "TYPE":[], "DP":[], "MAF":[], "GT":[], "QUAL":[], "ALS":[]} for i, rec in enumerate(vcf_in) : # For each record in vcf if i % 200000 == 0 : print("Elapsed records: {}".format(i)) gt = rec.samples[sample]["GT"] # Get sample GT if len(set(gt)) == 1 : continue # SKIP IF HOMOZYGOUS VariantSites["CHROM"].append(rec.chrom) # Add record CHROM VariantSites["POS"].append(rec.pos) # Add record POS als = [x for n, x in enumerate(rec.alleles) if n in gt] # Get record position VariantSites["ALS"].append(als) # Add record alleles VariantSites["GT"].append(gt) # Add sample GT if "<NON_REF>" in als : # In case undefined allele vtype = "U" elif any(len(x) > 1 for x in als) or "*" in als : # In case any is a deletion or is an insertion vtype = "I" else : # In case not non-ref and not an INDEL vtype = "S" VariantSites["TYPE"].append(vtype) # Add sample type try : # Add sample DP and compute Min AF based on allele reads frequencies. If AD or DP is unavailable fills with None dp = rec.samples[sample]["DP"] VariantSites["DP"].append(dp) try : min_ad = min(rec.samples[sample]["AD"]) min_af = float(min_ad/dp) VariantSites["MAF"].append(min_af) except : VariantSites["MAF"].append(None) except : VariantSites["DP"].append(None) try : # Add record QUAL at this position VariantSites["QUAL"].append(rec.qual) except : VariantSites["QUAL"].append(None) return pd.DataFrame.from_dict(VariantSites)
def get_hets(vcf, sample): """extract heterozygous STRs and SNPs from the vcf""" vcf = VariantFile(vcf) vcf.subset_samples([sample]) # iterate through each variant record in the VCF # TODO: make sure you're considering cases where the POS is duplicated for rec in vcf.fetch(): variant = rec.samples[sample] is_het = variant['GT'][0] != variant['GT'][1] is_snp = not ((len(variant.alleles[0]) - len(variant.alleles[1]))) # limit our analysis to only variants that are heterozygous # and they must be either STRs or SNPs if (is_het and (is_snp or rec.id.startswith("STR_"))): yield rec
def main(varfile, keep, outprefix): """ 输出sweepfinder2的allele frequency file文件 要求输入的vcf文件没有缺失 输出文件中会把alt allele count为0的过滤掉 """ varin = VariantFile(varfile) samples = [x.strip() for x in open(keep).readlines()] varin.subset_samples(samples) print(f'keep samples:\n{samples}') ss = len(samples) * 2 # sample size with open(f'{outprefix}.SF', 'w') as f: f.write('position\tx\tn\tfolded\n') for rec in varin.fetch(): gts = [s['GT'] for s in rec.samples.values()] gts = np.array(gts, dtype='int8').flatten() ac = np.sum(gts) # alt allele count if ac > 0: f.write(f'{rec.pos}\t{ac}\t{ss}\t1\n')