def prune_allrefs(vcf, fout): for record in vcf.fetch(): if is_biallelic(record): if record.info['AC'][0] > 0: fout.write(record) else: if record.info['CN_NONREF_FREQ'] > 0: fout.write(record)
def drop_nonref_gts(vcf, fout): NULL_GT = [(0, 0), (None, None), (0, ), (None, ), (None, 2)] samples = [s for s in vcf.header.samples] # for record in vcf.fetch(): # nonref = svu.get_called_samples(record) # if len(nonref) > 0: # fout.write(record) for record in vcf.fetch(): for s in samples: if is_biallelic(record): if record.samples[s]['GT'] not in NULL_GT: fout.write(record) break else: if record.samples[s]['CN'] != 2: fout.write(record) break
def calc_allele_freq(record, samples, prefix=None, hemi=False): """ Computes allele frequencies for a single record based on a list of samples """ # Treat biallelic and multiallelic records differently # For biallelic sites, count number of non-ref, non-no-call GTs if svu.is_biallelic(record): # Get all sample GTs GTs = [record.samples[s]['GT'] for s in samples] # Count alleles & genotypes AC = 0 AN = 0 n_alt_count_0 = 0 n_alt_count_1 = 0 n_alt_count_2 = 0 n_gts_with_gt_0_alts = 0 # Used specifically for hemizygous sites for GT in GTs: AN += len([ allele for allele in GT if allele != '.' and allele is not None ]) AC += len([ allele for allele in GT if allele != '.' and allele != 0 and allele is not None ]) if GT == (0, 0): n_alt_count_0 += 1 if len([allele for allele in GT if allele == 0 and allele != '.' and allele is not None]) == 1 \ and len([allele for allele in GT if allele != 0 and allele != '.' and allele is not None]) == 1: n_alt_count_1 += 1 n_gts_with_gt_0_alts += 1 if len([ allele for allele in GT if allele != 0 and allele != '.' and allele is not None ]) == 2: n_alt_count_2 += 1 n_gts_with_gt_0_alts += 1 # Adjust hemizygous allele number and allele count, if optioned if hemi: AN = AN / 2 # For hemizygous sites, AC must be the sum of all non-reference *genotypes*, not alleles AC = n_gts_with_gt_0_alts # Calculate allele frequency if AN > 0: AF = AC / AN AF = round(AF, 6) else: AF = 0 # Add AN, AC, and AF to INFO field record.info[(prefix + '_' if prefix else '') + 'AN'] = AN record.info[(prefix + '_' if prefix else '') + 'AC'] = AC record.info[(prefix + '_' if prefix else '') + 'AF'] = AF # Calculate genotype frequencies n_bi_genos = n_alt_count_0 + n_alt_count_1 + n_alt_count_2 if n_bi_genos > 0: freq_homref = n_alt_count_0 / n_bi_genos freq_het = n_alt_count_1 / n_bi_genos freq_homalt = n_alt_count_2 / n_bi_genos else: freq_homref = 0 freq_het = 0 freq_homalt = 0 if hemi: freq_hemialt = freq_het + freq_homalt # Add N_BI_GENOS, N_HOMREF, N_HET, N_HOMALT, FREQ_HOMREF, FREQ_HET, and FREQ_HOMALT to INFO field record.info[(prefix + '_' if prefix else '') + 'N_BI_GENOS'] = n_bi_genos if hemi: record.info[(prefix + '_' if prefix else '') + 'N_HEMIREF'] = n_alt_count_0 record.info[(prefix + '_' if prefix else '') + 'N_HEMIALT'] = n_gts_with_gt_0_alts record.info[(prefix + '_' if prefix else '') + 'FREQ_HEMIREF'] = freq_homref record.info[(prefix + '_' if prefix else '') + 'FREQ_HEMIALT'] = freq_hemialt record.info[(prefix + '_' if prefix else '') + 'N_HOMREF'] = n_alt_count_0 record.info[(prefix + '_' if prefix else '') + 'N_HET'] = n_alt_count_1 record.info[(prefix + '_' if prefix else '') + 'N_HOMALT'] = n_alt_count_2 record.info[(prefix + '_' if prefix else '') + 'FREQ_HOMREF'] = freq_homref record.info[(prefix + '_' if prefix else '') + 'FREQ_HET'] = freq_het record.info[(prefix + '_' if prefix else '') + 'FREQ_HOMALT'] = freq_homalt # Multiallelic sites should reference FORMAT:CN rather than GT # Compute CN_NUMBER, CN_NONREF_COUNT, CN_NONREF_FREQ, and CN_COUNT/CN_FREQ for each copy state else: # Get all sample CNs and remove Nones CNs_wNones = [record.samples[s]['CN'] for s in samples] CNs = [ c for c in CNs_wNones if c is not None and c not in '. NA'.split() ] if len(CNs) == 0: nonnull_CNs, nonref_CN_count, nonref_CN_freq = [0] * 3 CN_dist = (0, ) CN_freqs = (0, ) else: # Count number of samples per CN and total CNs observed CN_counts = dict(Counter(CNs)) nonnull_CNs = len(CNs) # Get max observed CN and enumerate counts/frequencies per CN as list starting from CN=0 max_CN = max([int(k) for k, v in CN_counts.items()]) CN_dist = [int(CN_counts.get(k, 0)) for k in range(max_CN + 1)] CN_freqs = [round(v / nonnull_CNs, 6) for v in CN_dist] # Get total non-reference CN counts and freq if hemi: ref_CN = 1 else: ref_CN = 2 nonref_CN_count = sum([ int(CN_counts.get(k, 0)) for k in range(max_CN + 1) if k != ref_CN ]) nonref_CN_freq = round(nonref_CN_count / nonnull_CNs, 6) # Add values to INFO field record.info[(prefix + '_' if prefix else '') + 'CN_NUMBER'] = nonnull_CNs record.info[(prefix + '_' if prefix else '') + 'CN_COUNT'] = tuple(CN_dist) record.info[(prefix + '_' if prefix else '') + 'CN_FREQ'] = tuple(CN_freqs) record.info[(prefix + '_' if prefix else '') + 'CN_NONREF_COUNT'] = nonref_CN_count record.info[(prefix + '_' if prefix else '') + 'CN_NONREF_FREQ'] = nonref_CN_freq return record
def gather_allele_freqs(record, samples, males_set, females_set, parbt, pop_dict, pops, sex_chroms, no_combos=False): """ Wrapper to compute allele frequencies for all sex & population pairings """ # Add PAR annotation to record (if optioned) if record.chrom in sex_chroms and len(parbt) > 0: if in_par(record, parbt): rec_in_par = True record.info['PAR'] = True else: rec_in_par = False else: rec_in_par = False # Get allele frequencies for all populations calc_allele_freq(record, samples) if len(males_set) > 0: if record.chrom in sex_chroms and not rec_in_par: calc_allele_freq(record, males_set, prefix='MALE', hemi=True) else: calc_allele_freq(record, males_set, prefix='MALE') if len(females_set) > 0: calc_allele_freq(record, females_set, prefix='FEMALE') # Adjust global allele frequencies on sex chromosomes, if famfile provided if record.chrom in sex_chroms and not rec_in_par \ and svu.is_biallelic(record) and len(males_set) + len(females_set) > 0: update_sex_freqs(record) # Get allele frequencies per population if len(pops) > 0: for pop in pops: pop_samps = [s for s in samples if pop_dict.get(s, None) == pop] calc_allele_freq(record, pop_samps, prefix=pop) if len(males_set) > 0 and not no_combos: if record.chrom in sex_chroms and not rec_in_par: calc_allele_freq( record, list([s for s in pop_samps if s in males_set]), prefix=pop + '_MALE', hemi=True) else: calc_allele_freq( record, list([s for s in pop_samps if s in males_set]), prefix=pop + '_MALE') if len(females_set) > 0 and not no_combos: calc_allele_freq( record, list([s for s in pop_samps if s in females_set]), prefix=pop + '_FEMALE') # Adjust per-pop allele frequencies on sex chromosomes, if famfile provided if record.chrom in sex_chroms and not rec_in_par \ and svu.is_biallelic(record) and len(males_set) + len(females_set) > 0: update_sex_freqs(record, pop=pop) # Get POPMAX AF biallelic sites only if svu.is_biallelic(record): AFs = [record.info['{0}_AF'.format(pop)][0] for pop in pops] popmax = max(AFs) record.info['POPMAX_AF'] = popmax return record
def cleanup_vcf(vcf, fout, callrates, min_callrate_global=0.85, min_callrate_smallDels=0.95): # minus_samples = [s for s in vcf.header.samples if s not in plus_samples] # male_minus_samples = [s for s in minus_samples if s not in male_samples] for record in vcf: #Move several filters from FILTER to INFO for filt in filts_for_info: if filt in record.filter: record.info[filt] = True #Move HIGH_SR_BACKGROUND #Remove all HIGH_NOCALL_RATE and HIGH_SR_BACKGROUND tags from FILTER column newfilts = [ filt for filt in record.filter if filt not in filts_to_remove ] record.filter.clear() for filt in newfilts: record.filter.add(filt) if len(record.filter) == 0: record.filter.add('PASS') # #Mark sites with low PCR+ call rate # plus_callrate = get_call_rate(record, plus_samples) # if plus_callrate < min_callrate: # if 'LOW_PCRPLUS_CALL_RATE' not in record.info.keys(): # record.info.keys().append('LOW_PCRPLUS_CALL_RATE') # record.info['LOW_PCRPLUS_CALL_RATE'] = True #Mark sites with low PCR- call rate if record.id in callrates.keys(): callrate = callrates[record.id] #Mark small (300bp-1kb) deletions with stricter 5% null gt rate, #and mark all other variants at specified null gt rate if record.info['SVTYPE'] == 'DEL' \ and record.info['SVLEN'] < 1000 \ and record.info['SVLEN'] > 300: if callrate < min_callrate_smallDels: record.filter.add('LOW_CALL_RATE') else: if callrate < min_callrate_global: record.filter.add('LOW_CALL_RATE') #Recalibrate QUAL score for biallelic variants if is_biallelic(record): newQUAL = recal_qual_score(record) if newQUAL is not None: record.qual = newQUAL #Only check for non-empty GTs for biallelic variants if is_biallelic(record): for s in record.samples: if record.samples[s]['GT'] not in NULL_and_REF_GTs: fout.write(record) break else: fout.write(record)