def test_GetHeterozygosity(): afreqs = {0: 1} assert (utils.GetHeterozygosity(afreqs) == 0) afreqs = {0: 0.5, 1: 0.5} assert (utils.GetHeterozygosity(afreqs) == 0.5) afreqs = {0: 0.5, 1: 0.2, 2: 0.3} assert (utils.GetHeterozygosity(afreqs) == 0.62) afreqs = {} assert (np.isnan(utils.GetHeterozygosity(afreqs)))
def __call__(self, record): trrecord = trh.HarmonizeRecord(self.vcftype, record) het = utils.GetHeterozygosity( trrecord.GetAlleleFreqs(uselength=self.uselength)) if het > self.threshold: return het return None
def GetHet(trrecord, samplelists=[], uselength=True): r"""Compute heterozygosity of a locus Heterozygosity is defined as the probability that two randomly drawn allele are different. Parameters ---------- trrecord: trh.TRRecord object The record that we are computing the statistic for samplelist: list of list of str List of list of the samples that we include when compute the statistic uselength: bool Whether we should collapse alleles by length Returns ------- heterozygosity: list of float The heterozygosity of the locus. One value for each sample list. If the allele frequencies dictionary is invalid, return np.nan """ if len(samplelists) == 0: samplelists.append(None) hetvals = [] for sl in samplelists: allele_freqs = trrecord.GetAlleleFreqs(samplelist=sl, uselength=uselength) hetvals.append(utils.GetHeterozygosity(allele_freqs)) return hetvals
def main(args): # Load VCF file invcf = utils.LoadSingleReader(args.vcf, checkgz=False) if invcf is None: return 1 if not os.path.exists(os.path.dirname(os.path.abspath(args.out))): common.WARNING( "Error: The directory which contains the output location {} does" " not exist".format(args.out)) return 1 if os.path.isdir(args.out) and args.out.endswith(os.sep): common.WARNING("Error: The output location {} is a " "directory".format(args.out)) return 1 # Set up record harmonizer and infer VCF type vcftype = trh.InferVCFType(invcf, args.vcftype) # Check filters all make sense if not CheckFilters(invcf, args, vcftype): return 1 # Set up locus-level filter list try: filter_list = BuildLocusFilters(args, vcftype) except ValueError: return 1 filter_list = BuildLocusFilters(args, vcftype) invcf.filters = {} for f in filter_list: short_doc = f.__doc__ or '' short_doc = short_doc.split('\n')[0].lstrip() invcf.filters[f.filter_name()] = _Filter(f.filter_name(), short_doc) # Set up call-level filters call_filters = BuildCallFilters(args) # Add new FORMAT fields if "FILTER" not in invcf.formats: invcf.formats["FILTER"] = _Format("FILTER", 1, "String", "Call-level filter") # Add new INFO fields invcf.infos["AC"] = _Info("AC", -1, "Integer", "Alternate allele counts", source=None, version=None) invcf.infos["REFAC"] = _Info("REFAC", 1, "Integer", "Reference allele count", source=None, version=None) invcf.infos["HET"] = _Info("HET", 1, "Float", "Heterozygosity", source=None, version=None) invcf.infos["HWEP"] = _Info("HWEP", 1, "Float", "HWE p-value for obs. vs. exp het rate", source=None, version=None) invcf.infos["HRUN"] = _Info("HRUN", 1, "Integer", "Length of longest homopolymer run", source=None, version=None) # Set up output files if not os.path.exists(os.path.dirname(os.path.abspath(args.out))): common.WARNING("Output directory does not exist") return 1 outvcf = MakeWriter(args.out + ".vcf", invcf, " ".join(sys.argv)) if outvcf is None: return 1 # Set up sample info all_reasons = GetAllCallFilters(call_filters) sample_info = {} for s in invcf.samples: sample_info[s] = {"numcalls": 0, "totaldp": 0} for r in all_reasons: sample_info[s][r] = 0 # Set up locus info loc_info = {"totalcalls": 0, "PASS": 0} for filt in filter_list: loc_info[filt.filter_name()] = 0 # Go through each record record_counter = 0 while True: try: record = next(invcf) except IndexError: common.WARNING( "Skipping TR that couldn't be parsed by PyVCF. Check VCF format" ) if args.die_on_warning: return 1 except StopIteration: break if args.verbose: common.MSG("Processing %s:%s" % (record.CHROM, record.POS)) record_counter += 1 if args.num_records is not None and record_counter > args.num_records: break # Call-level filters record = ApplyCallFilters(record, invcf, call_filters, sample_info) # Locus-level filters record.FILTER = None output_record = True for filt in filter_list: if filt(record) == None: continue if args.drop_filtered: output_record = False break record.add_filter(filt.filter_name()) loc_info[filt.filter_name()] += 1 if args.drop_filtered: if record.call_rate == 0: output_record = False if output_record: trrecord = trh.HarmonizeRecord(vcftype, record) # Recalculate locus-level INFO fields record.INFO["HRUN"] = utils.GetHomopolymerRun(record.REF) if record.num_called > 0: allele_freqs = trrecord.GetAlleleFreqs( uselength=args.use_length) genotype_counts = trrecord.GetGenotypeCounts( uselength=args.use_length) record.INFO["HET"] = utils.GetHeterozygosity(allele_freqs) record.INFO["HWEP"] = utils.GetHardyWeinbergBinomialTest( allele_freqs, genotype_counts) record.INFO["AC"] = [ int(item * (3 * record.num_called)) for item in record.aaf ] record.INFO["REFAC"] = int( (1 - sum(record.aaf)) * (2 * record.num_called)) else: record.INFO["HET"] = -1 record.INFO["HWEP"] = -1 record.INFO["AC"] = [0] * len(record.ALT) record.INFO["REFAC"] = 0 # Recalc filter if record.FILTER is None and not args.drop_filtered: record.FILTER = "PASS" loc_info["PASS"] += 1 loc_info["totalcalls"] += record.num_called # Output the record outvcf.write_record(record) # Output log info WriteSampLog(sample_info, all_reasons, args.out + ".samplog.tab") WriteLocLog(loc_info, args.out + ".loclog.tab") return 0
def load_strs(imputation_run_name: str, region: str, samples: np.ndarray, details: bool = True, var_subset: Optional[Set[int]] = None, hardcalls=False): """ Iterate over a region returning genotypes at STR loci. First yield is a tuple of names of the fields in details. Every subsequent yield is described in the yields section below. Parameters ---------- imputation_run_name: which imputation run to load genotypes from? region: chr:start-end samples: A boolean array of length nsamples determining which samples are included (True) and which are not Yields ------ dosages: Dict[float, np.ndarray] A dictionary from unique length alleles to 2D arrays of size (n_samples, 2) which contain the dosages of those alleles for each haplotype Length dosage are measured in number of repeats. None if locus_filtered is not None If hardcalls, then instead of that just an array nx2 of length alleles unique_alleles: np.ndarray Array of unique length alleles (measured in number of repeats), same length as the dosages dict chrom: str e.g. '13' pos: int locus_filtered: None if the locus is not filtered, otherwise a string explaining why. 'MAC<20' if the minor allele dosage is less than 20 after sample subsetting, per plink's standard. None if hardcalls. locus_details: tuple of strings with the same length as the first yield with the corresponding order. None if hardcalls. Notes ----- Hardcalls mentioned in the locus details are phased hardcalls, and in some corner cases will not correspond to the maximum likelihood unphased allele. """ if hardcalls: assert var_subset is not None and not details chrom, region_poses = region.split(':') region_start, _ = [int(pos) for pos in region_poses.split('-')] vcf_fname = (f'{ukb}/str_imputed/runs/{imputation_run_name}/' f'vcfs/annotated_strs/chr{chrom}.vcf.gz') vcf = cyvcf2.VCF(vcf_fname) if details: yield ('motif', 'period', 'ref_len', 'total_per_allele_dosages', 'total_hardcall_alleles', 'total_hardcall_genotypes', 'subset_total_per_allele_dosages', 'subset_total_hardcall_alleles', 'subset_total_hardcall_genotypes', 'subset_het', 'subset_entropy', 'subset_HWEP', 'subset_allele_dosage_r2') for record in vcf(region): if record.POS < region_start: # records that overlap this region but started before this region # should be considered part of the pervious region and not returned here continue if record.INFO.get('PERIOD') is None: # there are a few duplicate loci which I didn't handle # properly, this identifies and removes them continue if var_subset is not None and record.POS not in var_subset: continue trrecord = trh.HarmonizeRecord(vcfrecord=record, vcftype='beagle-hipstr') len_alleles = [trrecord.ref_allele_length ] + trrecord.alt_allele_lengths len_alleles = [ round(allele_len, allele_len_precision) for allele_len in len_alleles ] if hardcalls: yield (trrecord.GetLengthGenotypes()[samples, :-1], np.unique(len_alleles), trrecord.chrom, trrecord.pos, None, None) continue if details: total_dosages = {_len: 0 for _len in np.unique(len_alleles)} for p in (1, 2): ap = trrecord.format[f'AP{p}'] total_dosages[len_alleles[0]] += np.sum( np.maximum(0, 1 - np.sum(ap, axis=1))) for i in range(ap.shape[1]): total_dosages[len_alleles[i + 1]] += np.sum(ap[:, i]) total_hardcall_alleles = clean_len_alleles( trrecord.GetAlleleCounts()) total_hardcall_genotypes = clean_len_allele_pairs( trrecord.GetGenotypeCounts()) if isinstance(samples, slice): assert samples == slice(None) n_subset_samples = trrecord.GetNumSamples() else: n_subset_samples = int(np.sum(samples)) subset_dosage_gts = { _len: np.zeros((n_subset_samples, 2)) for _len in np.unique(len_alleles) } for p in (1, 2): # todo genotype dosages ap = trrecord.format[f'AP{p}'] subset_dosage_gts[len_alleles[0]][:, (p-1)] += \ np.maximum(0, 1 - np.sum(ap[samples, :], axis=1)) for i in range(ap.shape[1]): subset_dosage_gts[len_alleles[i + 1]][:, (p - 1)] += ap[samples, i] subset_total_dosages = { _len: np.sum(subset_dosage_gts[_len]) for _len in subset_dosage_gts } if details: subset_total_hardcall_alleles = clean_len_alleles( trrecord.GetAlleleCounts(samples)) subset_total_hardcall_genotypes = clean_len_allele_pairs( trrecord.GetGenotypeCounts(samples)) subset_hardcall_allele_freqs = clean_len_alleles( trrecord.GetAlleleFreqs(samples)) subset_het = utils.GetHeterozygosity(subset_hardcall_allele_freqs) subset_entropy = utils.GetEntropy(subset_hardcall_allele_freqs) subset_hwep = utils.GetHardyWeinbergBinomialTest( subset_hardcall_allele_freqs, subset_total_hardcall_genotypes) # https://www.cell.com/ajhg/fulltext/S0002-9297(09)00012-3#app1 # Browning, Brian L., and Sharon R. Browning. "A unified approach to genotype imputation and haplotype-phase inference for large data sets of trios and unrelated individuals." The American Journal of Human Genetics 84.2 (2009): 210-223. # appendix 1 subset_allele_dosage_r2 = {} subset_hardcalls = np.around( trrecord.GetLengthGenotypes()[samples, :-1], allele_len_precision) for length in len_alleles: # calculate allele dosage r**2 for this length if length in subset_allele_dosage_r2: continue calls = subset_hardcalls == length subset_allele_dosage_r2[length] = np.corrcoef( calls.reshape(-1), subset_dosage_gts[length].reshape(-1))[0, 1]**2 locus_details = (trrecord.motif, str(len(trrecord.motif)), str( round(trrecord.ref_allele_length, allele_len_precision)), dict_str( round_vals(total_dosages, dosage_precision)), dict_str(total_hardcall_alleles), dict_str(total_hardcall_genotypes), dict_str( round_vals(subset_total_dosages, dosage_precision)), dict_str(subset_total_hardcall_alleles), dict_str(subset_total_hardcall_genotypes), str(subset_het), str(subset_entropy), str(subset_hwep), dict_str( round_vals(subset_allele_dosage_r2, r2_precision))) else: locus_details = None mac = list(subset_total_dosages.values()) mac.pop(np.argmax(mac)) if np.sum(mac) < 20: yield (None, np.unique(len_alleles), trrecord.chrom, trrecord.pos, 'MAC<20', locus_details) continue yield (subset_dosage_gts, np.unique(len_alleles), trrecord.chrom, trrecord.pos, None, locus_details)