def test_allele_freq_with_variations(self): variations = load_zarr(TEST_DATA_DIR / 'test.zarr') # variations = remove_low_call_rate_vars(variations, min_call_rate=0, # calc_histogram=False)[FLT_VARS] max_alleles = variations[ALT_FIELD].shape[1] + 1 task = calc_allele_freq(variations, max_alleles=max_alleles, min_num_genotypes=0) result = compute(task, silence_runtime_warnings=True) expected = np.array([[0.5, 0.5, 0.0, 0.0], [0.5, 0.5, 0.0, 0.0], [0.5, 0.5, 0.0, 0.0], [0.75, 0.25, 0.0, 0.0], [np.nan, np.nan, np.nan, np.nan], [0.5, 0.5, 0.0, 0.0], [0.25, 0.75, 0.0, 0.0]]) np.testing.assert_allclose(result, expected, equal_nan=True)
def test_allele_freq_in_memory(self): gts = np.array([[[0, 0], [1, 1], [0, -1], [-1, -1]], [[0, -1], [0, 0], [0, -1], [-1, -1]], [[0, 1], [0, 2], [0, 0], [-1, -1]]]) samples = ['1', '2', '3', '4'] variations = Variations(samples=np.array(samples)) variations[GT_FIELD] = gts variations[ALT_FIELD] = np.zeros((3, 2)) allele_freq = calc_allele_freq(variations, max_alleles=3, min_num_genotypes=0) allele_freq = allele_freq expected = np.array([[0.6, 0.4, 0], [1, 0, 0], [4 / 6, 1 / 6, 1 / 6]]) assert np.allclose(allele_freq, expected)
def _calc_allele_freq_and_unbiased_J_per_locus(variations, max_alleles, min_num_genotypes): try: allele_freq = calc_allele_freq(variations, max_alleles=max_alleles, min_num_genotypes=min_num_genotypes) except ValueError: allele_freq = None xUb_per_locus = None if allele_freq is not None: n_indi = variations[GT_FIELD].shape[1] xUb_per_locus = ((2 * n_indi * va.sum(allele_freq**2, axis=1)) - 1) / (2 * n_indi - 1) return allele_freq, xUb_per_locus
def _calc_pairwise_dest(vars_for_pop1, vars_for_pop2, max_alleles, min_call_dp_for_het, min_num_genotypes): num_pops = 2 ploidy = vars_for_pop1.ploidy allele_freq1 = calc_allele_freq(vars_for_pop1, max_alleles=max_alleles, min_num_genotypes=0) allele_freq2 = calc_allele_freq(vars_for_pop2, max_alleles=max_alleles, min_num_genotypes=0) exp_het1 = 1 - va.sum(allele_freq1**ploidy, axis=1) exp_het2 = 1 - va.sum(allele_freq2**ploidy, axis=1) hs_per_var = (exp_het1 + exp_het2) / 2 global_allele_freq = (allele_freq1 + allele_freq2) / 2 global_exp_het = 1 - va.sum(global_allele_freq**ploidy, axis=1) ht_per_var = global_exp_het obs_het1_counts, called_gts1 = _calc_obs_het_counts( vars_for_pop1, axis=1, min_call_dp_for_het_call=min_call_dp_for_het) obs_het1 = obs_het1_counts / called_gts1 obs_het2_counts, called_gts2 = _calc_obs_het_counts( vars_for_pop2, axis=1, min_call_dp_for_het_call=min_call_dp_for_het) obs_het2 = obs_het2_counts / called_gts2 called_gts = va.stack([called_gts1, called_gts2], as_type_of=called_gts1) try: called_gts_hmean = hmean(called_gts, axis=0) except ValueError: called_gts_hmean = None if called_gts_hmean is None: num_vars = vars_for_pop1.num_variations corrected_hs = va.full((num_vars, ), np.nan, as_type_of=vars_for_pop1[GT_FIELD]) corrected_ht = va.full((num_vars, ), np.nan, as_type_of=vars_for_pop1[GT_FIELD]) else: mean_obs_het_per_var = va.nanmean(va.stack([obs_het1, obs_het2], as_type_of=obs_het1), axis=0) corrected_hs = (called_gts_hmean / (called_gts_hmean - 1)) * (hs_per_var - (mean_obs_het_per_var / (2 * called_gts_hmean))) corrected_ht = ht_per_var + (corrected_hs / (called_gts_hmean * num_pops)) - ( mean_obs_het_per_var / (2 * called_gts_hmean * num_pops)) not_enough_gts = va.logical_or(called_gts1 < min_num_genotypes, called_gts2 < min_num_genotypes) corrected_hs[not_enough_gts] = np.nan corrected_ht[not_enough_gts] = np.nan return {'corrected_hs': corrected_hs, 'corrected_ht': corrected_ht}