Example #1
0
    def test_allele_freq_with_variations(self):
        variations = load_zarr(TEST_DATA_DIR / 'test.zarr')
        #         variations = remove_low_call_rate_vars(variations, min_call_rate=0,
        #                                                calc_histogram=False)[FLT_VARS]

        max_alleles = variations[ALT_FIELD].shape[1] + 1
        task = calc_allele_freq(variations,
                                max_alleles=max_alleles,
                                min_num_genotypes=0)
        result = compute(task, silence_runtime_warnings=True)
        expected = np.array([[0.5, 0.5, 0.0, 0.0], [0.5, 0.5, 0.0, 0.0],
                             [0.5, 0.5, 0.0, 0.0], [0.75, 0.25, 0.0, 0.0],
                             [np.nan, np.nan, np.nan, np.nan],
                             [0.5, 0.5, 0.0, 0.0], [0.25, 0.75, 0.0, 0.0]])
        np.testing.assert_allclose(result, expected, equal_nan=True)
Example #2
0
    def test_allele_freq_in_memory(self):

        gts = np.array([[[0, 0], [1, 1], [0, -1], [-1, -1]],
                        [[0, -1], [0, 0], [0, -1], [-1, -1]],
                        [[0, 1], [0, 2], [0, 0], [-1, -1]]])
        samples = ['1', '2', '3', '4']
        variations = Variations(samples=np.array(samples))
        variations[GT_FIELD] = gts
        variations[ALT_FIELD] = np.zeros((3, 2))

        allele_freq = calc_allele_freq(variations,
                                       max_alleles=3,
                                       min_num_genotypes=0)
        allele_freq = allele_freq
        expected = np.array([[0.6, 0.4, 0], [1, 0, 0], [4 / 6, 1 / 6, 1 / 6]])
        assert np.allclose(allele_freq, expected)
Example #3
0
def _calc_allele_freq_and_unbiased_J_per_locus(variations, max_alleles,
                                               min_num_genotypes):
    try:
        allele_freq = calc_allele_freq(variations,
                                       max_alleles=max_alleles,
                                       min_num_genotypes=min_num_genotypes)
    except ValueError:
        allele_freq = None
        xUb_per_locus = None

    if allele_freq is not None:
        n_indi = variations[GT_FIELD].shape[1]
        xUb_per_locus = ((2 * n_indi * va.sum(allele_freq**2, axis=1)) -
                         1) / (2 * n_indi - 1)

    return allele_freq, xUb_per_locus
Example #4
0
def _calc_pairwise_dest(vars_for_pop1, vars_for_pop2, max_alleles,
                        min_call_dp_for_het, min_num_genotypes):
    num_pops = 2
    ploidy = vars_for_pop1.ploidy

    allele_freq1 = calc_allele_freq(vars_for_pop1,
                                    max_alleles=max_alleles,
                                    min_num_genotypes=0)
    allele_freq2 = calc_allele_freq(vars_for_pop2,
                                    max_alleles=max_alleles,
                                    min_num_genotypes=0)

    exp_het1 = 1 - va.sum(allele_freq1**ploidy, axis=1)
    exp_het2 = 1 - va.sum(allele_freq2**ploidy, axis=1)

    hs_per_var = (exp_het1 + exp_het2) / 2

    global_allele_freq = (allele_freq1 + allele_freq2) / 2
    global_exp_het = 1 - va.sum(global_allele_freq**ploidy, axis=1)
    ht_per_var = global_exp_het

    obs_het1_counts, called_gts1 = _calc_obs_het_counts(
        vars_for_pop1, axis=1, min_call_dp_for_het_call=min_call_dp_for_het)
    obs_het1 = obs_het1_counts / called_gts1
    obs_het2_counts, called_gts2 = _calc_obs_het_counts(
        vars_for_pop2, axis=1, min_call_dp_for_het_call=min_call_dp_for_het)
    obs_het2 = obs_het2_counts / called_gts2

    called_gts = va.stack([called_gts1, called_gts2], as_type_of=called_gts1)

    try:
        called_gts_hmean = hmean(called_gts, axis=0)
    except ValueError:
        called_gts_hmean = None

    if called_gts_hmean is None:
        num_vars = vars_for_pop1.num_variations
        corrected_hs = va.full((num_vars, ),
                               np.nan,
                               as_type_of=vars_for_pop1[GT_FIELD])
        corrected_ht = va.full((num_vars, ),
                               np.nan,
                               as_type_of=vars_for_pop1[GT_FIELD])
    else:
        mean_obs_het_per_var = va.nanmean(va.stack([obs_het1, obs_het2],
                                                   as_type_of=obs_het1),
                                          axis=0)
        corrected_hs = (called_gts_hmean /
                        (called_gts_hmean - 1)) * (hs_per_var -
                                                   (mean_obs_het_per_var /
                                                    (2 * called_gts_hmean)))

        corrected_ht = ht_per_var + (corrected_hs /
                                     (called_gts_hmean * num_pops)) - (
                                         mean_obs_het_per_var /
                                         (2 * called_gts_hmean * num_pops))

        not_enough_gts = va.logical_or(called_gts1 < min_num_genotypes,
                                       called_gts2 < min_num_genotypes)
        corrected_hs[not_enough_gts] = np.nan
        corrected_ht[not_enough_gts] = np.nan

    return {'corrected_hs': corrected_hs, 'corrected_ht': corrected_ht}