def test_heterozygosity_expected(self): def refimpl(af, ploidy, fill=0): """Limited reference implementation for testing purposes.""" # check allele frequencies sum to 1 af_sum = np.sum(af, axis=1) # assume three alleles p = af[:, 0] q = af[:, 1] r = af[:, 2] out = 1 - p**ploidy - q**ploidy - r**ploidy with ignore_invalid(): out[(af_sum < 1) | np.isnan(af_sum)] = fill return out # diploid g = GenotypeArray([[[0, 0], [0, 0]], [[1, 1], [1, 1]], [[1, 1], [2, 2]], [[0, 0], [0, 1]], [[0, 0], [0, 2]], [[1, 1], [1, 2]], [[0, 1], [0, 1]], [[0, 1], [1, 2]], [[0, 0], [-1, -1]], [[0, 1], [-1, -1]], [[-1, -1], [-1, -1]]], dtype='i1') expect1 = [0, 0, 0.5, .375, .375, .375, .5, .625, 0, .5, -1] af = g.count_alleles().to_frequencies() expect2 = refimpl(af, ploidy=g.ploidy, fill=-1) actual = allel.stats.heterozygosity_expected(af, ploidy=g.ploidy, fill=-1) assert_array_close(expect1, actual) assert_array_close(expect2, actual) expect3 = [0, 0, 0.5, .375, .375, .375, .5, .625, 0, .5, 0] actual = allel.stats.heterozygosity_expected(af, ploidy=g.ploidy, fill=0) assert_array_close(expect3, actual) # polyploid g = GenotypeArray([[[0, 0, 0], [0, 0, 0]], [[1, 1, 1], [1, 1, 1]], [[1, 1, 1], [2, 2, 2]], [[0, 0, 0], [0, 0, 1]], [[0, 0, 0], [0, 0, 2]], [[1, 1, 1], [0, 1, 2]], [[0, 0, 1], [0, 1, 1]], [[0, 1, 1], [0, 1, 2]], [[0, 0, 0], [-1, -1, -1]], [[0, 0, 1], [-1, -1, -1]], [[-1, -1, -1], [-1, -1, -1]]], dtype='i1') af = g.count_alleles().to_frequencies() expect = refimpl(af, ploidy=g.ploidy, fill=-1) actual = allel.stats.heterozygosity_expected(af, ploidy=g.ploidy, fill=-1) assert_array_close(expect, actual)
def inbreeding_coefficient(g, fill=np.nan): """Calculate the inbreeding coefficient for each variant. Parameters ---------- g : array_like, int, shape (n_variants, n_samples, ploidy) Genotype array. fill : float, optional Use this value for variants where the expected heterozygosity is zero. Returns ------- f : ndarray, float, shape (n_variants,) Inbreeding coefficient. Notes ----- The inbreeding coefficient is calculated as *1 - (Ho/He)* where *Ho* is the observed heterozygosity and *He* is the expected heterozygosity. Examples -------- >>> import allel >>> g = allel.GenotypeArray([[[0, 0], [0, 0], [0, 0]], ... [[0, 0], [0, 1], [1, 1]], ... [[0, 0], [1, 1], [2, 2]], ... [[1, 1], [1, 2], [-1, -1]]]) >>> allel.stats.inbreeding_coefficient(g) array([ nan, 0.33333333, 1. , -0.33333333]) """ # check inputs if not hasattr(g, 'count_het') or not hasattr(g, 'count_called'): g = GenotypeArray(g, copy=False) # calculate observed and expected heterozygosity ho = heterozygosity_observed(g) af = g.count_alleles().to_frequencies() he = heterozygosity_expected(af, ploidy=g.shape[-1], fill=0) # calculate inbreeding coefficient, accounting for variants with no # expected heterozygosity with ignore_invalid(): f = np.where(he > 0, 1 - (ho / he), fill) return f
def inbreeding_coefficient(g, fill=np.nan): """Calculate the inbreeding coefficient for each variant. Parameters ---------- g : array_like, int, shape (n_variants, n_samples, ploidy) Genotype array. fill : float, optional Use this value for variants where the expected heterozygosity is zero. Returns ------- f : ndarray, float, shape (n_variants,) Inbreeding coefficient. Notes ----- The inbreeding coefficient is calculated as *1 - (Ho/He)* where *Ho* is the observed heterozygosity and *He* is the expected heterozygosity. Examples -------- >>> import allel >>> g = allel.GenotypeArray([[[0, 0], [0, 0], [0, 0]], ... [[0, 0], [0, 1], [1, 1]], ... [[0, 0], [1, 1], [2, 2]], ... [[1, 1], [1, 2], [-1, -1]]]) >>> allel.stats.inbreeding_coefficient(g) array([ nan, 0.33333333, 1. , -0.33333333]) """ # check inputs if not hasattr(g, 'count_het') or not hasattr(g, 'count_called'): g = GenotypeArray(g, copy=False) # calculate observed and expected heterozygosity ho = heterozygosity_observed(g) af = g.count_alleles().to_frequencies() he = heterozygosity_expected(af, ploidy=g.shape[-1], fill=0) # calculate inbreeding coefficient, accounting for variants with no # expected heterozygosity with ignore_invalid(): f = np.where(he > 0, 1 - (ho / he), fill) return f
def _weir_cockerham_fst(g, subpops, max_allele): # check inputs g = GenotypeArray(g, copy=False) n_variants, n_samples, ploidy = g.shape n_alleles = max_allele + 1 # number of populations sampled r = len(subpops) n_populations = r debug('r: %r', r) # count alleles within each subpopulation ac = [g.count_alleles(subpop=s, max_allele=max_allele) for s in subpops] # stack allele counts from each sub-population into a single array ac = np.dstack(ac) assert ac.shape == (n_variants, n_alleles, n_populations) debug('ac: %s, %r', ac.shape, ac) # count number of alleles called within each population by summing # allele counts along the alleles dimension an = np.sum(ac, axis=1) assert an.shape == (n_variants, n_populations) debug('an: %s, %r', an.shape, an) # compute number of individuals sampled from each population n = an // 2 assert n.shape == (n_variants, n_populations) debug('n: %s, %r', n.shape, n) # compute the total number of individuals sampled across all populations n_total = np.sum(n, axis=1) assert n_total.shape == (n_variants, ) debug('n_total: %s, %r', n_total.shape, n_total) # compute the average sample size across populations n_bar = np.mean(n, axis=1) assert n_bar.shape == (n_variants, ) debug('n_bar: %s, %r', n_bar.shape, n_bar) # compute the term n sub C incorporating the coefficient of variation in # sample sizes n_C = (n_total - (np.sum(n**2, axis=1) / n_total)) / (r - 1) assert n_C.shape == (n_variants, ) debug('n_C: %s, %r', n_C.shape, n_C) # compute allele frequencies within each population p = ac / an[:, np.newaxis, :] assert p.shape == (n_variants, n_alleles, n_populations) debug('p: %s, %r', p.shape, p) # compute the average sample frequency of each allele ac_total = np.sum(ac, axis=2) an_total = np.sum(an, axis=1) p_bar = ac_total / an_total[:, np.newaxis] assert p_bar.shape == (n_variants, n_alleles) debug('p_bar: %s, %r', p_bar.shape, p_bar) # add in some extra dimensions to enable broadcasting n_bar = n_bar[:, np.newaxis] n_C = n_C[:, np.newaxis] n = n[:, np.newaxis, :] p_bar = p_bar[:, :, np.newaxis] # compute the sample variance of allele frequencies over populations s_squared = (np.sum(n * ((p - p_bar)**2), axis=2) / (n_bar * (r - 1))) assert s_squared.shape == (n_variants, n_alleles) debug('s_squared: %s, %r', s_squared.shape, s_squared) # remove extra dimensions for correct broadcasting p_bar = p_bar[:, :, 0] # compute the average heterozygosity over all populations # N.B., take only samples in subpops of interest gs = g.take(list(itertools.chain(*subpops)), axis=1) h_bar = [ gs.count_het(allele=allele, axis=1) / n_total for allele in range(n_alleles) ] h_bar = np.column_stack(h_bar) assert h_bar.shape == (n_variants, n_alleles) debug('h_bar: %s, %r', h_bar.shape, h_bar) # now comes the tricky bit... # component of variance between populations a = ((n_bar / n_C) * (s_squared - ((1 / (n_bar - 1)) * ((p_bar * (1 - p_bar)) - ((r - 1) * s_squared / r) - (h_bar / 4))))) assert a.shape == (n_variants, n_alleles) # component of variance between individuals within populations b = ((n_bar / (n_bar - 1)) * ((p_bar * (1 - p_bar)) - ((r - 1) * s_squared / r) - (((2 * n_bar) - 1) * h_bar / (4 * n_bar)))) assert b.shape == (n_variants, n_alleles) # component of variance between gametes within individuals c = h_bar / 2 assert c.shape == (n_variants, n_alleles) return a, b, c
def _weir_cockerham_fst(g, subpops, max_allele): # check inputs g = GenotypeArray(g, copy=False) n_variants, n_samples, ploidy = g.shape n_alleles = max_allele + 1 # number of populations sampled r = len(subpops) n_populations = r debug('r: %r', r) # count alleles within each subpopulation ac = [g.count_alleles(subpop=s, max_allele=max_allele) for s in subpops] # stack allele counts from each sub-population into a single array ac = np.dstack(ac) assert ac.shape == (n_variants, n_alleles, n_populations) debug('ac: %s, %r', ac.shape, ac) # count number of alleles called within each population by summing # allele counts along the alleles dimension an = np.sum(ac, axis=1) assert an.shape == (n_variants, n_populations) debug('an: %s, %r', an.shape, an) # compute number of individuals sampled from each population n = an // 2 assert n.shape == (n_variants, n_populations) debug('n: %s, %r', n.shape, n) # compute the total number of individuals sampled across all populations n_total = np.sum(n, axis=1) assert n_total.shape == (n_variants,) debug('n_total: %s, %r', n_total.shape, n_total) # compute the average sample size across populations n_bar = np.mean(n, axis=1) assert n_bar.shape == (n_variants,) debug('n_bar: %s, %r', n_bar.shape, n_bar) # compute the term n sub C incorporating the coefficient of variation in # sample sizes n_C = (n_total - (np.sum(n**2, axis=1) / n_total)) / (r - 1) assert n_C.shape == (n_variants,) debug('n_C: %s, %r', n_C.shape, n_C) # compute allele frequencies within each population p = ac / an[:, np.newaxis, :] assert p.shape == (n_variants, n_alleles, n_populations) debug('p: %s, %r', p.shape, p) # compute the average sample frequency of each allele ac_total = np.sum(ac, axis=2) an_total = np.sum(an, axis=1) p_bar = ac_total / an_total[:, np.newaxis] assert p_bar.shape == (n_variants, n_alleles) debug('p_bar: %s, %r', p_bar.shape, p_bar) # add in some extra dimensions to enable broadcasting n_bar = n_bar[:, np.newaxis] n_C = n_C[:, np.newaxis] n = n[:, np.newaxis, :] p_bar = p_bar[:, :, np.newaxis] # compute the sample variance of allele frequencies over populations s_squared = ( np.sum(n * ((p - p_bar) ** 2), axis=2) / (n_bar * (r - 1)) ) assert s_squared.shape == (n_variants, n_alleles) debug('s_squared: %s, %r', s_squared.shape, s_squared) # remove extra dimensions for correct broadcasting p_bar = p_bar[:, :, 0] # compute the average heterozygosity over all populations # N.B., take only samples in subpops of interest gs = g.take(list(itertools.chain(*subpops)), axis=1) h_bar = [gs.count_het(allele=allele, axis=1) / n_total for allele in range(n_alleles)] h_bar = np.column_stack(h_bar) assert h_bar.shape == (n_variants, n_alleles) debug('h_bar: %s, %r', h_bar.shape, h_bar) # now comes the tricky bit... # component of variance between populations a = ((n_bar / n_C) * (s_squared - ((1 / (n_bar - 1)) * ((p_bar * (1 - p_bar)) - ((r - 1) * s_squared / r) - (h_bar / 4))))) assert a.shape == (n_variants, n_alleles) # component of variance between individuals within populations b = ((n_bar / (n_bar - 1)) * ((p_bar * (1 - p_bar)) - ((r - 1) * s_squared / r) - (((2 * n_bar) - 1) * h_bar / (4 * n_bar)))) assert b.shape == (n_variants, n_alleles) # component of variance between gametes within individuals c = h_bar / 2 assert c.shape == (n_variants, n_alleles) return a, b, c