def test_heterozygosity_expected(self): def refimpl(af, ploidy, fill=0): """Limited reference implementation for testing purposes.""" # check allele frequencies sum to 1 af_sum = np.sum(af, axis=1) # assume three alleles p = af[:, 0] q = af[:, 1] r = af[:, 2] out = 1 - p**ploidy - q**ploidy - r**ploidy with ignore_invalid(): out[(af_sum < 1) | np.isnan(af_sum)] = fill return out # diploid g = GenotypeArray([[[0, 0], [0, 0]], [[1, 1], [1, 1]], [[1, 1], [2, 2]], [[0, 0], [0, 1]], [[0, 0], [0, 2]], [[1, 1], [1, 2]], [[0, 1], [0, 1]], [[0, 1], [1, 2]], [[0, 0], [-1, -1]], [[0, 1], [-1, -1]], [[-1, -1], [-1, -1]]], dtype='i1') expect1 = [0, 0, 0.5, .375, .375, .375, .5, .625, 0, .5, -1] af = g.count_alleles().to_frequencies() expect2 = refimpl(af, ploidy=g.ploidy, fill=-1) actual = allel.stats.heterozygosity_expected(af, ploidy=g.ploidy, fill=-1) assert_array_close(expect1, actual) assert_array_close(expect2, actual) expect3 = [0, 0, 0.5, .375, .375, .375, .5, .625, 0, .5, 0] actual = allel.stats.heterozygosity_expected(af, ploidy=g.ploidy, fill=0) assert_array_close(expect3, actual) # polyploid g = GenotypeArray([[[0, 0, 0], [0, 0, 0]], [[1, 1, 1], [1, 1, 1]], [[1, 1, 1], [2, 2, 2]], [[0, 0, 0], [0, 0, 1]], [[0, 0, 0], [0, 0, 2]], [[1, 1, 1], [0, 1, 2]], [[0, 0, 1], [0, 1, 1]], [[0, 1, 1], [0, 1, 2]], [[0, 0, 0], [-1, -1, -1]], [[0, 0, 1], [-1, -1, -1]], [[-1, -1, -1], [-1, -1, -1]]], dtype='i1') af = g.count_alleles().to_frequencies() expect = refimpl(af, ploidy=g.ploidy, fill=-1) actual = allel.stats.heterozygosity_expected(af, ploidy=g.ploidy, fill=-1) assert_array_close(expect, actual)
def test_haploidify_samples(self): # diploid g = GenotypeArray([[[0, 1], [2, 3]], [[4, 5], [6, 7]], [[8, 9], [10, 11]]], dtype='i1') h = g.haploidify_samples() eq(2, h.ndim) eq(3, h.n_variants) eq(2, h.n_haplotypes) eq(np.int8, h.dtype) for i in range(g.n_variants): for j in range(g.n_samples): self.assertIn(h[i, j], set(g[i, j])) print(repr(h)) # triploid g = GenotypeArray([[[0, 1, 2], [3, 4, 5]], [[6, 7, 8], [9, 10, 11]], [[12, 13, 14], [15, 16, 17]]], dtype='i1') h = g.haploidify_samples() eq(2, h.ndim) eq(3, h.n_variants) eq(2, h.n_haplotypes) eq(np.int8, h.dtype) for i in range(g.n_variants): for j in range(g.n_samples): self.assertIn(h[i, j], set(g[i, j]))
def inbreeding_coefficient(g, fill=np.nan): """Calculate the inbreeding coefficient for each variant. Parameters ---------- g : array_like, int, shape (n_variants, n_samples, ploidy) Genotype array. fill : float, optional Use this value for variants where the expected heterozygosity is zero. Returns ------- f : ndarray, float, shape (n_variants,) Inbreeding coefficient. Notes ----- The inbreeding coefficient is calculated as *1 - (Ho/He)* where *Ho* is the observed heterozygosity and *He* is the expected heterozygosity. Examples -------- >>> import allel >>> g = allel.GenotypeArray([[[0, 0], [0, 0], [0, 0]], ... [[0, 0], [0, 1], [1, 1]], ... [[0, 0], [1, 1], [2, 2]], ... [[1, 1], [1, 2], [-1, -1]]]) >>> allel.stats.inbreeding_coefficient(g) array([ nan, 0.33333333, 1. , -0.33333333]) """ # check inputs if not hasattr(g, 'count_het') or not hasattr(g, 'count_called'): g = GenotypeArray(g, copy=False) # calculate observed and expected heterozygosity ho = heterozygosity_observed(g) af = g.count_alleles().to_frequencies() he = heterozygosity_expected(af, ploidy=g.shape[-1], fill=0) # calculate inbreeding coefficient, accounting for variants with no # expected heterozygosity with ignore_invalid(): f = np.where(he > 0, 1 - (ho / he), fill) return f
def test_from_hdf5_condition(self): # setup HDF5 file node_path = 'test' tf = tempfile.NamedTemporaryFile(delete=False) file_path = tf.name tf.close() with h5py.File(file_path, mode='w') as h5f: h5f.create_dataset(node_path, data=diploid_genotype_data, chunks=(2, 3, 2)) # selection condition = [False, True, False, True, False] # file and node path g = GenotypeCArray.from_hdf5(file_path, node_path, condition=condition) expect = GenotypeArray(diploid_genotype_data).compress(condition, axis=0) aeq(expect, g) # dataset with h5py.File(file_path, mode='r') as h5f: dataset = h5f[node_path] g = GenotypeCArray.from_hdf5(dataset, condition=condition) aeq(expect, g)
def test_slice_types(self): g = GenotypeArray(diploid_genotype_data, dtype='i1') # row slice s = g[1:] assert_is_instance(s, GenotypeArray) # col slice s = g[:, 1:] assert_is_instance(s, GenotypeArray) # row index s = g[0] assert_is_instance(s, np.ndarray) assert_not_is_instance(s, GenotypeArray) # col index s = g[:, 0] assert_is_instance(s, np.ndarray) assert_not_is_instance(s, GenotypeArray) # ploidy index s = g[:, :, 0] assert_is_instance(s, np.ndarray) assert_not_is_instance(s, GenotypeArray) # item s = g[0, 0, 0] assert_is_instance(s, np.int8) assert_not_is_instance(s, GenotypeArray)
def heterozygosity_observed(g, fill=np.nan): """Calculate the rate of observed heterozygosity for each variant. Parameters ---------- g : array_like, int, shape (n_variants, n_samples, ploidy) Genotype array. fill : float, optional Use this value for variants where all calls are missing. Returns ------- ho : ndarray, float, shape (n_variants,) Observed heterozygosity Examples -------- >>> import allel >>> g = allel.GenotypeArray([[[0, 0], [0, 0], [0, 0]], ... [[0, 0], [0, 1], [1, 1]], ... [[0, 0], [1, 1], [2, 2]], ... [[1, 1], [1, 2], [-1, -1]]]) >>> allel.stats.heterozygosity_observed(g) array([ 0. , 0.33333333, 0. , 0.5 ]) """ # check inputs if not hasattr(g, 'count_het') or not hasattr(g, 'count_called'): g = GenotypeArray(g, copy=False) # count hets n_het = np.asarray(g.count_het(axis=1)) n_called = np.asarray(g.count_called(axis=1)) # calculate rate of observed heterozygosity, accounting for variants # where all calls are missing with ignore_invalid(): ho = np.where(n_called > 0, n_het / n_called, fill) return ho
def phase_by_transmission(g, window_size, copy=True): """Phase genotypes in a trio or cross where possible using Mendelian transmission. Parameters ---------- g : array_like, int, shape (n_variants, n_samples, 2) Genotype array, with parents as first two columns and progeny as remaining columns. window_size : int Number of previous heterozygous sites to include when phasing each parent. A number somewhere between 10 and 100 may be appropriate, depending on levels of heterozygosity and quality of data. copy : bool, optional If False, attempt to phase genotypes in-place. Note that this is only possible if the input array has int8 dtype, otherwise a copy is always made regardless of this parameter. Returns ------- g : GenotypeArray Genotype array with progeny phased where possible. """ # setup g = np.asarray(g, dtype='i1') g = GenotypeArray(g, copy=copy) g._values = memoryview_safe(g.values) check_ploidy(g.ploidy, 2) check_min_samples(g.n_samples, 3) # phase the progeny is_phased = _opt_phase_progeny_by_transmission(g.values) g.is_phased = np.asarray(is_phased).view(bool) # phase the parents _opt_phase_parents_by_transmission(g.values, is_phased, window_size) return g
def test_pairwise_distance_multidim(self): g = GenotypeArray([[[0, 0], [0, 0]], [[1, 1], [1, 1]], [[1, 1], [2, 2]], [[0, 0], [0, 1]], [[0, 0], [0, 2]], [[1, 1], [1, 2]], [[0, 1], [0, 1]], [[0, 1], [1, 2]], [[0, 0], [-1, -1]], [[0, 1], [-1, -1]], [[-1, -1], [-1, -1]]], dtype='i1') gac = g.to_allele_counts() def metric(ac1, ac2): mpd = allel.stats.mean_pairwise_difference_between(ac1, ac2, fill=0) return mpd.sum() expect = [ allel.stats.mean_pairwise_difference_between(gac[:, 0], gac[:, 1], fill=0).sum()] actual = allel.stats.pairwise_distance(gac, metric) aeq(expect, actual)
def test_constructor(self): # missing data arg with assert_raises(TypeError): # noinspection PyArgumentList GenotypeArray() # data has wrong dtype data = 'foo bar' with assert_raises(TypeError): GenotypeArray(data) # data has wrong dtype data = [4., 5., 3.7] with assert_raises(TypeError): GenotypeArray(data) # data has wrong dimensions data = [1, 2, 3] with assert_raises(TypeError): GenotypeArray(data) # data has wrong dimensions data = [[1, 2], [3, 4]] # use HaplotypeArray instead with assert_raises(TypeError): GenotypeArray(data) # diploid data (typed) g = GenotypeArray(diploid_genotype_data, dtype='i1') aeq(diploid_genotype_data, g) eq(np.int8, g.dtype) # polyploid data (typed) g = GenotypeArray(triploid_genotype_data, dtype='i1') aeq(triploid_genotype_data, g) eq(np.int8, g.dtype)
def _weir_cockerham_fst(g, subpops, max_allele): # check inputs g = GenotypeArray(g, copy=False) n_variants, n_samples, ploidy = g.shape n_alleles = max_allele + 1 # number of populations sampled r = len(subpops) n_populations = r debug('r: %r', r) # count alleles within each subpopulation ac = [g.count_alleles(subpop=s, max_allele=max_allele) for s in subpops] # stack allele counts from each sub-population into a single array ac = np.dstack(ac) assert ac.shape == (n_variants, n_alleles, n_populations) debug('ac: %s, %r', ac.shape, ac) # count number of alleles called within each population by summing # allele counts along the alleles dimension an = np.sum(ac, axis=1) assert an.shape == (n_variants, n_populations) debug('an: %s, %r', an.shape, an) # compute number of individuals sampled from each population n = an // 2 assert n.shape == (n_variants, n_populations) debug('n: %s, %r', n.shape, n) # compute the total number of individuals sampled across all populations n_total = np.sum(n, axis=1) assert n_total.shape == (n_variants,) debug('n_total: %s, %r', n_total.shape, n_total) # compute the average sample size across populations n_bar = np.mean(n, axis=1) assert n_bar.shape == (n_variants,) debug('n_bar: %s, %r', n_bar.shape, n_bar) # compute the term n sub C incorporating the coefficient of variation in # sample sizes n_C = (n_total - (np.sum(n**2, axis=1) / n_total)) / (r - 1) assert n_C.shape == (n_variants,) debug('n_C: %s, %r', n_C.shape, n_C) # compute allele frequencies within each population p = ac / an[:, np.newaxis, :] assert p.shape == (n_variants, n_alleles, n_populations) debug('p: %s, %r', p.shape, p) # compute the average sample frequency of each allele ac_total = np.sum(ac, axis=2) an_total = np.sum(an, axis=1) p_bar = ac_total / an_total[:, np.newaxis] assert p_bar.shape == (n_variants, n_alleles) debug('p_bar: %s, %r', p_bar.shape, p_bar) # add in some extra dimensions to enable broadcasting n_bar = n_bar[:, np.newaxis] n_C = n_C[:, np.newaxis] n = n[:, np.newaxis, :] p_bar = p_bar[:, :, np.newaxis] # compute the sample variance of allele frequencies over populations s_squared = ( np.sum(n * ((p - p_bar) ** 2), axis=2) / (n_bar * (r - 1)) ) assert s_squared.shape == (n_variants, n_alleles) debug('s_squared: %s, %r', s_squared.shape, s_squared) # remove extra dimensions for correct broadcasting p_bar = p_bar[:, :, 0] # compute the average heterozygosity over all populations # N.B., take only samples in subpops of interest gs = g.take(list(itertools.chain(*subpops)), axis=1) h_bar = [gs.count_het(allele=allele, axis=1) / n_total for allele in range(n_alleles)] h_bar = np.column_stack(h_bar) assert h_bar.shape == (n_variants, n_alleles) debug('h_bar: %s, %r', h_bar.shape, h_bar) # now comes the tricky bit... # component of variance between populations a = ((n_bar / n_C) * (s_squared - ((1 / (n_bar - 1)) * ((p_bar * (1 - p_bar)) - ((r - 1) * s_squared / r) - (h_bar / 4))))) assert a.shape == (n_variants, n_alleles) # component of variance between individuals within populations b = ((n_bar / (n_bar - 1)) * ((p_bar * (1 - p_bar)) - ((r - 1) * s_squared / r) - (((2 * n_bar) - 1) * h_bar / (4 * n_bar)))) assert b.shape == (n_variants, n_alleles) # component of variance between gametes within individuals c = h_bar / 2 assert c.shape == (n_variants, n_alleles) return a, b, c
def phase_progeny_by_transmission(g): """Phase progeny genotypes from a trio or cross using Mendelian transmission. Parameters ---------- g : array_like, int, shape (n_variants, n_samples, 2) Genotype array, with parents as first two columns and progeny as remaining columns. Returns ------- g : ndarray, int8, shape (n_variants, n_samples, 2) Genotype array with progeny phased where possible. Examples -------- >>> import allel >>> g = allel.GenotypeArray([ ... [[0, 0], [0, 0], [0, 0]], ... [[1, 1], [1, 1], [1, 1]], ... [[0, 0], [1, 1], [0, 1]], ... [[1, 1], [0, 0], [0, 1]], ... [[0, 0], [0, 1], [0, 0]], ... [[0, 0], [0, 1], [0, 1]], ... [[0, 1], [0, 0], [0, 1]], ... [[0, 1], [0, 1], [0, 1]], ... [[0, 1], [1, 2], [0, 1]], ... [[1, 2], [0, 1], [1, 2]], ... [[0, 1], [2, 3], [0, 2]], ... [[2, 3], [0, 1], [1, 3]], ... [[0, 0], [0, 0], [-1, -1]], ... [[0, 0], [0, 0], [1, 1]], ... ], dtype='i1') >>> g = allel.phase_progeny_by_transmission(g) >>> print(g.to_str(row_threshold=None)) 0/0 0/0 0|0 1/1 1/1 1|1 0/0 1/1 0|1 1/1 0/0 1|0 0/0 0/1 0|0 0/0 0/1 0|1 0/1 0/0 1|0 0/1 0/1 0/1 0/1 1/2 0|1 1/2 0/1 2|1 0/1 2/3 0|2 2/3 0/1 3|1 0/0 0/0 ./. 0/0 0/0 1/1 >>> g.is_phased array([[False, False, True], [False, False, True], [False, False, True], [False, False, True], [False, False, True], [False, False, True], [False, False, True], [False, False, False], [False, False, True], [False, False, True], [False, False, True], [False, False, True], [False, False, False], [False, False, False]]) """ # setup g = GenotypeArray(g, dtype='i1', copy=True) check_ploidy(g.ploidy, 2) check_min_samples(g.n_samples, 3) # run the phasing # N.B., a copy has already been made, so no need to make memoryview safe is_phased = _opt_phase_progeny_by_transmission(g.values) g.is_phased = np.asarray(is_phased).view(bool) # outputs return g
def paint_transmission(parent_haplotypes, progeny_haplotypes): """Paint haplotypes inherited from a single diploid parent according to their allelic inheritance. Parameters ---------- parent_haplotypes : array_like, int, shape (n_variants, 2) Both haplotypes from a single diploid parent. progeny_haplotypes : array_like, int, shape (n_variants, n_progeny) Haplotypes found in progeny of the given parent, inherited from the given parent. I.e., haplotypes from gametes of the given parent. Returns ------- painting : ndarray, uint8, shape (n_variants, n_progeny) An array of integers coded as follows: 1 = allele inherited from first parental haplotype; 2 = allele inherited from second parental haplotype; 3 = reference allele, also carried by both parental haplotypes; 4 = non-reference allele, also carried by both parental haplotypes; 5 = non-parental allele; 6 = either or both parental alleles missing; 7 = missing allele; 0 = undetermined. Examples -------- >>> import allel >>> haplotypes = allel.HaplotypeArray([ ... [0, 0, 0, 1, 2, -1], ... [0, 1, 0, 1, 2, -1], ... [1, 0, 0, 1, 2, -1], ... [1, 1, 0, 1, 2, -1], ... [0, 2, 0, 1, 2, -1], ... [0, -1, 0, 1, 2, -1], ... [-1, 1, 0, 1, 2, -1], ... [-1, -1, 0, 1, 2, -1], ... ], dtype='i1') >>> painting = allel.paint_transmission(haplotypes[:, :2], ... haplotypes[:, 2:]) >>> painting array([[3, 5, 5, 7], [1, 2, 5, 7], [2, 1, 5, 7], [5, 4, 5, 7], [1, 5, 2, 7], [6, 6, 6, 7], [6, 6, 6, 7], [6, 6, 6, 7]], dtype=uint8) """ # check inputs parent_haplotypes = HaplotypeArray(parent_haplotypes) progeny_haplotypes = HaplotypeArray(progeny_haplotypes) if parent_haplotypes.n_haplotypes != 2: raise ValueError('exactly two parental haplotypes should be provided') # convenience variables parent1 = parent_haplotypes[:, 0, np.newaxis] parent2 = parent_haplotypes[:, 1, np.newaxis] progeny_is_missing = progeny_haplotypes < 0 parent_is_missing = np.any(parent_haplotypes < 0, axis=1) # need this for broadcasting, but also need to retain original for later parent_is_missing_bc = parent_is_missing[:, np.newaxis] parent_diplotype = GenotypeArray(parent_haplotypes[:, np.newaxis, :]) parent_is_hom_ref = parent_diplotype.is_hom_ref() parent_is_het = parent_diplotype.is_het() parent_is_hom_alt = parent_diplotype.is_hom_alt() # identify allele calls where inheritance can be determined is_callable = ~progeny_is_missing & ~parent_is_missing_bc is_callable_seg = is_callable & parent_is_het # main inheritance states inherit_parent1 = is_callable_seg & (progeny_haplotypes == parent1) inherit_parent2 = is_callable_seg & (progeny_haplotypes == parent2) nonseg_ref = (is_callable & parent_is_hom_ref & (progeny_haplotypes == parent1)) nonseg_alt = (is_callable & parent_is_hom_alt & (progeny_haplotypes == parent1)) nonparental = ( is_callable & (progeny_haplotypes != parent1) & (progeny_haplotypes != parent2) ) # record inheritance states # N.B., order in which these are set matters painting = np.zeros(progeny_haplotypes.shape, dtype='u1') painting[inherit_parent1] = INHERIT_PARENT1 painting[inherit_parent2] = INHERIT_PARENT2 painting[nonseg_ref] = INHERIT_NONSEG_REF painting[nonseg_alt] = INHERIT_NONSEG_ALT painting[nonparental] = INHERIT_NONPARENTAL painting[parent_is_missing] = INHERIT_PARENT_MISSING painting[progeny_is_missing] = INHERIT_MISSING return painting
def mendel_errors(parent_genotypes, progeny_genotypes): """Locate genotype calls not consistent with Mendelian transmission of alleles. Parameters ---------- parent_genotypes : array_like, int, shape (n_variants, 2, 2) Genotype calls for the two parents. progeny_genotypes : array_like, int, shape (n_variants, n_progeny, 2) Genotype calls for the progeny. Returns ------- me : ndarray, int, shape (n_variants, n_progeny) Count of Mendel errors for each progeny genotype call. Examples -------- The following are all consistent with Mendelian transmission. Note that a value of 0 is returned for missing calls:: >>> import allel >>> import numpy as np >>> genotypes = np.array([ ... # aa x aa -> aa ... [[0, 0], [0, 0], [0, 0], [-1, -1], [-1, -1], [-1, -1]], ... [[1, 1], [1, 1], [1, 1], [-1, -1], [-1, -1], [-1, -1]], ... [[2, 2], [2, 2], [2, 2], [-1, -1], [-1, -1], [-1, -1]], ... # aa x ab -> aa or ab ... [[0, 0], [0, 1], [0, 0], [0, 1], [-1, -1], [-1, -1]], ... [[0, 0], [0, 2], [0, 0], [0, 2], [-1, -1], [-1, -1]], ... [[1, 1], [0, 1], [1, 1], [0, 1], [-1, -1], [-1, -1]], ... # aa x bb -> ab ... [[0, 0], [1, 1], [0, 1], [-1, -1], [-1, -1], [-1, -1]], ... [[0, 0], [2, 2], [0, 2], [-1, -1], [-1, -1], [-1, -1]], ... [[1, 1], [2, 2], [1, 2], [-1, -1], [-1, -1], [-1, -1]], ... # aa x bc -> ab or ac ... [[0, 0], [1, 2], [0, 1], [0, 2], [-1, -1], [-1, -1]], ... [[1, 1], [0, 2], [0, 1], [1, 2], [-1, -1], [-1, -1]], ... # ab x ab -> aa or ab or bb ... [[0, 1], [0, 1], [0, 0], [0, 1], [1, 1], [-1, -1]], ... [[1, 2], [1, 2], [1, 1], [1, 2], [2, 2], [-1, -1]], ... [[0, 2], [0, 2], [0, 0], [0, 2], [2, 2], [-1, -1]], ... # ab x bc -> ab or ac or bb or bc ... [[0, 1], [1, 2], [0, 1], [0, 2], [1, 1], [1, 2]], ... [[0, 1], [0, 2], [0, 0], [0, 1], [0, 1], [1, 2]], ... # ab x cd -> ac or ad or bc or bd ... [[0, 1], [2, 3], [0, 2], [0, 3], [1, 2], [1, 3]], ... ]) >>> me = allel.mendel_errors(genotypes[:, :2], genotypes[:, 2:]) >>> me array([[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]) The following are cases of 'non-parental' inheritance where one or two alleles are found in the progeny that are not present in either parent. Note that the number of errors may be 1 or 2 depending on the number of non-parental alleles:: >>> genotypes = np.array([ ... # aa x aa -> ab or ac or bb or cc ... [[0, 0], [0, 0], [0, 1], [0, 2], [1, 1], [2, 2]], ... [[1, 1], [1, 1], [0, 1], [1, 2], [0, 0], [2, 2]], ... [[2, 2], [2, 2], [0, 2], [1, 2], [0, 0], [1, 1]], ... # aa x ab -> ac or bc or cc ... [[0, 0], [0, 1], [0, 2], [1, 2], [2, 2], [2, 2]], ... [[0, 0], [0, 2], [0, 1], [1, 2], [1, 1], [1, 1]], ... [[1, 1], [0, 1], [1, 2], [0, 2], [2, 2], [2, 2]], ... # aa x bb -> ac or bc or cc ... [[0, 0], [1, 1], [0, 2], [1, 2], [2, 2], [2, 2]], ... [[0, 0], [2, 2], [0, 1], [1, 2], [1, 1], [1, 1]], ... [[1, 1], [2, 2], [0, 1], [0, 2], [0, 0], [0, 0]], ... # ab x ab -> ac or bc or cc ... [[0, 1], [0, 1], [0, 2], [1, 2], [2, 2], [2, 2]], ... [[0, 2], [0, 2], [0, 1], [1, 2], [1, 1], [1, 1]], ... [[1, 2], [1, 2], [0, 1], [0, 2], [0, 0], [0, 0]], ... # ab x bc -> ad or bd or cd or dd ... [[0, 1], [1, 2], [0, 3], [1, 3], [2, 3], [3, 3]], ... [[0, 1], [0, 2], [0, 3], [1, 3], [2, 3], [3, 3]], ... [[0, 2], [1, 2], [0, 3], [1, 3], [2, 3], [3, 3]], ... # ab x cd -> ae or be or ce or de ... [[0, 1], [2, 3], [0, 4], [1, 4], [2, 4], [3, 4]], ... ]) >>> me = allel.mendel_errors(genotypes[:, :2], genotypes[:, 2:]) >>> me array([[1, 1, 2, 2], [1, 1, 2, 2], [1, 1, 2, 2], [1, 1, 2, 2], [1, 1, 2, 2], [1, 1, 2, 2], [1, 1, 2, 2], [1, 1, 2, 2], [1, 1, 2, 2], [1, 1, 2, 2], [1, 1, 2, 2], [1, 1, 2, 2], [1, 1, 1, 2], [1, 1, 1, 2], [1, 1, 1, 2], [1, 1, 1, 1]]) The following are cases of 'hemi-parental' inheritance, where progeny appear to have inherited two copies of an allele found only once in one of the parents:: >>> genotypes = np.array([ ... # aa x ab -> bb ... [[0, 0], [0, 1], [1, 1], [-1, -1]], ... [[0, 0], [0, 2], [2, 2], [-1, -1]], ... [[1, 1], [0, 1], [0, 0], [-1, -1]], ... # ab x bc -> aa or cc ... [[0, 1], [1, 2], [0, 0], [2, 2]], ... [[0, 1], [0, 2], [1, 1], [2, 2]], ... [[0, 2], [1, 2], [0, 0], [1, 1]], ... # ab x cd -> aa or bb or cc or dd ... [[0, 1], [2, 3], [0, 0], [1, 1]], ... [[0, 1], [2, 3], [2, 2], [3, 3]], ... ]) >>> me = allel.mendel_errors(genotypes[:, :2], genotypes[:, 2:]) >>> me array([[1, 0], [1, 0], [1, 0], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1]]) The following are cases of 'uni-parental' inheritance, where progeny appear to have inherited both alleles from a single parent:: >>> genotypes = np.array([ ... # aa x bb -> aa or bb ... [[0, 0], [1, 1], [0, 0], [1, 1]], ... [[0, 0], [2, 2], [0, 0], [2, 2]], ... [[1, 1], [2, 2], [1, 1], [2, 2]], ... # aa x bc -> aa or bc ... [[0, 0], [1, 2], [0, 0], [1, 2]], ... [[1, 1], [0, 2], [1, 1], [0, 2]], ... # ab x cd -> ab or cd ... [[0, 1], [2, 3], [0, 1], [2, 3]], ... ]) >>> me = allel.mendel_errors(genotypes[:, :2], genotypes[:, 2:]) >>> me array([[1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1]]) """ # setup parent_genotypes = GenotypeArray(parent_genotypes) progeny_genotypes = GenotypeArray(progeny_genotypes) check_ploidy(parent_genotypes.ploidy, 2) check_ploidy(progeny_genotypes.ploidy, 2) # transform into per-call allele counts max_allele = max(parent_genotypes.max(), progeny_genotypes.max()) parent_gc = parent_genotypes.to_allele_counts(max_allele=max_allele, dtype='i1') progeny_gc = progeny_genotypes.to_allele_counts(max_allele=max_allele, dtype='i1') # detect nonparental and hemiparental inheritance by comparing allele # counts between parents and progeny max_progeny_gc = parent_gc.clip(max=1).sum(axis=1) max_progeny_gc = max_progeny_gc[:, np.newaxis, :] me = (progeny_gc - max_progeny_gc).clip(min=0).sum(axis=2) # detect uniparental inheritance by finding cases where no alleles are # shared between parents, then comparing progeny allele counts to each # parent p1_gc = parent_gc[:, 0, np.newaxis, :] p2_gc = parent_gc[:, 1, np.newaxis, :] # find variants where parents don't share any alleles is_shared_allele = (p1_gc > 0) & (p2_gc > 0) no_shared_alleles = ~np.any(is_shared_allele, axis=2) # find calls where progeny genotype is identical to one or the other parent me[no_shared_alleles & (np.all(progeny_gc == p1_gc, axis=2) | np.all(progeny_gc == p2_gc, axis=2))] = 1 # retrofit where either or both parent has a missing call me[np.any(parent_genotypes.is_missing(), axis=1)] = 0 return me
def weir_cockerham_fst(g, subpops, max_allele=None, blen=None): """Compute the variance components from the analyses of variance of allele frequencies according to Weir and Cockerham (1984). Parameters ---------- g : array_like, int, shape (n_variants, n_samples, ploidy) Genotype array. subpops : sequence of sequences of ints Sample indices for each subpopulation. max_allele : int, optional The highest allele index to consider. blen : int, optional Block length to use for chunked computation. Returns ------- a : ndarray, float, shape (n_variants, n_alleles) Component of variance between populations. b : ndarray, float, shape (n_variants, n_alleles) Component of variance between individuals within populations. c : ndarray, float, shape (n_variants, n_alleles) Component of variance between gametes within individuals. Examples -------- Calculate variance components from some genotype data:: >>> import allel >>> g = [[[0, 0], [0, 0], [1, 1], [1, 1]], ... [[0, 1], [0, 1], [0, 1], [0, 1]], ... [[0, 0], [0, 0], [0, 0], [0, 0]], ... [[0, 1], [1, 2], [1, 1], [2, 2]], ... [[0, 0], [1, 1], [0, 1], [-1, -1]]] >>> subpops = [[0, 1], [2, 3]] >>> a, b, c = allel.weir_cockerham_fst(g, subpops) >>> a array([[ 0.5 , 0.5 , 0. ], [ 0. , 0. , 0. ], [ 0. , 0. , 0. ], [ 0. , -0.125, -0.125], [-0.375, -0.375, 0. ]]) >>> b array([[ 0. , 0. , 0. ], [-0.25 , -0.25 , 0. ], [ 0. , 0. , 0. ], [ 0. , 0.125 , 0.25 ], [ 0.41666667, 0.41666667, 0. ]]) >>> c array([[0. , 0. , 0. ], [0.5 , 0.5 , 0. ], [0. , 0. , 0. ], [0.125 , 0.25 , 0.125 ], [0.16666667, 0.16666667, 0. ]]) Estimate the parameter theta (a.k.a., Fst) for each variant and each allele individually:: >>> fst = a / (a + b + c) >>> fst array([[ 1. , 1. , nan], [ 0. , 0. , nan], [ nan, nan, nan], [ 0. , -0.5, -0.5], [-1.8, -1.8, nan]]) Estimate Fst for each variant individually (averaging over alleles):: >>> fst = (np.sum(a, axis=1) / ... (np.sum(a, axis=1) + np.sum(b, axis=1) + np.sum(c, axis=1))) >>> fst array([ 1. , 0. , nan, -0.4, -1.8]) Estimate Fst averaging over all variants and alleles:: >>> fst = np.sum(a) / (np.sum(a) + np.sum(b) + np.sum(c)) >>> fst -4.36809058868914e-17 Note that estimated Fst values may be negative. """ # check inputs if not hasattr(g, 'shape') or not hasattr(g, 'ndim'): g = GenotypeArray(g, copy=False) if g.ndim != 3: raise ValueError('g must have three dimensions') if g.shape[2] != 2: raise NotImplementedError('only diploid genotypes are supported') # determine highest allele index if max_allele is None: max_allele = g.max() # compute in chunks to avoid loading big arrays into memory blen = get_blen_array(g, blen) n_variants = g.shape[0] shape = (n_variants, max_allele + 1) a = np.zeros(shape, dtype='f8') b = np.zeros(shape, dtype='f8') c = np.zeros(shape, dtype='f8') for i in range(0, n_variants, blen): j = min(n_variants, i + blen) gb = g[i:j] ab, bb, cb = _weir_cockerham_fst(gb, subpops, max_allele) a[i:j] = ab b[i:j] = bb c[i:j] = cb return a, b, c
def _weir_cockerham_fst(g, subpops, max_allele): # check inputs g = GenotypeArray(g, copy=False) n_variants, n_samples, ploidy = g.shape n_alleles = max_allele + 1 # number of populations sampled r = len(subpops) n_populations = r debug('r: %r', r) # count alleles within each subpopulation ac = [g.count_alleles(subpop=s, max_allele=max_allele) for s in subpops] # stack allele counts from each sub-population into a single array ac = np.dstack(ac) assert ac.shape == (n_variants, n_alleles, n_populations) debug('ac: %s, %r', ac.shape, ac) # count number of alleles called within each population by summing # allele counts along the alleles dimension an = np.sum(ac, axis=1) assert an.shape == (n_variants, n_populations) debug('an: %s, %r', an.shape, an) # compute number of individuals sampled from each population n = an // 2 assert n.shape == (n_variants, n_populations) debug('n: %s, %r', n.shape, n) # compute the total number of individuals sampled across all populations n_total = np.sum(n, axis=1) assert n_total.shape == (n_variants, ) debug('n_total: %s, %r', n_total.shape, n_total) # compute the average sample size across populations n_bar = np.mean(n, axis=1) assert n_bar.shape == (n_variants, ) debug('n_bar: %s, %r', n_bar.shape, n_bar) # compute the term n sub C incorporating the coefficient of variation in # sample sizes n_C = (n_total - (np.sum(n**2, axis=1) / n_total)) / (r - 1) assert n_C.shape == (n_variants, ) debug('n_C: %s, %r', n_C.shape, n_C) # compute allele frequencies within each population p = ac / an[:, np.newaxis, :] assert p.shape == (n_variants, n_alleles, n_populations) debug('p: %s, %r', p.shape, p) # compute the average sample frequency of each allele ac_total = np.sum(ac, axis=2) an_total = np.sum(an, axis=1) p_bar = ac_total / an_total[:, np.newaxis] assert p_bar.shape == (n_variants, n_alleles) debug('p_bar: %s, %r', p_bar.shape, p_bar) # add in some extra dimensions to enable broadcasting n_bar = n_bar[:, np.newaxis] n_C = n_C[:, np.newaxis] n = n[:, np.newaxis, :] p_bar = p_bar[:, :, np.newaxis] # compute the sample variance of allele frequencies over populations s_squared = (np.sum(n * ((p - p_bar)**2), axis=2) / (n_bar * (r - 1))) assert s_squared.shape == (n_variants, n_alleles) debug('s_squared: %s, %r', s_squared.shape, s_squared) # remove extra dimensions for correct broadcasting p_bar = p_bar[:, :, 0] # compute the average heterozygosity over all populations # N.B., take only samples in subpops of interest gs = g.take(list(itertools.chain(*subpops)), axis=1) h_bar = [ gs.count_het(allele=allele, axis=1) / n_total for allele in range(n_alleles) ] h_bar = np.column_stack(h_bar) assert h_bar.shape == (n_variants, n_alleles) debug('h_bar: %s, %r', h_bar.shape, h_bar) # now comes the tricky bit... # component of variance between populations a = ((n_bar / n_C) * (s_squared - ((1 / (n_bar - 1)) * ((p_bar * (1 - p_bar)) - ((r - 1) * s_squared / r) - (h_bar / 4))))) assert a.shape == (n_variants, n_alleles) # component of variance between individuals within populations b = ((n_bar / (n_bar - 1)) * ((p_bar * (1 - p_bar)) - ((r - 1) * s_squared / r) - (((2 * n_bar) - 1) * h_bar / (4 * n_bar)))) assert b.shape == (n_variants, n_alleles) # component of variance between gametes within individuals c = h_bar / 2 assert c.shape == (n_variants, n_alleles) return a, b, c
def weir_cockerham_fst(g, subpops, max_allele=None, chunked=False, blen=None): """Compute the variance components from the analyses of variance of allele frequencies according to Weir and Cockerham (1984). Parameters ---------- g : array_like, int, shape (n_variants, n_samples, ploidy) Genotype array. subpops : sequence of sequences of ints Sample indices for each subpopulation. max_allele : int, optional The highest allele index to consider. chunked : bool, optional If True, use a block-wise implementation to avoid loading the entire input array into memory. blen : int, optional Block length to use for chunked implementation. Returns ------- a : ndarray, float, shape (n_variants, n_alleles) Component of variance between populations. b : ndarray, float, shape (n_variants, n_alleles) Component of variance between individuals within populations. c : ndarray, float, shape (n_variants, n_alleles) Component of variance between gametes within individuals. Examples -------- Calculate variance components from some genotype data:: >>> import allel >>> g = [[[0, 0], [0, 0], [1, 1], [1, 1]], ... [[0, 1], [0, 1], [0, 1], [0, 1]], ... [[0, 0], [0, 0], [0, 0], [0, 0]], ... [[0, 1], [1, 2], [1, 1], [2, 2]], ... [[0, 0], [1, 1], [0, 1], [-1, -1]]] >>> subpops = [[0, 1], [2, 3]] >>> a, b, c = allel.stats.weir_cockerham_fst(g, subpops) >>> a array([[ 0.5 , 0.5 , 0. ], [ 0. , 0. , 0. ], [ 0. , 0. , 0. ], [ 0. , -0.125, -0.125], [-0.375, -0.375, 0. ]]) >>> b array([[ 0. , 0. , 0. ], [-0.25 , -0.25 , 0. ], [ 0. , 0. , 0. ], [ 0. , 0.125 , 0.25 ], [ 0.41666667, 0.41666667, 0. ]]) >>> c array([[ 0. , 0. , 0. ], [ 0.5 , 0.5 , 0. ], [ 0. , 0. , 0. ], [ 0.125 , 0.25 , 0.125 ], [ 0.16666667, 0.16666667, 0. ]]) Estimate the parameter theta (a.k.a., Fst) for each variant and each allele individually:: >>> fst = a / (a + b + c) >>> fst array([[ 1. , 1. , nan], [ 0. , 0. , nan], [ nan, nan, nan], [ 0. , -0.5, -0.5], [-1.8, -1.8, nan]]) Estimate Fst for each variant individually (averaging over alleles):: >>> fst = (np.sum(a, axis=1) / ... (np.sum(a, axis=1) + np.sum(b, axis=1) + np.sum(c, axis=1))) >>> fst array([ 1. , 0. , nan, -0.4, -1.8]) Estimate Fst averaging over all variants and alleles:: >>> fst = np.sum(a) / (np.sum(a) + np.sum(b) + np.sum(c)) >>> fst -4.3680905886891398e-17 Note that estimated Fst values may be negative. """ # check inputs if not hasattr(g, 'shape') or not hasattr(g, 'ndim'): g = GenotypeArray(g, copy=False) if g.ndim != 3: raise ValueError('g must have three dimensions') if g.shape[2] != 2: raise NotImplementedError('only diploid genotypes are supported') # determine highest allele index if max_allele is None: max_allele = g.max() if chunked: # use a block-wise implementation blen = get_blen_array(g, blen) n_variants = g.shape[0] shape = (n_variants, max_allele + 1) a = np.zeros(shape, dtype='f8') b = np.zeros(shape, dtype='f8') c = np.zeros(shape, dtype='f8') for i in range(0, n_variants, blen): j = min(n_variants, i+blen) gb = g[i:j] ab, bb, cb = _weir_cockerham_fst(gb, subpops, max_allele) a[i:j] = ab b[i:j] = bb c[i:j] = cb else: a, b, c = _weir_cockerham_fst(g, subpops, max_allele) return a, b, c
def setup_instance(self, data): return GenotypeArray(data)