def weir_cockerham_fst(g, subpops, max_allele=None, blen=None): """Compute the variance components from the analyses of variance of allele frequencies according to Weir and Cockerham (1984). Parameters ---------- g : array_like, int, shape (n_variants, n_samples, ploidy) Genotype array. subpops : sequence of sequences of ints Sample indices for each subpopulation. max_allele : int, optional The highest allele index to consider. blen : int, optional Block length to use for chunked computation. Returns ------- a : ndarray, float, shape (n_variants, n_alleles) Component of variance between populations. b : ndarray, float, shape (n_variants, n_alleles) Component of variance between individuals within populations. c : ndarray, float, shape (n_variants, n_alleles) Component of variance between gametes within individuals. Examples -------- Calculate variance components from some genotype data:: >>> import allel >>> g = [[[0, 0], [0, 0], [1, 1], [1, 1]], ... [[0, 1], [0, 1], [0, 1], [0, 1]], ... [[0, 0], [0, 0], [0, 0], [0, 0]], ... [[0, 1], [1, 2], [1, 1], [2, 2]], ... [[0, 0], [1, 1], [0, 1], [-1, -1]]] >>> subpops = [[0, 1], [2, 3]] >>> a, b, c = allel.weir_cockerham_fst(g, subpops) >>> a array([[ 0.5 , 0.5 , 0. ], [ 0. , 0. , 0. ], [ 0. , 0. , 0. ], [ 0. , -0.125, -0.125], [-0.375, -0.375, 0. ]]) >>> b array([[ 0. , 0. , 0. ], [-0.25 , -0.25 , 0. ], [ 0. , 0. , 0. ], [ 0. , 0.125 , 0.25 ], [ 0.41666667, 0.41666667, 0. ]]) >>> c array([[0. , 0. , 0. ], [0.5 , 0.5 , 0. ], [0. , 0. , 0. ], [0.125 , 0.25 , 0.125 ], [0.16666667, 0.16666667, 0. ]]) Estimate the parameter theta (a.k.a., Fst) for each variant and each allele individually:: >>> fst = a / (a + b + c) >>> fst array([[ 1. , 1. , nan], [ 0. , 0. , nan], [ nan, nan, nan], [ 0. , -0.5, -0.5], [-1.8, -1.8, nan]]) Estimate Fst for each variant individually (averaging over alleles):: >>> fst = (np.sum(a, axis=1) / ... (np.sum(a, axis=1) + np.sum(b, axis=1) + np.sum(c, axis=1))) >>> fst array([ 1. , 0. , nan, -0.4, -1.8]) Estimate Fst averaging over all variants and alleles:: >>> fst = np.sum(a) / (np.sum(a) + np.sum(b) + np.sum(c)) >>> fst -4.36809058868914e-17 Note that estimated Fst values may be negative. """ # check inputs if not hasattr(g, 'shape') or not hasattr(g, 'ndim'): g = GenotypeArray(g, copy=False) if g.ndim != 3: raise ValueError('g must have three dimensions') if g.shape[2] != 2: raise NotImplementedError('only diploid genotypes are supported') # determine highest allele index if max_allele is None: max_allele = g.max() # compute in chunks to avoid loading big arrays into memory blen = get_blen_array(g, blen) n_variants = g.shape[0] shape = (n_variants, max_allele + 1) a = np.zeros(shape, dtype='f8') b = np.zeros(shape, dtype='f8') c = np.zeros(shape, dtype='f8') for i in range(0, n_variants, blen): j = min(n_variants, i + blen) gb = g[i:j] ab, bb, cb = _weir_cockerham_fst(gb, subpops, max_allele) a[i:j] = ab b[i:j] = bb c[i:j] = cb return a, b, c
def mendel_errors(parent_genotypes, progeny_genotypes): """Locate genotype calls not consistent with Mendelian transmission of alleles. Parameters ---------- parent_genotypes : array_like, int, shape (n_variants, 2, 2) Genotype calls for the two parents. progeny_genotypes : array_like, int, shape (n_variants, n_progeny, 2) Genotype calls for the progeny. Returns ------- me : ndarray, int, shape (n_variants, n_progeny) Count of Mendel errors for each progeny genotype call. Examples -------- The following are all consistent with Mendelian transmission. Note that a value of 0 is returned for missing calls:: >>> import allel >>> import numpy as np >>> genotypes = np.array([ ... # aa x aa -> aa ... [[0, 0], [0, 0], [0, 0], [-1, -1], [-1, -1], [-1, -1]], ... [[1, 1], [1, 1], [1, 1], [-1, -1], [-1, -1], [-1, -1]], ... [[2, 2], [2, 2], [2, 2], [-1, -1], [-1, -1], [-1, -1]], ... # aa x ab -> aa or ab ... [[0, 0], [0, 1], [0, 0], [0, 1], [-1, -1], [-1, -1]], ... [[0, 0], [0, 2], [0, 0], [0, 2], [-1, -1], [-1, -1]], ... [[1, 1], [0, 1], [1, 1], [0, 1], [-1, -1], [-1, -1]], ... # aa x bb -> ab ... [[0, 0], [1, 1], [0, 1], [-1, -1], [-1, -1], [-1, -1]], ... [[0, 0], [2, 2], [0, 2], [-1, -1], [-1, -1], [-1, -1]], ... [[1, 1], [2, 2], [1, 2], [-1, -1], [-1, -1], [-1, -1]], ... # aa x bc -> ab or ac ... [[0, 0], [1, 2], [0, 1], [0, 2], [-1, -1], [-1, -1]], ... [[1, 1], [0, 2], [0, 1], [1, 2], [-1, -1], [-1, -1]], ... # ab x ab -> aa or ab or bb ... [[0, 1], [0, 1], [0, 0], [0, 1], [1, 1], [-1, -1]], ... [[1, 2], [1, 2], [1, 1], [1, 2], [2, 2], [-1, -1]], ... [[0, 2], [0, 2], [0, 0], [0, 2], [2, 2], [-1, -1]], ... # ab x bc -> ab or ac or bb or bc ... [[0, 1], [1, 2], [0, 1], [0, 2], [1, 1], [1, 2]], ... [[0, 1], [0, 2], [0, 0], [0, 1], [0, 1], [1, 2]], ... # ab x cd -> ac or ad or bc or bd ... [[0, 1], [2, 3], [0, 2], [0, 3], [1, 2], [1, 3]], ... ]) >>> me = allel.mendel_errors(genotypes[:, :2], genotypes[:, 2:]) >>> me array([[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]) The following are cases of 'non-parental' inheritance where one or two alleles are found in the progeny that are not present in either parent. Note that the number of errors may be 1 or 2 depending on the number of non-parental alleles:: >>> genotypes = np.array([ ... # aa x aa -> ab or ac or bb or cc ... [[0, 0], [0, 0], [0, 1], [0, 2], [1, 1], [2, 2]], ... [[1, 1], [1, 1], [0, 1], [1, 2], [0, 0], [2, 2]], ... [[2, 2], [2, 2], [0, 2], [1, 2], [0, 0], [1, 1]], ... # aa x ab -> ac or bc or cc ... [[0, 0], [0, 1], [0, 2], [1, 2], [2, 2], [2, 2]], ... [[0, 0], [0, 2], [0, 1], [1, 2], [1, 1], [1, 1]], ... [[1, 1], [0, 1], [1, 2], [0, 2], [2, 2], [2, 2]], ... # aa x bb -> ac or bc or cc ... [[0, 0], [1, 1], [0, 2], [1, 2], [2, 2], [2, 2]], ... [[0, 0], [2, 2], [0, 1], [1, 2], [1, 1], [1, 1]], ... [[1, 1], [2, 2], [0, 1], [0, 2], [0, 0], [0, 0]], ... # ab x ab -> ac or bc or cc ... [[0, 1], [0, 1], [0, 2], [1, 2], [2, 2], [2, 2]], ... [[0, 2], [0, 2], [0, 1], [1, 2], [1, 1], [1, 1]], ... [[1, 2], [1, 2], [0, 1], [0, 2], [0, 0], [0, 0]], ... # ab x bc -> ad or bd or cd or dd ... [[0, 1], [1, 2], [0, 3], [1, 3], [2, 3], [3, 3]], ... [[0, 1], [0, 2], [0, 3], [1, 3], [2, 3], [3, 3]], ... [[0, 2], [1, 2], [0, 3], [1, 3], [2, 3], [3, 3]], ... # ab x cd -> ae or be or ce or de ... [[0, 1], [2, 3], [0, 4], [1, 4], [2, 4], [3, 4]], ... ]) >>> me = allel.mendel_errors(genotypes[:, :2], genotypes[:, 2:]) >>> me array([[1, 1, 2, 2], [1, 1, 2, 2], [1, 1, 2, 2], [1, 1, 2, 2], [1, 1, 2, 2], [1, 1, 2, 2], [1, 1, 2, 2], [1, 1, 2, 2], [1, 1, 2, 2], [1, 1, 2, 2], [1, 1, 2, 2], [1, 1, 2, 2], [1, 1, 1, 2], [1, 1, 1, 2], [1, 1, 1, 2], [1, 1, 1, 1]]) The following are cases of 'hemi-parental' inheritance, where progeny appear to have inherited two copies of an allele found only once in one of the parents:: >>> genotypes = np.array([ ... # aa x ab -> bb ... [[0, 0], [0, 1], [1, 1], [-1, -1]], ... [[0, 0], [0, 2], [2, 2], [-1, -1]], ... [[1, 1], [0, 1], [0, 0], [-1, -1]], ... # ab x bc -> aa or cc ... [[0, 1], [1, 2], [0, 0], [2, 2]], ... [[0, 1], [0, 2], [1, 1], [2, 2]], ... [[0, 2], [1, 2], [0, 0], [1, 1]], ... # ab x cd -> aa or bb or cc or dd ... [[0, 1], [2, 3], [0, 0], [1, 1]], ... [[0, 1], [2, 3], [2, 2], [3, 3]], ... ]) >>> me = allel.mendel_errors(genotypes[:, :2], genotypes[:, 2:]) >>> me array([[1, 0], [1, 0], [1, 0], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1]]) The following are cases of 'uni-parental' inheritance, where progeny appear to have inherited both alleles from a single parent:: >>> genotypes = np.array([ ... # aa x bb -> aa or bb ... [[0, 0], [1, 1], [0, 0], [1, 1]], ... [[0, 0], [2, 2], [0, 0], [2, 2]], ... [[1, 1], [2, 2], [1, 1], [2, 2]], ... # aa x bc -> aa or bc ... [[0, 0], [1, 2], [0, 0], [1, 2]], ... [[1, 1], [0, 2], [1, 1], [0, 2]], ... # ab x cd -> ab or cd ... [[0, 1], [2, 3], [0, 1], [2, 3]], ... ]) >>> me = allel.mendel_errors(genotypes[:, :2], genotypes[:, 2:]) >>> me array([[1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1]]) """ # setup parent_genotypes = GenotypeArray(parent_genotypes) progeny_genotypes = GenotypeArray(progeny_genotypes) check_ploidy(parent_genotypes.ploidy, 2) check_ploidy(progeny_genotypes.ploidy, 2) # transform into per-call allele counts max_allele = max(parent_genotypes.max(), progeny_genotypes.max()) parent_gc = parent_genotypes.to_allele_counts(max_allele=max_allele, dtype='i1') progeny_gc = progeny_genotypes.to_allele_counts(max_allele=max_allele, dtype='i1') # detect nonparental and hemiparental inheritance by comparing allele # counts between parents and progeny max_progeny_gc = parent_gc.clip(max=1).sum(axis=1) max_progeny_gc = max_progeny_gc[:, np.newaxis, :] me = (progeny_gc - max_progeny_gc).clip(min=0).sum(axis=2) # detect uniparental inheritance by finding cases where no alleles are # shared between parents, then comparing progeny allele counts to each # parent p1_gc = parent_gc[:, 0, np.newaxis, :] p2_gc = parent_gc[:, 1, np.newaxis, :] # find variants where parents don't share any alleles is_shared_allele = (p1_gc > 0) & (p2_gc > 0) no_shared_alleles = ~np.any(is_shared_allele, axis=2) # find calls where progeny genotype is identical to one or the other parent me[no_shared_alleles & (np.all(progeny_gc == p1_gc, axis=2) | np.all(progeny_gc == p2_gc, axis=2))] = 1 # retrofit where either or both parent has a missing call me[np.any(parent_genotypes.is_missing(), axis=1)] = 0 return me
def weir_cockerham_fst(g, subpops, max_allele=None, chunked=False, blen=None): """Compute the variance components from the analyses of variance of allele frequencies according to Weir and Cockerham (1984). Parameters ---------- g : array_like, int, shape (n_variants, n_samples, ploidy) Genotype array. subpops : sequence of sequences of ints Sample indices for each subpopulation. max_allele : int, optional The highest allele index to consider. chunked : bool, optional If True, use a block-wise implementation to avoid loading the entire input array into memory. blen : int, optional Block length to use for chunked implementation. Returns ------- a : ndarray, float, shape (n_variants, n_alleles) Component of variance between populations. b : ndarray, float, shape (n_variants, n_alleles) Component of variance between individuals within populations. c : ndarray, float, shape (n_variants, n_alleles) Component of variance between gametes within individuals. Examples -------- Calculate variance components from some genotype data:: >>> import allel >>> g = [[[0, 0], [0, 0], [1, 1], [1, 1]], ... [[0, 1], [0, 1], [0, 1], [0, 1]], ... [[0, 0], [0, 0], [0, 0], [0, 0]], ... [[0, 1], [1, 2], [1, 1], [2, 2]], ... [[0, 0], [1, 1], [0, 1], [-1, -1]]] >>> subpops = [[0, 1], [2, 3]] >>> a, b, c = allel.stats.weir_cockerham_fst(g, subpops) >>> a array([[ 0.5 , 0.5 , 0. ], [ 0. , 0. , 0. ], [ 0. , 0. , 0. ], [ 0. , -0.125, -0.125], [-0.375, -0.375, 0. ]]) >>> b array([[ 0. , 0. , 0. ], [-0.25 , -0.25 , 0. ], [ 0. , 0. , 0. ], [ 0. , 0.125 , 0.25 ], [ 0.41666667, 0.41666667, 0. ]]) >>> c array([[ 0. , 0. , 0. ], [ 0.5 , 0.5 , 0. ], [ 0. , 0. , 0. ], [ 0.125 , 0.25 , 0.125 ], [ 0.16666667, 0.16666667, 0. ]]) Estimate the parameter theta (a.k.a., Fst) for each variant and each allele individually:: >>> fst = a / (a + b + c) >>> fst array([[ 1. , 1. , nan], [ 0. , 0. , nan], [ nan, nan, nan], [ 0. , -0.5, -0.5], [-1.8, -1.8, nan]]) Estimate Fst for each variant individually (averaging over alleles):: >>> fst = (np.sum(a, axis=1) / ... (np.sum(a, axis=1) + np.sum(b, axis=1) + np.sum(c, axis=1))) >>> fst array([ 1. , 0. , nan, -0.4, -1.8]) Estimate Fst averaging over all variants and alleles:: >>> fst = np.sum(a) / (np.sum(a) + np.sum(b) + np.sum(c)) >>> fst -4.3680905886891398e-17 Note that estimated Fst values may be negative. """ # check inputs if not hasattr(g, 'shape') or not hasattr(g, 'ndim'): g = GenotypeArray(g, copy=False) if g.ndim != 3: raise ValueError('g must have three dimensions') if g.shape[2] != 2: raise NotImplementedError('only diploid genotypes are supported') # determine highest allele index if max_allele is None: max_allele = g.max() if chunked: # use a block-wise implementation blen = get_blen_array(g, blen) n_variants = g.shape[0] shape = (n_variants, max_allele + 1) a = np.zeros(shape, dtype='f8') b = np.zeros(shape, dtype='f8') c = np.zeros(shape, dtype='f8') for i in range(0, n_variants, blen): j = min(n_variants, i+blen) gb = g[i:j] ab, bb, cb = _weir_cockerham_fst(gb, subpops, max_allele) a[i:j] = ab b[i:j] = bb c[i:j] = cb else: a, b, c = _weir_cockerham_fst(g, subpops, max_allele) return a, b, c