def locate_unlinked(gn, size=100, step=20, threshold=.1, chunked=False, blen=None): """Locate variants in approximate linkage equilibrium, where r**2 is below the given `threshold`. Parameters ---------- gn : array_like, int8, shape (n_variants, n_samples) Diploid genotypes at biallelic variants, coded as the number of alternate alleles per call (i.e., 0 = hom ref, 1 = het, 2 = hom alt). size : int Window size (number of variants). step : int Number of variants to advance to the next window. threshold : float Maximum value of r**2 to include variants. blen : int, optional Block length to use for chunked computation. Returns ------- loc : ndarray, bool, shape (n_variants) Boolean array where True items locate variants in approximate linkage equilibrium. Notes ----- The value of r**2 between each pair of variants is calculated using the method of Rogers and Huff (2008). """ from allel.opt.stats import gn_locate_unlinked_int8 # check inputs if not hasattr(gn, 'shape') or not hasattr(gn, 'dtype'): gn = np.asarray(gn, dtype='i1') if gn.ndim != 2: raise ValueError('gn must have two dimensions') # setup output loc = np.ones(gn.shape[0], dtype='u1') # compute in chunks to avoid loading big arrays into memory blen = get_blen_array(gn, blen) blen = max(blen, 10*size) # avoid too small chunks n_variants = gn.shape[0] for i in range(0, n_variants, blen): # N.B., ensure overlap with next window j = min(n_variants, i+blen+size) gnb = np.asarray(gn[i:j], dtype='i1') locb = loc[i:j] gn_locate_unlinked_int8(gnb, locb, size, step, threshold) return loc.astype('b1')
def locate_unlinked(gn, size=100, step=20, threshold=.1, blen=None): """Locate variants in approximate linkage equilibrium, where r**2 is below the given `threshold`. Parameters ---------- gn : array_like, int8, shape (n_variants, n_samples) Diploid genotypes at biallelic variants, coded as the number of alternate alleles per call (i.e., 0 = hom ref, 1 = het, 2 = hom alt). size : int Window size (number of variants). step : int Number of variants to advance to the next window. threshold : float Maximum value of r**2 to include variants. blen : int, optional Block length to use for chunked computation. Returns ------- loc : ndarray, bool, shape (n_variants) Boolean array where True items locate variants in approximate linkage equilibrium. Notes ----- The value of r**2 between each pair of variants is calculated using the method of Rogers and Huff (2008). """ # check inputs if not hasattr(gn, 'shape') or not hasattr(gn, 'dtype'): gn = np.asarray(gn, dtype='i1') if gn.ndim != 2: raise ValueError('gn must have two dimensions') # setup output loc = np.ones(gn.shape[0], dtype='u1') # compute in chunks to avoid loading big arrays into memory blen = get_blen_array(gn, blen) blen = max(blen, 10 * size) # avoid too small chunks n_variants = gn.shape[0] for i in range(0, n_variants, blen): # N.B., ensure overlap with next window j = min(n_variants, i + blen + size) gnb = np.asarray(gn[i:j], dtype='i1') gnb = memoryview_safe(gnb) locb = loc[i:j] gn_locate_unlinked_int8(gnb, locb, size, step, threshold) return loc.astype('b1')
def pairwise_distance(x, metric, chunked=False, blen=None): """Compute pairwise distance between individuals (e.g., samples or haplotypes). Parameters ---------- x : array_like, shape (n, m, ...) Array of m observations (e.g., samples or haplotypes) in a space with n dimensions (e.g., variants). Note that the order of the first two dimensions is **swapped** compared to what is expected by scipy.spatial.distance.pdist. metric : string or function Distance metric. See documentation for the function :func:`scipy.spatial.distance.pdist` for a list of built-in distance metrics. chunked : bool, optional If True, use a block-wise implementation to avoid loading the entire input array into memory. This means that a distance matrix will be calculated for each block of the input array, and the results will be summed to produce the final output. For some distance metrics this will return a different result from the standard implementation. blen : int, optional Block length to use for chunked implementation. Returns ------- dist : ndarray, shape (m * (m - 1) / 2,) Distance matrix in condensed form. Examples -------- >>> import allel >>> g = allel.GenotypeArray([[[0, 0], [0, 1], [1, 1]], ... [[0, 1], [1, 1], [1, 2]], ... [[0, 2], [2, 2], [-1, -1]]]) >>> d = allel.stats.pairwise_distance(g.to_n_alt(), metric='cityblock') >>> d array([ 3., 4., 3.]) >>> import scipy.spatial >>> scipy.spatial.distance.squareform(d) array([[ 0., 3., 4.], [ 3., 0., 3.], [ 4., 3., 0.]]) """ import scipy.spatial # check inputs if not hasattr(x, "ndim"): x = np.asarray(x) if x.ndim < 2: raise ValueError("array with at least 2 dimensions expected") if x.ndim == 2: # use scipy to calculate distance, it's most efficient def f(b): # transpose as pdist expects (m, n) for m observations in an # n-dimensional space t = b.T # compute the distance matrix return scipy.spatial.distance.pdist(t, metric=metric) else: # use our own implementation, it handles multidimensional observations def f(b): return pdist(b, metric=metric) if chunked: # use block-wise implementation blen = get_blen_array(x, blen) dist = None for i in range(0, x.shape[0], blen): j = min(x.shape[0], i + blen) block = x[i:j] if dist is None: dist = f(block) else: dist += f(block) else: # standard implementation dist = f(x) return dist
def weir_cockerham_fst(g, subpops, max_allele=None, blen=None): """Compute the variance components from the analyses of variance of allele frequencies according to Weir and Cockerham (1984). Parameters ---------- g : array_like, int, shape (n_variants, n_samples, ploidy) Genotype array. subpops : sequence of sequences of ints Sample indices for each subpopulation. max_allele : int, optional The highest allele index to consider. blen : int, optional Block length to use for chunked computation. Returns ------- a : ndarray, float, shape (n_variants, n_alleles) Component of variance between populations. b : ndarray, float, shape (n_variants, n_alleles) Component of variance between individuals within populations. c : ndarray, float, shape (n_variants, n_alleles) Component of variance between gametes within individuals. Examples -------- Calculate variance components from some genotype data:: >>> import allel >>> g = [[[0, 0], [0, 0], [1, 1], [1, 1]], ... [[0, 1], [0, 1], [0, 1], [0, 1]], ... [[0, 0], [0, 0], [0, 0], [0, 0]], ... [[0, 1], [1, 2], [1, 1], [2, 2]], ... [[0, 0], [1, 1], [0, 1], [-1, -1]]] >>> subpops = [[0, 1], [2, 3]] >>> a, b, c = allel.weir_cockerham_fst(g, subpops) >>> a array([[ 0.5 , 0.5 , 0. ], [ 0. , 0. , 0. ], [ 0. , 0. , 0. ], [ 0. , -0.125, -0.125], [-0.375, -0.375, 0. ]]) >>> b array([[ 0. , 0. , 0. ], [-0.25 , -0.25 , 0. ], [ 0. , 0. , 0. ], [ 0. , 0.125 , 0.25 ], [ 0.41666667, 0.41666667, 0. ]]) >>> c array([[0. , 0. , 0. ], [0.5 , 0.5 , 0. ], [0. , 0. , 0. ], [0.125 , 0.25 , 0.125 ], [0.16666667, 0.16666667, 0. ]]) Estimate the parameter theta (a.k.a., Fst) for each variant and each allele individually:: >>> fst = a / (a + b + c) >>> fst array([[ 1. , 1. , nan], [ 0. , 0. , nan], [ nan, nan, nan], [ 0. , -0.5, -0.5], [-1.8, -1.8, nan]]) Estimate Fst for each variant individually (averaging over alleles):: >>> fst = (np.sum(a, axis=1) / ... (np.sum(a, axis=1) + np.sum(b, axis=1) + np.sum(c, axis=1))) >>> fst array([ 1. , 0. , nan, -0.4, -1.8]) Estimate Fst averaging over all variants and alleles:: >>> fst = np.sum(a) / (np.sum(a) + np.sum(b) + np.sum(c)) >>> fst -4.36809058868914e-17 Note that estimated Fst values may be negative. """ # check inputs if not hasattr(g, 'shape') or not hasattr(g, 'ndim'): g = GenotypeArray(g, copy=False) if g.ndim != 3: raise ValueError('g must have three dimensions') if g.shape[2] != 2: raise NotImplementedError('only diploid genotypes are supported') # determine highest allele index if max_allele is None: max_allele = g.max() # compute in chunks to avoid loading big arrays into memory blen = get_blen_array(g, blen) n_variants = g.shape[0] shape = (n_variants, max_allele + 1) a = np.zeros(shape, dtype='f8') b = np.zeros(shape, dtype='f8') c = np.zeros(shape, dtype='f8') for i in range(0, n_variants, blen): j = min(n_variants, i + blen) gb = g[i:j] ab, bb, cb = _weir_cockerham_fst(gb, subpops, max_allele) a[i:j] = ab b[i:j] = bb c[i:j] = cb return a, b, c
def pairwise_distance(x, metric, chunked=False, blen=None): """Compute pairwise distance between individuals (e.g., samples or haplotypes). Parameters ---------- x : array_like, shape (n, m, ...) Array of m observations (e.g., samples or haplotypes) in a space with n dimensions (e.g., variants). Note that the order of the first two dimensions is **swapped** compared to what is expected by scipy.spatial.distance.pdist. metric : string or function Distance metric. See documentation for the function :func:`scipy.spatial.distance.pdist` for a list of built-in distance metrics. chunked : bool, optional If True, use a block-wise implementation to avoid loading the entire input array into memory. This means that a distance matrix will be calculated for each block of the input array, and the results will be summed to produce the final output. For some distance metrics this will return a different result from the standard implementation. blen : int, optional Block length to use for chunked implementation. Returns ------- dist : ndarray, shape (m * (m - 1) / 2,) Distance matrix in condensed form. Examples -------- >>> import allel >>> g = allel.GenotypeArray([[[0, 0], [0, 1], [1, 1]], ... [[0, 1], [1, 1], [1, 2]], ... [[0, 2], [2, 2], [-1, -1]]]) >>> d = allel.stats.pairwise_distance(g.to_n_alt(), metric='cityblock') >>> d array([ 3., 4., 3.]) >>> import scipy.spatial >>> scipy.spatial.distance.squareform(d) array([[ 0., 3., 4.], [ 3., 0., 3.], [ 4., 3., 0.]]) """ import scipy.spatial # check inputs if not hasattr(x, 'ndim'): x = np.asarray(x) if x.ndim < 2: raise ValueError('array with at least 2 dimensions expected') if x.ndim == 2: # use scipy to calculate distance, it's most efficient def f(b): # transpose as pdist expects (m, n) for m observations in an # n-dimensional space t = b.T # compute the distance matrix return scipy.spatial.distance.pdist(t, metric=metric) else: # use our own implementation, it handles multidimensional observations def f(b): return pdist(b, metric=metric) if chunked: # use block-wise implementation blen = get_blen_array(x, blen) dist = None for i in range(0, x.shape[0], blen): j = min(x.shape[0], i + blen) block = x[i:j] if dist is None: dist = f(block) else: dist += f(block) else: # standard implementation dist = f(x) return dist
def weir_cockerham_fst(g, subpops, max_allele=None, chunked=False, blen=None): """Compute the variance components from the analyses of variance of allele frequencies according to Weir and Cockerham (1984). Parameters ---------- g : array_like, int, shape (n_variants, n_samples, ploidy) Genotype array. subpops : sequence of sequences of ints Sample indices for each subpopulation. max_allele : int, optional The highest allele index to consider. chunked : bool, optional If True, use a block-wise implementation to avoid loading the entire input array into memory. blen : int, optional Block length to use for chunked implementation. Returns ------- a : ndarray, float, shape (n_variants, n_alleles) Component of variance between populations. b : ndarray, float, shape (n_variants, n_alleles) Component of variance between individuals within populations. c : ndarray, float, shape (n_variants, n_alleles) Component of variance between gametes within individuals. Examples -------- Calculate variance components from some genotype data:: >>> import allel >>> g = [[[0, 0], [0, 0], [1, 1], [1, 1]], ... [[0, 1], [0, 1], [0, 1], [0, 1]], ... [[0, 0], [0, 0], [0, 0], [0, 0]], ... [[0, 1], [1, 2], [1, 1], [2, 2]], ... [[0, 0], [1, 1], [0, 1], [-1, -1]]] >>> subpops = [[0, 1], [2, 3]] >>> a, b, c = allel.stats.weir_cockerham_fst(g, subpops) >>> a array([[ 0.5 , 0.5 , 0. ], [ 0. , 0. , 0. ], [ 0. , 0. , 0. ], [ 0. , -0.125, -0.125], [-0.375, -0.375, 0. ]]) >>> b array([[ 0. , 0. , 0. ], [-0.25 , -0.25 , 0. ], [ 0. , 0. , 0. ], [ 0. , 0.125 , 0.25 ], [ 0.41666667, 0.41666667, 0. ]]) >>> c array([[ 0. , 0. , 0. ], [ 0.5 , 0.5 , 0. ], [ 0. , 0. , 0. ], [ 0.125 , 0.25 , 0.125 ], [ 0.16666667, 0.16666667, 0. ]]) Estimate the parameter theta (a.k.a., Fst) for each variant and each allele individually:: >>> fst = a / (a + b + c) >>> fst array([[ 1. , 1. , nan], [ 0. , 0. , nan], [ nan, nan, nan], [ 0. , -0.5, -0.5], [-1.8, -1.8, nan]]) Estimate Fst for each variant individually (averaging over alleles):: >>> fst = (np.sum(a, axis=1) / ... (np.sum(a, axis=1) + np.sum(b, axis=1) + np.sum(c, axis=1))) >>> fst array([ 1. , 0. , nan, -0.4, -1.8]) Estimate Fst averaging over all variants and alleles:: >>> fst = np.sum(a) / (np.sum(a) + np.sum(b) + np.sum(c)) >>> fst -4.3680905886891398e-17 Note that estimated Fst values may be negative. """ # check inputs if not hasattr(g, 'shape') or not hasattr(g, 'ndim'): g = GenotypeArray(g, copy=False) if g.ndim != 3: raise ValueError('g must have three dimensions') if g.shape[2] != 2: raise NotImplementedError('only diploid genotypes are supported') # determine highest allele index if max_allele is None: max_allele = g.max() if chunked: # use a block-wise implementation blen = get_blen_array(g, blen) n_variants = g.shape[0] shape = (n_variants, max_allele + 1) a = np.zeros(shape, dtype='f8') b = np.zeros(shape, dtype='f8') c = np.zeros(shape, dtype='f8') for i in range(0, n_variants, blen): j = min(n_variants, i+blen) gb = g[i:j] ab, bb, cb = _weir_cockerham_fst(gb, subpops, max_allele) a[i:j] = ab b[i:j] = bb c[i:j] = cb else: a, b, c = _weir_cockerham_fst(g, subpops, max_allele) return a, b, c