def rogers_huff_r_between(gna, gnb, fill=np.nan): """Estimate the linkage disequilibrium parameter *r* for each pair of variants between the two input arrays, using the method of Rogers and Huff (2008). Parameters ---------- gna, gnb : array_like, int8, shape (n_variants, n_samples) Diploid genotypes at biallelic variants, coded as the number of alternate alleles per call (i.e., 0 = hom ref, 1 = het, 2 = hom alt). Returns ------- r : ndarray, float, shape (m_variants, n_variants ) Matrix in rectangular form. """ # check inputs gna = asarray_ndim(gna, 2, dtype='i1') gnb = asarray_ndim(gnb, 2, dtype='i1') # compute correlation coefficients from allel.opt.stats import gn_pairwise2_corrcoef_int8 r = gn_pairwise2_corrcoef_int8(gna, gnb, fill) # convenience for singletons if r.size == 1: r = r[0, 0] return r
def joint_sfs(dac1, dac2): """Compute the joint site frequency spectrum between two populations. Parameters ---------- dac1 : array_like, int, shape (n_variants,) Derived allele counts for the first population. dac2 : array_like, int, shape (n_variants,) Derived allele counts for the second population. Returns ------- joint_sfs : ndarray, int, shape (m_chromosomes, n_chromosomes) Array where the (i, j)th element is the number of variant sites with i derived alleles in the first population and j derived alleles in the second population. """ # check inputs dac1 = asarray_ndim(dac1, 1) dac2 = asarray_ndim(dac2, 1) # compute site frequency spectrum n = np.max(dac1) + 1 m = np.max(dac2) + 1 s = np.bincount(dac1 * m + dac2) s.resize((n, m)) return s
def joint_sfs_folded(ac1, ac2): """Compute the joint folded site frequency spectrum between two populations. Parameters ---------- ac1 : array_like, int, shape (n_variants, 2) Allele counts for the first population. ac2 : array_like, int, shape (n_variants, 2) Allele counts for the second population. Returns ------- joint_sfs_folded : ndarray, int, shape (m_chromosomes//2, n_chromosomes//2) Array where the (i, j)th element is the number of variant sites with a minor allele count of i in the first population and j in the second population. """ # check inputs ac1 = asarray_ndim(ac1, 2) ac2 = asarray_ndim(ac2, 2) assert ac1.shape[1] == ac2.shape[1] == 2, "only biallelic variants are supported" # compute minor allele counts mac1 = np.amin(ac1, axis=1) mac2 = np.amin(ac2, axis=1) # compute site frequency spectrum m = np.max(mac1) + 1 n = np.max(mac2) + 1 s = np.bincount(mac1 * n + mac2) s.resize((m, n)) return s
def plot_joint_sfs(s, ax=None, imshow_kwargs=None): import matplotlib.pyplot as plt import matplotlib as mpl # check inputs s = asarray_ndim(s, 2) # setup axes if ax is None: w = plt.rcParams['figure.figsize'][0] fig, ax = plt.subplots(figsize=(w, w)) # set plotting defaults if imshow_kwargs is None: imshow_kwargs = dict() imshow_kwargs.setdefault('cmap', 'jet') imshow_kwargs.setdefault('interpolation', 'none') imshow_kwargs.setdefault('aspect', 'auto') imshow_kwargs.setdefault('norm', mpl.colors.LogNorm()) # plot data ax.imshow(s, **imshow_kwargs) # tidy ax.xaxis.tick_top() ax.set_ylabel('derived allele count (population 1)') ax.set_xlabel('derived allele count (population 2)') ax.xaxis.set_label_position('top') return ax
def plot_joint_sfs(s, ax=None, imshow_kwargs=None): import matplotlib.pyplot as plt import matplotlib as mpl # check inputs s = asarray_ndim(s, 2) # setup axes if ax is None: w = plt.rcParams["figure.figsize"][0] fig, ax = plt.subplots(figsize=(w, w)) # set plotting defaults if imshow_kwargs is None: imshow_kwargs = dict() imshow_kwargs.setdefault("cmap", "jet") imshow_kwargs.setdefault("interpolation", "none") imshow_kwargs.setdefault("aspect", "auto") imshow_kwargs.setdefault("norm", mpl.colors.LogNorm()) # plot data ax.imshow(s, **imshow_kwargs) # tidy ax.xaxis.tick_top() ax.set_ylabel("derived allele count (population 1)") ax.set_xlabel("derived allele count (population 2)") ax.xaxis.set_label_position("top") return ax
def sfs_folded(ac): """Compute the folded site frequency spectrum given reference and alternate allele counts at a set of biallelic variants. Parameters ---------- ac : array_like, int, shape (n_variants, 2) Allele counts array. Returns ------- sfs_folded : ndarray, int, shape (n_chromosomes//2,) Array where the kth element is the number of variant sites with a minor allele count of k. """ # check input ac = asarray_ndim(ac, 2) assert ac.shape[1] == 2, 'only biallelic variants are supported' # compute minor allele counts mac = np.amin(ac, axis=1) # compute folded site frequency spectrum s = np.bincount(mac) return s
def h_hat(ac): """Unbiased estimator for h, where 2*h is the heterozygosity of the population. Parameters ---------- ac : array_like, int, shape (n_variants, 2) Allele counts array for a single population. Returns ------- h_hat : ndarray, float, shape (n_variants,) Notes ----- Used in Patterson (2012) for calculation of various statistics. """ # check inputs ac = asarray_ndim(ac, 2) assert ac.shape[1] == 2, 'only biallelic variants supported' # compute allele number an = ac.sum(axis=1) # compute estimator x = (ac[:, 0] * ac[:, 1]) / (an * (an - 1)) return x
def fold_sfs(s, n): """Fold a site frequency spectrum. Parameters ---------- s : array_like, int, shape (n_chromosomes,) Site frequency spectrum n : int Total number of chromosomes called. Returns ------- sfs_folded : ndarray, int Folded site frequency spectrum """ # check inputs s = asarray_ndim(s, 1) assert s.shape[0] <= n + 1, 'invalid number of chromosomes' # need to check s has all entries up to n if s.shape[0] < n + 1: sn = np.zeros(n + 1, dtype=s.dtype) sn[:s.shape[0]] = s s = sn # fold nf = (n + 1) // 2 n = nf * 2 o = s[:nf] + s[nf:n][::-1] return o
def fit(self, gn): # check input gn = asarray_ndim(gn, 2) # find mean self.mean_ = np.mean(gn, axis=1, keepdims=True) return self
def rogers_huff_r(gn, fill=np.nan): """Estimate the linkage disequilibrium parameter *r* for each pair of variants using the method of Rogers and Huff (2008). Parameters ---------- gn : array_like, int8, shape (n_variants, n_samples) Diploid genotypes at biallelic variants, coded as the number of alternate alleles per call (i.e., 0 = hom ref, 1 = het, 2 = hom alt). Returns ------- r : ndarray, float, shape (n_variants * (n_variants - 1) // 2,) Matrix in condensed form. Examples -------- >>> import allel >>> g = allel.GenotypeArray([[[0, 0], [1, 1], [0, 0]], ... [[0, 0], [1, 1], [0, 0]], ... [[1, 1], [0, 0], [1, 1]], ... [[0, 0], [0, 1], [-1, -1]]], dtype='i1') >>> gn = g.to_n_alt(fill=-1) >>> gn array([[ 0, 2, 0], [ 0, 2, 0], [ 2, 0, 2], [ 0, 1, -1]], dtype=int8) >>> r = allel.stats.rogers_huff_r(gn) >>> r array([ 1. , -1.00000012, 1. , -1.00000012, 1. , -1. ], dtype=float32) >>> r ** 2 array([ 1. , 1.00000024, 1. , 1.00000024, 1. , 1. ], dtype=float32) >>> from scipy.spatial.distance import squareform >>> squareform(r ** 2) array([[ 0. , 1. , 1.00000024, 1. ], [ 1. , 0. , 1.00000024, 1. ], [ 1.00000024, 1.00000024, 0. , 1. ], [ 1. , 1. , 1. , 0. ]]) """ # flake8: noqa # check inputs gn = asarray_ndim(gn, 2, dtype='i1') # compute correlation coefficients from allel.opt.stats import gn_pairwise_corrcoef_int8 r = gn_pairwise_corrcoef_int8(gn, fill) # convenience for singletons if r.size == 1: r = r[0] return r
def plot_sfs(s, yscale='log', bins=None, n=None, clip_endpoints=True, label=None, plot_kwargs=None, ax=None): import matplotlib.pyplot as plt import scipy # check inputs s = asarray_ndim(s, 1) # setup axes if ax is None: fig, ax = plt.subplots() # setup data if bins is None: if clip_endpoints: x = np.arange(1, s.shape[0]-1) y = s[1:-1] else: x = np.arange(s.shape[0]) y = s else: if clip_endpoints: y, b, _ = scipy.stats.binned_statistic( np.arange(1, s.shape[0]-1), values=s[1:-1], bins=bins, statistic='sum') else: y, b, _ = scipy.stats.binned_statistic( np.arange(s.shape[0]), values=s, bins=bins, statistic='sum') # use bin midpoints for plotting x = (b[:-1] + b[1:]) / 2 if n: # convert allele counts to allele frequencies x = x / n ax.set_xlabel('derived allele frequency') else: ax.set_xlabel('derived allele count') # do plotting if plot_kwargs is None: plot_kwargs = dict() ax.plot(x, y, label=label, **plot_kwargs) # tidy ax.set_yscale(yscale) ax.set_ylabel('site frequency') ax.autoscale(axis='x', tight=True) return ax
def fit(self, gn): # check input gn = asarray_ndim(gn, 2) # find mean self.mean_ = np.mean(gn, axis=1, keepdims=True) # find scaling factor self.std_ = np.std(gn, axis=1, keepdims=True) return self
def transform(self, gn, copy=None): # check inputs copy = copy if copy is not None else self.copy gn = asarray_ndim(gn, 2, copy=copy) if not gn.dtype.kind == 'f': gn = gn.astype('f2') # center gn -= self.mean_ return gn
def fit(self, gn): # check input gn = asarray_ndim(gn, 2) # find mean self.mean_ = np.mean(gn, axis=1, keepdims=True) # find scaling factor p = self.mean_ / self.ploidy self.std_ = np.sqrt(p * (1 - p)) return self
def pairwise_dxy(pos, gac, start=None, stop=None, is_accessible=None): """Convenience function to calculate a pairwise distance matrix using nucleotide divergence (a.k.a. Dxy) as the distance metric. Parameters ---------- pos : array_like, int, shape (n_variants,) Variant positions. gac : array_like, int, shape (n_variants, n_samples, n_alleles) Per-genotype allele counts. start : int, optional Start position of region to use. stop : int, optional Stop position of region to use. is_accessible : array_like, bool, shape (len(contig),), optional Boolean array indicating accessibility status for all positions in the chromosome/contig. Returns ------- dist : ndarray Distance matrix in condensed form. See Also -------- allel.model.ndarray.GenotypeArray.to_allele_counts """ if not isinstance(pos, SortedIndex): pos = SortedIndex(pos, copy=False) gac = asarray_ndim(gac, 3) # compute this once here, to avoid repeated evaluation within the loop gan = np.sum(gac, axis=2) m = gac.shape[1] dist = list() for i, j in itertools.combinations(range(m), 2): ac1 = gac[:, i, ...] an1 = gan[:, i] ac2 = gac[:, j, ...] an2 = gan[:, j] d = sequence_divergence(pos, ac1, ac2, an1=an1, an2=an2, start=start, stop=stop, is_accessible=is_accessible) dist.append(d) return np.array(dist)
def fold_joint_sfs(s, m, n): """Fold a joint site frequency spectrum. Parameters ---------- s : array_like, int, shape (m_chromosomes, n_chromosomes) Joint site frequency spectrum. m : int Number of chromosomes called in the first population. n : int Number of chromosomes called in the second population. Returns ------- joint_sfs_folded : ndarray, int Folded joint site frequency spectrum. """ # check inputs s = asarray_ndim(s, 2) assert s.shape[0] <= m + 1, "invalid number of chromosomes" assert s.shape[1] <= n + 1, "invalid number of chromosomes" # need to check s has all entries up to m if s.shape[0] < m + 1: sm = np.zeros((m + 1, s.shape[1]), dtype=s.dtype) sm[: s.shape[0]] = s s = sm # need to check s has all entries up to n if s.shape[1] < n + 1: sn = np.zeros((s.shape[0], n + 1), dtype=s.dtype) sn[:, : s.shape[1]] = s s = sn # fold mf = (m + 1) // 2 nf = (n + 1) // 2 m = mf * 2 n = nf * 2 o = ( s[:mf, :nf] + s[mf:m, :nf][::-1] # top left + s[:mf, nf:n][:, ::-1] # top right + s[mf:m, nf:n][::-1, ::-1] # bottom left ) # bottom right return o
def joint_sfs_folded(ac1, ac2): """Compute the joint folded site frequency spectrum between two populations. Parameters ---------- ac1 : array_like, int, shape (n_variants, 2) Allele counts for the first population. ac2 : array_like, int, shape (n_variants, 2) Allele counts for the second population. Returns ------- joint_sfs_folded : ndarray, int, shape (m_chromosomes//2, n_chromosomes//2) Array where the (i, j)th element is the number of variant sites with a minor allele count of i in the first population and j in the second population. """ # check inputs ac1 = asarray_ndim(ac1, 2) ac2 = asarray_ndim(ac2, 2) assert ac1.shape[1] == ac2.shape[1] == 2, \ 'only biallelic variants are supported' # compute minor allele counts mac1 = np.amin(ac1, axis=1) mac2 = np.amin(ac2, axis=1) # compute site frequency spectrum m = np.max(mac1) + 1 n = np.max(mac2) + 1 s = np.bincount(mac1 * n + mac2) s.resize((m, n)) return s
def transform(self, gn, copy=None): # check inputs copy = copy if copy is not None else self.copy gn = asarray_ndim(gn, 2, copy=copy) if not gn.dtype.kind == 'f': gn = gn.astype('f2') # center gn -= self.mean_ # scale gn /= self.std_ return gn
def maxFDA(pos, ac, start=None, stop=None, is_accessible=None): # check inputs if not isinstance(pos, SortedIndex): pos = SortedIndex(pos, copy=False) ac = asarray_ndim(ac, 2) is_accessible = asarray_ndim(is_accessible, 1, allow_none=True) # deal with subregion if start is not None or stop is not None: loc = pos.locate_range(start, stop) pos = pos[loc] ac = ac[loc] if start is None: start = pos[0] if stop is None: stop = pos[-1] # calculate values of the stat dafs = [] for i in range(len(ac)): p1 = ac[i, 1] n = p1+ac[i, 0] dafs.append(p1/float(n)) return max(dafs)
def fold_joint_sfs(s, m, n): """Fold a joint site frequency spectrum. Parameters ---------- s : array_like, int, shape (m_chromosomes, n_chromosomes) Joint site frequency spectrum. m : int Number of chromosomes called in the first population. n : int Number of chromosomes called in the second population. Returns ------- joint_sfs_folded : ndarray, int Folded joint site frequency spectrum. """ # check inputs s = asarray_ndim(s, 2) assert s.shape[0] <= m + 1, 'invalid number of chromosomes' assert s.shape[1] <= n + 1, 'invalid number of chromosomes' # need to check s has all entries up to m if s.shape[0] < m + 1: sm = np.zeros((m + 1, s.shape[1]), dtype=s.dtype) sm[:s.shape[0]] = s s = sm # need to check s has all entries up to n if s.shape[1] < n + 1: sn = np.zeros((s.shape[0], n + 1), dtype=s.dtype) sn[:, :s.shape[1]] = s s = sn # fold mf = (m + 1) // 2 nf = (n + 1) // 2 m = mf * 2 n = nf * 2 o = ( s[:mf, :nf] + # top left s[mf:m, :nf][::-1] + # top right s[:mf, nf:n][:, ::-1] + # bottom left s[mf:m, nf:n][::-1, ::-1]) # bottom right return o
def plot_joint_sfs(s, ax=None, imshow_kwargs=None): """Plot a joint site frequency spectrum. Parameters ---------- s : array_like, int, shape (n_chromosomes_pop1, n_chromosomes_pop2) Joint site frequency spectrum. ax : axes, optional Axes on which to draw. If not provided, a new figure will be created. imshow_kwargs : dict-like Additional keyword arguments, passed through to ax.imshow(). Returns ------- ax : axes The axes on which the plot was drawn. """ import matplotlib.pyplot as plt from matplotlib.colors import LogNorm # check inputs s = asarray_ndim(s, 2) # setup axes if ax is None: w = plt.rcParams['figure.figsize'][0] fig, ax = plt.subplots(figsize=(w, w)) # set plotting defaults if imshow_kwargs is None: imshow_kwargs = dict() imshow_kwargs.setdefault('cmap', 'jet') imshow_kwargs.setdefault('interpolation', 'none') imshow_kwargs.setdefault('aspect', 'auto') imshow_kwargs.setdefault('norm', LogNorm()) # plot data ax.imshow(s.T, **imshow_kwargs) # tidy ax.invert_yaxis() ax.set_xlabel('derived allele count (population 1)') ax.set_ylabel('derived allele count (population 2)') return ax
def heterozygosity_expected(af, ploidy, fill=np.nan): """Calculate the expected rate of heterozygosity for each variant under Hardy-Weinberg equilibrium. Parameters ---------- af : array_like, float, shape (n_variants, n_alleles) Allele frequencies array. ploidy : int Sample ploidy. fill : float, optional Use this value for variants where allele frequencies do not sum to 1. Returns ------- he : ndarray, float, shape (n_variants,) Expected heterozygosity Examples -------- >>> import allel >>> g = allel.GenotypeArray([[[0, 0], [0, 0], [0, 0]], ... [[0, 0], [0, 1], [1, 1]], ... [[0, 0], [1, 1], [2, 2]], ... [[1, 1], [1, 2], [-1, -1]]]) >>> af = g.count_alleles().to_frequencies() >>> allel.stats.heterozygosity_expected(af, ploidy=2) array([ 0. , 0.5 , 0.66666667, 0.375 ]) """ # check inputs af = asarray_ndim(af, 2) # calculate expected heterozygosity out = 1 - np.sum(np.power(af, ploidy), axis=1) # fill values where allele frequencies could not be calculated af_sum = np.sum(af, axis=1) with ignore_invalid(): out[(af_sum < 1) | np.isnan(af_sum)] = fill return out
def fold_joint_sfs(s, n1, n2): """Fold a joint site frequency spectrum. Parameters ---------- s : array_like, int, shape (m_chromosomes, n_chromosomes) Joint site frequency spectrum. n1, n2 : int, optional The total number of chromosomes called in each population. Returns ------- joint_sfs_folded : ndarray, int Folded joint site frequency spectrum. """ # check inputs s = asarray_ndim(s, 2) assert s.shape[0] <= n1 + 1, 'invalid number of chromosomes' assert s.shape[1] <= n2 + 1, 'invalid number of chromosomes' # need to check s has all entries up to m if s.shape[0] < n1 + 1: sm = np.zeros((n1 + 1, s.shape[1]), dtype=s.dtype) sm[:s.shape[0]] = s s = sm # need to check s has all entries up to n if s.shape[1] < n2 + 1: sn = np.zeros((s.shape[0], n2 + 1), dtype=s.dtype) sn[:, :s.shape[1]] = s s = sn # fold mf = (n1 + 1) // 2 nf = (n2 + 1) // 2 n1 = mf * 2 n2 = nf * 2 o = ( s[:mf, :nf] + # top left s[mf:n1, :nf][::-1] + # top right s[:mf, nf:n2][:, ::-1] + # bottom left s[mf:n1, nf:n2][::-1, ::-1]) # bottom right return o
def heterozygosity_expected(af, ploidy, fill=np.nan): """Calculate the expected rate of heterozygosity for each variant under Hardy-Weinberg equilibrium. Parameters ---------- af : array_like, float, shape (n_variants, n_alleles) Allele frequencies array. fill : float, optional Use this value for variants where allele frequencies do not sum to 1. Returns ------- he : ndarray, float, shape (n_variants,) Expected heterozygosity Examples -------- >>> import allel >>> g = allel.GenotypeArray([[[0, 0], [0, 0], [0, 0]], ... [[0, 0], [0, 1], [1, 1]], ... [[0, 0], [1, 1], [2, 2]], ... [[1, 1], [1, 2], [-1, -1]]]) >>> af = g.count_alleles().to_frequencies() >>> allel.stats.heterozygosity_expected(af, ploidy=2) array([ 0. , 0.5 , 0.66666667, 0.375 ]) """ # check inputs af = asarray_ndim(af, 2) # calculate expected heterozygosity out = 1 - np.sum(np.power(af, ploidy), axis=1) # fill values where allele frequencies could not be calculated af_sum = np.sum(af, axis=1) with ignore_invalid(): out[(af_sum < 1) | np.isnan(af_sum)] = fill return out
def count_alleles(self, max_allele=None, subpop=None): # if max_allele not specified, count all alleles if max_allele is None: max_allele = self.max().compute()[()] # deal with subpop subpop = asarray_ndim(subpop, 1, allow_none=True, dtype=np.int64) if subpop is not None: gd = self.take(subpop, axis=1).values else: gd = self.values # determine output chunks - preserve axis0; change axis1, axis2 chunks = (gd.chunks[0], (1, ) * len(gd.chunks[1]), (max_allele + 1, )) if self.mask is None: # simple case, no mask def f(block): gb = GenotypeArray(block) return gb.count_alleles(max_allele=max_allele)[:, None, :] # map blocks and reduce out = da.map_blocks(f, gd, chunks=chunks).sum(axis=1, dtype='i4') else: # map with mask def f(block, bmask): g = GenotypeArray(block) g.mask = bmask[:, :, 0] return g.count_alleles(max_allele=max_allele)[:, None, :] md = self.mask[:, :, None] out = da.map_blocks(f, gd, md, chunks=chunks).sum(axis=1, dtype='i4') return AlleleCountsDaskArray(out)
def sfs(dac): """Compute the site frequency spectrum given derived allele counts at a set of biallelic variants. Parameters ---------- dac : array_like, int, shape (n_variants,) Array of derived allele counts. Returns ------- sfs : ndarray, int, shape (n_chromosomes,) Array where the kth element is the number of variant sites with k derived alleles. """ # check input dac = asarray_ndim(dac, 1) # compute site frequency spectrum s = np.bincount(dac) return s
def windowed_df(pos, ac1, ac2, size=None, start=None, stop=None, step=None, windows=None, is_accessible=None, fill=np.nan): """Calculate the density of fixed differences between two populations in windows over a single chromosome/contig. Parameters ---------- pos : array_like, int, shape (n_items,) Variant positions, using 1-based coordinates, in ascending order. ac1 : array_like, int, shape (n_variants, n_alleles) Allele counts array for the first population. ac2 : array_like, int, shape (n_variants, n_alleles) Allele counts array for the second population. size : int, optional The window size (number of bases). start : int, optional The position at which to start (1-based). stop : int, optional The position at which to stop (1-based). step : int, optional The distance between start positions of windows. If not given, defaults to the window size, i.e., non-overlapping windows. windows : array_like, int, shape (n_windows, 2), optional Manually specify the windows to use as a sequence of (window_start, window_stop) positions, using 1-based coordinates. Overrides the size/start/stop/step parameters. is_accessible : array_like, bool, shape (len(contig),), optional Boolean array indicating accessibility status for all positions in the chromosome/contig. fill : object, optional The value to use where a window is completely inaccessible. Returns ------- df : ndarray, float, shape (n_windows,) Per-base density of fixed differences in each window. windows : ndarray, int, shape (n_windows, 2) The windows used, as an array of (window_start, window_stop) positions, using 1-based coordinates. n_bases : ndarray, int, shape (n_windows,) Number of (accessible) bases in each window. counts : ndarray, int, shape (n_windows,) Number of variants in each window. See Also -------- allel.model.locate_fixed_differences """ # check inputs pos = SortedIndex(pos, copy=False) is_accessible = asarray_ndim(is_accessible, 1, allow_none=True) # locate fixed differences loc_df = locate_fixed_differences(ac1, ac2) # count number of fixed differences in windows n_df, windows, counts = windowed_statistic(pos, values=loc_df, statistic=np.count_nonzero, size=size, start=start, stop=stop, step=step, windows=windows, fill=0) # calculate value per base df, n_bases = per_base(n_df, windows, is_accessible=is_accessible, fill=fill) return df, windows, n_bases, counts
def hudson_fst(ac1, ac2, fill=np.nan): """Calculate the numerator and denominator for Fst estimation using the method of Hudson (1992) elaborated by Bhatia et al. (2013). Parameters ---------- ac1 : array_like, int, shape (n_variants, n_alleles) Allele counts array from the first population. ac2 : array_like, int, shape (n_variants, n_alleles) Allele counts array from the second population. fill : float Use this value where there are no pairs to compare (e.g., all allele calls are missing). Returns ------- num : ndarray, float, shape (n_variants,) Divergence between the two populations minus average of diversity within each population. den : ndarray, float, shape (n_variants,) Divergence between the two populations. Examples -------- Calculate numerator and denominator for Fst estimation:: >>> import allel >>> g = allel.GenotypeArray([[[0, 0], [0, 0], [1, 1], [1, 1]], ... [[0, 1], [0, 1], [0, 1], [0, 1]], ... [[0, 0], [0, 0], [0, 0], [0, 0]], ... [[0, 1], [1, 2], [1, 1], [2, 2]], ... [[0, 0], [1, 1], [0, 1], [-1, -1]]]) >>> subpops = [[0, 1], [2, 3]] >>> ac1 = g.count_alleles(subpop=subpops[0]) >>> ac2 = g.count_alleles(subpop=subpops[1]) >>> num, den = allel.hudson_fst(ac1, ac2) >>> num array([ 1. , -0.16666667, 0. , -0.125 , -0.33333333]) >>> den array([1. , 0.5 , 0. , 0.625, 0.5 ]) Estimate Fst for each variant individually:: >>> fst = num / den >>> fst array([ 1. , -0.33333333, nan, -0.2 , -0.66666667]) Estimate Fst averaging over variants:: >>> fst = np.sum(num) / np.sum(den) >>> fst 0.1428571428571429 """ # flake8: noqa # check inputs ac1 = asarray_ndim(ac1, 2) ac2 = asarray_ndim(ac2, 2) check_dim0_aligned(ac1, ac2) ac1, ac2 = ensure_dim1_aligned(ac1, ac2) # calculate these once only an1 = np.sum(ac1, axis=1) an2 = np.sum(ac2, axis=1) # calculate average diversity (a.k.a. heterozygosity) within each # population within = (mean_pairwise_difference(ac1, an1, fill=fill) + mean_pairwise_difference(ac2, an2, fill=fill)) / 2 # calculate divergence (a.k.a. heterozygosity) between each population between = mean_pairwise_difference_between(ac1, ac2, an1, an2, fill=fill) # define numerator and denominator for Fst calculations num = between - within den = between return num, den
def windowed_watterson_theta(pos, ac, size=None, start=None, stop=None, step=None, windows=None, is_accessible=None, fill=np.nan): """Calculate the value of Watterson's estimator in windows over a single chromosome/contig. Parameters ---------- pos : array_like, int, shape (n_items,) Variant positions, using 1-based coordinates, in ascending order. ac : array_like, int, shape (n_variants, n_alleles) Allele counts array. size : int, optional The window size (number of bases). start : int, optional The position at which to start (1-based). stop : int, optional The position at which to stop (1-based). step : int, optional The distance between start positions of windows. If not given, defaults to the window size, i.e., non-overlapping windows. windows : array_like, int, shape (n_windows, 2), optional Manually specify the windows to use as a sequence of (window_start, window_stop) positions, using 1-based coordinates. Overrides the size/start/stop/step parameters. is_accessible : array_like, bool, shape (len(contig),), optional Boolean array indicating accessibility status for all positions in the chromosome/contig. fill : object, optional The value to use where a window is completely inaccessible. Returns ------- theta_hat_w : ndarray, float, shape (n_windows,) Watterson's estimator (theta hat per base). windows : ndarray, int, shape (n_windows, 2) The windows used, as an array of (window_start, window_stop) positions, using 1-based coordinates. n_bases : ndarray, int, shape (n_windows,) Number of (accessible) bases in each window. counts : ndarray, int, shape (n_windows,) Number of variants in each window. Examples -------- >>> import allel >>> g = allel.GenotypeArray([[[0, 0], [0, 0]], ... [[0, 0], [0, 1]], ... [[0, 0], [1, 1]], ... [[0, 1], [1, 1]], ... [[1, 1], [1, 1]], ... [[0, 0], [1, 2]], ... [[0, 1], [1, 2]], ... [[0, 1], [-1, -1]], ... [[-1, -1], [-1, -1]]]) >>> ac = g.count_alleles() >>> pos = [2, 4, 7, 14, 15, 18, 19, 25, 27] >>> theta_hat_w, windows, n_bases, counts = allel.windowed_watterson_theta( ... pos, ac, size=10, start=1, stop=31 ... ) >>> theta_hat_w array([0.10909091, 0.16363636, 0.04958678]) >>> windows array([[ 1, 10], [11, 20], [21, 31]]) >>> n_bases array([10, 10, 11]) >>> counts array([3, 4, 2]) """ # flake8: noqa # check inputs if not isinstance(pos, SortedIndex): pos = SortedIndex(pos, copy=False) is_accessible = asarray_ndim(is_accessible, 1, allow_none=True) if not hasattr(ac, 'count_segregating'): ac = AlleleCountsArray(ac, copy=False) # locate segregating variants is_seg = ac.is_segregating() # count segregating variants in windows S, windows, counts = windowed_statistic(pos, is_seg, statistic=np.count_nonzero, size=size, start=start, stop=stop, step=step, windows=windows, fill=0) # assume number of chromosomes sampled is constant for all variants n = ac.sum(axis=1).max() # (n-1)th harmonic number a1 = np.sum(1 / np.arange(1, n)) # absolute value of Watterson's theta theta_hat_w_abs = S / a1 # theta per base theta_hat_w, n_bases = per_base(theta_hat_w_abs, windows=windows, is_accessible=is_accessible, fill=fill) return theta_hat_w, windows, n_bases, counts
def watterson_theta(pos, ac, start=None, stop=None, is_accessible=None): """Calculate the value of Watterson's estimator over a given region. Parameters ---------- pos : array_like, int, shape (n_items,) Variant positions, using 1-based coordinates, in ascending order. ac : array_like, int, shape (n_variants, n_alleles) Allele counts array. start : int, optional The position at which to start (1-based). Defaults to the first position. stop : int, optional The position at which to stop (1-based). Defaults to the last position. is_accessible : array_like, bool, shape (len(contig),), optional Boolean array indicating accessibility status for all positions in the chromosome/contig. Returns ------- theta_hat_w : float Watterson's estimator (theta hat per base). Examples -------- >>> import allel >>> g = allel.GenotypeArray([[[0, 0], [0, 0]], ... [[0, 0], [0, 1]], ... [[0, 0], [1, 1]], ... [[0, 1], [1, 1]], ... [[1, 1], [1, 1]], ... [[0, 0], [1, 2]], ... [[0, 1], [1, 2]], ... [[0, 1], [-1, -1]], ... [[-1, -1], [-1, -1]]]) >>> ac = g.count_alleles() >>> pos = [2, 4, 7, 14, 15, 18, 19, 25, 27] >>> theta_hat_w = allel.watterson_theta(pos, ac, start=1, stop=31) >>> theta_hat_w 0.10557184750733138 """ # check inputs if not isinstance(pos, SortedIndex): pos = SortedIndex(pos, copy=False) is_accessible = asarray_ndim(is_accessible, 1, allow_none=True) if not hasattr(ac, 'count_segregating'): ac = AlleleCountsArray(ac, copy=False) # deal with subregion if start is not None or stop is not None: loc = pos.locate_range(start, stop) pos = pos[loc] ac = ac[loc] if start is None: start = pos[0] if stop is None: stop = pos[-1] # count segregating variants S = ac.count_segregating() # assume number of chromosomes sampled is constant for all variants n = ac.sum(axis=1).max() # (n-1)th harmonic number a1 = np.sum(1 / np.arange(1, n)) # calculate absolute value theta_hat_w_abs = S / a1 # calculate value per base if is_accessible is None: n_bases = stop - start + 1 else: n_bases = np.count_nonzero(is_accessible[start - 1:stop]) theta_hat_w = theta_hat_w_abs / n_bases return theta_hat_w
def xpnsl(h1, h2, use_threads=True): """Cross-population version of the NSL statistic. Parameters ---------- h1 : array_like, int, shape (n_variants, n_haplotypes) Haplotype array for the first population. h2 : array_like, int, shape (n_variants, n_haplotypes) Haplotype array for the second population. use_threads : bool, optional If True use multiple threads to compute. Returns ------- score : ndarray, float, shape (n_variants,) Unstandardized XPNSL scores. """ # check inputs h1 = asarray_ndim(h1, 2) check_integer_dtype(h1) h2 = asarray_ndim(h2, 2) check_integer_dtype(h2) check_dim0_aligned(h1, h2) if use_threads and multiprocessing.cpu_count() > 1: # use multiple threads # setup threadpool pool = ThreadPool(min(4, multiprocessing.cpu_count())) # scan forward res1_fwd = pool.apply_async(nsl_scan, args=(h1, )) res2_fwd = pool.apply_async(nsl_scan, args=(h2, )) # scan backward res1_rev = pool.apply_async(nsl_scan, args=(h1[::-1], )) res2_rev = pool.apply_async(nsl_scan, args=(h2[::-1], )) # wait for both to finish pool.close() pool.join() # obtain results nsl1_fwd = res1_fwd.get() nsl2_fwd = res2_fwd.get() nsl1_rev = res1_rev.get() nsl2_rev = res2_rev.get() # cleanup pool.terminate() else: # compute without threads # scan forward nsl1_fwd = nsl_scan(h1) nsl2_fwd = nsl_scan(h2) # scan backward nsl1_rev = nsl_scan(h1[::-1]) nsl2_rev = nsl_scan(h2[::-1]) # handle reverse scans nsl1_rev = nsl1_rev[::-1] nsl2_rev = nsl2_rev[::-1] # compute unstandardized score nsl1 = nsl1_fwd + nsl1_rev nsl2 = nsl2_fwd + nsl2_rev score = np.log(nsl1 / nsl2) return score
def tabulate_state_blocks(x, states, pos=None): """Construct a dataframe where each row provides information about continuous state blocks. Parameters ---------- x : array_like, int 1-dimensional array of state values. states : set Set of states of interest. Any state value not in this set will be ignored. pos : array_like, int, optional Array of positions corresponding to values in `x`. Returns ------- df : DataFrame Examples -------- >>> import allel >>> x = [1, 1, 0, 1, 1, 2, 2, 0, 2, 1, 1] >>> df = allel.tabulate_state_blocks(x, states={1, 2}) >>> df state support start_lidx ... size_min size_max is_marginal 0 1 4 -1 ... 5 -1 True 1 2 3 4 ... 4 4 False 2 1 2 8 ... 2 -1 True [3 rows x 9 columns] >>> pos = [2, 4, 7, 8, 10, 14, 19, 23, 28, 30, 31] >>> df = allel.tabulate_state_blocks(x, states={1, 2}, pos=pos) >>> df state support start_lidx ... stop_rpos length_min length_max 0 1 4 -1 ... 14 9 -1 1 2 3 4 ... 30 15 19 2 1 2 8 ... -1 2 -1 [3 rows x 15 columns] """ # check inputs x = asarray_ndim(x, 1) check_integer_dtype(x) x = memoryview_safe(x) # find state transitions switch_points, transitions, observations = state_transitions(x, states) # setup some helpers t = transitions[1:, 0] o = observations[1:] s1 = switch_points[:-1] s2 = switch_points[1:] is_marginal = (s1[:, 0] < 0) | (s2[:, 1] < 0) size_min = s2[:, 0] - s1[:, 1] + 1 size_max = s2[:, 1] - s1[:, 0] - 1 size_max[is_marginal] = -1 # start to build a dataframe items = [ ('state', t), ('support', o), ('start_lidx', s1[:, 0]), ('start_ridx', s1[:, 1]), ('stop_lidx', s2[:, 0]), ('stop_ridx', s2[:, 1]), ('size_min', size_min), ('size_max', size_max), ('is_marginal', is_marginal) ] # deal with optional positions if pos is not None: pos = asarray_ndim(pos, 1) check_dim0_aligned(x, pos) check_integer_dtype(pos) # obtain switch positions switch_positions = np.take(pos, switch_points) # deal with boundary transitions switch_positions[0, 0] = -1 switch_positions[-1, 1] = -1 # setup helpers p1 = switch_positions[:-1] p2 = switch_positions[1:] length_min = p2[:, 0] - p1[:, 1] + 1 length_max = p2[:, 1] - p1[:, 0] - 1 length_max[is_marginal] = -1 items += [ ('start_lpos', p1[:, 0]), ('start_rpos', p1[:, 1]), ('stop_lpos', p2[:, 0]), ('stop_rpos', p2[:, 1]), ('length_min', length_min), ('length_max', length_max), ] import pandas return pandas.DataFrame.from_dict(OrderedDict(items))
def windowed_statistic(pos, values, statistic, size=None, start=None, stop=None, step=None, windows=None, fill=np.nan): """Calculate a statistic from items in windows over a single chromosome/contig. Parameters ---------- pos : array_like, int, shape (n_items,) The item positions in ascending order, using 1-based coordinates.. values : array_like, int, shape (n_items,) The values to summarise. May also be a tuple of values arrays, in which case each array will be sliced and passed through to the statistic function as separate arguments. statistic : function The statistic to compute. size : int, optional The window size (number of bases). start : int, optional The position at which to start (1-based). stop : int, optional The position at which to stop (1-based). step : int, optional The distance between start positions of windows. If not given, defaults to the window size, i.e., non-overlapping windows. windows : array_like, int, shape (n_windows, 2), optional Manually specify the windows to use as a sequence of (window_start, window_stop) positions, using 1-based coordinates. Overrides the size/start/stop/step parameters. fill : object, optional The value to use where a window is empty, i.e., contains no items. Returns ------- out : ndarray, shape (n_windows,) The value of the statistic for each window. windows : ndarray, int, shape (n_windows, 2) The windows used, as an array of (window_start, window_stop) positions, using 1-based coordinates. counts : ndarray, int, shape (n_windows,) The number of items in each window. Notes ----- The window stop positions are included within a window. The final window will be truncated to the specified stop position, and so may be smaller than the other windows. Examples -------- Count non-zero (i.e., True) items in non-overlapping windows:: >>> import allel >>> pos = [1, 7, 12, 15, 28] >>> values = [True, False, True, False, False] >>> nnz, windows, counts = allel.stats.windowed_statistic( ... pos, values, statistic=np.count_nonzero, size=10 ... ) >>> nnz array([1, 1, 0]) >>> windows array([[ 1, 10], [11, 20], [21, 28]]) >>> counts array([2, 2, 1]) Compute a sum over items in half-overlapping windows:: >>> values = [3, 4, 2, 6, 9] >>> x, windows, counts = allel.stats.windowed_statistic( ... pos, values, statistic=np.sum, size=10, step=5, fill=0 ... ) >>> x array([ 7, 12, 8, 0, 9]) >>> windows array([[ 1, 10], [ 6, 15], [11, 20], [16, 25], [21, 28]]) >>> counts array([2, 3, 2, 0, 1]) """ # assume sorted positions if not isinstance(pos, SortedIndex): pos = SortedIndex(pos, copy=False) # check lengths are equal if isinstance(values, tuple): # assume multiple values arrays check_equal_length(pos, *values) else: # assume a single values array check_equal_length(pos, values) # setup windows if windows is None: windows = position_windows(pos, size, start, stop, step) else: windows = asarray_ndim(windows, 2) # find window locations locs = window_locations(pos, windows) # setup outputs out = [] counts = [] # iterate over windows for start_idx, stop_idx in locs: # calculate number of values in window n = stop_idx - start_idx if n == 0: # window is empty s = fill else: if isinstance(values, tuple): # assume multiple values arrays wv = [v[start_idx:stop_idx] for v in values] s = statistic(*wv) else: # assume a single values array wv = values[start_idx:stop_idx] s = statistic(wv) # store outputs out.append(s) counts.append(n) # convert to arrays for output return np.asarray(out), windows, np.asarray(counts)
def mean_pairwise_difference(ac, an=None, fill=np.nan): """Calculate for each variant the mean number of pairwise differences between chromosomes sampled from within a single population. Parameters ---------- ac : array_like, int, shape (n_variants, n_alleles) Allele counts array. an : array_like, int, shape (n_variants,), optional Allele numbers. If not provided, will be calculated from `ac`. fill : float Use this value where there are no pairs to compare (e.g., all allele calls are missing). Returns ------- mpd : ndarray, float, shape (n_variants,) Notes ----- The values returned by this function can be summed over a genome region and divided by the number of accessible bases to estimate nucleotide diversity, a.k.a. *pi*. Examples -------- >>> import allel >>> h = allel.HaplotypeArray([[0, 0, 0, 0], ... [0, 0, 0, 1], ... [0, 0, 1, 1], ... [0, 1, 1, 1], ... [1, 1, 1, 1], ... [0, 0, 1, 2], ... [0, 1, 1, 2], ... [0, 1, -1, -1]]) >>> ac = h.count_alleles() >>> allel.mean_pairwise_difference(ac) array([0. , 0.5 , 0.66666667, 0.5 , 0. , 0.83333333, 0.83333333, 1. ]) See Also -------- sequence_diversity, windowed_diversity """ # This function calculates the mean number of pairwise differences # between haplotypes within a single population, generalising to any number # of alleles. # check inputs ac = asarray_ndim(ac, 2) # total number of haplotypes if an is None: an = np.sum(ac, axis=1) else: an = asarray_ndim(an, 1) check_dim0_aligned(ac, an) # total number of pairwise comparisons for each variant: # (an choose 2) n_pairs = an * (an - 1) / 2 # number of pairwise comparisons where there is no difference: # sum of (ac choose 2) for each allele (i.e., number of ways to # choose the same allele twice) n_same = np.sum(ac * (ac - 1) / 2, axis=1) # number of pairwise differences n_diff = n_pairs - n_same # mean number of pairwise differences, accounting for cases where # there are no pairs with ignore_invalid(): mpd = np.where(n_pairs > 0, n_diff / n_pairs, fill) return mpd
def roh_mhmm(gv, pos, phet_roh=0.001, phet_nonroh=(0.0025, 0.01), transition=1e-6, min_roh=0, is_accessible=None, contig_size=None): """Call ROH (runs of homozygosity) in a single individual given a genotype vector. This function computes the likely ROH using a Multinomial HMM model. There are 3 observable states at each position in a chromosome/contig: 0 = Hom, 1 = Het, 2 = inaccessible (i.e., unobserved). The model is provided with a probability of observing a het in a ROH (`phet_roh`) and one or more probabilities of observing a het in a non-ROH, as this probability may not be constant across the genome (`phet_nonroh`). Parameters ---------- gv : array_like, int, shape (n_variants, ploidy) Genotype vector. pos: array_like, int, shape (n_variants,) Positions of variants, same 0th dimension as `gv`. phet_roh: float, optional Probability of observing a heterozygote in a ROH. Appropriate values will depend on de novo mutation rate and genotype error rate. phet_nonroh: tuple of floats, optional One or more probabilites of observing a heterozygote outside of ROH. Appropriate values will depend primarily on nucleotide diversity within the population, but also on mutation rate and genotype error rate. transition: float, optional Probability of moving between states. min_roh: integer, optional Minimum size (bp) to condsider as a ROH. Will depend on contig size and recombination rate. is_accessible: array_like, bool, shape (`contig_size`,), optional Boolean array for each position in contig describing whether accessible or not. contig_size: int, optional If is_accessible not known/not provided, allows specification of total length of contig. Returns ------- df_roh: DataFrame Data frame where each row describes a run of homozygosity. Columns are 'start', 'stop', 'length' and 'is_marginal'. Start and stop are 1-based, stop-inclusive. froh: float Proportion of genome in a ROH. Notes ----- This function requires `hmmlearn <http://hmmlearn.readthedocs.io/en/latest/>`_ to be installed. This function currently requires around 4GB memory for a contig size of ~50Mbp. """ from hmmlearn import hmm # setup inputs if isinstance(phet_nonroh, float): phet_nonroh = phet_nonroh, gv = GenotypeVector(gv) pos = asarray_ndim(pos, 1) check_dim0_aligned(gv, pos) is_accessible = asarray_ndim(is_accessible, 1, dtype=bool) # heterozygote probabilities het_px = np.concatenate([(phet_roh, ), phet_nonroh]) # start probabilities (all equal) start_prob = np.repeat(1 / het_px.size, het_px.size) # transition between underlying states transition_mx = _hmm_derive_transition_matrix(transition, het_px.size) # probability of inaccessible if is_accessible is None: if contig_size is None: raise ValueError( "If is_accessibile argument is not provided, you must provide contig_size" ) p_accessible = 1.0 else: p_accessible = is_accessible.mean() contig_size = is_accessible.size emission_mx = _mhmm_derive_emission_matrix(het_px, p_accessible) # initialize HMM roh_hmm = hmm.MultinomialHMM(n_components=het_px.size) roh_hmm.n_symbols_ = 3 roh_hmm.startprob_ = start_prob roh_hmm.transmat_ = transition_mx roh_hmm.emissionprob_ = emission_mx # locate heterozygous calls is_het = gv.is_het() # predict ROH state pred, obs = _mhmm_predict_roh_state(roh_hmm, is_het, pos, is_accessible, contig_size) # find ROH windows df_blocks = tabulate_state_blocks(pred, states=list(range(len(het_px)))) df_roh = df_blocks[(df_blocks.state == 0)].reset_index(drop=True) # adapt the dataframe for ROH for col in 'state', 'support', 'start_lidx', 'stop_ridx', 'size_max': del df_roh[col] df_roh.rename(columns={ 'start_ridx': 'start', 'stop_lidx': 'stop', 'size_min': 'length' }, inplace=True) # make coordinates 1-based df_roh['start'] = df_roh['start'] + 1 df_roh['stop'] = df_roh['stop'] + 1 # filter by ROH size if min_roh > 0: df_roh = df_roh[df_roh.length >= min_roh] # compute FROH froh = df_roh.length.sum() / contig_size return df_roh, froh
def windowed_count(pos, size=None, start=None, stop=None, step=None, windows=None): """Count the number of items in windows over a single chromosome/contig. Parameters ---------- pos : array_like, int, shape (n_items,) The item positions in ascending order, using 1-based coordinates.. size : int, optional The window size (number of bases). start : int, optional The position at which to start (1-based). stop : int, optional The position at which to stop (1-based). step : int, optional The distance between start positions of windows. If not given, defaults to the window size, i.e., non-overlapping windows. windows : array_like, int, shape (n_windows, 2), optional Manually specify the windows to use as a sequence of (window_start, window_stop) positions, using 1-based coordinates. Overrides the size/start/stop/step parameters. Returns ------- counts : ndarray, int, shape (n_windows,) The number of items in each window. windows : ndarray, int, shape (n_windows, 2) The windows used, as an array of (window_start, window_stop) positions, using 1-based coordinates. Notes ----- The window stop positions are included within a window. The final window will be truncated to the specified stop position, and so may be smaller than the other windows. Examples -------- Non-overlapping windows:: >>> import allel >>> pos = [1, 7, 12, 15, 28] >>> counts, windows = allel.stats.windowed_count(pos, size=10) >>> counts array([2, 2, 1]) >>> windows array([[ 1, 10], [11, 20], [21, 28]]) Half-overlapping windows:: >>> counts, windows = allel.stats.windowed_count(pos, size=10, step=5) >>> counts array([2, 3, 2, 0, 1]) >>> windows array([[ 1, 10], [ 6, 15], [11, 20], [16, 25], [21, 28]]) """ # assume sorted positions if not isinstance(pos, SortedIndex): pos = SortedIndex(pos, copy=False) # setup windows if windows is None: windows = position_windows(pos, size, start, stop, step) else: windows = asarray_ndim(windows, 2) # find window locations locs = window_locations(pos, windows) # count number of items in each window counts = np.diff(locs, axis=1).reshape(-1) return counts, windows
def standardize_by_allele_count(score, aac, bins=None, n_bins=None, diagnostics=True): """Standardize `score` within allele frequency bins. Parameters ---------- score : array_like, float The score to be standardized, e.g., IHS or NSL. aac : array_like, int An array of alternate allele counts. bins : array_like, int, optional Allele count bins, overrides `n_bins`. n_bins : int, optional Number of allele count bins to use. diagnostics : bool, optional If True, plot some diagnostic information about the standardization. Returns ------- score_standardized : ndarray, float Standardized scores. bins : ndarray, int Allele count bins used for standardization. """ from scipy.stats import binned_statistic # check inputs score = asarray_ndim(score, 1) aac = asarray_ndim(aac, 1) check_dim0_aligned(score, aac) # remove nans nonan = ~np.isnan(score) score_nonan = score[nonan] aac_nonan = aac[nonan] if bins is None: # make our own similar sized bins # how many bins to make? if n_bins is None: # something vaguely reasonable n_bins = np.max(aac) // 2 # make bins bins = make_similar_sized_bins(aac_nonan, n_bins) else: # user-provided bins bins = asarray_ndim(bins, 1) mean_score, _, _ = binned_statistic(aac_nonan, score_nonan, statistic=np.mean, bins=bins) std_score, _, _ = binned_statistic(aac_nonan, score_nonan, statistic=np.std, bins=bins) if diagnostics: import matplotlib.pyplot as plt x = (bins[:-1] + bins[1:]) / 2 plt.figure() plt.fill_between(x, mean_score - std_score, mean_score + std_score, alpha=.5, label='std') plt.plot(x, mean_score, marker='o', label='mean') plt.grid(axis='y') plt.xlabel('Alternate allele count') plt.ylabel('Unstandardized score') plt.title('Standardization diagnostics') plt.legend() # apply standardization score_standardized = np.empty_like(score) for i in range(len(bins) - 1): x1 = bins[i] x2 = bins[i + 1] if i == 0: # first bin loc = (aac < x2) elif i == len(bins) - 2: # last bin loc = (aac >= x1) else: # middle bins loc = (aac >= x1) & (aac < x2) m = mean_score[i] s = std_score[i] score_standardized[loc] = (score[loc] - m) / s return score_standardized, bins
def sequence_divergence(pos, ac1, ac2, an1=None, an2=None, start=None, stop=None, is_accessible=None): """Estimate nucleotide divergence between two populations within a given region, which is the average proportion of sites (including monomorphic sites not present in the data) that differ between randomly chosen pairs of chromosomes, one from each population. Parameters ---------- pos : array_like, int, shape (n_items,) Variant positions, using 1-based coordinates, in ascending order. ac1 : array_like, int, shape (n_variants, n_alleles) Allele counts array for the first population. ac2 : array_like, int, shape (n_variants, n_alleles) Allele counts array for the second population. an1 : array_like, int, shape (n_variants,), optional Allele numbers for the first population. If not provided, will be calculated from `ac1`. an2 : array_like, int, shape (n_variants,), optional Allele numbers for the second population. If not provided, will be calculated from `ac2`. start : int, optional The position at which to start (1-based). Defaults to the first position. stop : int, optional The position at which to stop (1-based). Defaults to the last position. is_accessible : array_like, bool, shape (len(contig),), optional Boolean array indicating accessibility status for all positions in the chromosome/contig. Returns ------- Dxy : ndarray, float, shape (n_windows,) Nucleotide divergence. Examples -------- Simplest case, two haplotypes in each population:: >>> import allel >>> h = allel.HaplotypeArray([[0, 0, 0, 0], ... [0, 0, 0, 1], ... [0, 0, 1, 1], ... [0, 1, 1, 1], ... [1, 1, 1, 1], ... [0, 0, 1, 2], ... [0, 1, 1, 2], ... [0, 1, -1, -1], ... [-1, -1, -1, -1]]) >>> ac1 = h.count_alleles(subpop=[0, 1]) >>> ac2 = h.count_alleles(subpop=[2, 3]) >>> pos = [2, 4, 7, 14, 15, 18, 19, 25, 27] >>> dxy = sequence_divergence(pos, ac1, ac2, start=1, stop=31) >>> dxy 0.12096774193548387 """ # check inputs if not isinstance(pos, SortedIndex): pos = SortedIndex(pos, copy=False) ac1 = asarray_ndim(ac1, 2) ac2 = asarray_ndim(ac2, 2) if an1 is not None: an1 = asarray_ndim(an1, 1) if an2 is not None: an2 = asarray_ndim(an2, 1) is_accessible = asarray_ndim(is_accessible, 1, allow_none=True) # handle start/stop if start is not None or stop is not None: loc = pos.locate_range(start, stop) pos = pos[loc] ac1 = ac1[loc] ac2 = ac2[loc] if an1 is not None: an1 = an1[loc] if an2 is not None: an2 = an2[loc] if start is None: start = pos[0] if stop is None: stop = pos[-1] # calculate mean pairwise difference between the two populations mpd = mean_pairwise_difference_between(ac1, ac2, an1=an1, an2=an2, fill=0) # sum differences over variants mpd_sum = np.sum(mpd) # calculate value per base, N.B., expect pos is 1-based if is_accessible is None: n_bases = stop - start + 1 else: n_bases = np.count_nonzero(is_accessible[start - 1:stop]) dxy = mpd_sum / n_bases return dxy
def compute_ihh_gaps(pos, map_pos, gap_scale, max_gap, is_accessible): """Compute spacing between variants for integrating haplotype homozygosity. Parameters ---------- pos : array_like, int, shape (n_variants,) Variant positions (physical distance). map_pos : array_like, float, shape (n_variants,) Variant positions (genetic map distance). gap_scale : int, optional Rescale distance between variants if gap is larger than this value. max_gap : int, optional Do not report scores if EHH spans a gap larger than this number of base pairs. is_accessible : array_like, bool, optional Genome accessibility array. If provided, distance between variants will be computed as the number of accessible bases between them. Returns ------- gaps : ndarray, float, shape (n_variants - 1,) """ # check inputs if map_pos is None: # integrate over physical distance map_pos = pos else: map_pos = asarray_ndim(map_pos, 1) check_dim0_aligned(pos, map_pos) # compute physical gaps physical_gaps = np.diff(pos) # compute genetic gaps gaps = np.diff(map_pos).astype('f8') if is_accessible is not None: # compute accessible gaps is_accessible = asarray_ndim(is_accessible, 1) assert is_accessible.shape[0] > pos[-1], \ 'accessibility array too short' accessible_gaps = np.zeros_like(physical_gaps) for i in range(1, len(pos)): # N.B., expect pos is 1-based n_access = np.count_nonzero(is_accessible[pos[i - 1] - 1:pos[i] - 1]) accessible_gaps[i - 1] = n_access # adjust using accessibility scaling = accessible_gaps / physical_gaps gaps = gaps * scaling elif gap_scale is not None and gap_scale > 0: scaling = np.ones(gaps.shape, dtype='f8') loc_scale = physical_gaps > gap_scale scaling[loc_scale] = gap_scale / physical_gaps[loc_scale] gaps = gaps * scaling if max_gap is not None and max_gap > 0: # deal with very large gaps gaps[physical_gaps > max_gap] = -1 return gaps
def windowed_divergence(pos, ac1, ac2, size=None, start=None, stop=None, step=None, windows=None, is_accessible=None, fill=np.nan): """Estimate nucleotide divergence between two populations in windows over a single chromosome/contig. Parameters ---------- pos : array_like, int, shape (n_items,) Variant positions, using 1-based coordinates, in ascending order. ac1 : array_like, int, shape (n_variants, n_alleles) Allele counts array for the first population. ac2 : array_like, int, shape (n_variants, n_alleles) Allele counts array for the second population. size : int, optional The window size (number of bases). start : int, optional The position at which to start (1-based). stop : int, optional The position at which to stop (1-based). step : int, optional The distance between start positions of windows. If not given, defaults to the window size, i.e., non-overlapping windows. windows : array_like, int, shape (n_windows, 2), optional Manually specify the windows to use as a sequence of (window_start, window_stop) positions, using 1-based coordinates. Overrides the size/start/stop/step parameters. is_accessible : array_like, bool, shape (len(contig),), optional Boolean array indicating accessibility status for all positions in the chromosome/contig. fill : object, optional The value to use where a window is completely inaccessible. Returns ------- Dxy : ndarray, float, shape (n_windows,) Nucleotide divergence in each window. windows : ndarray, int, shape (n_windows, 2) The windows used, as an array of (window_start, window_stop) positions, using 1-based coordinates. n_bases : ndarray, int, shape (n_windows,) Number of (accessible) bases in each window. counts : ndarray, int, shape (n_windows,) Number of variants in each window. Examples -------- Simplest case, two haplotypes in each population:: >>> import allel >>> h = allel.HaplotypeArray([[0, 0, 0, 0], ... [0, 0, 0, 1], ... [0, 0, 1, 1], ... [0, 1, 1, 1], ... [1, 1, 1, 1], ... [0, 0, 1, 2], ... [0, 1, 1, 2], ... [0, 1, -1, -1], ... [-1, -1, -1, -1]]) >>> ac1 = h.count_alleles(subpop=[0, 1]) >>> ac2 = h.count_alleles(subpop=[2, 3]) >>> pos = [2, 4, 7, 14, 15, 18, 19, 25, 27] >>> dxy, windows, n_bases, counts = windowed_divergence( ... pos, ac1, ac2, size=10, start=1, stop=31 ... ) >>> dxy array([0.15 , 0.225, 0. ]) >>> windows array([[ 1, 10], [11, 20], [21, 31]]) >>> n_bases array([10, 10, 11]) >>> counts array([3, 4, 2]) """ # check inputs pos = SortedIndex(pos, copy=False) is_accessible = asarray_ndim(is_accessible, 1, allow_none=True) # calculate mean pairwise divergence mpd = mean_pairwise_difference_between(ac1, ac2, fill=0) # sum in windows mpd_sum, windows, counts = windowed_statistic(pos, values=mpd, statistic=np.sum, size=size, start=start, stop=stop, step=step, windows=windows, fill=0) # calculate value per base dxy, n_bases = per_base(mpd_sum, windows, is_accessible=is_accessible, fill=fill) return dxy, windows, n_bases, counts
def tabulate_state_transitions(x, states, pos=None): """Construct a dataframe where each row provides information about a state transition. Parameters ---------- x : array_like, int 1-dimensional array of state values. states : set Set of states of interest. Any state value not in this set will be ignored. pos : array_like, int, optional Array of positions corresponding to values in `x`. Returns ------- df : DataFrame Notes ----- The resulting dataframe includes one row at the start representing the first state observation and one row at the end representing the last state observation. Examples -------- >>> import allel >>> x = [1, 1, 0, 1, 1, 2, 2, 0, 2, 1, 1] >>> df = allel.tabulate_state_transitions(x, states={1, 2}) >>> df lstate rstate lidx ridx 0 -1 1 -1 0 1 1 2 4 5 2 2 1 8 9 3 1 -1 10 -1 >>> pos = [2, 4, 7, 8, 10, 14, 19, 23, 28, 30, 31] >>> df = allel.tabulate_state_transitions(x, states={1, 2}, pos=pos) >>> df lstate rstate lidx ridx lpos rpos 0 -1 1 -1 0 -1 2 1 1 2 4 5 10 14 2 2 1 8 9 28 30 3 1 -1 10 -1 31 -1 """ # check inputs x = asarray_ndim(x, 1) check_integer_dtype(x) x = memoryview_safe(x) # find state transitions switch_points, transitions, _ = state_transitions(x, states) # start to build a dataframe items = [('lstate', transitions[:, 0]), ('rstate', transitions[:, 1]), ('lidx', switch_points[:, 0]), ('ridx', switch_points[:, 1])] # deal with optional positions if pos is not None: pos = asarray_ndim(pos, 1) check_dim0_aligned(x, pos) check_integer_dtype(pos) # find switch positions switch_positions = np.take(pos, switch_points) # deal with boundary transitions switch_positions[0, 0] = -1 switch_positions[-1, 1] = -1 # add columns into dataframe items += [('lpos', switch_positions[:, 0]), ('rpos', switch_positions[:, 1])] import pandas return pandas.DataFrame.from_dict(OrderedDict(items))
def windowed_watterson_theta( pos, ac, size=None, start=None, stop=None, step=None, windows=None, is_accessible=None, fill=np.nan ): """Calculate the value of Watterson's estimator in windows over a single chromosome/contig. Parameters ---------- pos : array_like, int, shape (n_items,) Variant positions, using 1-based coordinates, in ascending order. ac : array_like, int, shape (n_variants, n_alleles) Allele counts array. size : int, optional The window size (number of bases). start : int, optional The position at which to start (1-based). stop : int, optional The position at which to stop (1-based). step : int, optional The distance between start positions of windows. If not given, defaults to the window size, i.e., non-overlapping windows. windows : array_like, int, shape (n_windows, 2), optional Manually specify the windows to use as a sequence of (window_start, window_stop) positions, using 1-based coordinates. Overrides the size/start/stop/step parameters. is_accessible : array_like, bool, shape (len(contig),), optional Boolean array indicating accessibility status for all positions in the chromosome/contig. fill : object, optional The value to use where a window is completely inaccessible. Returns ------- theta_hat_w : ndarray, float, shape (n_windows,) Watterson's estimator (theta hat per base). windows : ndarray, int, shape (n_windows, 2) The windows used, as an array of (window_start, window_stop) positions, using 1-based coordinates. n_bases : ndarray, int, shape (n_windows,) Number of (accessible) bases in each window. counts : ndarray, int, shape (n_windows,) Number of variants in each window. Examples -------- >>> import allel >>> g = allel.GenotypeArray([[[0, 0], [0, 0]], ... [[0, 0], [0, 1]], ... [[0, 0], [1, 1]], ... [[0, 1], [1, 1]], ... [[1, 1], [1, 1]], ... [[0, 0], [1, 2]], ... [[0, 1], [1, 2]], ... [[0, 1], [-1, -1]], ... [[-1, -1], [-1, -1]]]) >>> ac = g.count_alleles() >>> pos = [2, 4, 7, 14, 15, 18, 19, 25, 27] >>> theta_hat_w, windows, n_bases, counts = allel.stats.windowed_watterson_theta( ... pos, ac, size=10, start=1, stop=31 ... ) >>> theta_hat_w array([ 0.10909091, 0.16363636, 0.04958678]) >>> windows array([[ 1, 10], [11, 20], [21, 31]]) >>> n_bases array([10, 10, 11]) >>> counts array([3, 4, 2]) """ # flake8: noqa # check inputs if not isinstance(pos, SortedIndex): pos = SortedIndex(pos, copy=False) is_accessible = asarray_ndim(is_accessible, 1, allow_none=True) if not hasattr(ac, "count_segregating"): ac = AlleleCountsArray(ac, copy=False) # locate segregating variants is_seg = ac.is_segregating() # count segregating variants in windows S, windows, counts = windowed_statistic( pos, is_seg, statistic=np.count_nonzero, size=size, start=start, stop=stop, step=step, windows=windows, fill=0 ) # assume number of chromosomes sampled is constant for all variants n = ac.sum(axis=1).max() # (n-1)th harmonic number a1 = np.sum(1 / np.arange(1, n)) # absolute value of Watterson's theta theta_hat_w_abs = S / a1 # theta per base theta_hat_w, n_bases = per_base(theta_hat_w_abs, windows=windows, is_accessible=is_accessible, fill=fill) return theta_hat_w, windows, n_bases, counts
def xpehh(h1, h2, pos, map_pos=None, min_ehh=0.05, include_edges=False, gap_scale=20000, max_gap=200000, is_accessible=None, use_threads=True): """Compute the unstandardized cross-population extended haplotype homozygosity score (XPEHH) for each variant. Parameters ---------- h1 : array_like, int, shape (n_variants, n_haplotypes) Haplotype array for the first population. h2 : array_like, int, shape (n_variants, n_haplotypes) Haplotype array for the second population. pos : array_like, int, shape (n_variants,) Variant positions on physical or genetic map. map_pos : array_like, float, shape (n_variants,) Variant positions (genetic map distance). min_ehh: float, optional Minimum EHH beyond which to truncate integrated haplotype homozygosity calculation. include_edges : bool, optional If True, report scores even if EHH does not decay below `min_ehh` before reaching the edge of the data. gap_scale : int, optional Rescale distance between variants if gap is larger than this value. max_gap : int, optional Do not report scores if EHH spans a gap larger than this number of base pairs. is_accessible : array_like, bool, optional Genome accessibility array. If provided, distance between variants will be computed as the number of accessible bases between them. use_threads : bool, optional If True use multiple threads to compute. Returns ------- score : ndarray, float, shape (n_variants,) Unstandardized XPEHH scores. Notes ----- This function will calculate XPEHH for all variants. To exclude variants below a given minor allele frequency, filter the input haplotype arrays before passing to this function. This function returns NaN for any EHH calculations where haplotype homozygosity does not decay below `min_ehh` before reaching the first or last variant. To disable this behaviour, set `include_edges` to True. Note that the unstandardized score is returned. Usually these scores are then standardized genome-wide. Haplotype arrays from the two populations may have different numbers of haplotypes. See Also -------- standardize """ # check inputs h1 = asarray_ndim(h1, 2) check_integer_dtype(h1) h2 = asarray_ndim(h2, 2) check_integer_dtype(h2) pos = asarray_ndim(pos, 1) check_dim0_aligned(h1, h2, pos) # compute gaps between variants for integration gaps = compute_ihh_gaps(pos, map_pos, gap_scale, max_gap, is_accessible) # setup kwargs kwargs = dict(min_ehh=min_ehh, include_edges=include_edges) if use_threads and multiprocessing.cpu_count() > 1: # use multiple threads # setup threadpool pool = ThreadPool(min(4, multiprocessing.cpu_count())) # scan forward res1_fwd = pool.apply_async(ihh_scan, (h1, gaps), kwargs) res2_fwd = pool.apply_async(ihh_scan, (h2, gaps), kwargs) # scan backward res1_rev = pool.apply_async(ihh_scan, (h1[::-1], gaps[::-1]), kwargs) res2_rev = pool.apply_async(ihh_scan, (h2[::-1], gaps[::-1]), kwargs) # wait for both to finish pool.close() pool.join() # obtain results ihh1_fwd = res1_fwd.get() ihh2_fwd = res2_fwd.get() ihh1_rev = res1_rev.get() ihh2_rev = res2_rev.get() # cleanup pool.terminate() else: # compute without threads # scan forward ihh1_fwd = ihh_scan(h1, gaps, **kwargs) ihh2_fwd = ihh_scan(h2, gaps, **kwargs) # scan backward ihh1_rev = ihh_scan(h1[::-1], gaps[::-1], **kwargs) ihh2_rev = ihh_scan(h2[::-1], gaps[::-1], **kwargs) # handle reverse scans ihh1_rev = ihh1_rev[::-1] ihh2_rev = ihh2_rev[::-1] # compute unstandardized score ihh1 = ihh1_fwd + ihh1_rev ihh2 = ihh2_fwd + ihh2_rev score = np.log(ihh1 / ihh2) return score
def mean_pairwise_difference_between(ac1, ac2, an1=None, an2=None, fill=np.nan): """Calculate for each variant the mean number of pairwise differences between chromosomes sampled from two different populations. Parameters ---------- ac1 : array_like, int, shape (n_variants, n_alleles) Allele counts array from the first population. ac2 : array_like, int, shape (n_variants, n_alleles) Allele counts array from the second population. an1 : array_like, int, shape (n_variants,), optional Allele numbers for the first population. If not provided, will be calculated from `ac1`. an2 : array_like, int, shape (n_variants,), optional Allele numbers for the second population. If not provided, will be calculated from `ac2`. fill : float Use this value where there are no pairs to compare (e.g., all allele calls are missing). Returns ------- mpd : ndarray, float, shape (n_variants,) Notes ----- The values returned by this function can be summed over a genome region and divided by the number of accessible bases to estimate nucleotide divergence between two populations, a.k.a. *Dxy*. Examples -------- >>> import allel >>> h = allel.HaplotypeArray([[0, 0, 0, 0], ... [0, 0, 0, 1], ... [0, 0, 1, 1], ... [0, 1, 1, 1], ... [1, 1, 1, 1], ... [0, 0, 1, 2], ... [0, 1, 1, 2], ... [0, 1, -1, -1]]) >>> ac1 = h.count_alleles(subpop=[0, 1]) >>> ac2 = h.count_alleles(subpop=[2, 3]) >>> allel.stats.mean_pairwise_difference_between(ac1, ac2) array([ 0. , 0.5 , 1. , 0.5 , 0. , 1. , 0.75, nan]) See Also -------- sequence_divergence, windowed_divergence """ # This function calculates the mean number of pairwise differences # between haplotypes from two different populations, generalising to any # number of alleles. # check inputs ac1 = asarray_ndim(ac1, 2) ac2 = asarray_ndim(ac2, 2) check_dim0_aligned(ac1, ac2) ac1, ac2 = ensure_dim1_aligned(ac1, ac2) # total number of haplotypes sampled from each population if an1 is None: an1 = np.sum(ac1, axis=1) else: an1 = asarray_ndim(an1, 1) check_dim0_aligned(ac1, an1) if an2 is None: an2 = np.sum(ac2, axis=1) else: an2 = asarray_ndim(an2, 1) check_dim0_aligned(ac2, an2) # total number of pairwise comparisons for each variant n_pairs = an1 * an2 # number of pairwise comparisons where there is no difference: # sum of (ac1 * ac2) for each allele (i.e., number of ways to # choose the same allele twice) n_same = np.sum(ac1 * ac2, axis=1) # number of pairwise differences n_diff = n_pairs - n_same # mean number of pairwise differences, accounting for cases where # there are no pairs with ignore_invalid(): mpd = np.where(n_pairs > 0, n_diff / n_pairs, fill) return mpd
def sequence_diversity(pos, ac, start=None, stop=None, is_accessible=None): """Estimate nucleotide diversity within a given region, which is the average proportion of sites (including monomorphic sites not present in the data) that differ between randomly chosen pairs of chromosomes. Parameters ---------- pos : array_like, int, shape (n_items,) Variant positions, using 1-based coordinates, in ascending order. ac : array_like, int, shape (n_variants, n_alleles) Allele counts array. start : int, optional The position at which to start (1-based). Defaults to the first position. stop : int, optional The position at which to stop (1-based). Defaults to the last position. is_accessible : array_like, bool, shape (len(contig),), optional Boolean array indicating accessibility status for all positions in the chromosome/contig. Returns ------- pi : ndarray, float, shape (n_windows,) Nucleotide diversity. Notes ----- If start and/or stop are not provided, uses the difference between the last and the first position as a proxy for the total number of sites, which can overestimate the sequence diversity. Examples -------- >>> import allel >>> g = allel.GenotypeArray([[[0, 0], [0, 0]], ... [[0, 0], [0, 1]], ... [[0, 0], [1, 1]], ... [[0, 1], [1, 1]], ... [[1, 1], [1, 1]], ... [[0, 0], [1, 2]], ... [[0, 1], [1, 2]], ... [[0, 1], [-1, -1]], ... [[-1, -1], [-1, -1]]]) >>> ac = g.count_alleles() >>> pos = [2, 4, 7, 14, 15, 18, 19, 25, 27] >>> pi = allel.sequence_diversity(pos, ac, start=1, stop=31) >>> pi 0.13978494623655915 """ # check inputs if not isinstance(pos, SortedIndex): pos = SortedIndex(pos, copy=False) ac = asarray_ndim(ac, 2) is_accessible = asarray_ndim(is_accessible, 1, allow_none=True) # deal with subregion if start is not None or stop is not None: loc = pos.locate_range(start, stop) pos = pos[loc] ac = ac[loc] if start is None: start = pos[0] if stop is None: stop = pos[-1] # calculate mean pairwise difference mpd = mean_pairwise_difference(ac, fill=0) # sum differences over variants mpd_sum = np.sum(mpd) # calculate value per base if is_accessible is None: n_bases = stop - start + 1 else: n_bases = np.count_nonzero(is_accessible[start - 1:stop]) pi = mpd_sum / n_bases return pi
def plot_sfs(s, yscale='log', bins=None, n=None, clip_endpoints=True, label=None, plot_kwargs=None, ax=None): """Plot a site frequency spectrum. Parameters ---------- s : array_like, int, shape (n_chromosomes,) Site frequency spectrum. yscale : string, optional Y axis scale. bins : int or array_like, int, optional Allele count bins. n : int, optional Number of chromosomes sampled. If provided, X axis will be plotted as allele frequency, otherwise as allele count. clip_endpoints : bool, optional If True, do not plot first and last values from frequency spectrum. label : string, optional Label for data series in plot. plot_kwargs : dict-like Additional keyword arguments, passed through to ax.plot(). ax : axes, optional Axes on which to draw. If not provided, a new figure will be created. Returns ------- ax : axes The axes on which the plot was drawn. """ import matplotlib.pyplot as plt import scipy # check inputs s = asarray_ndim(s, 1) # setup axes if ax is None: fig, ax = plt.subplots() # setup data if bins is None: if clip_endpoints: x = np.arange(1, s.shape[0]-1) y = s[1:-1] else: x = np.arange(s.shape[0]) y = s else: if clip_endpoints: y, b, _ = scipy.stats.binned_statistic( np.arange(1, s.shape[0]-1), values=s[1:-1], bins=bins, statistic='sum') else: y, b, _ = scipy.stats.binned_statistic( np.arange(s.shape[0]), values=s, bins=bins, statistic='sum') # use bin midpoints for plotting x = (b[:-1] + b[1:]) / 2 if n: # convert allele counts to allele frequencies x = x / n ax.set_xlabel('derived allele frequency') else: ax.set_xlabel('derived allele count') # do plotting if plot_kwargs is None: plot_kwargs = dict() ax.plot(x, y, label=label, **plot_kwargs) # tidy ax.set_yscale(yscale) ax.set_ylabel('site frequency') ax.autoscale(axis='x', tight=True) return ax
def mean_pairwise_difference(ac, an=None, fill=np.nan): """Calculate for each variant the mean number of pairwise differences between chromosomes sampled from within a single population. Parameters ---------- ac : array_like, int, shape (n_variants, n_alleles) Allele counts array. an : array_like, int, shape (n_variants,), optional Allele numbers. If not provided, will be calculated from `ac`. fill : float Use this value where there are no pairs to compare (e.g., all allele calls are missing). Returns ------- mpd : ndarray, float, shape (n_variants,) Notes ----- The values returned by this function can be summed over a genome region and divided by the number of accessible bases to estimate nucleotide diversity, a.k.a. *pi*. Examples -------- >>> import allel >>> h = allel.HaplotypeArray([[0, 0, 0, 0], ... [0, 0, 0, 1], ... [0, 0, 1, 1], ... [0, 1, 1, 1], ... [1, 1, 1, 1], ... [0, 0, 1, 2], ... [0, 1, 1, 2], ... [0, 1, -1, -1]]) >>> ac = h.count_alleles() >>> allel.stats.mean_pairwise_difference(ac) array([ 0. , 0.5 , 0.66666667, 0.5 , 0. , 0.83333333, 0.83333333, 1. ]) See Also -------- sequence_diversity, windowed_diversity """ # This function calculates the mean number of pairwise differences # between haplotypes within a single population, generalising to any number # of alleles. # check inputs ac = asarray_ndim(ac, 2) # total number of haplotypes if an is None: an = np.sum(ac, axis=1) else: an = asarray_ndim(an, 1) check_dim0_aligned(ac, an) # total number of pairwise comparisons for each variant: # (an choose 2) n_pairs = an * (an - 1) / 2 # number of pairwise comparisons where there is no difference: # sum of (ac choose 2) for each allele (i.e., number of ways to # choose the same allele twice) n_same = np.sum(ac * (ac - 1) / 2, axis=1) # number of pairwise differences n_diff = n_pairs - n_same # mean number of pairwise differences, accounting for cases where # there are no pairs with ignore_invalid(): mpd = np.where(n_pairs > 0, n_diff / n_pairs, fill) return mpd
def standardize(score): """Centre and scale to unit variance.""" score = asarray_ndim(score, 1) return (score - np.nanmean(score)) / np.nanstd(score)
def sequence_diversity(pos, ac, start=None, stop=None, is_accessible=None): """Estimate nucleotide diversity within a given region. Parameters ---------- pos : array_like, int, shape (n_items,) Variant positions, using 1-based coordinates, in ascending order. ac : array_like, int, shape (n_variants, n_alleles) Allele counts array. start : int, optional The position at which to start (1-based). stop : int, optional The position at which to stop (1-based). is_accessible : array_like, bool, shape (len(contig),), optional Boolean array indicating accessibility status for all positions in the chromosome/contig. Returns ------- pi : ndarray, float, shape (n_windows,) Nucleotide diversity. Examples -------- >>> import allel >>> g = allel.GenotypeArray([[[0, 0], [0, 0]], ... [[0, 0], [0, 1]], ... [[0, 0], [1, 1]], ... [[0, 1], [1, 1]], ... [[1, 1], [1, 1]], ... [[0, 0], [1, 2]], ... [[0, 1], [1, 2]], ... [[0, 1], [-1, -1]], ... [[-1, -1], [-1, -1]]]) >>> ac = g.count_alleles() >>> pos = [2, 4, 7, 14, 15, 18, 19, 25, 27] >>> pi = allel.stats.sequence_diversity(pos, ac, start=1, stop=31) >>> pi 0.13978494623655915 """ # check inputs if not isinstance(pos, SortedIndex): pos = SortedIndex(pos, copy=False) ac = asarray_ndim(ac, 2) is_accessible = asarray_ndim(is_accessible, 1, allow_none=True) # deal with subregion if start is not None or stop is not None: loc = pos.locate_range(start, stop) pos = pos[loc] ac = ac[loc] if start is None: start = pos[0] if stop is None: stop = pos[-1] # calculate mean pairwise difference mpd = mean_pairwise_difference(ac, fill=0) # sum differences over variants mpd_sum = np.sum(mpd) # calculate value per base if is_accessible is None: n_bases = stop - start + 1 else: n_bases = np.count_nonzero(is_accessible[start - 1 : stop]) pi = mpd_sum / n_bases return pi
def fig_voight_painting(h, index=None, palette='colorblind', height_factor=0.01, fig=None): """Make a figure of shared haplotype prefixes for both left and right flanks, centred on some variant of choice. Parameters ---------- h : array_like, int, shape (n_variants, n_haplotypes) Haplotype array. index : int, optional Index of the variant within the haplotype array to centre on. If not provided, the middle variant will be used. palette : string, optional A Seaborn palette name. height_factor : float, optional If no axes provided, determine height of figure by multiplying height of painting array by this number. fig : figure The figure on which to draw. If not provided, a new figure will be created. Returns ------- fig : figure Notes ----- N.B., the ordering of haplotypes on the left and right flanks will be different. This means that haplotypes on the right flank **will not** correspond to haplotypes on the left flank at the same vertical position. """ import matplotlib.pyplot as plt from matplotlib.gridspec import GridSpec import seaborn as sns # check inputs h = asarray_ndim(h, 2) if index is None: # use midpoint index = h.shape[0] // 2 # divide data into two flanks hl = h[:index + 1][::-1] hr = h[index:] # paint both flanks pl, il = voight_painting(hl) pr, ir = voight_painting(hr) # compute ehh decay for both flanks el = ehh_decay(hl, truncate=False) er = ehh_decay(hr, truncate=False) # setup figure # fixed height for EHH decay subplot h_ehh = plt.rcParams['figure.figsize'][1] // 3 # add height for paintings h_painting = height_factor * h.shape[1] if fig is None: w = plt.rcParams['figure.figsize'][0] h = h_ehh + h_painting fig = plt.figure(figsize=(w, h)) # setup gridspec gs = GridSpec(2, 2, width_ratios=[hl.shape[0], hr.shape[0]], height_ratios=[h_painting, h_ehh]) # plot paintings ax = fig.add_subplot(gs[0, 0]) sns.despine(ax=ax, left=True, bottom=True) plot_voight_painting(pl, palette=palette, flank='left', ax=ax) ax = fig.add_subplot(gs[0, 1]) sns.despine(ax=ax, left=True, bottom=True) plot_voight_painting(pr, palette=palette, flank='right', ax=ax) # plot ehh ax = fig.add_subplot(gs[1, 0]) sns.despine(ax=ax, offset=3) x = np.arange(el.shape[0]) y = el ax.fill_between(x, 0, y) ax.set_ylim(0, 1) ax.set_yticks([0, 1]) ax.set_ylabel('EHH') ax.invert_xaxis() ax = fig.add_subplot(gs[1, 1]) sns.despine(ax=ax, left=True, right=False, offset=3) ax.yaxis.tick_right() ax.set_ylim(0, 1) ax.set_yticks([0, 1]) x = np.arange(er.shape[0]) y = er ax.fill_between(x, 0, y) # tidy up fig.tight_layout() return fig
def sequence_divergence(pos, ac1, ac2, an1=None, an2=None, start=None, stop=None, is_accessible=None): """Estimate nucleotide divergence between two populations within a given region. Parameters ---------- pos : array_like, int, shape (n_items,) Variant positions, using 1-based coordinates, in ascending order. ac1 : array_like, int, shape (n_variants, n_alleles) Allele counts array for the first population. ac2 : array_like, int, shape (n_variants, n_alleles) Allele counts array for the second population. start : int, optional The position at which to start (1-based). stop : int, optional The position at which to stop (1-based). is_accessible : array_like, bool, shape (len(contig),), optional Boolean array indicating accessibility status for all positions in the chromosome/contig. Returns ------- Dxy : ndarray, float, shape (n_windows,) Nucleotide divergence. Examples -------- Simplest case, two haplotypes in each population:: >>> import allel >>> h = allel.HaplotypeArray([[0, 0, 0, 0], ... [0, 0, 0, 1], ... [0, 0, 1, 1], ... [0, 1, 1, 1], ... [1, 1, 1, 1], ... [0, 0, 1, 2], ... [0, 1, 1, 2], ... [0, 1, -1, -1], ... [-1, -1, -1, -1]]) >>> ac1 = h.count_alleles(subpop=[0, 1]) >>> ac2 = h.count_alleles(subpop=[2, 3]) >>> pos = [2, 4, 7, 14, 15, 18, 19, 25, 27] >>> dxy = sequence_divergence(pos, ac1, ac2, start=1, stop=31) >>> dxy 0.12096774193548387 """ # check inputs if not isinstance(pos, SortedIndex): pos = SortedIndex(pos, copy=False) ac1 = asarray_ndim(ac1, 2) ac2 = asarray_ndim(ac2, 2) if an1 is not None: an1 = asarray_ndim(an1, 1) if an2 is not None: an2 = asarray_ndim(an2, 1) is_accessible = asarray_ndim(is_accessible, 1, allow_none=True) # handle start/stop if start is not None or stop is not None: loc = pos.locate_range(start, stop) pos = pos[loc] ac1 = ac1[loc] ac2 = ac2[loc] if an1 is not None: an1 = an1[loc] if an2 is not None: an2 = an2[loc] if start is None: start = pos[0] if stop is None: stop = pos[-1] # calculate mean pairwise difference between the two populations mpd = mean_pairwise_difference_between(ac1, ac2, an1=an1, an2=an2, fill=0) # sum differences over variants mpd_sum = np.sum(mpd) # calculate value per base, N.B., expect pos is 1-based if is_accessible is None: n_bases = stop - start + 1 else: n_bases = np.count_nonzero(is_accessible[start - 1 : stop]) dxy = mpd_sum / n_bases return dxy
def ihs(h, pos, map_pos=None, min_ehh=0.05, min_maf=0.05, include_edges=False, gap_scale=20000, max_gap=200000, is_accessible=None, use_threads=True): """Compute the unstandardized integrated haplotype score (IHS) for each variant, comparing integrated haplotype homozygosity between the reference (0) and alternate (1) alleles. Parameters ---------- h : array_like, int, shape (n_variants, n_haplotypes) Haplotype array. pos : array_like, int, shape (n_variants,) Variant positions (physical distance). map_pos : array_like, float, shape (n_variants,) Variant positions (genetic map distance). min_ehh: float, optional Minimum EHH beyond which to truncate integrated haplotype homozygosity calculation. min_maf : float, optional Do not compute integrated haplotype homozogysity for variants with minor allele frequency below this value. include_edges : bool, optional If True, report scores even if EHH does not decay below `min_ehh` before reaching the edge of the data. gap_scale : int, optional Rescale distance between variants if gap is larger than this value. max_gap : int, optional Do not report scores if EHH spans a gap larger than this number of base pairs. is_accessible : array_like, bool, optional Genome accessibility array. If provided, distance between variants will be computed as the number of accessible bases between them. use_threads : bool, optional If True use multiple threads to compute. Returns ------- score : ndarray, float, shape (n_variants,) Unstandardized IHS scores. Notes ----- This function will calculate IHS for all variants. To exclude variants below a given minor allele frequency, filter the input haplotype array before passing to this function. This function computes IHS comparing the reference and alternate alleles. These can be polarised by switching the sign for any variant where the reference allele is derived. This function returns NaN for any IHS calculations where haplotype homozygosity does not decay below `min_ehh` before reaching the first or last variant. To disable this behaviour, set `include_edges` to True. Note that the unstandardized score is returned. Usually these scores are then standardized in different allele frequency bins. See Also -------- standardize_by_allele_count """ # check inputs h = asarray_ndim(h, 2) check_integer_dtype(h) pos = asarray_ndim(pos, 1) check_dim0_aligned(h, pos) # compute gaps between variants for integration gaps = compute_ihh_gaps(pos, map_pos, gap_scale, max_gap, is_accessible) # setup kwargs kwargs = dict(min_ehh=min_ehh, min_maf=min_maf, include_edges=include_edges) if use_threads and multiprocessing.cpu_count() > 1: # run with threads # create pool pool = ThreadPool(2) # scan forward result_fwd = pool.apply_async(ihh01_scan, (h, gaps), kwargs) # scan backward result_rev = pool.apply_async(ihh01_scan, (h[::-1], gaps[::-1]), kwargs) # wait for both to finish pool.close() pool.join() # obtain results ihh0_fwd, ihh1_fwd = result_fwd.get() ihh0_rev, ihh1_rev = result_rev.get() # cleanup pool.terminate() else: # run without threads # scan forward ihh0_fwd, ihh1_fwd = ihh01_scan(h, gaps, **kwargs) # scan backward ihh0_rev, ihh1_rev = ihh01_scan(h[::-1], gaps[::-1], **kwargs) # handle reverse scan ihh0_rev = ihh0_rev[::-1] ihh1_rev = ihh1_rev[::-1] # compute unstandardized score ihh0 = ihh0_fwd + ihh0_rev ihh1 = ihh1_fwd + ihh1_rev score = np.log(ihh1 / ihh0) return score
def windowed_divergence( pos, ac1, ac2, size=None, start=None, stop=None, step=None, windows=None, is_accessible=None, fill=np.nan ): """Estimate nucleotide divergence between two populations in windows over a single chromosome/contig. Parameters ---------- pos : array_like, int, shape (n_items,) Variant positions, using 1-based coordinates, in ascending order. ac1 : array_like, int, shape (n_variants, n_alleles) Allele counts array for the first population. ac2 : array_like, int, shape (n_variants, n_alleles) Allele counts array for the second population. size : int, optional The window size (number of bases). start : int, optional The position at which to start (1-based). stop : int, optional The position at which to stop (1-based). step : int, optional The distance between start positions of windows. If not given, defaults to the window size, i.e., non-overlapping windows. windows : array_like, int, shape (n_windows, 2), optional Manually specify the windows to use as a sequence of (window_start, window_stop) positions, using 1-based coordinates. Overrides the size/start/stop/step parameters. is_accessible : array_like, bool, shape (len(contig),), optional Boolean array indicating accessibility status for all positions in the chromosome/contig. fill : object, optional The value to use where a window is completely inaccessible. Returns ------- Dxy : ndarray, float, shape (n_windows,) Nucleotide divergence in each window. windows : ndarray, int, shape (n_windows, 2) The windows used, as an array of (window_start, window_stop) positions, using 1-based coordinates. n_bases : ndarray, int, shape (n_windows,) Number of (accessible) bases in each window. counts : ndarray, int, shape (n_windows,) Number of variants in each window. Examples -------- Simplest case, two haplotypes in each population:: >>> import allel >>> h = allel.HaplotypeArray([[0, 0, 0, 0], ... [0, 0, 0, 1], ... [0, 0, 1, 1], ... [0, 1, 1, 1], ... [1, 1, 1, 1], ... [0, 0, 1, 2], ... [0, 1, 1, 2], ... [0, 1, -1, -1], ... [-1, -1, -1, -1]]) >>> ac1 = h.count_alleles(subpop=[0, 1]) >>> ac2 = h.count_alleles(subpop=[2, 3]) >>> pos = [2, 4, 7, 14, 15, 18, 19, 25, 27] >>> dxy, windows, n_bases, counts = windowed_divergence( ... pos, ac1, ac2, size=10, start=1, stop=31 ... ) >>> dxy array([ 0.15 , 0.225, 0. ]) >>> windows array([[ 1, 10], [11, 20], [21, 31]]) >>> n_bases array([10, 10, 11]) >>> counts array([3, 4, 2]) """ # check inputs pos = SortedIndex(pos, copy=False) is_accessible = asarray_ndim(is_accessible, 1, allow_none=True) # calculate mean pairwise divergence mpd = mean_pairwise_difference_between(ac1, ac2, fill=0) # sum in windows mpd_sum, windows, counts = windowed_statistic( pos, values=mpd, statistic=np.sum, size=size, start=start, stop=stop, step=step, windows=windows, fill=0 ) # calculate value per base dxy, n_bases = per_base(mpd_sum, windows, is_accessible=is_accessible, fill=fill) return dxy, windows, n_bases, counts
def nsl(h, use_threads=True): """Compute the unstandardized number of segregating sites by length (nSl) for each variant, comparing the reference and alternate alleles, after Ferrer-Admetlla et al. (2014). Parameters ---------- h : array_like, int, shape (n_variants, n_haplotypes) Haplotype array. use_threads : bool, optional If True use multiple threads to compute. Returns ------- score : ndarray, float, shape (n_variants,) Notes ----- This function will calculate nSl for all variants. To exclude variants below a given minor allele frequency, filter the input haplotype array before passing to this function. This function computes nSl by comparing the reference and alternate alleles. These can be polarised by switching the sign for any variant where the reference allele is derived. This function does nothing about nSl calculations where haplotype homozygosity extends up to the first or last variant. There may be edge effects. Note that the unstandardized score is returned. Usually these scores are then standardized in different allele frequency bins. See Also -------- standardize_by_allele_count """ # check inputs h = asarray_ndim(h, 2) check_integer_dtype(h) # # check there are no invariant sites # ac = h.count_alleles() # assert np.all(ac.is_segregating()), 'please remove non-segregating sites' if use_threads and multiprocessing.cpu_count() > 1: # create pool pool = ThreadPool(2) # scan forward result_fwd = pool.apply_async(nsl01_scan, args=(h, )) # scan backward result_rev = pool.apply_async(nsl01_scan, args=(h[::-1], )) # wait for both to finish pool.close() pool.join() # obtain results nsl0_fwd, nsl1_fwd = result_fwd.get() nsl0_rev, nsl1_rev = result_rev.get() else: # scan forward nsl0_fwd, nsl1_fwd = nsl01_scan(h) # scan backward nsl0_rev, nsl1_rev = nsl01_scan(h[::-1]) # handle backwards nsl0_rev = nsl0_rev[::-1] nsl1_rev = nsl1_rev[::-1] # compute unstandardized score nsl0 = nsl0_fwd + nsl0_rev nsl1 = nsl1_fwd + nsl1_rev score = np.log(nsl1 / nsl0) return score
def windowed_df( pos, ac1, ac2, size=None, start=None, stop=None, step=None, windows=None, is_accessible=None, fill=np.nan ): """Calculate the density of fixed differences between two populations in windows over a single chromosome/contig. Parameters ---------- pos : array_like, int, shape (n_items,) Variant positions, using 1-based coordinates, in ascending order. ac1 : array_like, int, shape (n_variants, n_alleles) Allele counts array for the first population. ac2 : array_like, int, shape (n_variants, n_alleles) Allele counts array for the second population. size : int, optional The window size (number of bases). start : int, optional The position at which to start (1-based). stop : int, optional The position at which to stop (1-based). step : int, optional The distance between start positions of windows. If not given, defaults to the window size, i.e., non-overlapping windows. windows : array_like, int, shape (n_windows, 2), optional Manually specify the windows to use as a sequence of (window_start, window_stop) positions, using 1-based coordinates. Overrides the size/start/stop/step parameters. is_accessible : array_like, bool, shape (len(contig),), optional Boolean array indicating accessibility status for all positions in the chromosome/contig. fill : object, optional The value to use where a window is completely inaccessible. Returns ------- df : ndarray, float, shape (n_windows,) Per-base density of fixed differences in each window. windows : ndarray, int, shape (n_windows, 2) The windows used, as an array of (window_start, window_stop) positions, using 1-based coordinates. n_bases : ndarray, int, shape (n_windows,) Number of (accessible) bases in each window. counts : ndarray, int, shape (n_windows,) Number of variants in each window. See Also -------- allel.model.locate_fixed_differences """ # check inputs pos = SortedIndex(pos, copy=False) is_accessible = asarray_ndim(is_accessible, 1, allow_none=True) # locate fixed differences loc_df = locate_fixed_differences(ac1, ac2) # count number of fixed differences in windows n_df, windows, counts = windowed_statistic( pos, values=loc_df, statistic=np.count_nonzero, size=size, start=start, stop=stop, step=step, windows=windows, fill=0, ) # calculate value per base df, n_bases = per_base(n_df, windows, is_accessible=is_accessible, fill=fill) return df, windows, n_bases, counts
def recarray_from_hdf5_group(*args, **kwargs): """Load a recarray from columns stored as separate datasets with an HDF5 group. Either provide an h5py group as a single positional argument, or provide two positional arguments giving the HDF5 file path and the group node path within the file. The following optional parameters may be given. Parameters ---------- start : int, optional Index to start loading from. stop : int, optional Index to finish loading at. condition : array_like, bool, optional A 1-dimensional boolean array of the same length as the columns of the table to load, indicating a selection of rows to load. """ import h5py h5f = None if len(args) == 1: group = args[0] elif len(args) == 2: file_path, node_path = args h5f = h5py.File(file_path, mode='r') try: group = h5f[node_path] except Exception as e: h5f.close() raise e else: raise ValueError('bad arguments; expected group or (file_path, ' 'node_path), found %s' % repr(args)) try: if not isinstance(group, h5py.Group): raise ValueError('expected group, found %r' % group) # determine dataset names to load available_dataset_names = [ n for n in group.keys() if isinstance(group[n], h5py.Dataset) ] names = kwargs.pop('names', available_dataset_names) names = [str(n) for n in names] # needed for PY2 for n in names: if n not in set(group.keys()): raise ValueError('name not found: %s' % n) if not isinstance(group[n], h5py.Dataset): raise ValueError('name does not refer to a dataset: %s, %r' % (n, group[n])) # check datasets are aligned datasets = [group[n] for n in names] length = datasets[0].shape[0] for d in datasets[1:]: if d.shape[0] != length: raise ValueError('datasets must be of equal length') # determine start and stop parameters for load start = kwargs.pop('start', 0) stop = kwargs.pop('stop', length) # check condition condition = kwargs.pop('condition', None) # type: np.ndarray condition = asarray_ndim(condition, 1, allow_none=True) if condition is not None and condition.size != length: raise ValueError('length of condition does not match length ' 'of datasets') # setup output data dtype = [(n, d.dtype, d.shape[1:]) for n, d in zip(names, datasets)] ra = np.empty(length, dtype=dtype) for n, d in zip(names, datasets): a = d[start:stop] if condition is not None: a = np.compress(condition[start:stop], a, axis=0) ra[n] = a return ra finally: if h5f is not None: h5f.close()
def watterson_theta(pos, ac, start=None, stop=None, is_accessible=None): """Calculate the value of Watterson's estimator over a given region. Parameters ---------- pos : array_like, int, shape (n_items,) Variant positions, using 1-based coordinates, in ascending order. ac : array_like, int, shape (n_variants, n_alleles) Allele counts array. start : int, optional The position at which to start (1-based). stop : int, optional The position at which to stop (1-based). is_accessible : array_like, bool, shape (len(contig),), optional Boolean array indicating accessibility status for all positions in the chromosome/contig. Returns ------- theta_hat_w : float Watterson's estimator (theta hat per base). Examples -------- >>> import allel >>> g = allel.GenotypeArray([[[0, 0], [0, 0]], ... [[0, 0], [0, 1]], ... [[0, 0], [1, 1]], ... [[0, 1], [1, 1]], ... [[1, 1], [1, 1]], ... [[0, 0], [1, 2]], ... [[0, 1], [1, 2]], ... [[0, 1], [-1, -1]], ... [[-1, -1], [-1, -1]]]) >>> ac = g.count_alleles() >>> pos = [2, 4, 7, 14, 15, 18, 19, 25, 27] >>> theta_hat_w = allel.stats.watterson_theta(pos, ac, start=1, stop=31) >>> theta_hat_w 0.10557184750733138 """ # check inputs if not isinstance(pos, SortedIndex): pos = SortedIndex(pos, copy=False) is_accessible = asarray_ndim(is_accessible, 1, allow_none=True) if not hasattr(ac, "count_segregating"): ac = AlleleCountsArray(ac, copy=False) # deal with subregion if start is not None or stop is not None: loc = pos.locate_range(start, stop) pos = pos[loc] ac = ac[loc] if start is None: start = pos[0] if stop is None: stop = pos[-1] # count segregating variants S = ac.count_segregating() # assume number of chromosomes sampled is constant for all variants n = ac.sum(axis=1).max() # (n-1)th harmonic number a1 = np.sum(1 / np.arange(1, n)) # calculate absolute value theta_hat_w_abs = S / a1 # calculate value per base if is_accessible is None: n_bases = stop - start + 1 else: n_bases = np.count_nonzero(is_accessible[start - 1 : stop]) theta_hat_w = theta_hat_w_abs / n_bases return theta_hat_w