def thetah(pos, ac, start=None, stop=None, is_accessible=None): # check inputs if not isinstance(pos, SortedIndex): pos = SortedIndex(pos, copy=False) ac = asarray_ndim(ac, 2) is_accessible = asarray_ndim(is_accessible, 1, allow_none=True) # deal with subregion if start is not None or stop is not None: loc = pos.locate_range(start, stop) pos = pos[loc] ac = ac[loc] if start is None: start = pos[0] if stop is None: stop = pos[-1] # calculate values of the stat h = 0 for i in range(len(ac)): p1 = ac[i, 1] n = p1+ac[i, 0] if n > 1: h += (p1*p1)/(n*(n-1.0)) h *= 2 # calculate value per base if is_accessible is None: n_bases = stop - start + 1 else: n_bases = np.count_nonzero(is_accessible[start-1:stop]) h = h / n_bases return h
def test_constructor(self): # missing data arg with self.assertRaises(TypeError): # noinspection PyArgumentList SortedIndex() # data has wrong dtype data = 'foo bar' with self.assertRaises(TypeError): SortedIndex(data) # data has wrong dimensions data = [[1, 2], [3, 4]] with self.assertRaises(TypeError): SortedIndex(data) # values are not sorted data = [2, 1, 3, 5] with self.assertRaises(ValueError): SortedIndex(data) # values are not sorted data = [4., 5., 3.7] with self.assertRaises(ValueError): SortedIndex(data) # valid data (unique) data = [1, 4, 5, 7, 12] idx = SortedIndex(data) aeq(data, idx) eq(np.int, idx.dtype) eq(1, idx.ndim) eq(5, len(idx)) assert idx.is_unique # valid data (non-unique) data = [1, 4, 5, 5, 7, 12] idx = SortedIndex(data) aeq(data, idx) eq(np.int, idx.dtype) eq(1, idx.ndim) eq(6, len(idx)) assert not idx.is_unique # valid data (typed) data = [1, 4, 5, 5, 7, 12] idx = SortedIndex(data, dtype='u4') aeq(data, idx) eq(np.uint32, idx.dtype) # valid data (non-numeric) data = ['1', '12', '4', '5', '5', '7'] idx = SortedIndex(data) aeq(data, idx)
def pairwise_dxy(pos, gac, start=None, stop=None, is_accessible=None): """Convenience function to calculate a pairwise distance matrix using nucleotide divergence (a.k.a. Dxy) as the distance metric. Parameters ---------- pos : array_like, int, shape (n_variants,) Variant positions. gac : array_like, int, shape (n_variants, n_samples, n_alleles) Per-genotype allele counts. start : int, optional Start position of region to use. stop : int, optional Stop position of region to use. is_accessible : array_like, bool, shape (len(contig),), optional Boolean array indicating accessibility status for all positions in the chromosome/contig. Returns ------- dist : ndarray Distance matrix in condensed form. See Also -------- allel.model.ndarray.GenotypeArray.to_allele_counts """ if not isinstance(pos, SortedIndex): pos = SortedIndex(pos, copy=False) gac = asarray_ndim(gac, 3) # compute this once here, to avoid repeated evaluation within the loop gan = np.sum(gac, axis=2) m = gac.shape[1] dist = list() for i, j in itertools.combinations(range(m), 2): ac1 = gac[:, i, ...] an1 = gan[:, i] ac2 = gac[:, j, ...] an2 = gan[:, j] d = sequence_divergence(pos, ac1, ac2, an1=an1, an2=an2, start=start, stop=stop, is_accessible=is_accessible) dist.append(d) return np.array(dist)
def test_slice(self): data = [1, 4, 5, 5, 7, 12] idx = SortedIndex(data, dtype='u4') # row slice s = idx[1:] assert_is_instance(s, SortedIndex) # index s = idx[0] assert_is_instance(s, np.uint32) assert_not_is_instance(s, SortedIndex) eq(data[0], s)
def maxFDA(pos, ac, start=None, stop=None, is_accessible=None): # check inputs if not isinstance(pos, SortedIndex): pos = SortedIndex(pos, copy=False) ac = asarray_ndim(ac, 2) is_accessible = asarray_ndim(is_accessible, 1, allow_none=True) # deal with subregion if start is not None or stop is not None: loc = pos.locate_range(start, stop) pos = pos[loc] ac = ac[loc] if start is None: start = pos[0] if stop is None: stop = pos[-1] # calculate values of the stat dafs = [] for i in range(len(ac)): p1 = ac[i, 1] n = p1+ac[i, 0] dafs.append(p1/float(n)) return max(dafs)
def setup_instance(self, data): return SortedIndex(data)
def windowed_diversity(pos, ac, size=None, start=None, stop=None, step=None, windows=None, is_accessible=None, fill=np.nan): """Estimate nucleotide diversity in windows over a single chromosome/contig. Parameters ---------- pos : array_like, int, shape (n_items,) Variant positions, using 1-based coordinates, in ascending order. ac : array_like, int, shape (n_variants, n_alleles) Allele counts array. size : int, optional The window size (number of bases). start : int, optional The position at which to start (1-based). stop : int, optional The position at which to stop (1-based). step : int, optional The distance between start positions of windows. If not given, defaults to the window size, i.e., non-overlapping windows. windows : array_like, int, shape (n_windows, 2), optional Manually specify the windows to use as a sequence of (window_start, window_stop) positions, using 1-based coordinates. Overrides the size/start/stop/step parameters. is_accessible : array_like, bool, shape (len(contig),), optional Boolean array indicating accessibility status for all positions in the chromosome/contig. fill : object, optional The value to use where a window is completely inaccessible. Returns ------- pi : ndarray, float, shape (n_windows,) Nucleotide diversity in each window. windows : ndarray, int, shape (n_windows, 2) The windows used, as an array of (window_start, window_stop) positions, using 1-based coordinates. n_bases : ndarray, int, shape (n_windows,) Number of (accessible) bases in each window. counts : ndarray, int, shape (n_windows,) Number of variants in each window. Examples -------- >>> import allel >>> g = allel.GenotypeArray([[[0, 0], [0, 0]], ... [[0, 0], [0, 1]], ... [[0, 0], [1, 1]], ... [[0, 1], [1, 1]], ... [[1, 1], [1, 1]], ... [[0, 0], [1, 2]], ... [[0, 1], [1, 2]], ... [[0, 1], [-1, -1]], ... [[-1, -1], [-1, -1]]]) >>> ac = g.count_alleles() >>> pos = [2, 4, 7, 14, 15, 18, 19, 25, 27] >>> pi, windows, n_bases, counts = allel.windowed_diversity( ... pos, ac, size=10, start=1, stop=31 ... ) >>> pi array([0.11666667, 0.21666667, 0.09090909]) >>> windows array([[ 1, 10], [11, 20], [21, 31]]) >>> n_bases array([10, 10, 11]) >>> counts array([3, 4, 2]) """ # check inputs if not isinstance(pos, SortedIndex): pos = SortedIndex(pos, copy=False) is_accessible = asarray_ndim(is_accessible, 1, allow_none=True) # masking inaccessible sites from pos and ac pos, ac = mask_inaccessible(is_accessible, pos, ac) # calculate mean pairwise difference mpd = mean_pairwise_difference(ac, fill=0) # sum differences in windows mpd_sum, windows, counts = windowed_statistic( pos, values=mpd, statistic=np.sum, size=size, start=start, stop=stop, step=step, windows=windows, fill=0 ) # calculate value per base pi, n_bases = per_base(mpd_sum, windows, is_accessible=is_accessible, fill=fill) return pi, windows, n_bases, counts
def watterson_theta(pos, ac, start=None, stop=None, is_accessible=None): """Calculate the value of Watterson's estimator over a given region. Parameters ---------- pos : array_like, int, shape (n_items,) Variant positions, using 1-based coordinates, in ascending order. ac : array_like, int, shape (n_variants, n_alleles) Allele counts array. start : int, optional The position at which to start (1-based). stop : int, optional The position at which to stop (1-based). is_accessible : array_like, bool, shape (len(contig),), optional Boolean array indicating accessibility status for all positions in the chromosome/contig. Returns ------- theta_hat_w : float Watterson's estimator (theta hat per base). Examples -------- >>> import allel >>> g = allel.GenotypeArray([[[0, 0], [0, 0]], ... [[0, 0], [0, 1]], ... [[0, 0], [1, 1]], ... [[0, 1], [1, 1]], ... [[1, 1], [1, 1]], ... [[0, 0], [1, 2]], ... [[0, 1], [1, 2]], ... [[0, 1], [-1, -1]], ... [[-1, -1], [-1, -1]]]) >>> ac = g.count_alleles() >>> pos = [2, 4, 7, 14, 15, 18, 19, 25, 27] >>> theta_hat_w = allel.stats.watterson_theta(pos, ac, start=1, stop=31) >>> theta_hat_w 0.10557184750733138 """ # check inputs if not isinstance(pos, SortedIndex): pos = SortedIndex(pos, copy=False) is_accessible = asarray_ndim(is_accessible, 1, allow_none=True) if not hasattr(ac, "count_segregating"): ac = AlleleCountsArray(ac, copy=False) # deal with subregion if start is not None or stop is not None: loc = pos.locate_range(start, stop) pos = pos[loc] ac = ac[loc] if start is None: start = pos[0] if stop is None: stop = pos[-1] # count segregating variants S = ac.count_segregating() # assume number of chromosomes sampled is constant for all variants n = ac.sum(axis=1).max() # (n-1)th harmonic number a1 = np.sum(1 / np.arange(1, n)) # calculate absolute value theta_hat_w_abs = S / a1 # calculate value per base if is_accessible is None: n_bases = stop - start + 1 else: n_bases = np.count_nonzero(is_accessible[start - 1 : stop]) theta_hat_w = theta_hat_w_abs / n_bases return theta_hat_w
def windowed_divergence(pos, ac1, ac2, size=None, start=None, stop=None, step=None, windows=None, is_accessible=None, fill=np.nan): """Estimate nucleotide divergence between two populations in windows over a single chromosome/contig. Parameters ---------- pos : array_like, int, shape (n_items,) Variant positions, using 1-based coordinates, in ascending order. ac1 : array_like, int, shape (n_variants, n_alleles) Allele counts array for the first population. ac2 : array_like, int, shape (n_variants, n_alleles) Allele counts array for the second population. size : int, optional The window size (number of bases). start : int, optional The position at which to start (1-based). stop : int, optional The position at which to stop (1-based). step : int, optional The distance between start positions of windows. If not given, defaults to the window size, i.e., non-overlapping windows. windows : array_like, int, shape (n_windows, 2), optional Manually specify the windows to use as a sequence of (window_start, window_stop) positions, using 1-based coordinates. Overrides the size/start/stop/step parameters. is_accessible : array_like, bool, shape (len(contig),), optional Boolean array indicating accessibility status for all positions in the chromosome/contig. fill : object, optional The value to use where a window is completely inaccessible. Returns ------- Dxy : ndarray, float, shape (n_windows,) Nucleotide divergence in each window. windows : ndarray, int, shape (n_windows, 2) The windows used, as an array of (window_start, window_stop) positions, using 1-based coordinates. n_bases : ndarray, int, shape (n_windows,) Number of (accessible) bases in each window. counts : ndarray, int, shape (n_windows,) Number of variants in each window. Examples -------- Simplest case, two haplotypes in each population:: >>> import allel >>> h = allel.HaplotypeArray([[0, 0, 0, 0], ... [0, 0, 0, 1], ... [0, 0, 1, 1], ... [0, 1, 1, 1], ... [1, 1, 1, 1], ... [0, 0, 1, 2], ... [0, 1, 1, 2], ... [0, 1, -1, -1], ... [-1, -1, -1, -1]]) >>> ac1 = h.count_alleles(subpop=[0, 1]) >>> ac2 = h.count_alleles(subpop=[2, 3]) >>> pos = [2, 4, 7, 14, 15, 18, 19, 25, 27] >>> dxy, windows, n_bases, counts = windowed_divergence( ... pos, ac1, ac2, size=10, start=1, stop=31 ... ) >>> dxy array([0.15 , 0.225, 0. ]) >>> windows array([[ 1, 10], [11, 20], [21, 31]]) >>> n_bases array([10, 10, 11]) >>> counts array([3, 4, 2]) """ # check inputs pos = SortedIndex(pos, copy=False) is_accessible = asarray_ndim(is_accessible, 1, allow_none=True) # calculate mean pairwise divergence mpd = mean_pairwise_difference_between(ac1, ac2, fill=0) # sum in windows mpd_sum, windows, counts = windowed_statistic(pos, values=mpd, statistic=np.sum, size=size, start=start, stop=stop, step=step, windows=windows, fill=0) # calculate value per base dxy, n_bases = per_base(mpd_sum, windows, is_accessible=is_accessible, fill=fill) return dxy, windows, n_bases, counts
def sequence_divergence(pos, ac1, ac2, an1=None, an2=None, start=None, stop=None, is_accessible=None): """Estimate nucleotide divergence between two populations within a given region, which is the average proportion of sites (including monomorphic sites not present in the data) that differ between randomly chosen pairs of chromosomes, one from each population. Parameters ---------- pos : array_like, int, shape (n_items,) Variant positions, using 1-based coordinates, in ascending order. ac1 : array_like, int, shape (n_variants, n_alleles) Allele counts array for the first population. ac2 : array_like, int, shape (n_variants, n_alleles) Allele counts array for the second population. an1 : array_like, int, shape (n_variants,), optional Allele numbers for the first population. If not provided, will be calculated from `ac1`. an2 : array_like, int, shape (n_variants,), optional Allele numbers for the second population. If not provided, will be calculated from `ac2`. start : int, optional The position at which to start (1-based). Defaults to the first position. stop : int, optional The position at which to stop (1-based). Defaults to the last position. is_accessible : array_like, bool, shape (len(contig),), optional Boolean array indicating accessibility status for all positions in the chromosome/contig. Returns ------- Dxy : ndarray, float, shape (n_windows,) Nucleotide divergence. Examples -------- Simplest case, two haplotypes in each population:: >>> import allel >>> h = allel.HaplotypeArray([[0, 0, 0, 0], ... [0, 0, 0, 1], ... [0, 0, 1, 1], ... [0, 1, 1, 1], ... [1, 1, 1, 1], ... [0, 0, 1, 2], ... [0, 1, 1, 2], ... [0, 1, -1, -1], ... [-1, -1, -1, -1]]) >>> ac1 = h.count_alleles(subpop=[0, 1]) >>> ac2 = h.count_alleles(subpop=[2, 3]) >>> pos = [2, 4, 7, 14, 15, 18, 19, 25, 27] >>> dxy = sequence_divergence(pos, ac1, ac2, start=1, stop=31) >>> dxy 0.12096774193548387 """ # check inputs if not isinstance(pos, SortedIndex): pos = SortedIndex(pos, copy=False) ac1 = asarray_ndim(ac1, 2) ac2 = asarray_ndim(ac2, 2) if an1 is not None: an1 = asarray_ndim(an1, 1) if an2 is not None: an2 = asarray_ndim(an2, 1) is_accessible = asarray_ndim(is_accessible, 1, allow_none=True) # handle start/stop if start is not None or stop is not None: loc = pos.locate_range(start, stop) pos = pos[loc] ac1 = ac1[loc] ac2 = ac2[loc] if an1 is not None: an1 = an1[loc] if an2 is not None: an2 = an2[loc] if start is None: start = pos[0] if stop is None: stop = pos[-1] # calculate mean pairwise difference between the two populations mpd = mean_pairwise_difference_between(ac1, ac2, an1=an1, an2=an2, fill=0) # sum differences over variants mpd_sum = np.sum(mpd) # calculate value per base, N.B., expect pos is 1-based if is_accessible is None: n_bases = stop - start + 1 else: n_bases = np.count_nonzero(is_accessible[start - 1:stop]) dxy = mpd_sum / n_bases return dxy
def sequence_diversity(pos, ac, start=None, stop=None, is_accessible=None): """Estimate nucleotide diversity within a given region, which is the average proportion of sites (including monomorphic sites not present in the data) that differ between randomly chosen pairs of chromosomes. Parameters ---------- pos : array_like, int, shape (n_items,) Variant positions, using 1-based coordinates, in ascending order. ac : array_like, int, shape (n_variants, n_alleles) Allele counts array. start : int, optional The position at which to start (1-based). Defaults to the first position. stop : int, optional The position at which to stop (1-based). Defaults to the last position. is_accessible : array_like, bool, shape (len(contig),), optional Boolean array indicating accessibility status for all positions in the chromosome/contig. Returns ------- pi : ndarray, float, shape (n_windows,) Nucleotide diversity. Notes ----- If start and/or stop are not provided, uses the difference between the last and the first position as a proxy for the total number of sites, which can overestimate the sequence diversity. Examples -------- >>> import allel >>> g = allel.GenotypeArray([[[0, 0], [0, 0]], ... [[0, 0], [0, 1]], ... [[0, 0], [1, 1]], ... [[0, 1], [1, 1]], ... [[1, 1], [1, 1]], ... [[0, 0], [1, 2]], ... [[0, 1], [1, 2]], ... [[0, 1], [-1, -1]], ... [[-1, -1], [-1, -1]]]) >>> ac = g.count_alleles() >>> pos = [2, 4, 7, 14, 15, 18, 19, 25, 27] >>> pi = allel.sequence_diversity(pos, ac, start=1, stop=31) >>> pi 0.13978494623655915 """ # check inputs if not isinstance(pos, SortedIndex): pos = SortedIndex(pos, copy=False) ac = asarray_ndim(ac, 2) is_accessible = asarray_ndim(is_accessible, 1, allow_none=True) # deal with subregion if start is not None or stop is not None: loc = pos.locate_range(start, stop) pos = pos[loc] ac = ac[loc] if start is None: start = pos[0] if stop is None: stop = pos[-1] # calculate mean pairwise difference mpd = mean_pairwise_difference(ac, fill=0) # sum differences over variants mpd_sum = np.sum(mpd) # calculate value per base if is_accessible is None: n_bases = stop - start + 1 else: n_bases = np.count_nonzero(is_accessible[start - 1:stop]) pi = mpd_sum / n_bases return pi
def windowed_tajima_d(pos, ac, size=None, start=None, stop=None, step=None, windows=None, min_sites=3): """Calculate the value of Tajima's D in windows over a single chromosome/contig. Parameters ---------- pos : array_like, int, shape (n_items,) Variant positions, using 1-based coordinates, in ascending order. ac : array_like, int, shape (n_variants, n_alleles) Allele counts array. size : int, optional The window size (number of bases). start : int, optional The position at which to start (1-based). stop : int, optional The position at which to stop (1-based). step : int, optional The distance between start positions of windows. If not given, defaults to the window size, i.e., non-overlapping windows. windows : array_like, int, shape (n_windows, 2), optional Manually specify the windows to use as a sequence of (window_start, window_stop) positions, using 1-based coordinates. Overrides the size/start/stop/step parameters. min_sites : int, optional Minimum number of segregating sites for which to calculate a value. If there are fewer, np.nan is returned. Defaults to 3. Returns ------- D : ndarray, float, shape (n_windows,) Tajima's D. windows : ndarray, int, shape (n_windows, 2) The windows used, as an array of (window_start, window_stop) positions, using 1-based coordinates. counts : ndarray, int, shape (n_windows,) Number of variants in each window. Examples -------- >>> import allel >>> g = allel.GenotypeArray([[[0, 0], [0, 0]], ... [[0, 0], [0, 1]], ... [[0, 0], [1, 1]], ... [[0, 1], [1, 1]], ... [[1, 1], [1, 1]], ... [[0, 0], [1, 2]], ... [[0, 1], [1, 2]], ... [[0, 1], [-1, -1]], ... [[-1, -1], [-1, -1]]]) >>> ac = g.count_alleles() >>> pos = [2, 4, 7, 14, 15, 20, 22, 25, 27] >>> D, windows, counts = allel.windowed_tajima_d(pos, ac, size=20, step=10, start=1, stop=31) >>> D array([1.36521524, 4.22566622]) >>> windows array([[ 1, 20], [11, 31]]) >>> counts array([6, 6]) """ # check inputs if not isinstance(pos, SortedIndex): pos = SortedIndex(pos, copy=False) if not hasattr(ac, 'count_segregating'): ac = AlleleCountsArray(ac, copy=False) # assume number of chromosomes sampled is constant for all variants n = ac.sum(axis=1).max() # calculate constants a1 = np.sum(1 / np.arange(1, n)) a2 = np.sum(1 / (np.arange(1, n)**2)) b1 = (n + 1) / (3 * (n - 1)) b2 = 2 * (n**2 + n + 3) / (9 * n * (n - 1)) c1 = b1 - (1 / a1) c2 = b2 - ((n + 2) / (a1 * n)) + (a2 / (a1**2)) e1 = c1 / a1 e2 = c2 / (a1**2 + a2) # locate segregating variants is_seg = ac.is_segregating() # calculate mean pairwise difference mpd = mean_pairwise_difference(ac, fill=0) # define statistic to compute for each window # noinspection PyPep8Naming def statistic(w_is_seg, w_mpd): S = np.count_nonzero(w_is_seg) if S < min_sites: return np.nan pi = np.sum(w_mpd) d = pi - (S / a1) d_stdev = np.sqrt((e1 * S) + (e2 * S * (S - 1))) wD = d / d_stdev return wD D, windows, counts = windowed_statistic(pos, values=(is_seg, mpd), statistic=statistic, size=size, start=start, stop=stop, step=step, windows=windows, fill=np.nan) return D, windows, counts
def windowed_statistic(pos, values, statistic, size=None, start=None, stop=None, step=None, windows=None, fill=np.nan): """Calculate a statistic from items in windows over a single chromosome/contig. Parameters ---------- pos : array_like, int, shape (n_items,) The item positions in ascending order, using 1-based coordinates.. values : array_like, int, shape (n_items,) The values to summarise. May also be a tuple of values arrays, in which case each array will be sliced and passed through to the statistic function as separate arguments. statistic : function The statistic to compute. size : int, optional The window size (number of bases). start : int, optional The position at which to start (1-based). stop : int, optional The position at which to stop (1-based). step : int, optional The distance between start positions of windows. If not given, defaults to the window size, i.e., non-overlapping windows. windows : array_like, int, shape (n_windows, 2), optional Manually specify the windows to use as a sequence of (window_start, window_stop) positions, using 1-based coordinates. Overrides the size/start/stop/step parameters. fill : object, optional The value to use where a window is empty, i.e., contains no items. Returns ------- out : ndarray, shape (n_windows,) The value of the statistic for each window. windows : ndarray, int, shape (n_windows, 2) The windows used, as an array of (window_start, window_stop) positions, using 1-based coordinates. counts : ndarray, int, shape (n_windows,) The number of items in each window. Notes ----- The window stop positions are included within a window. The final window will be truncated to the specified stop position, and so may be smaller than the other windows. Examples -------- Count non-zero (i.e., True) items in non-overlapping windows:: >>> import allel >>> pos = [1, 7, 12, 15, 28] >>> values = [True, False, True, False, False] >>> nnz, windows, counts = allel.windowed_statistic( ... pos, values, statistic=np.count_nonzero, size=10 ... ) >>> nnz array([1, 1, 0]) >>> windows array([[ 1, 10], [11, 20], [21, 28]]) >>> counts array([2, 2, 1]) Compute a sum over items in half-overlapping windows:: >>> values = [3, 4, 2, 6, 9] >>> x, windows, counts = allel.windowed_statistic( ... pos, values, statistic=np.sum, size=10, step=5, fill=0 ... ) >>> x array([ 7, 12, 8, 0, 9]) >>> windows array([[ 1, 10], [ 6, 15], [11, 20], [16, 25], [21, 28]]) >>> counts array([2, 3, 2, 0, 1]) """ # assume sorted positions if not isinstance(pos, SortedIndex): pos = SortedIndex(pos, copy=False) # check lengths are equal if isinstance(values, tuple): # assume multiple values arrays check_equal_length(pos, *values) else: # assume a single values array check_equal_length(pos, values) # setup windows if windows is None: windows = position_windows(pos, size, start, stop, step) else: windows = asarray_ndim(windows, 2) # find window locations locs = window_locations(pos, windows) # setup outputs out = [] counts = [] # iterate over windows for start_idx, stop_idx in locs: # calculate number of values in window n = stop_idx - start_idx if n == 0: # window is empty s = fill else: if isinstance(values, tuple): # assume multiple values arrays wv = [v[start_idx:stop_idx] for v in values] s = statistic(*wv) else: # assume a single values array wv = values[start_idx:stop_idx] s = statistic(wv) # store outputs out.append(s) counts.append(n) # convert to arrays for output return np.asarray(out), windows, np.asarray(counts)
def windowed_count(pos, size=None, start=None, stop=None, step=None, windows=None): """Count the number of items in windows over a single chromosome/contig. Parameters ---------- pos : array_like, int, shape (n_items,) The item positions in ascending order, using 1-based coordinates.. size : int, optional The window size (number of bases). start : int, optional The position at which to start (1-based). stop : int, optional The position at which to stop (1-based). step : int, optional The distance between start positions of windows. If not given, defaults to the window size, i.e., non-overlapping windows. windows : array_like, int, shape (n_windows, 2), optional Manually specify the windows to use as a sequence of (window_start, window_stop) positions, using 1-based coordinates. Overrides the size/start/stop/step parameters. Returns ------- counts : ndarray, int, shape (n_windows,) The number of items in each window. windows : ndarray, int, shape (n_windows, 2) The windows used, as an array of (window_start, window_stop) positions, using 1-based coordinates. Notes ----- The window stop positions are included within a window. The final window will be truncated to the specified stop position, and so may be smaller than the other windows. Examples -------- Non-overlapping windows:: >>> import allel >>> pos = [1, 7, 12, 15, 28] >>> counts, windows = allel.windowed_count(pos, size=10) >>> counts array([2, 2, 1]) >>> windows array([[ 1, 10], [11, 20], [21, 28]]) Half-overlapping windows:: >>> counts, windows = allel.windowed_count(pos, size=10, step=5) >>> counts array([2, 3, 2, 0, 1]) >>> windows array([[ 1, 10], [ 6, 15], [11, 20], [16, 25], [21, 28]]) """ # assume sorted positions if not isinstance(pos, SortedIndex): pos = SortedIndex(pos, copy=False) # setup windows if windows is None: windows = position_windows(pos, size, start, stop, step) else: windows = asarray_ndim(windows, 2) # find window locations locs = window_locations(pos, windows) # count number of items in each window counts = np.diff(locs, axis=1).reshape(-1) return counts, windows
def sequence_diversity(pos, ac, start=None, stop=None, is_accessible=None): """Estimate nucleotide diversity within a given region. Parameters ---------- pos : array_like, int, shape (n_items,) Variant positions, using 1-based coordinates, in ascending order. ac : array_like, int, shape (n_variants, n_alleles) Allele counts array. start : int, optional The position at which to start (1-based). stop : int, optional The position at which to stop (1-based). is_accessible : array_like, bool, shape (len(contig),), optional Boolean array indicating accessibility status for all positions in the chromosome/contig. Returns ------- pi : ndarray, float, shape (n_windows,) Nucleotide diversity. Examples -------- >>> import allel >>> g = allel.GenotypeArray([[[0, 0], [0, 0]], ... [[0, 0], [0, 1]], ... [[0, 0], [1, 1]], ... [[0, 1], [1, 1]], ... [[1, 1], [1, 1]], ... [[0, 0], [1, 2]], ... [[0, 1], [1, 2]], ... [[0, 1], [-1, -1]], ... [[-1, -1], [-1, -1]]]) >>> ac = g.count_alleles() >>> pos = [2, 4, 7, 14, 15, 18, 19, 25, 27] >>> pi = allel.stats.sequence_diversity(pos, ac, start=1, stop=31) >>> pi 0.13978494623655915 """ # check inputs if not isinstance(pos, SortedIndex): pos = SortedIndex(pos, copy=False) ac = asarray_ndim(ac, 2) is_accessible = asarray_ndim(is_accessible, 1, allow_none=True) # deal with subregion if start is not None or stop is not None: loc = pos.locate_range(start, stop) pos = pos[loc] ac = ac[loc] if start is None: start = pos[0] if stop is None: stop = pos[-1] # calculate mean pairwise difference mpd = mean_pairwise_difference(ac, fill=0) # sum differences over variants mpd_sum = np.sum(mpd) # calculate value per base if is_accessible is None: n_bases = stop - start + 1 else: n_bases = np.count_nonzero(is_accessible[start - 1 : stop]) pi = mpd_sum / n_bases return pi
def sequence_divergence(pos, ac1, ac2, an1=None, an2=None, start=None, stop=None, is_accessible=None): """Estimate nucleotide divergence between two populations within a given region. Parameters ---------- pos : array_like, int, shape (n_items,) Variant positions, using 1-based coordinates, in ascending order. ac1 : array_like, int, shape (n_variants, n_alleles) Allele counts array for the first population. ac2 : array_like, int, shape (n_variants, n_alleles) Allele counts array for the second population. start : int, optional The position at which to start (1-based). stop : int, optional The position at which to stop (1-based). is_accessible : array_like, bool, shape (len(contig),), optional Boolean array indicating accessibility status for all positions in the chromosome/contig. Returns ------- Dxy : ndarray, float, shape (n_windows,) Nucleotide divergence. Examples -------- Simplest case, two haplotypes in each population:: >>> import allel >>> h = allel.HaplotypeArray([[0, 0, 0, 0], ... [0, 0, 0, 1], ... [0, 0, 1, 1], ... [0, 1, 1, 1], ... [1, 1, 1, 1], ... [0, 0, 1, 2], ... [0, 1, 1, 2], ... [0, 1, -1, -1], ... [-1, -1, -1, -1]]) >>> ac1 = h.count_alleles(subpop=[0, 1]) >>> ac2 = h.count_alleles(subpop=[2, 3]) >>> pos = [2, 4, 7, 14, 15, 18, 19, 25, 27] >>> dxy = sequence_divergence(pos, ac1, ac2, start=1, stop=31) >>> dxy 0.12096774193548387 """ # check inputs if not isinstance(pos, SortedIndex): pos = SortedIndex(pos, copy=False) ac1 = asarray_ndim(ac1, 2) ac2 = asarray_ndim(ac2, 2) if an1 is not None: an1 = asarray_ndim(an1, 1) if an2 is not None: an2 = asarray_ndim(an2, 1) is_accessible = asarray_ndim(is_accessible, 1, allow_none=True) # handle start/stop if start is not None or stop is not None: loc = pos.locate_range(start, stop) pos = pos[loc] ac1 = ac1[loc] ac2 = ac2[loc] if an1 is not None: an1 = an1[loc] if an2 is not None: an2 = an2[loc] if start is None: start = pos[0] if stop is None: stop = pos[-1] # calculate mean pairwise difference between the two populations mpd = mean_pairwise_difference_between(ac1, ac2, an1=an1, an2=an2, fill=0) # sum differences over variants mpd_sum = np.sum(mpd) # calculate value per base, N.B., expect pos is 1-based if is_accessible is None: n_bases = stop - start + 1 else: n_bases = np.count_nonzero(is_accessible[start - 1 : stop]) dxy = mpd_sum / n_bases return dxy
def windowed_df(pos, ac1, ac2, size=None, start=None, stop=None, step=None, windows=None, is_accessible=None, fill=np.nan): """Calculate the density of fixed differences between two populations in windows over a single chromosome/contig. Parameters ---------- pos : array_like, int, shape (n_items,) Variant positions, using 1-based coordinates, in ascending order. ac1 : array_like, int, shape (n_variants, n_alleles) Allele counts array for the first population. ac2 : array_like, int, shape (n_variants, n_alleles) Allele counts array for the second population. size : int, optional The window size (number of bases). start : int, optional The position at which to start (1-based). stop : int, optional The position at which to stop (1-based). step : int, optional The distance between start positions of windows. If not given, defaults to the window size, i.e., non-overlapping windows. windows : array_like, int, shape (n_windows, 2), optional Manually specify the windows to use as a sequence of (window_start, window_stop) positions, using 1-based coordinates. Overrides the size/start/stop/step parameters. is_accessible : array_like, bool, shape (len(contig),), optional Boolean array indicating accessibility status for all positions in the chromosome/contig. fill : object, optional The value to use where a window is completely inaccessible. Returns ------- df : ndarray, float, shape (n_windows,) Per-base density of fixed differences in each window. windows : ndarray, int, shape (n_windows, 2) The windows used, as an array of (window_start, window_stop) positions, using 1-based coordinates. n_bases : ndarray, int, shape (n_windows,) Number of (accessible) bases in each window. counts : ndarray, int, shape (n_windows,) Number of variants in each window. See Also -------- allel.model.locate_fixed_differences """ # check inputs pos = SortedIndex(pos, copy=False) is_accessible = asarray_ndim(is_accessible, 1, allow_none=True) # locate fixed differences loc_df = locate_fixed_differences(ac1, ac2) # count number of fixed differences in windows n_df, windows, counts = windowed_statistic(pos, values=loc_df, statistic=np.count_nonzero, size=size, start=start, stop=stop, step=step, windows=windows, fill=0) # calculate value per base df, n_bases = per_base(n_df, windows, is_accessible=is_accessible, fill=fill) return df, windows, n_bases, counts
def tajima_d(ac, pos=None, start=None, stop=None): """Calculate the value of Tajima's D over a given region. Parameters ---------- ac : array_like, int, shape (n_variants, n_alleles) Allele counts array. pos : array_like, int, shape (n_items,), optional Variant positions, using 1-based coordinates, in ascending order. start : int, optional The position at which to start (1-based). stop : int, optional The position at which to stop (1-based). Returns ------- D : float Examples -------- >>> import allel >>> g = allel.GenotypeArray([[[0, 0], [0, 0]], ... [[0, 0], [0, 1]], ... [[0, 0], [1, 1]], ... [[0, 1], [1, 1]], ... [[1, 1], [1, 1]], ... [[0, 0], [1, 2]], ... [[0, 1], [1, 2]], ... [[0, 1], [-1, -1]], ... [[-1, -1], [-1, -1]]]) >>> ac = g.count_alleles() >>> allel.stats.tajima_d(ac) 3.1445848780213814 >>> pos = [2, 4, 7, 14, 15, 18, 19, 25, 27] >>> allel.stats.tajima_d(ac, pos=pos, start=7, stop=25) 3.8779735196179366 """ # check inputs if not hasattr(ac, "count_segregating"): ac = AlleleCountsArray(ac, copy=False) # deal with subregion if pos is not None and (start is not None or stop is not None): if not isinstance(pos, SortedIndex): pos = SortedIndex(pos, copy=False) loc = pos.locate_range(start, stop) ac = ac[loc] # assume number of chromosomes sampled is constant for all variants n = ac.sum(axis=1).max() # count segregating variants S = ac.count_segregating() # (n-1)th harmonic number a1 = np.sum(1 / np.arange(1, n)) # calculate Watterson's theta (absolute value) theta_hat_w_abs = S / a1 # calculate mean pairwise difference mpd = mean_pairwise_difference(ac, fill=0) # calculate theta_hat pi (sum differences over variants) theta_hat_pi_abs = np.sum(mpd) # N.B., both theta estimates are usually divided by the number of # (accessible) bases but here we want the absolute difference d = theta_hat_pi_abs - theta_hat_w_abs # calculate the denominator (standard deviation) a2 = np.sum(1 / (np.arange(1, n) ** 2)) b1 = (n + 1) / (3 * (n - 1)) b2 = 2 * (n ** 2 + n + 3) / (9 * n * (n - 1)) c1 = b1 - (1 / a1) c2 = b2 - ((n + 2) / (a1 * n)) + (a2 / (a1 ** 2)) e1 = c1 / a1 e2 = c2 / (a1 ** 2 + a2) d_stdev = np.sqrt((e1 * S) + (e2 * S * (S - 1))) # finally calculate Tajima's D D = d / d_stdev return D
def watterson_theta(pos, ac, start=None, stop=None, is_accessible=None): """Calculate the value of Watterson's estimator over a given region. Parameters ---------- pos : array_like, int, shape (n_items,) Variant positions, using 1-based coordinates, in ascending order. ac : array_like, int, shape (n_variants, n_alleles) Allele counts array. start : int, optional The position at which to start (1-based). Defaults to the first position. stop : int, optional The position at which to stop (1-based). Defaults to the last position. is_accessible : array_like, bool, shape (len(contig),), optional Boolean array indicating accessibility status for all positions in the chromosome/contig. Returns ------- theta_hat_w : float Watterson's estimator (theta hat per base). Examples -------- >>> import allel >>> g = allel.GenotypeArray([[[0, 0], [0, 0]], ... [[0, 0], [0, 1]], ... [[0, 0], [1, 1]], ... [[0, 1], [1, 1]], ... [[1, 1], [1, 1]], ... [[0, 0], [1, 2]], ... [[0, 1], [1, 2]], ... [[0, 1], [-1, -1]], ... [[-1, -1], [-1, -1]]]) >>> ac = g.count_alleles() >>> pos = [2, 4, 7, 14, 15, 18, 19, 25, 27] >>> theta_hat_w = allel.watterson_theta(pos, ac, start=1, stop=31) >>> theta_hat_w 0.10557184750733138 """ # check inputs if not isinstance(pos, SortedIndex): pos = SortedIndex(pos, copy=False) is_accessible = asarray_ndim(is_accessible, 1, allow_none=True) if not hasattr(ac, 'count_segregating'): ac = AlleleCountsArray(ac, copy=False) # deal with subregion if start is not None or stop is not None: loc = pos.locate_range(start, stop) pos = pos[loc] ac = ac[loc] if start is None: start = pos[0] if stop is None: stop = pos[-1] # count segregating variants S = ac.count_segregating() # assume number of chromosomes sampled is constant for all variants n = ac.sum(axis=1).max() # (n-1)th harmonic number a1 = np.sum(1 / np.arange(1, n)) # calculate absolute value theta_hat_w_abs = S / a1 # calculate value per base if is_accessible is None: n_bases = stop - start + 1 else: n_bases = np.count_nonzero(is_accessible[start - 1:stop]) theta_hat_w = theta_hat_w_abs / n_bases return theta_hat_w
def plot_variant_locator(pos, step=None, ax=None, start=None, stop=None, flip=False, line_kwargs=None): """ Plot lines indicating the physical genome location of variants from a single chromosome/contig. By default the top x axis is in variant index space, and the bottom x axis is in genome position space. Parameters ---------- pos : array_like A sorted 1-dimensional array of genomic positions from a single chromosome/contig. step : int, optional Plot a line for every `step` variants. ax : axes, optional The axes on which to draw. If not provided, a new figure will be created. start : int, optional The start position for the region to draw. stop : int, optional The stop position for the region to draw. flip : bool, optional Flip the plot upside down. line_kwargs : dict-like Additional keyword arguments passed through to `plt.Line2D`. Returns ------- ax : axes The axes on which the plot was drawn """ import matplotlib.pyplot as plt # check inputs pos = SortedIndex(pos, copy=False) # set up axes if ax is None: x = plt.rcParams['figure.figsize'][0] y = x / 7 fig, ax = plt.subplots(figsize=(x, y)) fig.tight_layout() # determine x axis limits if start is None: start = np.min(pos) if stop is None: stop = np.max(pos) loc = pos.locate_range(start, stop) pos = pos[loc] if step is None: step = len(pos) // 100 ax.set_xlim(start, stop) # plot the lines if line_kwargs is None: line_kwargs = dict() # line_kwargs.setdefault('linewidth', .5) n_variants = len(pos) for i, p in enumerate(pos[::step]): xfrom = p xto = ( start + ((i * step / n_variants) * (stop-start)) ) line = plt.Line2D([xfrom, xto], [0, 1], **line_kwargs) ax.add_line(line) # invert? if flip: ax.invert_yaxis() ax.xaxis.tick_top() else: ax.xaxis.tick_bottom() # tidy up ax.set_yticks([]) ax.xaxis.set_tick_params(direction='out') for spine in 'left', 'right': ax.spines[spine].set_visible(False) return ax
def windowed_watterson_theta(pos, ac, size=None, start=None, stop=None, step=None, windows=None, is_accessible=None, fill=np.nan): """Calculate the value of Watterson's estimator in windows over a single chromosome/contig. Parameters ---------- pos : array_like, int, shape (n_items,) Variant positions, using 1-based coordinates, in ascending order. ac : array_like, int, shape (n_variants, n_alleles) Allele counts array. size : int, optional The window size (number of bases). start : int, optional The position at which to start (1-based). stop : int, optional The position at which to stop (1-based). step : int, optional The distance between start positions of windows. If not given, defaults to the window size, i.e., non-overlapping windows. windows : array_like, int, shape (n_windows, 2), optional Manually specify the windows to use as a sequence of (window_start, window_stop) positions, using 1-based coordinates. Overrides the size/start/stop/step parameters. is_accessible : array_like, bool, shape (len(contig),), optional Boolean array indicating accessibility status for all positions in the chromosome/contig. fill : object, optional The value to use where a window is completely inaccessible. Returns ------- theta_hat_w : ndarray, float, shape (n_windows,) Watterson's estimator (theta hat per base). windows : ndarray, int, shape (n_windows, 2) The windows used, as an array of (window_start, window_stop) positions, using 1-based coordinates. n_bases : ndarray, int, shape (n_windows,) Number of (accessible) bases in each window. counts : ndarray, int, shape (n_windows,) Number of variants in each window. Examples -------- >>> import allel >>> g = allel.GenotypeArray([[[0, 0], [0, 0]], ... [[0, 0], [0, 1]], ... [[0, 0], [1, 1]], ... [[0, 1], [1, 1]], ... [[1, 1], [1, 1]], ... [[0, 0], [1, 2]], ... [[0, 1], [1, 2]], ... [[0, 1], [-1, -1]], ... [[-1, -1], [-1, -1]]]) >>> ac = g.count_alleles() >>> pos = [2, 4, 7, 14, 15, 18, 19, 25, 27] >>> theta_hat_w, windows, n_bases, counts = allel.windowed_watterson_theta( ... pos, ac, size=10, start=1, stop=31 ... ) >>> theta_hat_w array([0.10909091, 0.16363636, 0.04958678]) >>> windows array([[ 1, 10], [11, 20], [21, 31]]) >>> n_bases array([10, 10, 11]) >>> counts array([3, 4, 2]) """ # flake8: noqa # check inputs if not isinstance(pos, SortedIndex): pos = SortedIndex(pos, copy=False) is_accessible = asarray_ndim(is_accessible, 1, allow_none=True) if not hasattr(ac, 'count_segregating'): ac = AlleleCountsArray(ac, copy=False) # locate segregating variants is_seg = ac.is_segregating() # count segregating variants in windows S, windows, counts = windowed_statistic(pos, is_seg, statistic=np.count_nonzero, size=size, start=start, stop=stop, step=step, windows=windows, fill=0) # assume number of chromosomes sampled is constant for all variants n = ac.sum(axis=1).max() # (n-1)th harmonic number a1 = np.sum(1 / np.arange(1, n)) # absolute value of Watterson's theta theta_hat_w_abs = S / a1 # theta per base theta_hat_w, n_bases = per_base(theta_hat_w_abs, windows=windows, is_accessible=is_accessible, fill=fill) return theta_hat_w, windows, n_bases, counts
def tajima_d(ac, pos=None, start=None, stop=None, min_sites=3): """Calculate the value of Tajima's D over a given region. Parameters ---------- ac : array_like, int, shape (n_variants, n_alleles) Allele counts array. pos : array_like, int, shape (n_items,), optional Variant positions, using 1-based coordinates, in ascending order. start : int, optional The position at which to start (1-based). Defaults to the first position. stop : int, optional The position at which to stop (1-based). Defaults to the last position. min_sites : int, optional Minimum number of segregating sites for which to calculate a value. If there are fewer, np.nan is returned. Defaults to 3. Returns ------- D : float Examples -------- >>> import allel >>> g = allel.GenotypeArray([[[0, 0], [0, 0]], ... [[0, 0], [0, 1]], ... [[0, 0], [1, 1]], ... [[0, 1], [1, 1]], ... [[1, 1], [1, 1]], ... [[0, 0], [1, 2]], ... [[0, 1], [1, 2]], ... [[0, 1], [-1, -1]], ... [[-1, -1], [-1, -1]]]) >>> ac = g.count_alleles() >>> allel.tajima_d(ac) 3.1445848780213814 >>> pos = [2, 4, 7, 14, 15, 18, 19, 25, 27] >>> allel.tajima_d(ac, pos=pos, start=7, stop=25) 3.8779735196179366 """ # check inputs if not hasattr(ac, 'count_segregating'): ac = AlleleCountsArray(ac, copy=False) # deal with subregion if pos is not None and (start is not None or stop is not None): if not isinstance(pos, SortedIndex): pos = SortedIndex(pos, copy=False) loc = pos.locate_range(start, stop) ac = ac[loc] # count segregating variants S = ac.count_segregating() if S < min_sites: return np.nan # assume number of chromosomes sampled is constant for all variants n = ac.sum(axis=1).max() # (n-1)th harmonic number a1 = np.sum(1 / np.arange(1, n)) # calculate Watterson's theta (absolute value) theta_hat_w_abs = S / a1 # calculate mean pairwise difference mpd = mean_pairwise_difference(ac, fill=0) # calculate theta_hat pi (sum differences over variants) theta_hat_pi_abs = np.sum(mpd) # N.B., both theta estimates are usually divided by the number of # (accessible) bases but here we want the absolute difference d = theta_hat_pi_abs - theta_hat_w_abs # calculate the denominator (standard deviation) a2 = np.sum(1 / (np.arange(1, n)**2)) b1 = (n + 1) / (3 * (n - 1)) b2 = 2 * (n**2 + n + 3) / (9 * n * (n - 1)) c1 = b1 - (1 / a1) c2 = b2 - ((n + 2) / (a1 * n)) + (a2 / (a1**2)) e1 = c1 / a1 e2 = c2 / (a1**2 + a2) d_stdev = np.sqrt((e1 * S) + (e2 * S * (S - 1))) # finally calculate Tajima's D D = d / d_stdev return D
def plot_variant_locator(pos, step=None, ax=None, start=None, stop=None, flip=False, line_kwargs=None): """ Plot lines indicating the physical genome location of variants from a single chromosome/contig. By default the top x axis is in variant index space, and the bottom x axis is in genome position space. Parameters ---------- pos : array_like A sorted 1-dimensional array of genomic positions from a single chromosome/contig. step : int, optional Plot a line for every `step` variants. ax : axes, optional The axes on which to draw. If not provided, a new figure will be created. start : int, optional The start position for the region to draw. stop : int, optional The stop position for the region to draw. flip : bool, optional Flip the plot upside down. line_kwargs : dict-like Additional keyword arguments passed through to `plt.Line2D`. Returns ------- ax : axes The axes on which the plot was drawn """ import matplotlib.pyplot as plt # check inputs pos = SortedIndex(pos, copy=False) # set up axes if ax is None: x = plt.rcParams['figure.figsize'][0] y = x / 7 fig, ax = plt.subplots(figsize=(x, y)) fig.tight_layout() # determine x axis limits if start is None: start = np.min(pos) if stop is None: stop = np.max(pos) loc = pos.locate_range(start, stop) pos = pos[loc] if step is None: step = len(pos) // 100 ax.set_xlim(start, stop) # plot the lines if line_kwargs is None: line_kwargs = dict() # line_kwargs.setdefault('linewidth', .5) n_variants = len(pos) for i, p in enumerate(pos[::step]): xfrom = p xto = ( start + ((i * step / n_variants) * (stop-start)) ) l = plt.Line2D([xfrom, xto], [0, 1], **line_kwargs) ax.add_line(l) # invert? if flip: ax.invert_yaxis() ax.xaxis.tick_top() else: ax.xaxis.tick_bottom() # tidy up ax.set_yticks([]) ax.xaxis.set_tick_params(direction='out') for l in 'left', 'right': ax.spines[l].set_visible(False) return ax