def windowed_weir_cockerham_fst(pos, g, subpops, size=None, start=None, stop=None, step=None, windows=None, fill=np.nan, max_allele=None): """Estimate average Fst in windows over a single chromosome/contig, following the method of Weir and Cockerham (1984). Parameters ---------- pos : array_like, int, shape (n_items,) Variant positions, using 1-based coordinates, in ascending order. g : array_like, int, shape (n_variants, n_samples, ploidy) Genotype array. subpops : sequence of sequences of ints Sample indices for each subpopulation. size : int The window size (number of bases). start : int, optional The position at which to start (1-based). stop : int, optional The position at which to stop (1-based). step : int, optional The distance between start positions of windows. If not given, defaults to the window size, i.e., non-overlapping windows. windows : array_like, int, shape (n_windows, 2), optional Manually specify the windows to use as a sequence of (window_start, window_stop) positions, using 1-based coordinates. Overrides the size/start/stop/step parameters. fill : object, optional The value to use where there are no variants within a window. max_allele : int, optional The highest allele index to consider. Returns ------- fst : ndarray, float, shape (n_windows,) Average Fst in each window. windows : ndarray, int, shape (n_windows, 2) The windows used, as an array of (window_start, window_stop) positions, using 1-based coordinates. counts : ndarray, int, shape (n_windows,) Number of variants in each window. """ # compute values per-variant a, b, c = weir_cockerham_fst(g, subpops, max_allele=max_allele) # define the statistic to compute within each window def average_fst(wa, wb, wc): return np.nansum(wa) / (np.nansum(wa) + np.nansum(wb) + np.nansum(wc)) # calculate average Fst in windows fst, windows, counts = windowed_statistic(pos, values=(a, b, c), statistic=average_fst, size=size, start=start, stop=stop, step=step, windows=windows, fill=fill) return fst, windows, counts
def windowed_patterson_fst(pos, ac1, ac2, size=None, start=None, stop=None, step=None, windows=None, fill=np.nan): """Estimate average Fst in windows over a single chromosome/contig, following the method of Patterson (2012). Parameters ---------- pos : array_like, int, shape (n_items,) Variant positions, using 1-based coordinates, in ascending order. ac1 : array_like, int, shape (n_variants, n_alleles) Allele counts array from the first population. ac2 : array_like, int, shape (n_variants, n_alleles) Allele counts array from the second population. size : int, optional The window size (number of bases). start : int, optional The position at which to start (1-based). stop : int, optional The position at which to stop (1-based). step : int, optional The distance between start positions of windows. If not given, defaults to the window size, i.e., non-overlapping windows. windows : array_like, int, shape (n_windows, 2), optional Manually specify the windows to use as a sequence of (window_start, window_stop) positions, using 1-based coordinates. Overrides the size/start/stop/step parameters. fill : object, optional The value to use where there are no variants within a window. Returns ------- fst : ndarray, float, shape (n_windows,) Average Fst in each window. windows : ndarray, int, shape (n_windows, 2) The windows used, as an array of (window_start, window_stop) positions, using 1-based coordinates. counts : ndarray, int, shape (n_windows,) Number of variants in each window. """ # compute values per-variants num, den = patterson_fst(ac1, ac2) # define the statistic to compute within each window def average_fst(wn, wd): return np.nansum(wn) / np.nansum(wd) # calculate average Fst in windows fst, windows, counts = windowed_statistic(pos, values=(num, den), statistic=average_fst, size=size, start=start, stop=stop, step=step, windows=windows, fill=fill) return fst, windows, counts
def windowed_r_squared(pos, gn, size=None, start=None, stop=None, step=None, windows=None, fill=np.nan, percentile=50): """Summarise linkage disequilibrium in windows over a single chromosome/contig. Parameters ---------- pos : array_like, int, shape (n_items,) The item positions in ascending order, using 1-based coordinates.. gn : array_like, int8, shape (n_variants, n_samples) Diploid genotypes at biallelic variants, coded as the number of alternate alleles per call (i.e., 0 = hom ref, 1 = het, 2 = hom alt). size : int, optional The window size (number of bases). start : int, optional The position at which to start (1-based). stop : int, optional The position at which to stop (1-based). step : int, optional The distance between start positions of windows. If not given, defaults to the window size, i.e., non-overlapping windows. windows : array_like, int, shape (n_windows, 2), optional Manually specify the windows to use as a sequence of (window_start, window_stop) positions, using 1-based coordinates. Overrides the size/start/stop/step parameters. fill : object, optional The value to use where a window is empty, i.e., contains no items. percentile : int or sequence of ints, optional The percentile or percentiles to calculate within each window. Returns ------- out : ndarray, shape (n_windows,) The value of the statistic for each window. windows : ndarray, int, shape (n_windows, 2) The windows used, as an array of (window_start, window_stop) positions, using 1-based coordinates. counts : ndarray, int, shape (n_windows,) The number of items in each window. Notes ----- Linkage disequilibrium (r**2) is calculated using the method of Rogers and Huff (2008). See Also -------- allel.stats.window.windowed_statistic """ # define the statistic function if isinstance(percentile, (list, tuple)): fill = [fill for _ in percentile] def statistic(gnw): r_squared = rogers_huff_r(gnw)**2 return [np.percentile(r_squared, p) for p in percentile] else: def statistic(gnw): r_squared = rogers_huff_r(gnw)**2 return np.percentile(r_squared, percentile) return windowed_statistic(pos, gn, statistic, size, start=start, stop=stop, step=step, windows=windows, fill=fill)
def roh_poissonhmm(gv, pos, phet_roh=0.001, phet_nonroh=(0.0025, 0.01), transition=1e-3, window_size=1000, min_roh=0, is_accessible=None, contig_size=None): """Call ROH (runs of homozygosity) in a single individual given a genotype vector. This function computes the likely ROH using a Poisson HMM model. The chromosome is divided into equally accessible windows of specified size, then the number of hets observed in each is used to fit a Poisson HMM. Note this is much faster than `roh_mhmm`, but at the cost of some resolution. The model is provided with a probability of observing a het in a ROH (`phet_roh`) and one or more probabilities of observing a het in a non-ROH, as this probability may not be constant across the genome (`phet_nonroh`). Parameters ---------- gv : array_like, int, shape (n_variants, ploidy) Genotype vector. pos: array_like, int, shape (n_variants,) Positions of variants, same 0th dimension as `gv`. phet_roh: float, optional Probability of observing a heterozygote in a ROH. Appropriate values will depend on de novo mutation rate and genotype error rate. phet_nonroh: tuple of floats, optional One or more probabilites of observing a heterozygote outside of ROH. Appropriate values will depend primarily on nucleotide diversity within the population, but also on mutation rate and genotype error rate. transition: float, optional Probability of moving between states. This is based on windows, so a larger window size may call for a larger transitional probability window_size: integer, optional Window size (equally accessible bases) to consider as a potential ROH. Setting this window too small may result in spurious ROH calls, while too large will result in a lack of resolution. min_roh: integer, optional Minimum size (bp) to condsider as a ROH. Will depend on contig size and recombination rate. is_accessible: array_like, bool, shape (`contig_size`,), optional Boolean array for each position in contig describing whether accessible or not. Although optional, highly recommended so invariant sites are distinguishable from sites where variation is inaccessible contig_size: integer, optional If is_accessible is not available, use this to specify the size of the contig, and assume all sites are accessible. Returns ------- df_roh: DataFrame Data frame where each row describes a run of homozygosity. Columns are 'start', 'stop', 'length' and 'is_marginal'. Start and stop are 1-based, stop-inclusive. froh: float Proportion of genome in a ROH. Notes ----- This function requires `pomegranate` (>= 0.9.0) to be installed. """ from pomegranate import HiddenMarkovModel, PoissonDistribution # equally accessbile windows if is_accessible is None: if contig_size is None: raise ValueError( "If is_accessibile argument is not provided, you must provide contig_size" ) is_accessible = np.ones((contig_size, ), dtype="bool") else: contig_size = is_accessible.size eqw = equally_accessible_windows(is_accessible, window_size) ishet = GenotypeVector(gv).is_het() counts, wins, records = windowed_statistic(pos, ishet, np.sum, windows=eqw) # heterozygote probabilities het_px = np.concatenate([(phet_roh, ), phet_nonroh]) # start probabilities (all equal) start_prob = np.repeat(1 / het_px.size, het_px.size) # transition between underlying states transition_mx = _hmm_derive_transition_matrix(transition, het_px.size) dists = [PoissonDistribution(x * window_size) for x in het_px] model = HiddenMarkovModel.from_matrix( transition_probabilities=transition_mx, distributions=dists, starts=start_prob) prediction = np.array(model.predict(counts[:, None])) df_blocks = tabulate_state_blocks(prediction, states=list(range(len(het_px)))) df_roh = df_blocks[(df_blocks.state == 0)].reset_index(drop=True) # adapt the dataframe for ROH df_roh["start"] = df_roh.start_ridx.apply(lambda y: eqw[y, 0]) df_roh["stop"] = df_roh.stop_lidx.apply(lambda y: eqw[y, 1]) df_roh["length"] = df_roh.stop - df_roh.start # filter by ROH size if min_roh > 0: df_roh = df_roh[df_roh.length >= min_roh] # compute FROH froh = df_roh.length.sum() / contig_size return df_roh[["start", "stop", "length", "is_marginal"]], froh
def windowed_tajima_d(pos, ac, size=None, start=None, stop=None, step=None, windows=None, fill=np.nan): """Calculate the value of Tajima's D in windows over a single chromosome/contig. Parameters ---------- pos : array_like, int, shape (n_items,) Variant positions, using 1-based coordinates, in ascending order. ac : array_like, int, shape (n_variants, n_alleles) Allele counts array. size : int, optional The window size (number of bases). start : int, optional The position at which to start (1-based). stop : int, optional The position at which to stop (1-based). step : int, optional The distance between start positions of windows. If not given, defaults to the window size, i.e., non-overlapping windows. windows : array_like, int, shape (n_windows, 2), optional Manually specify the windows to use as a sequence of (window_start, window_stop) positions, using 1-based coordinates. Overrides the size/start/stop/step parameters. fill : object, optional The value to use where a window is completely inaccessible. Returns ------- D : ndarray, float, shape (n_windows,) Tajima's D. windows : ndarray, int, shape (n_windows, 2) The windows used, as an array of (window_start, window_stop) positions, using 1-based coordinates. counts : ndarray, int, shape (n_windows,) Number of variants in each window. Examples -------- >>> import allel >>> g = allel.GenotypeArray([[[0, 0], [0, 0]], ... [[0, 0], [0, 1]], ... [[0, 0], [1, 1]], ... [[0, 1], [1, 1]], ... [[1, 1], [1, 1]], ... [[0, 0], [1, 2]], ... [[0, 1], [1, 2]], ... [[0, 1], [-1, -1]], ... [[-1, -1], [-1, -1]]]) >>> ac = g.count_alleles() >>> pos = [2, 4, 7, 14, 15, 18, 19, 25, 27] >>> D, windows, counts = allel.stats.windowed_tajima_d( ... pos, ac, size=10, start=1, stop=31 ... ) >>> D array([ 0.59158014, 2.93397641, 6.12372436]) >>> windows array([[ 1, 10], [11, 20], [21, 31]]) >>> counts array([3, 4, 2]) """ # check inputs if not isinstance(pos, SortedIndex): pos = SortedIndex(pos, copy=False) if not hasattr(ac, "count_segregating"): ac = AlleleCountsArray(ac, copy=False) # assume number of chromosomes sampled is constant for all variants n = ac.sum(axis=1).max() # calculate constants a1 = np.sum(1 / np.arange(1, n)) a2 = np.sum(1 / (np.arange(1, n) ** 2)) b1 = (n + 1) / (3 * (n - 1)) b2 = 2 * (n ** 2 + n + 3) / (9 * n * (n - 1)) c1 = b1 - (1 / a1) c2 = b2 - ((n + 2) / (a1 * n)) + (a2 / (a1 ** 2)) e1 = c1 / a1 e2 = c2 / (a1 ** 2 + a2) # locate segregating variants is_seg = ac.is_segregating() # calculate mean pairwise difference mpd = mean_pairwise_difference(ac, fill=0) # define statistic to compute for each window # noinspection PyPep8Naming def statistic(w_is_seg, w_mpd): S = np.count_nonzero(w_is_seg) pi = np.sum(w_mpd) d = pi - (S / a1) d_stdev = np.sqrt((e1 * S) + (e2 * S * (S - 1))) wD = d / d_stdev return wD D, windows, counts = windowed_statistic( pos, values=(is_seg, mpd), statistic=statistic, size=size, start=start, stop=stop, step=step, windows=windows, fill=fill, ) return D, windows, counts
def windowed_watterson_theta( pos, ac, size=None, start=None, stop=None, step=None, windows=None, is_accessible=None, fill=np.nan ): """Calculate the value of Watterson's estimator in windows over a single chromosome/contig. Parameters ---------- pos : array_like, int, shape (n_items,) Variant positions, using 1-based coordinates, in ascending order. ac : array_like, int, shape (n_variants, n_alleles) Allele counts array. size : int, optional The window size (number of bases). start : int, optional The position at which to start (1-based). stop : int, optional The position at which to stop (1-based). step : int, optional The distance between start positions of windows. If not given, defaults to the window size, i.e., non-overlapping windows. windows : array_like, int, shape (n_windows, 2), optional Manually specify the windows to use as a sequence of (window_start, window_stop) positions, using 1-based coordinates. Overrides the size/start/stop/step parameters. is_accessible : array_like, bool, shape (len(contig),), optional Boolean array indicating accessibility status for all positions in the chromosome/contig. fill : object, optional The value to use where a window is completely inaccessible. Returns ------- theta_hat_w : ndarray, float, shape (n_windows,) Watterson's estimator (theta hat per base). windows : ndarray, int, shape (n_windows, 2) The windows used, as an array of (window_start, window_stop) positions, using 1-based coordinates. n_bases : ndarray, int, shape (n_windows,) Number of (accessible) bases in each window. counts : ndarray, int, shape (n_windows,) Number of variants in each window. Examples -------- >>> import allel >>> g = allel.GenotypeArray([[[0, 0], [0, 0]], ... [[0, 0], [0, 1]], ... [[0, 0], [1, 1]], ... [[0, 1], [1, 1]], ... [[1, 1], [1, 1]], ... [[0, 0], [1, 2]], ... [[0, 1], [1, 2]], ... [[0, 1], [-1, -1]], ... [[-1, -1], [-1, -1]]]) >>> ac = g.count_alleles() >>> pos = [2, 4, 7, 14, 15, 18, 19, 25, 27] >>> theta_hat_w, windows, n_bases, counts = allel.stats.windowed_watterson_theta( ... pos, ac, size=10, start=1, stop=31 ... ) >>> theta_hat_w array([ 0.10909091, 0.16363636, 0.04958678]) >>> windows array([[ 1, 10], [11, 20], [21, 31]]) >>> n_bases array([10, 10, 11]) >>> counts array([3, 4, 2]) """ # flake8: noqa # check inputs if not isinstance(pos, SortedIndex): pos = SortedIndex(pos, copy=False) is_accessible = asarray_ndim(is_accessible, 1, allow_none=True) if not hasattr(ac, "count_segregating"): ac = AlleleCountsArray(ac, copy=False) # locate segregating variants is_seg = ac.is_segregating() # count segregating variants in windows S, windows, counts = windowed_statistic( pos, is_seg, statistic=np.count_nonzero, size=size, start=start, stop=stop, step=step, windows=windows, fill=0 ) # assume number of chromosomes sampled is constant for all variants n = ac.sum(axis=1).max() # (n-1)th harmonic number a1 = np.sum(1 / np.arange(1, n)) # absolute value of Watterson's theta theta_hat_w_abs = S / a1 # theta per base theta_hat_w, n_bases = per_base(theta_hat_w_abs, windows=windows, is_accessible=is_accessible, fill=fill) return theta_hat_w, windows, n_bases, counts
def windowed_df( pos, ac1, ac2, size=None, start=None, stop=None, step=None, windows=None, is_accessible=None, fill=np.nan ): """Calculate the density of fixed differences between two populations in windows over a single chromosome/contig. Parameters ---------- pos : array_like, int, shape (n_items,) Variant positions, using 1-based coordinates, in ascending order. ac1 : array_like, int, shape (n_variants, n_alleles) Allele counts array for the first population. ac2 : array_like, int, shape (n_variants, n_alleles) Allele counts array for the second population. size : int, optional The window size (number of bases). start : int, optional The position at which to start (1-based). stop : int, optional The position at which to stop (1-based). step : int, optional The distance between start positions of windows. If not given, defaults to the window size, i.e., non-overlapping windows. windows : array_like, int, shape (n_windows, 2), optional Manually specify the windows to use as a sequence of (window_start, window_stop) positions, using 1-based coordinates. Overrides the size/start/stop/step parameters. is_accessible : array_like, bool, shape (len(contig),), optional Boolean array indicating accessibility status for all positions in the chromosome/contig. fill : object, optional The value to use where a window is completely inaccessible. Returns ------- df : ndarray, float, shape (n_windows,) Per-base density of fixed differences in each window. windows : ndarray, int, shape (n_windows, 2) The windows used, as an array of (window_start, window_stop) positions, using 1-based coordinates. n_bases : ndarray, int, shape (n_windows,) Number of (accessible) bases in each window. counts : ndarray, int, shape (n_windows,) Number of variants in each window. See Also -------- allel.model.locate_fixed_differences """ # check inputs pos = SortedIndex(pos, copy=False) is_accessible = asarray_ndim(is_accessible, 1, allow_none=True) # locate fixed differences loc_df = locate_fixed_differences(ac1, ac2) # count number of fixed differences in windows n_df, windows, counts = windowed_statistic( pos, values=loc_df, statistic=np.count_nonzero, size=size, start=start, stop=stop, step=step, windows=windows, fill=0, ) # calculate value per base df, n_bases = per_base(n_df, windows, is_accessible=is_accessible, fill=fill) return df, windows, n_bases, counts
def windowed_divergence( pos, ac1, ac2, size=None, start=None, stop=None, step=None, windows=None, is_accessible=None, fill=np.nan ): """Estimate nucleotide divergence between two populations in windows over a single chromosome/contig. Parameters ---------- pos : array_like, int, shape (n_items,) Variant positions, using 1-based coordinates, in ascending order. ac1 : array_like, int, shape (n_variants, n_alleles) Allele counts array for the first population. ac2 : array_like, int, shape (n_variants, n_alleles) Allele counts array for the second population. size : int, optional The window size (number of bases). start : int, optional The position at which to start (1-based). stop : int, optional The position at which to stop (1-based). step : int, optional The distance between start positions of windows. If not given, defaults to the window size, i.e., non-overlapping windows. windows : array_like, int, shape (n_windows, 2), optional Manually specify the windows to use as a sequence of (window_start, window_stop) positions, using 1-based coordinates. Overrides the size/start/stop/step parameters. is_accessible : array_like, bool, shape (len(contig),), optional Boolean array indicating accessibility status for all positions in the chromosome/contig. fill : object, optional The value to use where a window is completely inaccessible. Returns ------- Dxy : ndarray, float, shape (n_windows,) Nucleotide divergence in each window. windows : ndarray, int, shape (n_windows, 2) The windows used, as an array of (window_start, window_stop) positions, using 1-based coordinates. n_bases : ndarray, int, shape (n_windows,) Number of (accessible) bases in each window. counts : ndarray, int, shape (n_windows,) Number of variants in each window. Examples -------- Simplest case, two haplotypes in each population:: >>> import allel >>> h = allel.HaplotypeArray([[0, 0, 0, 0], ... [0, 0, 0, 1], ... [0, 0, 1, 1], ... [0, 1, 1, 1], ... [1, 1, 1, 1], ... [0, 0, 1, 2], ... [0, 1, 1, 2], ... [0, 1, -1, -1], ... [-1, -1, -1, -1]]) >>> ac1 = h.count_alleles(subpop=[0, 1]) >>> ac2 = h.count_alleles(subpop=[2, 3]) >>> pos = [2, 4, 7, 14, 15, 18, 19, 25, 27] >>> dxy, windows, n_bases, counts = windowed_divergence( ... pos, ac1, ac2, size=10, start=1, stop=31 ... ) >>> dxy array([ 0.15 , 0.225, 0. ]) >>> windows array([[ 1, 10], [11, 20], [21, 31]]) >>> n_bases array([10, 10, 11]) >>> counts array([3, 4, 2]) """ # check inputs pos = SortedIndex(pos, copy=False) is_accessible = asarray_ndim(is_accessible, 1, allow_none=True) # calculate mean pairwise divergence mpd = mean_pairwise_difference_between(ac1, ac2, fill=0) # sum in windows mpd_sum, windows, counts = windowed_statistic( pos, values=mpd, statistic=np.sum, size=size, start=start, stop=stop, step=step, windows=windows, fill=0 ) # calculate value per base dxy, n_bases = per_base(mpd_sum, windows, is_accessible=is_accessible, fill=fill) return dxy, windows, n_bases, counts
def windowed_r_squared(pos, gn, size=None, start=None, stop=None, step=None, windows=None, fill=np.nan, percentile=50): """Summarise linkage disequilibrium in windows over a single chromosome/contig. Parameters ---------- pos : array_like, int, shape (n_items,) The item positions in ascending order, using 1-based coordinates.. gn : array_like, int8, shape (n_variants, n_samples) Diploid genotypes at biallelic variants, coded as the number of alternate alleles per call (i.e., 0 = hom ref, 1 = het, 2 = hom alt). size : int, optional The window size (number of bases). start : int, optional The position at which to start (1-based). stop : int, optional The position at which to stop (1-based). step : int, optional The distance between start positions of windows. If not given, defaults to the window size, i.e., non-overlapping windows. windows : array_like, int, shape (n_windows, 2), optional Manually specify the windows to use as a sequence of (window_start, window_stop) positions, using 1-based coordinates. Overrides the size/start/stop/step parameters. fill : object, optional The value to use where a window is empty, i.e., contains no items. percentile : int or sequence of ints, optional The percentile or percentiles to calculate within each window. Returns ------- out : ndarray, shape (n_windows,) The value of the statistic for each window. windows : ndarray, int, shape (n_windows, 2) The windows used, as an array of (window_start, window_stop) positions, using 1-based coordinates. counts : ndarray, int, shape (n_windows,) The number of items in each window. Notes ----- Linkage disequilibrium (r**2) is calculated using the method of Rogers and Huff (2008). See Also -------- allel.stats.window.windowed_statistic """ # define the statistic function if isinstance(percentile, (list, tuple)): fill = [fill for _ in percentile] def statistic(gnw): r_squared = rogers_huff_r(gnw) ** 2 return [np.percentile(r_squared, p) for p in percentile] else: def statistic(gnw): r_squared = rogers_huff_r(gnw) ** 2 return np.percentile(r_squared, percentile) return windowed_statistic(pos, gn, statistic, size, start=start, stop=stop, step=step, windows=windows, fill=fill)
def windowed_watterson_theta(pos, ac, size=None, start=None, stop=None, step=None, windows=None, is_accessible=None, fill=np.nan): """Calculate the value of Watterson's estimator in windows over a single chromosome/contig. Parameters ---------- pos : array_like, int, shape (n_items,) Variant positions, using 1-based coordinates, in ascending order. ac : array_like, int, shape (n_variants, n_alleles) Allele counts array. size : int, optional The window size (number of bases). start : int, optional The position at which to start (1-based). stop : int, optional The position at which to stop (1-based). step : int, optional The distance between start positions of windows. If not given, defaults to the window size, i.e., non-overlapping windows. windows : array_like, int, shape (n_windows, 2), optional Manually specify the windows to use as a sequence of (window_start, window_stop) positions, using 1-based coordinates. Overrides the size/start/stop/step parameters. is_accessible : array_like, bool, shape (len(contig),), optional Boolean array indicating accessibility status for all positions in the chromosome/contig. fill : object, optional The value to use where a window is completely inaccessible. Returns ------- theta_hat_w : ndarray, float, shape (n_windows,) Watterson's estimator (theta hat per base). windows : ndarray, int, shape (n_windows, 2) The windows used, as an array of (window_start, window_stop) positions, using 1-based coordinates. n_bases : ndarray, int, shape (n_windows,) Number of (accessible) bases in each window. counts : ndarray, int, shape (n_windows,) Number of variants in each window. Examples -------- >>> import allel >>> g = allel.GenotypeArray([[[0, 0], [0, 0]], ... [[0, 0], [0, 1]], ... [[0, 0], [1, 1]], ... [[0, 1], [1, 1]], ... [[1, 1], [1, 1]], ... [[0, 0], [1, 2]], ... [[0, 1], [1, 2]], ... [[0, 1], [-1, -1]], ... [[-1, -1], [-1, -1]]]) >>> ac = g.count_alleles() >>> pos = [2, 4, 7, 14, 15, 18, 19, 25, 27] >>> theta_hat_w, windows, n_bases, counts = allel.windowed_watterson_theta( ... pos, ac, size=10, start=1, stop=31 ... ) >>> theta_hat_w array([0.10909091, 0.16363636, 0.04958678]) >>> windows array([[ 1, 10], [11, 20], [21, 31]]) >>> n_bases array([10, 10, 11]) >>> counts array([3, 4, 2]) """ # flake8: noqa # check inputs if not isinstance(pos, SortedIndex): pos = SortedIndex(pos, copy=False) is_accessible = asarray_ndim(is_accessible, 1, allow_none=True) if not hasattr(ac, 'count_segregating'): ac = AlleleCountsArray(ac, copy=False) # locate segregating variants is_seg = ac.is_segregating() # count segregating variants in windows S, windows, counts = windowed_statistic(pos, is_seg, statistic=np.count_nonzero, size=size, start=start, stop=stop, step=step, windows=windows, fill=0) # assume number of chromosomes sampled is constant for all variants n = ac.sum(axis=1).max() # (n-1)th harmonic number a1 = np.sum(1 / np.arange(1, n)) # absolute value of Watterson's theta theta_hat_w_abs = S / a1 # theta per base theta_hat_w, n_bases = per_base(theta_hat_w_abs, windows=windows, is_accessible=is_accessible, fill=fill) return theta_hat_w, windows, n_bases, counts
def windowed_df(pos, ac1, ac2, size=None, start=None, stop=None, step=None, windows=None, is_accessible=None, fill=np.nan): """Calculate the density of fixed differences between two populations in windows over a single chromosome/contig. Parameters ---------- pos : array_like, int, shape (n_items,) Variant positions, using 1-based coordinates, in ascending order. ac1 : array_like, int, shape (n_variants, n_alleles) Allele counts array for the first population. ac2 : array_like, int, shape (n_variants, n_alleles) Allele counts array for the second population. size : int, optional The window size (number of bases). start : int, optional The position at which to start (1-based). stop : int, optional The position at which to stop (1-based). step : int, optional The distance between start positions of windows. If not given, defaults to the window size, i.e., non-overlapping windows. windows : array_like, int, shape (n_windows, 2), optional Manually specify the windows to use as a sequence of (window_start, window_stop) positions, using 1-based coordinates. Overrides the size/start/stop/step parameters. is_accessible : array_like, bool, shape (len(contig),), optional Boolean array indicating accessibility status for all positions in the chromosome/contig. fill : object, optional The value to use where a window is completely inaccessible. Returns ------- df : ndarray, float, shape (n_windows,) Per-base density of fixed differences in each window. windows : ndarray, int, shape (n_windows, 2) The windows used, as an array of (window_start, window_stop) positions, using 1-based coordinates. n_bases : ndarray, int, shape (n_windows,) Number of (accessible) bases in each window. counts : ndarray, int, shape (n_windows,) Number of variants in each window. See Also -------- allel.model.locate_fixed_differences """ # check inputs pos = SortedIndex(pos, copy=False) is_accessible = asarray_ndim(is_accessible, 1, allow_none=True) # locate fixed differences loc_df = locate_fixed_differences(ac1, ac2) # count number of fixed differences in windows n_df, windows, counts = windowed_statistic(pos, values=loc_df, statistic=np.count_nonzero, size=size, start=start, stop=stop, step=step, windows=windows, fill=0) # calculate value per base df, n_bases = per_base(n_df, windows, is_accessible=is_accessible, fill=fill) return df, windows, n_bases, counts
def windowed_divergence(pos, ac1, ac2, size=None, start=None, stop=None, step=None, windows=None, is_accessible=None, fill=np.nan): """Estimate nucleotide divergence between two populations in windows over a single chromosome/contig. Parameters ---------- pos : array_like, int, shape (n_items,) Variant positions, using 1-based coordinates, in ascending order. ac1 : array_like, int, shape (n_variants, n_alleles) Allele counts array for the first population. ac2 : array_like, int, shape (n_variants, n_alleles) Allele counts array for the second population. size : int, optional The window size (number of bases). start : int, optional The position at which to start (1-based). stop : int, optional The position at which to stop (1-based). step : int, optional The distance between start positions of windows. If not given, defaults to the window size, i.e., non-overlapping windows. windows : array_like, int, shape (n_windows, 2), optional Manually specify the windows to use as a sequence of (window_start, window_stop) positions, using 1-based coordinates. Overrides the size/start/stop/step parameters. is_accessible : array_like, bool, shape (len(contig),), optional Boolean array indicating accessibility status for all positions in the chromosome/contig. fill : object, optional The value to use where a window is completely inaccessible. Returns ------- Dxy : ndarray, float, shape (n_windows,) Nucleotide divergence in each window. windows : ndarray, int, shape (n_windows, 2) The windows used, as an array of (window_start, window_stop) positions, using 1-based coordinates. n_bases : ndarray, int, shape (n_windows,) Number of (accessible) bases in each window. counts : ndarray, int, shape (n_windows,) Number of variants in each window. Examples -------- Simplest case, two haplotypes in each population:: >>> import allel >>> h = allel.HaplotypeArray([[0, 0, 0, 0], ... [0, 0, 0, 1], ... [0, 0, 1, 1], ... [0, 1, 1, 1], ... [1, 1, 1, 1], ... [0, 0, 1, 2], ... [0, 1, 1, 2], ... [0, 1, -1, -1], ... [-1, -1, -1, -1]]) >>> ac1 = h.count_alleles(subpop=[0, 1]) >>> ac2 = h.count_alleles(subpop=[2, 3]) >>> pos = [2, 4, 7, 14, 15, 18, 19, 25, 27] >>> dxy, windows, n_bases, counts = windowed_divergence( ... pos, ac1, ac2, size=10, start=1, stop=31 ... ) >>> dxy array([0.15 , 0.225, 0. ]) >>> windows array([[ 1, 10], [11, 20], [21, 31]]) >>> n_bases array([10, 10, 11]) >>> counts array([3, 4, 2]) """ # check inputs pos = SortedIndex(pos, copy=False) is_accessible = asarray_ndim(is_accessible, 1, allow_none=True) # calculate mean pairwise divergence mpd = mean_pairwise_difference_between(ac1, ac2, fill=0) # sum in windows mpd_sum, windows, counts = windowed_statistic(pos, values=mpd, statistic=np.sum, size=size, start=start, stop=stop, step=step, windows=windows, fill=0) # calculate value per base dxy, n_bases = per_base(mpd_sum, windows, is_accessible=is_accessible, fill=fill) return dxy, windows, n_bases, counts
def windowed_tajima_d(pos, ac, size=None, start=None, stop=None, step=None, windows=None, min_sites=3): """Calculate the value of Tajima's D in windows over a single chromosome/contig. Parameters ---------- pos : array_like, int, shape (n_items,) Variant positions, using 1-based coordinates, in ascending order. ac : array_like, int, shape (n_variants, n_alleles) Allele counts array. size : int, optional The window size (number of bases). start : int, optional The position at which to start (1-based). stop : int, optional The position at which to stop (1-based). step : int, optional The distance between start positions of windows. If not given, defaults to the window size, i.e., non-overlapping windows. windows : array_like, int, shape (n_windows, 2), optional Manually specify the windows to use as a sequence of (window_start, window_stop) positions, using 1-based coordinates. Overrides the size/start/stop/step parameters. min_sites : int, optional Minimum number of segregating sites for which to calculate a value. If there are fewer, np.nan is returned. Defaults to 3. Returns ------- D : ndarray, float, shape (n_windows,) Tajima's D. windows : ndarray, int, shape (n_windows, 2) The windows used, as an array of (window_start, window_stop) positions, using 1-based coordinates. counts : ndarray, int, shape (n_windows,) Number of variants in each window. Examples -------- >>> import allel >>> g = allel.GenotypeArray([[[0, 0], [0, 0]], ... [[0, 0], [0, 1]], ... [[0, 0], [1, 1]], ... [[0, 1], [1, 1]], ... [[1, 1], [1, 1]], ... [[0, 0], [1, 2]], ... [[0, 1], [1, 2]], ... [[0, 1], [-1, -1]], ... [[-1, -1], [-1, -1]]]) >>> ac = g.count_alleles() >>> pos = [2, 4, 7, 14, 15, 20, 22, 25, 27] >>> D, windows, counts = allel.windowed_tajima_d(pos, ac, size=20, step=10, start=1, stop=31) >>> D array([1.36521524, 4.22566622]) >>> windows array([[ 1, 20], [11, 31]]) >>> counts array([6, 6]) """ # check inputs if not isinstance(pos, SortedIndex): pos = SortedIndex(pos, copy=False) if not hasattr(ac, 'count_segregating'): ac = AlleleCountsArray(ac, copy=False) # assume number of chromosomes sampled is constant for all variants n = ac.sum(axis=1).max() # calculate constants a1 = np.sum(1 / np.arange(1, n)) a2 = np.sum(1 / (np.arange(1, n)**2)) b1 = (n + 1) / (3 * (n - 1)) b2 = 2 * (n**2 + n + 3) / (9 * n * (n - 1)) c1 = b1 - (1 / a1) c2 = b2 - ((n + 2) / (a1 * n)) + (a2 / (a1**2)) e1 = c1 / a1 e2 = c2 / (a1**2 + a2) # locate segregating variants is_seg = ac.is_segregating() # calculate mean pairwise difference mpd = mean_pairwise_difference(ac, fill=0) # define statistic to compute for each window # noinspection PyPep8Naming def statistic(w_is_seg, w_mpd): S = np.count_nonzero(w_is_seg) if S < min_sites: return np.nan pi = np.sum(w_mpd) d = pi - (S / a1) d_stdev = np.sqrt((e1 * S) + (e2 * S * (S - 1))) wD = d / d_stdev return wD D, windows, counts = windowed_statistic(pos, values=(is_seg, mpd), statistic=statistic, size=size, start=start, stop=stop, step=step, windows=windows, fill=np.nan) return D, windows, counts
def windowed_diversity(pos, ac, size=None, start=None, stop=None, step=None, windows=None, is_accessible=None, fill=np.nan): """Estimate nucleotide diversity in windows over a single chromosome/contig. Parameters ---------- pos : array_like, int, shape (n_items,) Variant positions, using 1-based coordinates, in ascending order. ac : array_like, int, shape (n_variants, n_alleles) Allele counts array. size : int, optional The window size (number of bases). start : int, optional The position at which to start (1-based). stop : int, optional The position at which to stop (1-based). step : int, optional The distance between start positions of windows. If not given, defaults to the window size, i.e., non-overlapping windows. windows : array_like, int, shape (n_windows, 2), optional Manually specify the windows to use as a sequence of (window_start, window_stop) positions, using 1-based coordinates. Overrides the size/start/stop/step parameters. is_accessible : array_like, bool, shape (len(contig),), optional Boolean array indicating accessibility status for all positions in the chromosome/contig. fill : object, optional The value to use where a window is completely inaccessible. Returns ------- pi : ndarray, float, shape (n_windows,) Nucleotide diversity in each window. windows : ndarray, int, shape (n_windows, 2) The windows used, as an array of (window_start, window_stop) positions, using 1-based coordinates. n_bases : ndarray, int, shape (n_windows,) Number of (accessible) bases in each window. counts : ndarray, int, shape (n_windows,) Number of variants in each window. Examples -------- >>> import allel >>> g = allel.GenotypeArray([[[0, 0], [0, 0]], ... [[0, 0], [0, 1]], ... [[0, 0], [1, 1]], ... [[0, 1], [1, 1]], ... [[1, 1], [1, 1]], ... [[0, 0], [1, 2]], ... [[0, 1], [1, 2]], ... [[0, 1], [-1, -1]], ... [[-1, -1], [-1, -1]]]) >>> ac = g.count_alleles() >>> pos = [2, 4, 7, 14, 15, 18, 19, 25, 27] >>> pi, windows, n_bases, counts = allel.windowed_diversity( ... pos, ac, size=10, start=1, stop=31 ... ) >>> pi array([0.11666667, 0.21666667, 0.09090909]) >>> windows array([[ 1, 10], [11, 20], [21, 31]]) >>> n_bases array([10, 10, 11]) >>> counts array([3, 4, 2]) """ # check inputs if not isinstance(pos, SortedIndex): pos = SortedIndex(pos, copy=False) is_accessible = asarray_ndim(is_accessible, 1, allow_none=True) # masking inaccessible sites from pos and ac pos, ac = mask_inaccessible(is_accessible, pos, ac) # calculate mean pairwise difference mpd = mean_pairwise_difference(ac, fill=0) # sum differences in windows mpd_sum, windows, counts = windowed_statistic( pos, values=mpd, statistic=np.sum, size=size, start=start, stop=stop, step=step, windows=windows, fill=0 ) # calculate value per base pi, n_bases = per_base(mpd_sum, windows, is_accessible=is_accessible, fill=fill) return pi, windows, n_bases, counts