Example #1
0
def windowed_weir_cockerham_fst(pos, g, subpops, size=None, start=None,
                                stop=None, step=None, windows=None,
                                fill=np.nan, max_allele=None):
    """Estimate average Fst in windows over a single chromosome/contig,
    following the method of Weir and Cockerham (1984).

    Parameters
    ----------
    pos : array_like, int, shape (n_items,)
        Variant positions, using 1-based coordinates, in ascending order.
    g : array_like, int, shape (n_variants, n_samples, ploidy)
        Genotype array.
    subpops : sequence of sequences of ints
        Sample indices for each subpopulation.
    size : int
        The window size (number of bases).
    start : int, optional
        The position at which to start (1-based).
    stop : int, optional
        The position at which to stop (1-based).
    step : int, optional
        The distance between start positions of windows. If not given,
        defaults to the window size, i.e., non-overlapping windows.
    windows : array_like, int, shape (n_windows, 2), optional
        Manually specify the windows to use as a sequence of (window_start,
        window_stop) positions, using 1-based coordinates. Overrides the
        size/start/stop/step parameters.
    fill : object, optional
        The value to use where there are no variants within a window.
    max_allele : int, optional
        The highest allele index to consider.

    Returns
    -------
    fst : ndarray, float, shape (n_windows,)
        Average Fst in each window.
    windows : ndarray, int, shape (n_windows, 2)
        The windows used, as an array of (window_start, window_stop) positions,
        using 1-based coordinates.
    counts : ndarray, int, shape (n_windows,)
        Number of variants in each window.

    """

    # compute values per-variant
    a, b, c = weir_cockerham_fst(g, subpops, max_allele=max_allele)

    # define the statistic to compute within each window
    def average_fst(wa, wb, wc):
        return np.nansum(wa) / (np.nansum(wa) + np.nansum(wb) + np.nansum(wc))

    # calculate average Fst in windows
    fst, windows, counts = windowed_statistic(pos, values=(a, b, c),
                                              statistic=average_fst,
                                              size=size, start=start,
                                              stop=stop, step=step,
                                              windows=windows, fill=fill)

    return fst, windows, counts
Example #2
0
def windowed_patterson_fst(pos, ac1, ac2, size=None, start=None, stop=None,
                           step=None, windows=None, fill=np.nan):
    """Estimate average Fst in windows over a single chromosome/contig,
    following the method of Patterson (2012).

    Parameters
    ----------

    pos : array_like, int, shape (n_items,)
        Variant positions, using 1-based coordinates, in ascending order.
    ac1 : array_like, int, shape (n_variants, n_alleles)
        Allele counts array from the first population.
    ac2 : array_like, int, shape (n_variants, n_alleles)
        Allele counts array from the second population.
    size : int, optional
        The window size (number of bases).
    start : int, optional
        The position at which to start (1-based).
    stop : int, optional
        The position at which to stop (1-based).
    step : int, optional
        The distance between start positions of windows. If not given,
        defaults to the window size, i.e., non-overlapping windows.
    windows : array_like, int, shape (n_windows, 2), optional
        Manually specify the windows to use as a sequence of (window_start,
        window_stop) positions, using 1-based coordinates. Overrides the
        size/start/stop/step parameters.
    fill : object, optional
        The value to use where there are no variants within a window.

    Returns
    -------

    fst : ndarray, float, shape (n_windows,)
        Average Fst in each window.
    windows : ndarray, int, shape (n_windows, 2)
        The windows used, as an array of (window_start, window_stop) positions,
        using 1-based coordinates.
    counts : ndarray, int, shape (n_windows,)
        Number of variants in each window.

    """

    # compute values per-variants
    num, den = patterson_fst(ac1, ac2)

    # define the statistic to compute within each window
    def average_fst(wn, wd):
        return np.nansum(wn) / np.nansum(wd)

    # calculate average Fst in windows
    fst, windows, counts = windowed_statistic(pos, values=(num, den),
                                              statistic=average_fst,
                                              size=size, start=start,
                                              stop=stop, step=step,
                                              windows=windows, fill=fill)

    return fst, windows, counts
Example #3
0
def windowed_r_squared(pos,
                       gn,
                       size=None,
                       start=None,
                       stop=None,
                       step=None,
                       windows=None,
                       fill=np.nan,
                       percentile=50):
    """Summarise linkage disequilibrium in windows over a single
    chromosome/contig.

    Parameters
    ----------
    pos : array_like, int, shape (n_items,)
        The item positions in ascending order, using 1-based coordinates..
    gn : array_like, int8, shape (n_variants, n_samples)
        Diploid genotypes at biallelic variants, coded as the number of
        alternate alleles per call (i.e., 0 = hom ref, 1 = het, 2 = hom alt).
    size : int, optional
        The window size (number of bases).
    start : int, optional
        The position at which to start (1-based).
    stop : int, optional
        The position at which to stop (1-based).
    step : int, optional
        The distance between start positions of windows. If not given,
        defaults to the window size, i.e., non-overlapping windows.
    windows : array_like, int, shape (n_windows, 2), optional
        Manually specify the windows to use as a sequence of (window_start,
        window_stop) positions, using 1-based coordinates. Overrides the
        size/start/stop/step parameters.
    fill : object, optional
        The value to use where a window is empty, i.e., contains no items.
    percentile : int or sequence of ints, optional
        The percentile or percentiles to calculate within each window.

    Returns
    -------
    out : ndarray, shape (n_windows,)
        The value of the statistic for each window.
    windows : ndarray, int, shape (n_windows, 2)
        The windows used, as an array of (window_start, window_stop) positions,
        using 1-based coordinates.
    counts : ndarray, int, shape (n_windows,)
        The number of items in each window.

    Notes
    -----
    Linkage disequilibrium (r**2) is calculated using the method of Rogers
    and Huff (2008).

    See Also
    --------

    allel.stats.window.windowed_statistic

    """

    # define the statistic function
    if isinstance(percentile, (list, tuple)):
        fill = [fill for _ in percentile]

        def statistic(gnw):
            r_squared = rogers_huff_r(gnw)**2
            return [np.percentile(r_squared, p) for p in percentile]

    else:

        def statistic(gnw):
            r_squared = rogers_huff_r(gnw)**2
            return np.percentile(r_squared, percentile)

    return windowed_statistic(pos,
                              gn,
                              statistic,
                              size,
                              start=start,
                              stop=stop,
                              step=step,
                              windows=windows,
                              fill=fill)
Example #4
0
def roh_poissonhmm(gv,
                   pos,
                   phet_roh=0.001,
                   phet_nonroh=(0.0025, 0.01),
                   transition=1e-3,
                   window_size=1000,
                   min_roh=0,
                   is_accessible=None,
                   contig_size=None):
    """Call ROH (runs of homozygosity) in a single individual given a genotype vector.

    This function computes the likely ROH using a Poisson HMM model. The chromosome is divided into
    equally accessible windows of specified size, then the number of hets observed in each is used
    to fit a Poisson HMM. Note this is much faster than `roh_mhmm`, but at the cost of some
    resolution.

    The model is provided with a probability of observing a het in a ROH (`phet_roh`) and one
    or more probabilities of observing a het in a non-ROH, as this probability may not be
    constant across the genome (`phet_nonroh`).

    Parameters
    ----------
    gv : array_like, int, shape (n_variants, ploidy)
        Genotype vector.
    pos: array_like, int, shape (n_variants,)
        Positions of variants, same 0th dimension as `gv`.
    phet_roh: float, optional
        Probability of observing a heterozygote in a ROH. Appropriate values
        will depend on de novo mutation rate and genotype error rate.
    phet_nonroh: tuple of floats, optional
        One or more probabilites of observing a heterozygote outside of ROH.
        Appropriate values will depend primarily on nucleotide diversity within
        the population, but also on mutation rate and genotype error rate.
    transition: float, optional
        Probability of moving between states. This is based on windows, so a larger window size may
        call for a larger transitional probability
    window_size: integer, optional
        Window size (equally accessible bases) to consider as a potential ROH. Setting this window
        too small may result in spurious ROH calls, while too large will result in a lack of
        resolution.
    min_roh: integer, optional
        Minimum size (bp) to condsider as a ROH. Will depend on contig size and recombination rate.
    is_accessible: array_like, bool, shape (`contig_size`,), optional
        Boolean array for each position in contig describing whether accessible
        or not. Although optional, highly recommended so invariant sites are distinguishable from
        sites where variation is inaccessible
    contig_size: integer, optional
        If is_accessible is not available, use this to specify the size of the contig, and assume
        all sites are accessible.


    Returns
    -------
    df_roh: DataFrame
        Data frame where each row describes a run of homozygosity. Columns are 'start',
        'stop', 'length' and 'is_marginal'. Start and stop are 1-based, stop-inclusive.
    froh: float
        Proportion of genome in a ROH.

    Notes
    -----
    This function requires `pomegranate` (>= 0.9.0) to be installed.

    """

    from pomegranate import HiddenMarkovModel, PoissonDistribution

    # equally accessbile windows
    if is_accessible is None:
        if contig_size is None:
            raise ValueError(
                "If is_accessibile argument is not provided, you must provide contig_size"
            )
        is_accessible = np.ones((contig_size, ), dtype="bool")
    else:
        contig_size = is_accessible.size

    eqw = equally_accessible_windows(is_accessible, window_size)

    ishet = GenotypeVector(gv).is_het()
    counts, wins, records = windowed_statistic(pos, ishet, np.sum, windows=eqw)

    # heterozygote probabilities
    het_px = np.concatenate([(phet_roh, ), phet_nonroh])

    # start probabilities (all equal)
    start_prob = np.repeat(1 / het_px.size, het_px.size)

    # transition between underlying states
    transition_mx = _hmm_derive_transition_matrix(transition, het_px.size)

    dists = [PoissonDistribution(x * window_size) for x in het_px]

    model = HiddenMarkovModel.from_matrix(
        transition_probabilities=transition_mx,
        distributions=dists,
        starts=start_prob)

    prediction = np.array(model.predict(counts[:, None]))

    df_blocks = tabulate_state_blocks(prediction,
                                      states=list(range(len(het_px))))
    df_roh = df_blocks[(df_blocks.state == 0)].reset_index(drop=True)

    # adapt the dataframe for ROH
    df_roh["start"] = df_roh.start_ridx.apply(lambda y: eqw[y, 0])
    df_roh["stop"] = df_roh.stop_lidx.apply(lambda y: eqw[y, 1])
    df_roh["length"] = df_roh.stop - df_roh.start

    # filter by ROH size
    if min_roh > 0:
        df_roh = df_roh[df_roh.length >= min_roh]

    # compute FROH
    froh = df_roh.length.sum() / contig_size

    return df_roh[["start", "stop", "length", "is_marginal"]], froh
Example #5
0
def windowed_tajima_d(pos, ac, size=None, start=None, stop=None, step=None, windows=None, fill=np.nan):
    """Calculate the value of Tajima's D in windows over a single
    chromosome/contig.

    Parameters
    ----------
    pos : array_like, int, shape (n_items,)
        Variant positions, using 1-based coordinates, in ascending order.
    ac : array_like, int, shape (n_variants, n_alleles)
        Allele counts array.
    size : int, optional
        The window size (number of bases).
    start : int, optional
        The position at which to start (1-based).
    stop : int, optional
        The position at which to stop (1-based).
    step : int, optional
        The distance between start positions of windows. If not given,
        defaults to the window size, i.e., non-overlapping windows.
    windows : array_like, int, shape (n_windows, 2), optional
        Manually specify the windows to use as a sequence of (window_start,
        window_stop) positions, using 1-based coordinates. Overrides the
        size/start/stop/step parameters.
    fill : object, optional
        The value to use where a window is completely inaccessible.

    Returns
    -------
    D : ndarray, float, shape (n_windows,)
        Tajima's D.
    windows : ndarray, int, shape (n_windows, 2)
        The windows used, as an array of (window_start, window_stop) positions,
        using 1-based coordinates.
    counts : ndarray, int, shape (n_windows,)
        Number of variants in each window.

    Examples
    --------

    >>> import allel
    >>> g = allel.GenotypeArray([[[0, 0], [0, 0]],
    ...                          [[0, 0], [0, 1]],
    ...                          [[0, 0], [1, 1]],
    ...                          [[0, 1], [1, 1]],
    ...                          [[1, 1], [1, 1]],
    ...                          [[0, 0], [1, 2]],
    ...                          [[0, 1], [1, 2]],
    ...                          [[0, 1], [-1, -1]],
    ...                          [[-1, -1], [-1, -1]]])
    >>> ac = g.count_alleles()
    >>> pos = [2, 4, 7, 14, 15, 18, 19, 25, 27]
    >>> D, windows, counts = allel.stats.windowed_tajima_d(
    ...     pos, ac, size=10, start=1, stop=31
    ... )
    >>> D
    array([ 0.59158014,  2.93397641,  6.12372436])
    >>> windows
    array([[ 1, 10],
           [11, 20],
           [21, 31]])
    >>> counts
    array([3, 4, 2])

    """

    # check inputs
    if not isinstance(pos, SortedIndex):
        pos = SortedIndex(pos, copy=False)
    if not hasattr(ac, "count_segregating"):
        ac = AlleleCountsArray(ac, copy=False)

    # assume number of chromosomes sampled is constant for all variants
    n = ac.sum(axis=1).max()

    # calculate constants
    a1 = np.sum(1 / np.arange(1, n))
    a2 = np.sum(1 / (np.arange(1, n) ** 2))
    b1 = (n + 1) / (3 * (n - 1))
    b2 = 2 * (n ** 2 + n + 3) / (9 * n * (n - 1))
    c1 = b1 - (1 / a1)
    c2 = b2 - ((n + 2) / (a1 * n)) + (a2 / (a1 ** 2))
    e1 = c1 / a1
    e2 = c2 / (a1 ** 2 + a2)

    # locate segregating variants
    is_seg = ac.is_segregating()

    # calculate mean pairwise difference
    mpd = mean_pairwise_difference(ac, fill=0)

    # define statistic to compute for each window
    # noinspection PyPep8Naming
    def statistic(w_is_seg, w_mpd):
        S = np.count_nonzero(w_is_seg)
        pi = np.sum(w_mpd)
        d = pi - (S / a1)
        d_stdev = np.sqrt((e1 * S) + (e2 * S * (S - 1)))
        wD = d / d_stdev
        return wD

    D, windows, counts = windowed_statistic(
        pos,
        values=(is_seg, mpd),
        statistic=statistic,
        size=size,
        start=start,
        stop=stop,
        step=step,
        windows=windows,
        fill=fill,
    )

    return D, windows, counts
Example #6
0
def windowed_watterson_theta(
    pos, ac, size=None, start=None, stop=None, step=None, windows=None, is_accessible=None, fill=np.nan
):
    """Calculate the value of Watterson's estimator in windows over a single
    chromosome/contig.

    Parameters
    ----------

    pos : array_like, int, shape (n_items,)
        Variant positions, using 1-based coordinates, in ascending order.
    ac : array_like, int, shape (n_variants, n_alleles)
        Allele counts array.
    size : int, optional
        The window size (number of bases).
    start : int, optional
        The position at which to start (1-based).
    stop : int, optional
        The position at which to stop (1-based).
    step : int, optional
        The distance between start positions of windows. If not given,
        defaults to the window size, i.e., non-overlapping windows.
    windows : array_like, int, shape (n_windows, 2), optional
        Manually specify the windows to use as a sequence of (window_start,
        window_stop) positions, using 1-based coordinates. Overrides the
        size/start/stop/step parameters.
    is_accessible : array_like, bool, shape (len(contig),), optional
        Boolean array indicating accessibility status for all positions in the
        chromosome/contig.
    fill : object, optional
        The value to use where a window is completely inaccessible.

    Returns
    -------

    theta_hat_w : ndarray, float, shape (n_windows,)
        Watterson's estimator (theta hat per base).
    windows : ndarray, int, shape (n_windows, 2)
        The windows used, as an array of (window_start, window_stop) positions,
        using 1-based coordinates.
    n_bases : ndarray, int, shape (n_windows,)
        Number of (accessible) bases in each window.
    counts : ndarray, int, shape (n_windows,)
        Number of variants in each window.

    Examples
    --------

    >>> import allel
    >>> g = allel.GenotypeArray([[[0, 0], [0, 0]],
    ...                          [[0, 0], [0, 1]],
    ...                          [[0, 0], [1, 1]],
    ...                          [[0, 1], [1, 1]],
    ...                          [[1, 1], [1, 1]],
    ...                          [[0, 0], [1, 2]],
    ...                          [[0, 1], [1, 2]],
    ...                          [[0, 1], [-1, -1]],
    ...                          [[-1, -1], [-1, -1]]])
    >>> ac = g.count_alleles()
    >>> pos = [2, 4, 7, 14, 15, 18, 19, 25, 27]
    >>> theta_hat_w, windows, n_bases, counts = allel.stats.windowed_watterson_theta(
    ...     pos, ac, size=10, start=1, stop=31
    ... )
    >>> theta_hat_w
    array([ 0.10909091,  0.16363636,  0.04958678])
    >>> windows
    array([[ 1, 10],
           [11, 20],
           [21, 31]])
    >>> n_bases
    array([10, 10, 11])
    >>> counts
    array([3, 4, 2])

    """  # flake8: noqa

    # check inputs
    if not isinstance(pos, SortedIndex):
        pos = SortedIndex(pos, copy=False)
    is_accessible = asarray_ndim(is_accessible, 1, allow_none=True)
    if not hasattr(ac, "count_segregating"):
        ac = AlleleCountsArray(ac, copy=False)

    # locate segregating variants
    is_seg = ac.is_segregating()

    # count segregating variants in windows
    S, windows, counts = windowed_statistic(
        pos, is_seg, statistic=np.count_nonzero, size=size, start=start, stop=stop, step=step, windows=windows, fill=0
    )

    # assume number of chromosomes sampled is constant for all variants
    n = ac.sum(axis=1).max()

    # (n-1)th harmonic number
    a1 = np.sum(1 / np.arange(1, n))

    # absolute value of Watterson's theta
    theta_hat_w_abs = S / a1

    # theta per base
    theta_hat_w, n_bases = per_base(theta_hat_w_abs, windows=windows, is_accessible=is_accessible, fill=fill)

    return theta_hat_w, windows, n_bases, counts
Example #7
0
def windowed_df(
    pos, ac1, ac2, size=None, start=None, stop=None, step=None, windows=None, is_accessible=None, fill=np.nan
):
    """Calculate the density of fixed differences between two populations in
    windows over a single chromosome/contig.

    Parameters
    ----------

    pos : array_like, int, shape (n_items,)
        Variant positions, using 1-based coordinates, in ascending order.
    ac1 : array_like, int, shape (n_variants, n_alleles)
        Allele counts array for the first population.
    ac2 : array_like, int, shape (n_variants, n_alleles)
        Allele counts array for the second population.
    size : int, optional
        The window size (number of bases).
    start : int, optional
        The position at which to start (1-based).
    stop : int, optional
        The position at which to stop (1-based).
    step : int, optional
        The distance between start positions of windows. If not given,
        defaults to the window size, i.e., non-overlapping windows.
    windows : array_like, int, shape (n_windows, 2), optional
        Manually specify the windows to use as a sequence of (window_start,
        window_stop) positions, using 1-based coordinates. Overrides the
        size/start/stop/step parameters.
    is_accessible : array_like, bool, shape (len(contig),), optional
        Boolean array indicating accessibility status for all positions in the
        chromosome/contig.
    fill : object, optional
        The value to use where a window is completely inaccessible.

    Returns
    -------

    df : ndarray, float, shape (n_windows,)
        Per-base density of fixed differences in each window.
    windows : ndarray, int, shape (n_windows, 2)
        The windows used, as an array of (window_start, window_stop) positions,
        using 1-based coordinates.
    n_bases : ndarray, int, shape (n_windows,)
        Number of (accessible) bases in each window.
    counts : ndarray, int, shape (n_windows,)
        Number of variants in each window.

    See Also
    --------

    allel.model.locate_fixed_differences

    """

    # check inputs
    pos = SortedIndex(pos, copy=False)
    is_accessible = asarray_ndim(is_accessible, 1, allow_none=True)

    # locate fixed differences
    loc_df = locate_fixed_differences(ac1, ac2)

    # count number of fixed differences in windows
    n_df, windows, counts = windowed_statistic(
        pos,
        values=loc_df,
        statistic=np.count_nonzero,
        size=size,
        start=start,
        stop=stop,
        step=step,
        windows=windows,
        fill=0,
    )

    # calculate value per base
    df, n_bases = per_base(n_df, windows, is_accessible=is_accessible, fill=fill)

    return df, windows, n_bases, counts
Example #8
0
def windowed_divergence(
    pos, ac1, ac2, size=None, start=None, stop=None, step=None, windows=None, is_accessible=None, fill=np.nan
):
    """Estimate nucleotide divergence between two populations in windows
    over a single chromosome/contig.

    Parameters
    ----------

    pos : array_like, int, shape (n_items,)
        Variant positions, using 1-based coordinates, in ascending order.
    ac1 : array_like, int, shape (n_variants, n_alleles)
        Allele counts array for the first population.
    ac2 : array_like, int, shape (n_variants, n_alleles)
        Allele counts array for the second population.
    size : int, optional
        The window size (number of bases).
    start : int, optional
        The position at which to start (1-based).
    stop : int, optional
        The position at which to stop (1-based).
    step : int, optional
        The distance between start positions of windows. If not given,
        defaults to the window size, i.e., non-overlapping windows.
    windows : array_like, int, shape (n_windows, 2), optional
        Manually specify the windows to use as a sequence of (window_start,
        window_stop) positions, using 1-based coordinates. Overrides the
        size/start/stop/step parameters.
    is_accessible : array_like, bool, shape (len(contig),), optional
        Boolean array indicating accessibility status for all positions in the
        chromosome/contig.
    fill : object, optional
        The value to use where a window is completely inaccessible.

    Returns
    -------

    Dxy : ndarray, float, shape (n_windows,)
        Nucleotide divergence in each window.
    windows : ndarray, int, shape (n_windows, 2)
        The windows used, as an array of (window_start, window_stop) positions,
        using 1-based coordinates.
    n_bases : ndarray, int, shape (n_windows,)
        Number of (accessible) bases in each window.
    counts : ndarray, int, shape (n_windows,)
        Number of variants in each window.

    Examples
    --------

    Simplest case, two haplotypes in each population::

        >>> import allel
        >>> h = allel.HaplotypeArray([[0, 0, 0, 0],
        ...                           [0, 0, 0, 1],
        ...                           [0, 0, 1, 1],
        ...                           [0, 1, 1, 1],
        ...                           [1, 1, 1, 1],
        ...                           [0, 0, 1, 2],
        ...                           [0, 1, 1, 2],
        ...                           [0, 1, -1, -1],
        ...                           [-1, -1, -1, -1]])
        >>> ac1 = h.count_alleles(subpop=[0, 1])
        >>> ac2 = h.count_alleles(subpop=[2, 3])
        >>> pos = [2, 4, 7, 14, 15, 18, 19, 25, 27]
        >>> dxy, windows, n_bases, counts = windowed_divergence(
        ...     pos, ac1, ac2, size=10, start=1, stop=31
        ... )
        >>> dxy
        array([ 0.15 ,  0.225,  0.   ])
        >>> windows
        array([[ 1, 10],
               [11, 20],
               [21, 31]])
        >>> n_bases
        array([10, 10, 11])
        >>> counts
        array([3, 4, 2])

    """

    # check inputs
    pos = SortedIndex(pos, copy=False)
    is_accessible = asarray_ndim(is_accessible, 1, allow_none=True)

    # calculate mean pairwise divergence
    mpd = mean_pairwise_difference_between(ac1, ac2, fill=0)

    # sum in windows
    mpd_sum, windows, counts = windowed_statistic(
        pos, values=mpd, statistic=np.sum, size=size, start=start, stop=stop, step=step, windows=windows, fill=0
    )

    # calculate value per base
    dxy, n_bases = per_base(mpd_sum, windows, is_accessible=is_accessible, fill=fill)

    return dxy, windows, n_bases, counts
Example #9
0
def windowed_r_squared(pos, gn, size=None, start=None, stop=None, step=None,
                       windows=None, fill=np.nan, percentile=50):
    """Summarise linkage disequilibrium in windows over a single
    chromosome/contig.

    Parameters
    ----------

    pos : array_like, int, shape (n_items,)
        The item positions in ascending order, using 1-based coordinates..
    gn : array_like, int8, shape (n_variants, n_samples)
        Diploid genotypes at biallelic variants, coded as the number of
        alternate alleles per call (i.e., 0 = hom ref, 1 = het, 2 = hom alt).
    size : int, optional
        The window size (number of bases).
    start : int, optional
        The position at which to start (1-based).
    stop : int, optional
        The position at which to stop (1-based).
    step : int, optional
        The distance between start positions of windows. If not given,
        defaults to the window size, i.e., non-overlapping windows.
    windows : array_like, int, shape (n_windows, 2), optional
        Manually specify the windows to use as a sequence of (window_start,
        window_stop) positions, using 1-based coordinates. Overrides the
        size/start/stop/step parameters.
    fill : object, optional
        The value to use where a window is empty, i.e., contains no items.
    percentile : int or sequence of ints, optional
        The percentile or percentiles to calculate within each window.

    Returns
    -------

    out : ndarray, shape (n_windows,)
        The value of the statistic for each window.
    windows : ndarray, int, shape (n_windows, 2)
        The windows used, as an array of (window_start, window_stop) positions,
        using 1-based coordinates.
    counts : ndarray, int, shape (n_windows,)
        The number of items in each window.

    Notes
    -----

    Linkage disequilibrium (r**2) is calculated using the method of Rogers
    and Huff (2008).

    See Also
    --------

    allel.stats.window.windowed_statistic

    """

    # define the statistic function
    if isinstance(percentile, (list, tuple)):
        fill = [fill for _ in percentile]

        def statistic(gnw):
            r_squared = rogers_huff_r(gnw) ** 2
            return [np.percentile(r_squared, p) for p in percentile]

    else:
        def statistic(gnw):
            r_squared = rogers_huff_r(gnw) ** 2
            return np.percentile(r_squared, percentile)

    return windowed_statistic(pos, gn, statistic, size, start=start,
                              stop=stop, step=step, windows=windows, fill=fill)
Example #10
0
def windowed_watterson_theta(pos,
                             ac,
                             size=None,
                             start=None,
                             stop=None,
                             step=None,
                             windows=None,
                             is_accessible=None,
                             fill=np.nan):
    """Calculate the value of Watterson's estimator in windows over a single
    chromosome/contig.

    Parameters
    ----------

    pos : array_like, int, shape (n_items,)
        Variant positions, using 1-based coordinates, in ascending order.
    ac : array_like, int, shape (n_variants, n_alleles)
        Allele counts array.
    size : int, optional
        The window size (number of bases).
    start : int, optional
        The position at which to start (1-based).
    stop : int, optional
        The position at which to stop (1-based).
    step : int, optional
        The distance between start positions of windows. If not given,
        defaults to the window size, i.e., non-overlapping windows.
    windows : array_like, int, shape (n_windows, 2), optional
        Manually specify the windows to use as a sequence of (window_start,
        window_stop) positions, using 1-based coordinates. Overrides the
        size/start/stop/step parameters.
    is_accessible : array_like, bool, shape (len(contig),), optional
        Boolean array indicating accessibility status for all positions in the
        chromosome/contig.
    fill : object, optional
        The value to use where a window is completely inaccessible.

    Returns
    -------

    theta_hat_w : ndarray, float, shape (n_windows,)
        Watterson's estimator (theta hat per base).
    windows : ndarray, int, shape (n_windows, 2)
        The windows used, as an array of (window_start, window_stop) positions,
        using 1-based coordinates.
    n_bases : ndarray, int, shape (n_windows,)
        Number of (accessible) bases in each window.
    counts : ndarray, int, shape (n_windows,)
        Number of variants in each window.

    Examples
    --------

    >>> import allel
    >>> g = allel.GenotypeArray([[[0, 0], [0, 0]],
    ...                          [[0, 0], [0, 1]],
    ...                          [[0, 0], [1, 1]],
    ...                          [[0, 1], [1, 1]],
    ...                          [[1, 1], [1, 1]],
    ...                          [[0, 0], [1, 2]],
    ...                          [[0, 1], [1, 2]],
    ...                          [[0, 1], [-1, -1]],
    ...                          [[-1, -1], [-1, -1]]])
    >>> ac = g.count_alleles()
    >>> pos = [2, 4, 7, 14, 15, 18, 19, 25, 27]
    >>> theta_hat_w, windows, n_bases, counts = allel.windowed_watterson_theta(
    ...     pos, ac, size=10, start=1, stop=31
    ... )
    >>> theta_hat_w
    array([0.10909091, 0.16363636, 0.04958678])
    >>> windows
    array([[ 1, 10],
           [11, 20],
           [21, 31]])
    >>> n_bases
    array([10, 10, 11])
    >>> counts
    array([3, 4, 2])

    """  # flake8: noqa

    # check inputs
    if not isinstance(pos, SortedIndex):
        pos = SortedIndex(pos, copy=False)
    is_accessible = asarray_ndim(is_accessible, 1, allow_none=True)
    if not hasattr(ac, 'count_segregating'):
        ac = AlleleCountsArray(ac, copy=False)

    # locate segregating variants
    is_seg = ac.is_segregating()

    # count segregating variants in windows
    S, windows, counts = windowed_statistic(pos,
                                            is_seg,
                                            statistic=np.count_nonzero,
                                            size=size,
                                            start=start,
                                            stop=stop,
                                            step=step,
                                            windows=windows,
                                            fill=0)

    # assume number of chromosomes sampled is constant for all variants
    n = ac.sum(axis=1).max()

    # (n-1)th harmonic number
    a1 = np.sum(1 / np.arange(1, n))

    # absolute value of Watterson's theta
    theta_hat_w_abs = S / a1

    # theta per base
    theta_hat_w, n_bases = per_base(theta_hat_w_abs,
                                    windows=windows,
                                    is_accessible=is_accessible,
                                    fill=fill)

    return theta_hat_w, windows, n_bases, counts
Example #11
0
def windowed_df(pos,
                ac1,
                ac2,
                size=None,
                start=None,
                stop=None,
                step=None,
                windows=None,
                is_accessible=None,
                fill=np.nan):
    """Calculate the density of fixed differences between two populations in
    windows over a single chromosome/contig.

    Parameters
    ----------

    pos : array_like, int, shape (n_items,)
        Variant positions, using 1-based coordinates, in ascending order.
    ac1 : array_like, int, shape (n_variants, n_alleles)
        Allele counts array for the first population.
    ac2 : array_like, int, shape (n_variants, n_alleles)
        Allele counts array for the second population.
    size : int, optional
        The window size (number of bases).
    start : int, optional
        The position at which to start (1-based).
    stop : int, optional
        The position at which to stop (1-based).
    step : int, optional
        The distance between start positions of windows. If not given,
        defaults to the window size, i.e., non-overlapping windows.
    windows : array_like, int, shape (n_windows, 2), optional
        Manually specify the windows to use as a sequence of (window_start,
        window_stop) positions, using 1-based coordinates. Overrides the
        size/start/stop/step parameters.
    is_accessible : array_like, bool, shape (len(contig),), optional
        Boolean array indicating accessibility status for all positions in the
        chromosome/contig.
    fill : object, optional
        The value to use where a window is completely inaccessible.

    Returns
    -------

    df : ndarray, float, shape (n_windows,)
        Per-base density of fixed differences in each window.
    windows : ndarray, int, shape (n_windows, 2)
        The windows used, as an array of (window_start, window_stop) positions,
        using 1-based coordinates.
    n_bases : ndarray, int, shape (n_windows,)
        Number of (accessible) bases in each window.
    counts : ndarray, int, shape (n_windows,)
        Number of variants in each window.

    See Also
    --------

    allel.model.locate_fixed_differences

    """

    # check inputs
    pos = SortedIndex(pos, copy=False)
    is_accessible = asarray_ndim(is_accessible, 1, allow_none=True)

    # locate fixed differences
    loc_df = locate_fixed_differences(ac1, ac2)

    # count number of fixed differences in windows
    n_df, windows, counts = windowed_statistic(pos,
                                               values=loc_df,
                                               statistic=np.count_nonzero,
                                               size=size,
                                               start=start,
                                               stop=stop,
                                               step=step,
                                               windows=windows,
                                               fill=0)

    # calculate value per base
    df, n_bases = per_base(n_df,
                           windows,
                           is_accessible=is_accessible,
                           fill=fill)

    return df, windows, n_bases, counts
Example #12
0
def windowed_divergence(pos,
                        ac1,
                        ac2,
                        size=None,
                        start=None,
                        stop=None,
                        step=None,
                        windows=None,
                        is_accessible=None,
                        fill=np.nan):
    """Estimate nucleotide divergence between two populations in windows
    over a single chromosome/contig.

    Parameters
    ----------

    pos : array_like, int, shape (n_items,)
        Variant positions, using 1-based coordinates, in ascending order.
    ac1 : array_like, int, shape (n_variants, n_alleles)
        Allele counts array for the first population.
    ac2 : array_like, int, shape (n_variants, n_alleles)
        Allele counts array for the second population.
    size : int, optional
        The window size (number of bases).
    start : int, optional
        The position at which to start (1-based).
    stop : int, optional
        The position at which to stop (1-based).
    step : int, optional
        The distance between start positions of windows. If not given,
        defaults to the window size, i.e., non-overlapping windows.
    windows : array_like, int, shape (n_windows, 2), optional
        Manually specify the windows to use as a sequence of (window_start,
        window_stop) positions, using 1-based coordinates. Overrides the
        size/start/stop/step parameters.
    is_accessible : array_like, bool, shape (len(contig),), optional
        Boolean array indicating accessibility status for all positions in the
        chromosome/contig.
    fill : object, optional
        The value to use where a window is completely inaccessible.

    Returns
    -------

    Dxy : ndarray, float, shape (n_windows,)
        Nucleotide divergence in each window.
    windows : ndarray, int, shape (n_windows, 2)
        The windows used, as an array of (window_start, window_stop) positions,
        using 1-based coordinates.
    n_bases : ndarray, int, shape (n_windows,)
        Number of (accessible) bases in each window.
    counts : ndarray, int, shape (n_windows,)
        Number of variants in each window.

    Examples
    --------

    Simplest case, two haplotypes in each population::

        >>> import allel
        >>> h = allel.HaplotypeArray([[0, 0, 0, 0],
        ...                           [0, 0, 0, 1],
        ...                           [0, 0, 1, 1],
        ...                           [0, 1, 1, 1],
        ...                           [1, 1, 1, 1],
        ...                           [0, 0, 1, 2],
        ...                           [0, 1, 1, 2],
        ...                           [0, 1, -1, -1],
        ...                           [-1, -1, -1, -1]])
        >>> ac1 = h.count_alleles(subpop=[0, 1])
        >>> ac2 = h.count_alleles(subpop=[2, 3])
        >>> pos = [2, 4, 7, 14, 15, 18, 19, 25, 27]
        >>> dxy, windows, n_bases, counts = windowed_divergence(
        ...     pos, ac1, ac2, size=10, start=1, stop=31
        ... )
        >>> dxy
        array([0.15 , 0.225, 0.   ])
        >>> windows
        array([[ 1, 10],
               [11, 20],
               [21, 31]])
        >>> n_bases
        array([10, 10, 11])
        >>> counts
        array([3, 4, 2])

    """

    # check inputs
    pos = SortedIndex(pos, copy=False)
    is_accessible = asarray_ndim(is_accessible, 1, allow_none=True)

    # calculate mean pairwise divergence
    mpd = mean_pairwise_difference_between(ac1, ac2, fill=0)

    # sum in windows
    mpd_sum, windows, counts = windowed_statistic(pos,
                                                  values=mpd,
                                                  statistic=np.sum,
                                                  size=size,
                                                  start=start,
                                                  stop=stop,
                                                  step=step,
                                                  windows=windows,
                                                  fill=0)

    # calculate value per base
    dxy, n_bases = per_base(mpd_sum,
                            windows,
                            is_accessible=is_accessible,
                            fill=fill)

    return dxy, windows, n_bases, counts
Example #13
0
def windowed_tajima_d(pos,
                      ac,
                      size=None,
                      start=None,
                      stop=None,
                      step=None,
                      windows=None,
                      min_sites=3):
    """Calculate the value of Tajima's D in windows over a single
    chromosome/contig.

    Parameters
    ----------
    pos : array_like, int, shape (n_items,)
        Variant positions, using 1-based coordinates, in ascending order.
    ac : array_like, int, shape (n_variants, n_alleles)
        Allele counts array.
    size : int, optional
        The window size (number of bases).
    start : int, optional
        The position at which to start (1-based).
    stop : int, optional
        The position at which to stop (1-based).
    step : int, optional
        The distance between start positions of windows. If not given,
        defaults to the window size, i.e., non-overlapping windows.
    windows : array_like, int, shape (n_windows, 2), optional
        Manually specify the windows to use as a sequence of (window_start,
        window_stop) positions, using 1-based coordinates. Overrides the
        size/start/stop/step parameters.
    min_sites : int, optional
        Minimum number of segregating sites for which to calculate a value. If
        there are fewer, np.nan is returned. Defaults to 3.

    Returns
    -------
    D : ndarray, float, shape (n_windows,)
        Tajima's D.
    windows : ndarray, int, shape (n_windows, 2)
        The windows used, as an array of (window_start, window_stop) positions,
        using 1-based coordinates.
    counts : ndarray, int, shape (n_windows,)
        Number of variants in each window.

    Examples
    --------

    >>> import allel
    >>> g = allel.GenotypeArray([[[0, 0], [0, 0]],
    ...                          [[0, 0], [0, 1]],
    ...                          [[0, 0], [1, 1]],
    ...                          [[0, 1], [1, 1]],
    ...                          [[1, 1], [1, 1]],
    ...                          [[0, 0], [1, 2]],
    ...                          [[0, 1], [1, 2]],
    ...                          [[0, 1], [-1, -1]],
    ...                          [[-1, -1], [-1, -1]]])
    >>> ac = g.count_alleles()
    >>> pos = [2, 4, 7, 14, 15, 20, 22, 25, 27]
    >>> D, windows, counts = allel.windowed_tajima_d(pos, ac, size=20, step=10, start=1, stop=31)
    >>> D
    array([1.36521524, 4.22566622])
    >>> windows
    array([[ 1, 20],
           [11, 31]])
    >>> counts
    array([6, 6])

    """

    # check inputs
    if not isinstance(pos, SortedIndex):
        pos = SortedIndex(pos, copy=False)
    if not hasattr(ac, 'count_segregating'):
        ac = AlleleCountsArray(ac, copy=False)

    # assume number of chromosomes sampled is constant for all variants
    n = ac.sum(axis=1).max()

    # calculate constants
    a1 = np.sum(1 / np.arange(1, n))
    a2 = np.sum(1 / (np.arange(1, n)**2))
    b1 = (n + 1) / (3 * (n - 1))
    b2 = 2 * (n**2 + n + 3) / (9 * n * (n - 1))
    c1 = b1 - (1 / a1)
    c2 = b2 - ((n + 2) / (a1 * n)) + (a2 / (a1**2))
    e1 = c1 / a1
    e2 = c2 / (a1**2 + a2)

    # locate segregating variants
    is_seg = ac.is_segregating()

    # calculate mean pairwise difference
    mpd = mean_pairwise_difference(ac, fill=0)

    # define statistic to compute for each window
    # noinspection PyPep8Naming
    def statistic(w_is_seg, w_mpd):
        S = np.count_nonzero(w_is_seg)
        if S < min_sites:
            return np.nan
        pi = np.sum(w_mpd)
        d = pi - (S / a1)
        d_stdev = np.sqrt((e1 * S) + (e2 * S * (S - 1)))
        wD = d / d_stdev
        return wD

    D, windows, counts = windowed_statistic(pos,
                                            values=(is_seg, mpd),
                                            statistic=statistic,
                                            size=size,
                                            start=start,
                                            stop=stop,
                                            step=step,
                                            windows=windows,
                                            fill=np.nan)

    return D, windows, counts
Example #14
0
def windowed_diversity(pos, ac, size=None, start=None, stop=None, step=None,
                       windows=None, is_accessible=None, fill=np.nan):
    """Estimate nucleotide diversity in windows over a single
    chromosome/contig.

    Parameters
    ----------

    pos : array_like, int, shape (n_items,)
        Variant positions, using 1-based coordinates, in ascending order.
    ac : array_like, int, shape (n_variants, n_alleles)
        Allele counts array.
    size : int, optional
        The window size (number of bases).
    start : int, optional
        The position at which to start (1-based).
    stop : int, optional
        The position at which to stop (1-based).
    step : int, optional
        The distance between start positions of windows. If not given,
        defaults to the window size, i.e., non-overlapping windows.
    windows : array_like, int, shape (n_windows, 2), optional
        Manually specify the windows to use as a sequence of (window_start,
        window_stop) positions, using 1-based coordinates. Overrides the
        size/start/stop/step parameters.
    is_accessible : array_like, bool, shape (len(contig),), optional
        Boolean array indicating accessibility status for all positions in the
        chromosome/contig.
    fill : object, optional
        The value to use where a window is completely inaccessible.

    Returns
    -------

    pi : ndarray, float, shape (n_windows,)
        Nucleotide diversity in each window.
    windows : ndarray, int, shape (n_windows, 2)
        The windows used, as an array of (window_start, window_stop) positions,
        using 1-based coordinates.
    n_bases : ndarray, int, shape (n_windows,)
        Number of (accessible) bases in each window.
    counts : ndarray, int, shape (n_windows,)
        Number of variants in each window.

    Examples
    --------

    >>> import allel
    >>> g = allel.GenotypeArray([[[0, 0], [0, 0]],
    ...                          [[0, 0], [0, 1]],
    ...                          [[0, 0], [1, 1]],
    ...                          [[0, 1], [1, 1]],
    ...                          [[1, 1], [1, 1]],
    ...                          [[0, 0], [1, 2]],
    ...                          [[0, 1], [1, 2]],
    ...                          [[0, 1], [-1, -1]],
    ...                          [[-1, -1], [-1, -1]]])
    >>> ac = g.count_alleles()
    >>> pos = [2, 4, 7, 14, 15, 18, 19, 25, 27]
    >>> pi, windows, n_bases, counts = allel.windowed_diversity(
    ...     pos, ac, size=10, start=1, stop=31
    ... )
    >>> pi
    array([0.11666667, 0.21666667, 0.09090909])
    >>> windows
    array([[ 1, 10],
           [11, 20],
           [21, 31]])
    >>> n_bases
    array([10, 10, 11])
    >>> counts
    array([3, 4, 2])

    """

    # check inputs
    if not isinstance(pos, SortedIndex):
        pos = SortedIndex(pos, copy=False)
    is_accessible = asarray_ndim(is_accessible, 1, allow_none=True)
    # masking inaccessible sites from pos and ac
    pos, ac = mask_inaccessible(is_accessible, pos, ac)

    # calculate mean pairwise difference
    mpd = mean_pairwise_difference(ac, fill=0)

    # sum differences in windows
    mpd_sum, windows, counts = windowed_statistic(
        pos, values=mpd, statistic=np.sum, size=size, start=start, stop=stop,
        step=step, windows=windows, fill=0
    )

    # calculate value per base
    pi, n_bases = per_base(mpd_sum, windows, is_accessible=is_accessible,
                           fill=fill)

    return pi, windows, n_bases, counts