Exemple #1
0
def moving_weir_cockerham_fst(g,
                              subpops,
                              size,
                              start=0,
                              stop=None,
                              step=None,
                              max_allele=None):
    """Estimate average Fst in moving windows over a single chromosome/contig,
    following the method of Weir and Cockerham (1984).

    Parameters
    ----------
    g : array_like, int, shape (n_variants, n_samples, ploidy)
        Genotype array.
    subpops : sequence of sequences of ints
        Sample indices for each subpopulation.
    size : int
        The window size (number of variants).
    start : int, optional
        The index at which to start.
    stop : int, optional
        The index at which to stop.
    step : int, optional
        The number of variants between start positions of windows. If not
        given, defaults to the window size, i.e., non-overlapping windows.
    max_allele : int, optional
        The highest allele index to consider.

    Returns
    -------
    fst : ndarray, float, shape (n_windows,)
        Average Fst in each window.

    """

    # calculate per-variant values
    a, b, c = weir_cockerham_fst(g, subpops, max_allele=max_allele)

    # compute the numerator and denominator in moving windows
    num = moving_statistic(a,
                           statistic=np.nansum,
                           size=size,
                           start=start,
                           stop=stop,
                           step=step)
    den = moving_statistic(a + b + c,
                           statistic=np.nansum,
                           size=size,
                           start=start,
                           stop=stop,
                           step=step)

    #  calculate fst in each window
    fst = num / den

    return fst
Exemple #2
0
def blockwise_weir_cockerham_fst(g, subpops, blen, max_allele=None):
    """Estimate average Fst and standard error using the block-jackknife.

    Parameters
    ----------
    g : array_like, int, shape (n_variants, n_samples, ploidy)
        Genotype array.
    subpops : sequence of sequences of ints
        Sample indices for each subpopulation.
    blen : int
        Block size (number of variants).
    max_allele : int, optional
        The highest allele index to consider.

    Returns
    -------
    fst : float
        Estimated value of the statistic using all data.
    se : float
        Estimated standard error.
    vb : ndarray, float, shape (n_blocks,)
        Value of the statistic in each block.
    vj : ndarray, float, shape (n_blocks,)
        Values of the statistic from block-jackknife resampling.

    """

    # calculate per-variant values
    a, b, c = weir_cockerham_fst(g, subpops, max_allele=max_allele)

    # calculate overall estimate
    a_sum = np.nansum(a)
    b_sum = np.nansum(b)
    c_sum = np.nansum(c)
    fst = a_sum / (a_sum + b_sum + c_sum)

    # compute the numerator and denominator within each block
    num_bsum = moving_statistic(a, statistic=np.nansum, size=blen)
    den_bsum = moving_statistic(a + b + c, statistic=np.nansum, size=blen)

    # calculate the statistic values in each block
    vb = num_bsum / den_bsum

    # estimate standard error
    _, se, vj = jackknife((num_bsum, den_bsum),
                          statistic=lambda n, d: np.sum(n) / np.sum(d))

    return fst, se, vb, vj
Exemple #3
0
def average_weir_cockerham_fst(g, subpops, blen, max_allele=None):
    """Estimate average Fst and standard error using the block-jackknife.

    Parameters
    ----------
    g : array_like, int, shape (n_variants, n_samples, ploidy)
        Genotype array.
    subpops : sequence of sequences of ints
        Sample indices for each subpopulation.
    blen : int
        Block size (number of variants).
    max_allele : int, optional
        The highest allele index to consider.

    Returns
    -------
    fst : float
        Estimated value of the statistic using all data.
    se : float
        Estimated standard error.
    vb : ndarray, float, shape (n_blocks,)
        Value of the statistic in each block.
    vj : ndarray, float, shape (n_blocks,)
        Values of the statistic from block-jackknife resampling.

    """

    # calculate per-variant values
    a, b, c = weir_cockerham_fst(g, subpops, max_allele=max_allele)

    # calculate overall estimate
    a_sum = np.nansum(a)
    b_sum = np.nansum(b)
    c_sum = np.nansum(c)
    fst = a_sum / (a_sum + b_sum + c_sum)

    # compute the numerator and denominator within each block
    num_bsum = moving_statistic(a, statistic=np.nansum, size=blen)
    den_bsum = moving_statistic(a + b + c, statistic=np.nansum, size=blen)

    # calculate the statistic values in each block
    vb = num_bsum / den_bsum

    # estimate standard error
    _, se, vj = jackknife((num_bsum, den_bsum),
                          statistic=lambda n, d: np.sum(n) / np.sum(d))

    return fst, se, vb, vj
Exemple #4
0
def moving_patterson_fst(ac1, ac2, size, start=0, stop=None, step=None):
    """Estimate average Fst in moving windows over a single chromosome/contig,
    following the method of Patterson (2012).

    Parameters
    ----------
    ac1 : array_like, int, shape (n_variants, n_alleles)
        Allele counts array from the first population.
    ac2 : array_like, int, shape (n_variants, n_alleles)
        Allele counts array from the second population.
    size : int
        The window size (number of variants).
    start : int, optional
        The index at which to start.
    stop : int, optional
        The index at which to stop.
    step : int, optional
        The number of variants between start positions of windows. If not
        given, defaults to the window size, i.e., non-overlapping windows.

    Returns
    -------
    fst : ndarray, float, shape (n_windows,)
        Average Fst in each window.

    """

    # calculate per-variant values
    num, den = patterson_fst(ac1, ac2)

    # compute the numerator and denominator in moving windows
    num_sum = moving_statistic(num,
                               statistic=np.nansum,
                               size=size,
                               start=start,
                               stop=stop,
                               step=step)
    den_sum = moving_statistic(den,
                               statistic=np.nansum,
                               size=size,
                               start=start,
                               stop=stop,
                               step=step)

    # calculate fst in each window
    fst = num_sum / den_sum

    return fst
Exemple #5
0
def moving_haplotype_diversity(h, size, start=0, stop=None, step=None):
    """Estimate haplotype diversity in moving windows.

    Parameters
    ----------
    h : array_like, int, shape (n_variants, n_haplotypes)
        Haplotype array.
    size : int
        The window size (number of variants).
    start : int, optional
        The index at which to start.
    stop : int, optional
        The index at which to stop.
    step : int, optional
        The number of variants between start positions of windows. If not
        given, defaults to the window size, i.e., non-overlapping windows.

    Returns
    -------
    hd : ndarray, float, shape (n_windows,)
        Haplotype diversity.

    """

    hd = moving_statistic(values=h, statistic=haplotype_diversity, size=size, start=start, stop=stop, step=step)
    return hd
def moving_haplotype_diversity(h, size, start=0, stop=None, step=None):
    """Estimate haplotype diversity in moving windows.

    Parameters
    ----------
    h : array_like, int, shape (n_variants, n_haplotypes)
        Haplotype array.
    size : int
        The window size (number of variants).
    start : int, optional
        The index at which to start.
    stop : int, optional
        The index at which to stop.
    step : int, optional
        The number of variants between start positions of windows. If not
        given, defaults to the window size, i.e., non-overlapping windows.

    Returns
    -------
    hd : ndarray, float, shape (n_windows,)
        Haplotype diversity.

    """

    hd = moving_statistic(values=h,
                          statistic=haplotype_diversity,
                          size=size,
                          start=start,
                          stop=stop,
                          step=step)
    return hd
Exemple #7
0
def blockwise_patterson_fst(ac1, ac2, blen):
    """Estimate average Fst between two populations and standard error using
    the block-jackknife.

    Parameters
    ----------
    ac1 : array_like, int, shape (n_variants, n_alleles)
        Allele counts array from the first population.
    ac2 : array_like, int, shape (n_variants, n_alleles)
        Allele counts array from the second population.
    blen : int
        Block size (number of variants).

    Returns
    -------
    fst : float
        Estimated value of the statistic using all data.
    se : float
        Estimated standard error.
    vb : ndarray, float, shape (n_blocks,)
        Value of the statistic in each block.
    vj : ndarray, float, shape (n_blocks,)
        Values of the statistic from block-jackknife resampling.

    """

    # calculate per-variant values
    num, den = patterson_fst(ac1, ac2)

    # calculate overall estimate
    fst = np.nansum(num) / np.nansum(den)

    # compute the numerator and denominator within each block
    num_bsum = moving_statistic(num, statistic=np.nansum, size=blen)
    den_bsum = moving_statistic(den, statistic=np.nansum, size=blen)

    # calculate the statistic values in each block
    vb = num_bsum / den_bsum

    # estimate standard error
    _, se, vj = jackknife((num_bsum, den_bsum),
                          statistic=lambda n, d: np.sum(n) / np.sum(d))

    return fst, se, vb, vj
Exemple #8
0
def average_patterson_fst(ac1, ac2, blen):
    """Estimate average Fst between two populations and standard error using
    the block-jackknife.

    Parameters
    ----------
    ac1 : array_like, int, shape (n_variants, n_alleles)
        Allele counts array from the first population.
    ac2 : array_like, int, shape (n_variants, n_alleles)
        Allele counts array from the second population.
    blen : int
        Block size (number of variants).

    Returns
    -------
    fst : float
        Estimated value of the statistic using all data.
    se : float
        Estimated standard error.
    vb : ndarray, float, shape (n_blocks,)
        Value of the statistic in each block.
    vj : ndarray, float, shape (n_blocks,)
        Values of the statistic from block-jackknife resampling.

    """

    # calculate per-variant values
    num, den = patterson_fst(ac1, ac2)

    # calculate overall estimate
    fst = np.nansum(num) / np.nansum(den)

    # compute the numerator and denominator within each block
    num_bsum = moving_statistic(num, statistic=np.nansum, size=blen)
    den_bsum = moving_statistic(den, statistic=np.nansum, size=blen)

    # calculate the statistic values in each block
    vb = num_bsum / den_bsum

    # estimate standard error
    _, se, vj = jackknife((num_bsum, den_bsum),
                          statistic=lambda n, d: np.sum(n) / np.sum(d))

    return fst, se, vb, vj
Exemple #9
0
def moving_tajima_d(ac, size, start=0, stop=None, step=None, min_sites=3):
    """Calculate the value of Tajima's D in moving windows of `size` variants.


    Parameters
    ----------
    ac : array_like, int, shape (n_variants, n_alleles)
        Allele counts array.
    size : int
        The window size (number of variants).
    start : int, optional
        The index at which to start.
    stop : int, optional
        The index at which to stop.
    step : int, optional
        The number of variants between start positions of windows. If not
        given, defaults to the window size, i.e., non-overlapping windows.
    min_sites : int, optional
        Minimum number of segregating sites for which to calculate a value. If
        there are fewer, np.nan is returned. Defaults to 3.

    Returns
    -------
    d : ndarray, float, shape (n_windows,)
        Tajima's D.

    Examples
    --------

    >>> import allel
    >>> g = allel.GenotypeArray([[[0, 0], [0, 0]],
    ...                          [[0, 0], [0, 1]],
    ...                          [[0, 0], [1, 1]],
    ...                          [[0, 1], [1, 1]],
    ...                          [[1, 1], [1, 1]],
    ...                          [[0, 0], [1, 2]],
    ...                          [[0, 1], [1, 2]],
    ...                          [[0, 1], [-1, -1]],
    ...                          [[-1, -1], [-1, -1]]])
    >>> ac = g.count_alleles()
    >>> D = allel.moving_tajima_d(ac, size=4, step=2)
    >>> D
    array([0.1676558 , 2.01186954, 5.70029703])

    """

    d = moving_statistic(values=ac,
                         statistic=tajima_d,
                         size=size,
                         start=start,
                         stop=stop,
                         step=step,
                         min_sites=min_sites)
    return d
Exemple #10
0
def moving_weir_cockerham_fst(g, subpops, size, start=0, stop=None, step=None, max_allele=None):
    """Estimate average Fst in moving windows over a single chromosome/contig,
    following the method of Weir and Cockerham (1984).

    Parameters
    ----------
    g : array_like, int, shape (n_variants, n_samples, ploidy)
        Genotype array.
    subpops : sequence of sequences of ints
        Sample indices for each subpopulation.
    size : int
        The window size (number of variants).
    start : int, optional
        The index at which to start.
    stop : int, optional
        The index at which to stop.
    step : int, optional
        The number of variants between start positions of windows. If not
        given, defaults to the window size, i.e., non-overlapping windows.
    max_allele : int, optional
        The highest allele index to consider.

    Returns
    -------
    fst : ndarray, float, shape (n_windows,)
        Average Fst in each window.

    """

    # calculate per-variant values
    a, b, c = weir_cockerham_fst(g, subpops, max_allele=max_allele)

    # compute the numerator and denominator in moving windows
    num = moving_statistic(a, statistic=np.nansum, size=size, start=start, stop=stop, step=step)
    den = moving_statistic(a + b + c, statistic=np.nansum, size=size, start=start, stop=stop, step=step)

    #  calculate fst in each window
    fst = num / den

    return fst
Exemple #11
0
def moving_patterson_fst(ac1, ac2, size, start=0, stop=None, step=None):
    """Estimate average Fst in moving windows over a single chromosome/contig,
    following the method of Patterson (2012).

    Parameters
    ----------
    ac1 : array_like, int, shape (n_variants, n_alleles)
        Allele counts array from the first population.
    ac2 : array_like, int, shape (n_variants, n_alleles)
        Allele counts array from the second population.
    size : int
        The window size (number of variants).
    start : int, optional
        The index at which to start.
    stop : int, optional
        The index at which to stop.
    step : int, optional
        The number of variants between start positions of windows. If not
        given, defaults to the window size, i.e., non-overlapping windows.

    Returns
    -------
    fst : ndarray, float, shape (n_windows,)
        Average Fst in each window.

    """

    # calculate per-variant values
    num, den = patterson_fst(ac1, ac2)

    # compute the numerator and denominator in moving windows
    num_sum = moving_statistic(num, statistic=np.nansum, size=size, start=start, stop=stop, step=step)
    den_sum = moving_statistic(den, statistic=np.nansum, size=size, start=start, stop=stop, step=step)

    # calculate fst in each window
    fst = num_sum / den_sum

    return fst
Exemple #12
0
def moving_tajima_d(ac, size, start=0, stop=None, step=None):
    """Calculate the value of Tajima's D in moving windows of `size` variants.


    Parameters
    ----------
    ac : array_like, int, shape (n_variants, n_alleles)
        Allele counts array.
    size : int
        The window size (number of variants).
    start : int, optional
        The index at which to start.
    stop : int, optional
        The index at which to stop.
    step : int, optional
        The number of variants between start positions of windows. If not
        given, defaults to the window size, i.e., non-overlapping windows.

    Returns
    -------
    d : ndarray, float, shape (n_windows,)
        Tajima's D.

    Examples
    --------

    >>> import allel
    >>> g = allel.GenotypeArray([[[0, 0], [0, 0]],
    ...                          [[0, 0], [0, 1]],
    ...                          [[0, 0], [1, 1]],
    ...                          [[0, 1], [1, 1]],
    ...                          [[1, 1], [1, 1]],
    ...                          [[0, 0], [1, 2]],
    ...                          [[0, 1], [1, 2]],
    ...                          [[0, 1], [-1, -1]],
    ...                          [[-1, -1], [-1, -1]]])
    >>> ac = g.count_alleles()
    >>> D = allel.stats.moving_tajima_d(ac, size=3)
    >>> D
    array([ 0.59158014,  1.89305645,  5.79748537])

    """

    d = moving_statistic(values=ac,
                         statistic=tajima_d,
                         size=size,
                         start=start,
                         stop=stop,
                         step=step)
    return d
Exemple #13
0
def moving_garud_h(h, size, start=0, stop=None, step=None):
    """Compute the H1, H12, H123 and H2/H1 statistics for detecting signatures
    of soft sweeps, as defined in Garud et al. (2015), in moving windows,

    Parameters
    ----------
    h : array_like, int, shape (n_variants, n_haplotypes)
        Haplotype array.
    size : int
        The window size (number of variants).
    start : int, optional
        The index at which to start.
    stop : int, optional
        The index at which to stop.
    step : int, optional
        The number of variants between start positions of windows. If not
        given, defaults to the window size, i.e., non-overlapping windows.

    Returns
    -------
    h1 : ndarray, float, shape (n_windows,)
        H1 statistics (sum of squares of haplotype frequencies).
    h12 : ndarray, float, shape (n_windows,)
        H12 statistics (sum of squares of haplotype frequencies, combining
        the two most common haplotypes into a single frequency).
    h123 : ndarray, float, shape (n_windows,)
        H123 statistics (sum of squares of haplotype frequencies, combining
        the three most common haplotypes into a single frequency).
    h2_h1 : ndarray, float, shape (n_windows,)
        H2/H1 statistics, indicating the "softness" of a sweep.

    """

    gh = moving_statistic(values=h,
                          statistic=garud_h,
                          size=size,
                          start=start,
                          stop=stop,
                          step=step)

    h1 = gh[:, 0]
    h12 = gh[:, 1]
    h123 = gh[:, 2]
    h2_h1 = gh[:, 3]

    return h1, h12, h123, h2_h1
Exemple #14
0
def moving_tajima_d(ac, size, start=0, stop=None, step=None):
    """Calculate the value of Tajima's D in moving windows of `size` variants.


    Parameters
    ----------
    ac : array_like, int, shape (n_variants, n_alleles)
        Allele counts array.
    size : int
        The window size (number of variants).
    start : int, optional
        The index at which to start.
    stop : int, optional
        The index at which to stop.
    step : int, optional
        The number of variants between start positions of windows. If not
        given, defaults to the window size, i.e., non-overlapping windows.

    Returns
    -------
    D : ndarray, float, shape (n_windows,)
        Tajima's D.

    Examples
    --------

    >>> import allel
    >>> g = allel.GenotypeArray([[[0, 0], [0, 0]],
    ...                          [[0, 0], [0, 1]],
    ...                          [[0, 0], [1, 1]],
    ...                          [[0, 1], [1, 1]],
    ...                          [[1, 1], [1, 1]],
    ...                          [[0, 0], [1, 2]],
    ...                          [[0, 1], [1, 2]],
    ...                          [[0, 1], [-1, -1]],
    ...                          [[-1, -1], [-1, -1]]])
    >>> ac = g.count_alleles()
    >>> D = allel.stats.moving_tajima_d(ac, size=3)
    >>> D
    array([ 0.59158014,  1.89305645,  5.79748537])

    """

    D = moving_statistic(values=ac, statistic=tajima_d, size=size, start=start, stop=stop, step=step)
    return D
Exemple #15
0
def moving_garud_h(h, size, start=0, stop=None, step=None):
    """Compute the H1, H12, H123 and H2/H1 statistics for detecting signatures
    of soft sweeps, as defined in Garud et al. (2015), in moving windows,

    Parameters
    ----------
    h : array_like, int, shape (n_variants, n_haplotypes)
        Haplotype array.
    size : int
        The window size (number of variants).
    start : int, optional
        The index at which to start.
    stop : int, optional
        The index at which to stop.
    step : int, optional
        The number of variants between start positions of windows. If not
        given, defaults to the window size, i.e., non-overlapping windows.

    Returns
    -------
    h1 : ndarray, float, shape (n_windows,)
        H1 statistics (sum of squares of haplotype frequencies).
    h12 : ndarray, float, shape (n_windows,)
        H12 statistics (sum of squares of haplotype frequencies, combining
        the two most common haplotypes into a single frequency).
    h123 : ndarray, float, shape (n_windows,)
        H123 statistics (sum of squares of haplotype frequencies, combining
        the three most common haplotypes into a single frequency).
    h2_h1 : ndarray, float, shape (n_windows,)
        H2/H1 statistics, indicating the "softness" of a sweep.

    """

    gh = moving_statistic(values=h, statistic=garud_h, size=size, start=start,
                          stop=stop, step=step)

    h1 = gh[:, 0]
    h12 = gh[:, 1]
    h123 = gh[:, 2]
    h2_h1 = gh[:, 3]

    return h1, h12, h123, h2_h1
Exemple #16
0
def average_patterson_d(aca, acb, acc, acd, blen):
    """Estimate D(A, B; C, D) and standard error using the block-jackknife.

    Parameters
    ----------
    aca : array_like, int, shape (n_variants, 2),
        Allele counts for population A.
    acb : array_like, int, shape (n_variants, 2)
        Allele counts for population B.
    acc : array_like, int, shape (n_variants, 2)
        Allele counts for population C.
    acd : array_like, int, shape (n_variants, 2)
        Allele counts for population D.
    blen : int
        Block size (number of variants).

    Returns
    -------
    d : float
        Estimated value of the statistic using all data.
    se : float
        Estimated standard error.
    z : float
        Z-score (number of standard errors from zero).
    vb : ndarray, float, shape (n_blocks,)
        Value of the statistic in each block.
    vj : ndarray, float, shape (n_blocks,)
        Values of the statistic from block-jackknife resampling.

    Notes
    -----
    See Patterson (2012), main text and Appendix A.

    See Also
    --------
    allel.stats.admixture.patterson_d

    """

    # calculate per-variant values
    num, den = patterson_d(aca, acb, acc, acd)

    # N.B., nans can occur if any of the populations have completely missing
    # genotype calls at a variant (i.e., allele number is zero). Here we
    # assume that is rare enough to be negligible.

    # calculate overall estimate
    d_avg = np.nansum(num) / np.nansum(den)

    # compute the numerator and denominator within each block
    num_bsum = moving_statistic(num, statistic=np.nansum, size=blen)
    den_bsum = moving_statistic(den, statistic=np.nansum, size=blen)

    # calculate the statistic values in each block
    vb = num_bsum / den_bsum

    # estimate standard error
    _, se, vj = jackknife((num_bsum, den_bsum),
                          statistic=lambda n, d: np.sum(n) / np.sum(d))

    # compute Z score
    z = d_avg / se

    return d_avg, se, z, vb, vj
Exemple #17
0
def average_patterson_f3(acc, aca, acb, blen, normed=True):
    """Estimate F3(C; A, B) and standard error using the block-jackknife.

    Parameters
    ----------
    acc : array_like, int, shape (n_variants, 2)
        Allele counts for the test population (C).
    aca : array_like, int, shape (n_variants, 2)
        Allele counts for the first source population (A).
    acb : array_like, int, shape (n_variants, 2)
        Allele counts for the second source population (B).
    blen : int
        Block size (number of variants).
    normed : bool, optional
        If False, use un-normalised f3 values.

    Returns
    -------
    f3 : float
        Estimated value of the statistic using all data.
    se : float
        Estimated standard error.
    z : float
        Z-score (number of standard errors from zero).
    vb : ndarray, float, shape (n_blocks,)
        Value of the statistic in each block.
    vj : ndarray, float, shape (n_blocks,)
        Values of the statistic from block-jackknife resampling.

    Notes
    -----
    See Patterson (2012), main text and Appendix A.

    See Also
    --------
    allel.stats.admixture.patterson_f3

    """

    # calculate per-variant values
    T, B = patterson_f3(acc, aca, acb)

    # N.B., nans can occur if any of the populations have completely missing
    # genotype calls at a variant (i.e., allele number is zero). Here we
    # assume that is rare enough to be negligible.

    # calculate overall value of statistic
    if normed:
        f3 = np.nansum(T) / np.nansum(B)
    else:
        f3 = np.nanmean(T)

    # calculate value of statistic within each block
    if normed:
        T_bsum = moving_statistic(T, statistic=np.nansum, size=blen)
        B_bsum = moving_statistic(B, statistic=np.nansum, size=blen)
        vb = T_bsum / B_bsum
        _, se, vj = jackknife((T_bsum, B_bsum),
                              statistic=lambda t, b: np.sum(t) / np.sum(b))

    else:
        vb = moving_statistic(T, statistic=np.nanmean, size=blen)
        _, se, vj = jackknife(vb, statistic=np.mean)

    # compute Z score
    z = f3 / se

    return f3, se, z, vb, vj
Exemple #18
0
def moving_patterson_d(aca,
                       acb,
                       acc,
                       acd,
                       size,
                       start=0,
                       stop=None,
                       step=None):
    """Estimate D(A, B; C, D) in moving windows.

    Parameters
    ----------
    aca : array_like, int, shape (n_variants, 2),
        Allele counts for population A.
    acb : array_like, int, shape (n_variants, 2)
        Allele counts for population B.
    acc : array_like, int, shape (n_variants, 2)
        Allele counts for population C.
    acd : array_like, int, shape (n_variants, 2)
        Allele counts for population D.
    size : int
        The window size (number of variants).
    start : int, optional
        The index at which to start.
    stop : int, optional
        The index at which to stop.
    step : int, optional
        The number of variants between start positions of windows. If not
        given, defaults to the window size, i.e., non-overlapping windows.

    Returns
    -------
    d : ndarray, float, shape (n_windows,)
        Estimated value of the statistic in each window.

    """

    # calculate per-variant values
    num, den = patterson_d(aca, acb, acc, acd)

    # N.B., nans can occur if any of the populations have completely missing
    # genotype calls at a variant (i.e., allele number is zero). Here we
    # assume that is rare enough to be negligible.

    # compute the numerator and denominator within each window
    num_sum = moving_statistic(num,
                               statistic=np.nansum,
                               size=size,
                               start=start,
                               stop=stop,
                               step=step)
    den_sum = moving_statistic(den,
                               statistic=np.nansum,
                               size=size,
                               start=start,
                               stop=stop,
                               step=step)

    # calculate the statistic values in each block
    d = num_sum / den_sum

    return d
Exemple #19
0
def moving_patterson_f3(acc,
                        aca,
                        acb,
                        size,
                        start=0,
                        stop=None,
                        step=None,
                        normed=True):
    """Estimate F3(C; A, B) in moving windows.

    Parameters
    ----------
    acc : array_like, int, shape (n_variants, 2)
        Allele counts for the test population (C).
    aca : array_like, int, shape (n_variants, 2)
        Allele counts for the first source population (A).
    acb : array_like, int, shape (n_variants, 2)
        Allele counts for the second source population (B).
    size : int
        The window size (number of variants).
    start : int, optional
        The index at which to start.
    stop : int, optional
        The index at which to stop.
    step : int, optional
        The number of variants between start positions of windows. If not
        given, defaults to the window size, i.e., non-overlapping windows.
    normed : bool, optional
        If False, use un-normalised f3 values.

    Returns
    -------
    f3 : ndarray, float, shape (n_windows,)
        Estimated value of the statistic in each window.

    """

    # calculate per-variant values
    T, B = patterson_f3(acc, aca, acb)

    # calculate value of statistic within each block
    if normed:
        T_bsum = moving_statistic(T,
                                  statistic=np.nansum,
                                  size=size,
                                  start=start,
                                  stop=stop,
                                  step=step)
        B_bsum = moving_statistic(B,
                                  statistic=np.nansum,
                                  size=size,
                                  start=start,
                                  stop=stop,
                                  step=step)
        f3 = T_bsum / B_bsum

    else:
        f3 = moving_statistic(T,
                              statistic=np.nanmean,
                              size=size,
                              start=start,
                              stop=stop,
                              step=step)

    return f3
Exemple #20
0
def blockwise_patterson_f3(acc, aca, acb, blen, normed=True):
    """Estimate F3(C; A, B) and standard error using the block-jackknife.

    Parameters
    ----------
    acc : array_like, int, shape (n_variants, 2)
        Allele counts for the test population (C).
    aca : array_like, int, shape (n_variants, 2)
        Allele counts for the first source population (A).
    acb : array_like, int, shape (n_variants, 2)
        Allele counts for the second source population (B).
    blen : int
        Block size (number of variants).
    normed : bool, optional
        If False, use un-normalised f3 values.

    Returns
    -------
    f3 : float
        Estimated value of the statistic using all data.
    se : float
        Estimated standard error.
    z : float
        Z-score (number of standard errors from zero).
    vb : ndarray, float, shape (n_blocks,)
        Value of the statistic in each block.
    vj : ndarray, float, shape (n_blocks,)
        Values of the statistic from block-jackknife resampling.

    Notes
    -----
    See Patterson (2012), main text and Appendix A.

    See Also
    --------
    allel.stats.admixture.patterson_f3

    """

    # calculate per-variant values
    T, B = patterson_f3(acc, aca, acb)

    # N.B., nans can occur if any of the populations have completely missing
    # genotype calls at a variant (i.e., allele number is zero). Here we
    # assume that is rare enough to be negligible.

    # calculate overall value of statistic
    if normed:
        f3 = np.nansum(T) / np.nansum(B)
    else:
        f3 = np.nanmean(T)

    # calculate value of statistic within each block
    if normed:
        T_bsum = moving_statistic(T, statistic=np.nansum, size=blen)
        B_bsum = moving_statistic(B, statistic=np.nansum, size=blen)
        vb = T_bsum / B_bsum
        _, se, vj = jackknife((T_bsum, B_bsum),
                              statistic=lambda t, b: np.sum(t) / np.sum(b))

    else:
        vb = moving_statistic(T, statistic=np.nanmean, size=blen)
        _, se, vj = jackknife(vb, statistic=np.mean)

    # compute Z score
    z = f3 / se

    return f3, se, z, vb, vj
Exemple #21
0
def blockwise_patterson_d(aca, acb, acc, acd, blen):
    """Estimate D(A, B; C, D) and standard error using the block-jackknife.

    Parameters
    ----------
    aca : array_like, int, shape (n_variants, 2),
        Allele counts for population A.
    acb : array_like, int, shape (n_variants, 2)
        Allele counts for population B.
    acc : array_like, int, shape (n_variants, 2)
        Allele counts for population C.
    acd : array_like, int, shape (n_variants, 2)
        Allele counts for population D.
    blen : int
        Block size (number of variants).

    Returns
    -------
    d : float
        Estimated value of the statistic using all data.
    se : float
        Estimated standard error.
    z : float
        Z-score (number of standard errors from zero).
    vb : ndarray, float, shape (n_blocks,)
        Value of the statistic in each block.
    vj : ndarray, float, shape (n_blocks,)
        Values of the statistic from block-jackknife resampling.

    Notes
    -----
    See Patterson (2012), main text and Appendix A.

    See Also
    --------
    allel.stats.admixture.patterson_d

    """

    # calculate per-variant values
    num, den = patterson_d(aca, acb, acc, acd)

    # N.B., nans can occur if any of the populations have completely missing
    # genotype calls at a variant (i.e., allele number is zero). Here we
    # assume that is rare enough to be negligible.

    # calculate overall estimate
    d = np.nansum(num) / np.nansum(den)

    # compute the numerator and denominator within each block
    num_bsum = moving_statistic(num, statistic=np.nansum, size=blen)
    den_bsum = moving_statistic(den, statistic=np.nansum, size=blen)

    # calculate the statistic values in each block
    vb = num_bsum / den_bsum

    # estimate standard error
    _, se, vj = jackknife((num_bsum, den_bsum),
                          statistic=lambda n, d: np.sum(n) / np.sum(d))

    # compute Z score
    z = d / se

    return d, se, z, vb, vj
Exemple #22
0
def plot_moving_haplotype_frequencies(pos,
                                      h,
                                      size,
                                      start=0,
                                      stop=None,
                                      n=None,
                                      palette='Paired',
                                      singleton_color='w',
                                      ax=None):
    """Plot haplotype frequencies in moving windows over the genome.

    Parameters
    ----------
    pos : array_like, int, shape (n_items,)
        Variant positions, using 1-based coordinates, in ascending order.
    h : array_like, int, shape (n_variants, n_haplotypes)
        Haplotype array.
    size : int
        The window size (number of variants).
    start : int, optional
        The index at which to start.
    stop : int, optional
        The index at which to stop.
    n : int, optional
        Color only the `n` most frequent haplotypes (by default, all
        non-singleton haplotypes are colored).
    palette : string, optional
        A Seaborn palette name.
    singleton_color : string, optional
        Color to paint singleton haplotypes.
    ax : axes, optional
        The axes on which to draw. If not provided, a new figure will be
        created.

    Returns
    -------
    ax : axes

    """

    import matplotlib as mpl
    import matplotlib.pyplot as plt
    import seaborn as sns

    # setup figure
    if ax is None:
        fig, ax = plt.subplots()

    # compute haplotype frequencies
    # N.B., here we use a haplotype rank data structure to enable the use of
    # pcolormesh() which is a lot faster than any other type of plotting
    # function
    hr = moving_hfs_rank(h, size=size, start=start, stop=stop)

    # truncate to n most common haplotypes
    if n:
        hr[hr > n] = 0

    # compute window start and stop positions
    windows = moving_statistic(pos,
                               statistic=lambda v: (v[0], v[-1]),
                               size=size,
                               start=start,
                               stop=stop)

    # create color map
    colors = [singleton_color] + sns.color_palette(palette, n_colors=hr.max())
    cmap = mpl.colors.ListedColormap(colors)

    # draw colors
    x = np.append(windows[:, 0], windows[-1, -1])
    y = np.arange(h.shape[1] + 1)
    ax.pcolormesh(x, y, hr.T, cmap=cmap)

    # tidy up
    ax.set_xlim(windows[0, 0], windows[-1, -1])
    ax.set_ylim(0, h.shape[1])
    ax.set_ylabel('haplotype count')
    ax.set_xlabel('position (bp)')

    return ax
Exemple #23
0
def plot_moving_haplotype_frequencies(
    pos, h, size, start=0, stop=None, n=None, palette="Paired", singleton_color="w", ax=None
):
    """Plot haplotype frequencies in moving windows over the genome.

    Parameters
    ----------
    pos : array_like, int, shape (n_items,)
        Variant positions, using 1-based coordinates, in ascending order.
    h : array_like, int, shape (n_variants, n_haplotypes)
        Haplotype array.
    size : int
        The window size (number of variants).
    start : int, optional
        The index at which to start.
    stop : int, optional
        The index at which to stop.
    n : int, optional
        Color only the `n` most frequent haplotypes (by default, all
        non-singleton haplotypes are colored).
    palette : string, optional
        A Seaborn palette name.
    singleton_color : string, optional
        Color to paint singleton haplotypes.
    ax : axes, optional
        The axes on which to draw. If not provided, a new figure will be
        created.

    Returns
    -------
    ax : axes

    """

    import matplotlib as mpl
    import matplotlib.pyplot as plt
    import seaborn as sns

    # setup figure
    if ax is None:
        fig, ax = plt.subplots()

    # compute haplotype frequencies
    # N.B., here we use a haplotype rank data structure to enable the use of
    # pcolormesh() which is a lot faster than any other type of plotting
    # function
    hr = moving_hfs_rank(h, size=size, start=start, stop=stop)

    # truncate to n most common haplotypes
    if n:
        hr[hr > n] = 0

    # compute window start and stop positions
    windows = moving_statistic(pos, statistic=lambda x: (x[0], x[-1]), size=size, start=start, stop=stop)

    # create color map
    colors = [singleton_color] + sns.color_palette(palette, n_colors=hr.max())
    cmap = mpl.colors.ListedColormap(colors)

    # draw colors
    x = np.append(windows[:, 0], windows[-1, -1])
    y = np.arange(h.shape[1] + 1)
    ax.pcolormesh(x, y, hr.T, cmap=cmap)

    # tidy up
    ax.set_xlim(windows[0, 0], windows[-1, -1])
    ax.set_ylim(0, h.shape[1])
    ax.set_ylabel("haplotype count")
    ax.set_xlabel("position (bp)")

    return ax