Esempio n. 1
def moving_weir_cockerham_fst(g,
    """Estimate average Fst in moving windows over a single chromosome/contig,
    following the method of Weir and Cockerham (1984).

    g : array_like, int, shape (n_variants, n_samples, ploidy)
        Genotype array.
    subpops : sequence of sequences of ints
        Sample indices for each subpopulation.
    size : int
        The window size (number of variants).
    start : int, optional
        The index at which to start.
    stop : int, optional
        The index at which to stop.
    step : int, optional
        The number of variants between start positions of windows. If not
        given, defaults to the window size, i.e., non-overlapping windows.
    max_allele : int, optional
        The highest allele index to consider.

    fst : ndarray, float, shape (n_windows,)
        Average Fst in each window.


    # calculate per-variant values
    a, b, c = weir_cockerham_fst(g, subpops, max_allele=max_allele)

    # compute the numerator and denominator in moving windows
    num = moving_statistic(a,
    den = moving_statistic(a + b + c,

    #  calculate fst in each window
    fst = num / den

    return fst
Esempio n. 2
def blockwise_weir_cockerham_fst(g, subpops, blen, max_allele=None):
    """Estimate average Fst and standard error using the block-jackknife.

    g : array_like, int, shape (n_variants, n_samples, ploidy)
        Genotype array.
    subpops : sequence of sequences of ints
        Sample indices for each subpopulation.
    blen : int
        Block size (number of variants).
    max_allele : int, optional
        The highest allele index to consider.

    fst : float
        Estimated value of the statistic using all data.
    se : float
        Estimated standard error.
    vb : ndarray, float, shape (n_blocks,)
        Value of the statistic in each block.
    vj : ndarray, float, shape (n_blocks,)
        Values of the statistic from block-jackknife resampling.


    # calculate per-variant values
    a, b, c = weir_cockerham_fst(g, subpops, max_allele=max_allele)

    # calculate overall estimate
    a_sum = np.nansum(a)
    b_sum = np.nansum(b)
    c_sum = np.nansum(c)
    fst = a_sum / (a_sum + b_sum + c_sum)

    # compute the numerator and denominator within each block
    num_bsum = moving_statistic(a, statistic=np.nansum, size=blen)
    den_bsum = moving_statistic(a + b + c, statistic=np.nansum, size=blen)

    # calculate the statistic values in each block
    vb = num_bsum / den_bsum

    # estimate standard error
    _, se, vj = jackknife((num_bsum, den_bsum),
                          statistic=lambda n, d: np.sum(n) / np.sum(d))

    return fst, se, vb, vj
Esempio n. 3
def average_weir_cockerham_fst(g, subpops, blen, max_allele=None):
    """Estimate average Fst and standard error using the block-jackknife.

    g : array_like, int, shape (n_variants, n_samples, ploidy)
        Genotype array.
    subpops : sequence of sequences of ints
        Sample indices for each subpopulation.
    blen : int
        Block size (number of variants).
    max_allele : int, optional
        The highest allele index to consider.

    fst : float
        Estimated value of the statistic using all data.
    se : float
        Estimated standard error.
    vb : ndarray, float, shape (n_blocks,)
        Value of the statistic in each block.
    vj : ndarray, float, shape (n_blocks,)
        Values of the statistic from block-jackknife resampling.


    # calculate per-variant values
    a, b, c = weir_cockerham_fst(g, subpops, max_allele=max_allele)

    # calculate overall estimate
    a_sum = np.nansum(a)
    b_sum = np.nansum(b)
    c_sum = np.nansum(c)
    fst = a_sum / (a_sum + b_sum + c_sum)

    # compute the numerator and denominator within each block
    num_bsum = moving_statistic(a, statistic=np.nansum, size=blen)
    den_bsum = moving_statistic(a + b + c, statistic=np.nansum, size=blen)

    # calculate the statistic values in each block
    vb = num_bsum / den_bsum

    # estimate standard error
    _, se, vj = jackknife((num_bsum, den_bsum),
                          statistic=lambda n, d: np.sum(n) / np.sum(d))

    return fst, se, vb, vj
Esempio n. 4
def moving_patterson_fst(ac1, ac2, size, start=0, stop=None, step=None):
    """Estimate average Fst in moving windows over a single chromosome/contig,
    following the method of Patterson (2012).

    ac1 : array_like, int, shape (n_variants, n_alleles)
        Allele counts array from the first population.
    ac2 : array_like, int, shape (n_variants, n_alleles)
        Allele counts array from the second population.
    size : int
        The window size (number of variants).
    start : int, optional
        The index at which to start.
    stop : int, optional
        The index at which to stop.
    step : int, optional
        The number of variants between start positions of windows. If not
        given, defaults to the window size, i.e., non-overlapping windows.

    fst : ndarray, float, shape (n_windows,)
        Average Fst in each window.


    # calculate per-variant values
    num, den = patterson_fst(ac1, ac2)

    # compute the numerator and denominator in moving windows
    num_sum = moving_statistic(num,
    den_sum = moving_statistic(den,

    # calculate fst in each window
    fst = num_sum / den_sum

    return fst
Esempio n. 5
def moving_haplotype_diversity(h, size, start=0, stop=None, step=None):
    """Estimate haplotype diversity in moving windows.

    h : array_like, int, shape (n_variants, n_haplotypes)
        Haplotype array.
    size : int
        The window size (number of variants).
    start : int, optional
        The index at which to start.
    stop : int, optional
        The index at which to stop.
    step : int, optional
        The number of variants between start positions of windows. If not
        given, defaults to the window size, i.e., non-overlapping windows.

    hd : ndarray, float, shape (n_windows,)
        Haplotype diversity.


    hd = moving_statistic(values=h, statistic=haplotype_diversity, size=size, start=start, stop=stop, step=step)
    return hd
Esempio n. 6
def moving_haplotype_diversity(h, size, start=0, stop=None, step=None):
    """Estimate haplotype diversity in moving windows.

    h : array_like, int, shape (n_variants, n_haplotypes)
        Haplotype array.
    size : int
        The window size (number of variants).
    start : int, optional
        The index at which to start.
    stop : int, optional
        The index at which to stop.
    step : int, optional
        The number of variants between start positions of windows. If not
        given, defaults to the window size, i.e., non-overlapping windows.

    hd : ndarray, float, shape (n_windows,)
        Haplotype diversity.


    hd = moving_statistic(values=h,
    return hd
Esempio n. 7
def blockwise_patterson_fst(ac1, ac2, blen):
    """Estimate average Fst between two populations and standard error using
    the block-jackknife.

    ac1 : array_like, int, shape (n_variants, n_alleles)
        Allele counts array from the first population.
    ac2 : array_like, int, shape (n_variants, n_alleles)
        Allele counts array from the second population.
    blen : int
        Block size (number of variants).

    fst : float
        Estimated value of the statistic using all data.
    se : float
        Estimated standard error.
    vb : ndarray, float, shape (n_blocks,)
        Value of the statistic in each block.
    vj : ndarray, float, shape (n_blocks,)
        Values of the statistic from block-jackknife resampling.


    # calculate per-variant values
    num, den = patterson_fst(ac1, ac2)

    # calculate overall estimate
    fst = np.nansum(num) / np.nansum(den)

    # compute the numerator and denominator within each block
    num_bsum = moving_statistic(num, statistic=np.nansum, size=blen)
    den_bsum = moving_statistic(den, statistic=np.nansum, size=blen)

    # calculate the statistic values in each block
    vb = num_bsum / den_bsum

    # estimate standard error
    _, se, vj = jackknife((num_bsum, den_bsum),
                          statistic=lambda n, d: np.sum(n) / np.sum(d))

    return fst, se, vb, vj
Esempio n. 8
def average_patterson_fst(ac1, ac2, blen):
    """Estimate average Fst between two populations and standard error using
    the block-jackknife.

    ac1 : array_like, int, shape (n_variants, n_alleles)
        Allele counts array from the first population.
    ac2 : array_like, int, shape (n_variants, n_alleles)
        Allele counts array from the second population.
    blen : int
        Block size (number of variants).

    fst : float
        Estimated value of the statistic using all data.
    se : float
        Estimated standard error.
    vb : ndarray, float, shape (n_blocks,)
        Value of the statistic in each block.
    vj : ndarray, float, shape (n_blocks,)
        Values of the statistic from block-jackknife resampling.


    # calculate per-variant values
    num, den = patterson_fst(ac1, ac2)

    # calculate overall estimate
    fst = np.nansum(num) / np.nansum(den)

    # compute the numerator and denominator within each block
    num_bsum = moving_statistic(num, statistic=np.nansum, size=blen)
    den_bsum = moving_statistic(den, statistic=np.nansum, size=blen)

    # calculate the statistic values in each block
    vb = num_bsum / den_bsum

    # estimate standard error
    _, se, vj = jackknife((num_bsum, den_bsum),
                          statistic=lambda n, d: np.sum(n) / np.sum(d))

    return fst, se, vb, vj
Esempio n. 9
def moving_tajima_d(ac, size, start=0, stop=None, step=None, min_sites=3):
    """Calculate the value of Tajima's D in moving windows of `size` variants.

    ac : array_like, int, shape (n_variants, n_alleles)
        Allele counts array.
    size : int
        The window size (number of variants).
    start : int, optional
        The index at which to start.
    stop : int, optional
        The index at which to stop.
    step : int, optional
        The number of variants between start positions of windows. If not
        given, defaults to the window size, i.e., non-overlapping windows.
    min_sites : int, optional
        Minimum number of segregating sites for which to calculate a value. If
        there are fewer, np.nan is returned. Defaults to 3.

    d : ndarray, float, shape (n_windows,)
        Tajima's D.


    >>> import allel
    >>> g = allel.GenotypeArray([[[0, 0], [0, 0]],
    ...                          [[0, 0], [0, 1]],
    ...                          [[0, 0], [1, 1]],
    ...                          [[0, 1], [1, 1]],
    ...                          [[1, 1], [1, 1]],
    ...                          [[0, 0], [1, 2]],
    ...                          [[0, 1], [1, 2]],
    ...                          [[0, 1], [-1, -1]],
    ...                          [[-1, -1], [-1, -1]]])
    >>> ac = g.count_alleles()
    >>> D = allel.moving_tajima_d(ac, size=4, step=2)
    >>> D
    array([0.1676558 , 2.01186954, 5.70029703])


    d = moving_statistic(values=ac,
    return d
Esempio n. 10
def moving_weir_cockerham_fst(g, subpops, size, start=0, stop=None, step=None, max_allele=None):
    """Estimate average Fst in moving windows over a single chromosome/contig,
    following the method of Weir and Cockerham (1984).

    g : array_like, int, shape (n_variants, n_samples, ploidy)
        Genotype array.
    subpops : sequence of sequences of ints
        Sample indices for each subpopulation.
    size : int
        The window size (number of variants).
    start : int, optional
        The index at which to start.
    stop : int, optional
        The index at which to stop.
    step : int, optional
        The number of variants between start positions of windows. If not
        given, defaults to the window size, i.e., non-overlapping windows.
    max_allele : int, optional
        The highest allele index to consider.

    fst : ndarray, float, shape (n_windows,)
        Average Fst in each window.


    # calculate per-variant values
    a, b, c = weir_cockerham_fst(g, subpops, max_allele=max_allele)

    # compute the numerator and denominator in moving windows
    num = moving_statistic(a, statistic=np.nansum, size=size, start=start, stop=stop, step=step)
    den = moving_statistic(a + b + c, statistic=np.nansum, size=size, start=start, stop=stop, step=step)

    #  calculate fst in each window
    fst = num / den

    return fst
Esempio n. 11
def moving_patterson_fst(ac1, ac2, size, start=0, stop=None, step=None):
    """Estimate average Fst in moving windows over a single chromosome/contig,
    following the method of Patterson (2012).

    ac1 : array_like, int, shape (n_variants, n_alleles)
        Allele counts array from the first population.
    ac2 : array_like, int, shape (n_variants, n_alleles)
        Allele counts array from the second population.
    size : int
        The window size (number of variants).
    start : int, optional
        The index at which to start.
    stop : int, optional
        The index at which to stop.
    step : int, optional
        The number of variants between start positions of windows. If not
        given, defaults to the window size, i.e., non-overlapping windows.

    fst : ndarray, float, shape (n_windows,)
        Average Fst in each window.


    # calculate per-variant values
    num, den = patterson_fst(ac1, ac2)

    # compute the numerator and denominator in moving windows
    num_sum = moving_statistic(num, statistic=np.nansum, size=size, start=start, stop=stop, step=step)
    den_sum = moving_statistic(den, statistic=np.nansum, size=size, start=start, stop=stop, step=step)

    # calculate fst in each window
    fst = num_sum / den_sum

    return fst
Esempio n. 12
def moving_tajima_d(ac, size, start=0, stop=None, step=None):
    """Calculate the value of Tajima's D in moving windows of `size` variants.

    ac : array_like, int, shape (n_variants, n_alleles)
        Allele counts array.
    size : int
        The window size (number of variants).
    start : int, optional
        The index at which to start.
    stop : int, optional
        The index at which to stop.
    step : int, optional
        The number of variants between start positions of windows. If not
        given, defaults to the window size, i.e., non-overlapping windows.

    d : ndarray, float, shape (n_windows,)
        Tajima's D.


    >>> import allel
    >>> g = allel.GenotypeArray([[[0, 0], [0, 0]],
    ...                          [[0, 0], [0, 1]],
    ...                          [[0, 0], [1, 1]],
    ...                          [[0, 1], [1, 1]],
    ...                          [[1, 1], [1, 1]],
    ...                          [[0, 0], [1, 2]],
    ...                          [[0, 1], [1, 2]],
    ...                          [[0, 1], [-1, -1]],
    ...                          [[-1, -1], [-1, -1]]])
    >>> ac = g.count_alleles()
    >>> D = allel.stats.moving_tajima_d(ac, size=3)
    >>> D
    array([ 0.59158014,  1.89305645,  5.79748537])


    d = moving_statistic(values=ac,
    return d
Esempio n. 13
def moving_garud_h(h, size, start=0, stop=None, step=None):
    """Compute the H1, H12, H123 and H2/H1 statistics for detecting signatures
    of soft sweeps, as defined in Garud et al. (2015), in moving windows,

    h : array_like, int, shape (n_variants, n_haplotypes)
        Haplotype array.
    size : int
        The window size (number of variants).
    start : int, optional
        The index at which to start.
    stop : int, optional
        The index at which to stop.
    step : int, optional
        The number of variants between start positions of windows. If not
        given, defaults to the window size, i.e., non-overlapping windows.

    h1 : ndarray, float, shape (n_windows,)
        H1 statistics (sum of squares of haplotype frequencies).
    h12 : ndarray, float, shape (n_windows,)
        H12 statistics (sum of squares of haplotype frequencies, combining
        the two most common haplotypes into a single frequency).
    h123 : ndarray, float, shape (n_windows,)
        H123 statistics (sum of squares of haplotype frequencies, combining
        the three most common haplotypes into a single frequency).
    h2_h1 : ndarray, float, shape (n_windows,)
        H2/H1 statistics, indicating the "softness" of a sweep.


    gh = moving_statistic(values=h,

    h1 = gh[:, 0]
    h12 = gh[:, 1]
    h123 = gh[:, 2]
    h2_h1 = gh[:, 3]

    return h1, h12, h123, h2_h1
Esempio n. 14
def moving_tajima_d(ac, size, start=0, stop=None, step=None):
    """Calculate the value of Tajima's D in moving windows of `size` variants.

    ac : array_like, int, shape (n_variants, n_alleles)
        Allele counts array.
    size : int
        The window size (number of variants).
    start : int, optional
        The index at which to start.
    stop : int, optional
        The index at which to stop.
    step : int, optional
        The number of variants between start positions of windows. If not
        given, defaults to the window size, i.e., non-overlapping windows.

    D : ndarray, float, shape (n_windows,)
        Tajima's D.


    >>> import allel
    >>> g = allel.GenotypeArray([[[0, 0], [0, 0]],
    ...                          [[0, 0], [0, 1]],
    ...                          [[0, 0], [1, 1]],
    ...                          [[0, 1], [1, 1]],
    ...                          [[1, 1], [1, 1]],
    ...                          [[0, 0], [1, 2]],
    ...                          [[0, 1], [1, 2]],
    ...                          [[0, 1], [-1, -1]],
    ...                          [[-1, -1], [-1, -1]]])
    >>> ac = g.count_alleles()
    >>> D = allel.stats.moving_tajima_d(ac, size=3)
    >>> D
    array([ 0.59158014,  1.89305645,  5.79748537])


    D = moving_statistic(values=ac, statistic=tajima_d, size=size, start=start, stop=stop, step=step)
    return D
Esempio n. 15
def moving_garud_h(h, size, start=0, stop=None, step=None):
    """Compute the H1, H12, H123 and H2/H1 statistics for detecting signatures
    of soft sweeps, as defined in Garud et al. (2015), in moving windows,

    h : array_like, int, shape (n_variants, n_haplotypes)
        Haplotype array.
    size : int
        The window size (number of variants).
    start : int, optional
        The index at which to start.
    stop : int, optional
        The index at which to stop.
    step : int, optional
        The number of variants between start positions of windows. If not
        given, defaults to the window size, i.e., non-overlapping windows.

    h1 : ndarray, float, shape (n_windows,)
        H1 statistics (sum of squares of haplotype frequencies).
    h12 : ndarray, float, shape (n_windows,)
        H12 statistics (sum of squares of haplotype frequencies, combining
        the two most common haplotypes into a single frequency).
    h123 : ndarray, float, shape (n_windows,)
        H123 statistics (sum of squares of haplotype frequencies, combining
        the three most common haplotypes into a single frequency).
    h2_h1 : ndarray, float, shape (n_windows,)
        H2/H1 statistics, indicating the "softness" of a sweep.


    gh = moving_statistic(values=h, statistic=garud_h, size=size, start=start,
                          stop=stop, step=step)

    h1 = gh[:, 0]
    h12 = gh[:, 1]
    h123 = gh[:, 2]
    h2_h1 = gh[:, 3]

    return h1, h12, h123, h2_h1
Esempio n. 16
def average_patterson_d(aca, acb, acc, acd, blen):
    """Estimate D(A, B; C, D) and standard error using the block-jackknife.

    aca : array_like, int, shape (n_variants, 2),
        Allele counts for population A.
    acb : array_like, int, shape (n_variants, 2)
        Allele counts for population B.
    acc : array_like, int, shape (n_variants, 2)
        Allele counts for population C.
    acd : array_like, int, shape (n_variants, 2)
        Allele counts for population D.
    blen : int
        Block size (number of variants).

    d : float
        Estimated value of the statistic using all data.
    se : float
        Estimated standard error.
    z : float
        Z-score (number of standard errors from zero).
    vb : ndarray, float, shape (n_blocks,)
        Value of the statistic in each block.
    vj : ndarray, float, shape (n_blocks,)
        Values of the statistic from block-jackknife resampling.

    See Patterson (2012), main text and Appendix A.

    See Also


    # calculate per-variant values
    num, den = patterson_d(aca, acb, acc, acd)

    # N.B., nans can occur if any of the populations have completely missing
    # genotype calls at a variant (i.e., allele number is zero). Here we
    # assume that is rare enough to be negligible.

    # calculate overall estimate
    d_avg = np.nansum(num) / np.nansum(den)

    # compute the numerator and denominator within each block
    num_bsum = moving_statistic(num, statistic=np.nansum, size=blen)
    den_bsum = moving_statistic(den, statistic=np.nansum, size=blen)

    # calculate the statistic values in each block
    vb = num_bsum / den_bsum

    # estimate standard error
    _, se, vj = jackknife((num_bsum, den_bsum),
                          statistic=lambda n, d: np.sum(n) / np.sum(d))

    # compute Z score
    z = d_avg / se

    return d_avg, se, z, vb, vj
Esempio n. 17
def average_patterson_f3(acc, aca, acb, blen, normed=True):
    """Estimate F3(C; A, B) and standard error using the block-jackknife.

    acc : array_like, int, shape (n_variants, 2)
        Allele counts for the test population (C).
    aca : array_like, int, shape (n_variants, 2)
        Allele counts for the first source population (A).
    acb : array_like, int, shape (n_variants, 2)
        Allele counts for the second source population (B).
    blen : int
        Block size (number of variants).
    normed : bool, optional
        If False, use un-normalised f3 values.

    f3 : float
        Estimated value of the statistic using all data.
    se : float
        Estimated standard error.
    z : float
        Z-score (number of standard errors from zero).
    vb : ndarray, float, shape (n_blocks,)
        Value of the statistic in each block.
    vj : ndarray, float, shape (n_blocks,)
        Values of the statistic from block-jackknife resampling.

    See Patterson (2012), main text and Appendix A.

    See Also


    # calculate per-variant values
    T, B = patterson_f3(acc, aca, acb)

    # N.B., nans can occur if any of the populations have completely missing
    # genotype calls at a variant (i.e., allele number is zero). Here we
    # assume that is rare enough to be negligible.

    # calculate overall value of statistic
    if normed:
        f3 = np.nansum(T) / np.nansum(B)
        f3 = np.nanmean(T)

    # calculate value of statistic within each block
    if normed:
        T_bsum = moving_statistic(T, statistic=np.nansum, size=blen)
        B_bsum = moving_statistic(B, statistic=np.nansum, size=blen)
        vb = T_bsum / B_bsum
        _, se, vj = jackknife((T_bsum, B_bsum),
                              statistic=lambda t, b: np.sum(t) / np.sum(b))

        vb = moving_statistic(T, statistic=np.nanmean, size=blen)
        _, se, vj = jackknife(vb, statistic=np.mean)

    # compute Z score
    z = f3 / se

    return f3, se, z, vb, vj
Esempio n. 18
def moving_patterson_d(aca,
    """Estimate D(A, B; C, D) in moving windows.

    aca : array_like, int, shape (n_variants, 2),
        Allele counts for population A.
    acb : array_like, int, shape (n_variants, 2)
        Allele counts for population B.
    acc : array_like, int, shape (n_variants, 2)
        Allele counts for population C.
    acd : array_like, int, shape (n_variants, 2)
        Allele counts for population D.
    size : int
        The window size (number of variants).
    start : int, optional
        The index at which to start.
    stop : int, optional
        The index at which to stop.
    step : int, optional
        The number of variants between start positions of windows. If not
        given, defaults to the window size, i.e., non-overlapping windows.

    d : ndarray, float, shape (n_windows,)
        Estimated value of the statistic in each window.


    # calculate per-variant values
    num, den = patterson_d(aca, acb, acc, acd)

    # N.B., nans can occur if any of the populations have completely missing
    # genotype calls at a variant (i.e., allele number is zero). Here we
    # assume that is rare enough to be negligible.

    # compute the numerator and denominator within each window
    num_sum = moving_statistic(num,
    den_sum = moving_statistic(den,

    # calculate the statistic values in each block
    d = num_sum / den_sum

    return d
Esempio n. 19
def moving_patterson_f3(acc,
    """Estimate F3(C; A, B) in moving windows.

    acc : array_like, int, shape (n_variants, 2)
        Allele counts for the test population (C).
    aca : array_like, int, shape (n_variants, 2)
        Allele counts for the first source population (A).
    acb : array_like, int, shape (n_variants, 2)
        Allele counts for the second source population (B).
    size : int
        The window size (number of variants).
    start : int, optional
        The index at which to start.
    stop : int, optional
        The index at which to stop.
    step : int, optional
        The number of variants between start positions of windows. If not
        given, defaults to the window size, i.e., non-overlapping windows.
    normed : bool, optional
        If False, use un-normalised f3 values.

    f3 : ndarray, float, shape (n_windows,)
        Estimated value of the statistic in each window.


    # calculate per-variant values
    T, B = patterson_f3(acc, aca, acb)

    # calculate value of statistic within each block
    if normed:
        T_bsum = moving_statistic(T,
        B_bsum = moving_statistic(B,
        f3 = T_bsum / B_bsum

        f3 = moving_statistic(T,

    return f3
Esempio n. 20
def blockwise_patterson_f3(acc, aca, acb, blen, normed=True):
    """Estimate F3(C; A, B) and standard error using the block-jackknife.

    acc : array_like, int, shape (n_variants, 2)
        Allele counts for the test population (C).
    aca : array_like, int, shape (n_variants, 2)
        Allele counts for the first source population (A).
    acb : array_like, int, shape (n_variants, 2)
        Allele counts for the second source population (B).
    blen : int
        Block size (number of variants).
    normed : bool, optional
        If False, use un-normalised f3 values.

    f3 : float
        Estimated value of the statistic using all data.
    se : float
        Estimated standard error.
    z : float
        Z-score (number of standard errors from zero).
    vb : ndarray, float, shape (n_blocks,)
        Value of the statistic in each block.
    vj : ndarray, float, shape (n_blocks,)
        Values of the statistic from block-jackknife resampling.

    See Patterson (2012), main text and Appendix A.

    See Also


    # calculate per-variant values
    T, B = patterson_f3(acc, aca, acb)

    # N.B., nans can occur if any of the populations have completely missing
    # genotype calls at a variant (i.e., allele number is zero). Here we
    # assume that is rare enough to be negligible.

    # calculate overall value of statistic
    if normed:
        f3 = np.nansum(T) / np.nansum(B)
        f3 = np.nanmean(T)

    # calculate value of statistic within each block
    if normed:
        T_bsum = moving_statistic(T, statistic=np.nansum, size=blen)
        B_bsum = moving_statistic(B, statistic=np.nansum, size=blen)
        vb = T_bsum / B_bsum
        _, se, vj = jackknife((T_bsum, B_bsum),
                              statistic=lambda t, b: np.sum(t) / np.sum(b))

        vb = moving_statistic(T, statistic=np.nanmean, size=blen)
        _, se, vj = jackknife(vb, statistic=np.mean)

    # compute Z score
    z = f3 / se

    return f3, se, z, vb, vj
Esempio n. 21
def blockwise_patterson_d(aca, acb, acc, acd, blen):
    """Estimate D(A, B; C, D) and standard error using the block-jackknife.

    aca : array_like, int, shape (n_variants, 2),
        Allele counts for population A.
    acb : array_like, int, shape (n_variants, 2)
        Allele counts for population B.
    acc : array_like, int, shape (n_variants, 2)
        Allele counts for population C.
    acd : array_like, int, shape (n_variants, 2)
        Allele counts for population D.
    blen : int
        Block size (number of variants).

    d : float
        Estimated value of the statistic using all data.
    se : float
        Estimated standard error.
    z : float
        Z-score (number of standard errors from zero).
    vb : ndarray, float, shape (n_blocks,)
        Value of the statistic in each block.
    vj : ndarray, float, shape (n_blocks,)
        Values of the statistic from block-jackknife resampling.

    See Patterson (2012), main text and Appendix A.

    See Also


    # calculate per-variant values
    num, den = patterson_d(aca, acb, acc, acd)

    # N.B., nans can occur if any of the populations have completely missing
    # genotype calls at a variant (i.e., allele number is zero). Here we
    # assume that is rare enough to be negligible.

    # calculate overall estimate
    d = np.nansum(num) / np.nansum(den)

    # compute the numerator and denominator within each block
    num_bsum = moving_statistic(num, statistic=np.nansum, size=blen)
    den_bsum = moving_statistic(den, statistic=np.nansum, size=blen)

    # calculate the statistic values in each block
    vb = num_bsum / den_bsum

    # estimate standard error
    _, se, vj = jackknife((num_bsum, den_bsum),
                          statistic=lambda n, d: np.sum(n) / np.sum(d))

    # compute Z score
    z = d / se

    return d, se, z, vb, vj
Esempio n. 22
def plot_moving_haplotype_frequencies(pos,
    """Plot haplotype frequencies in moving windows over the genome.

    pos : array_like, int, shape (n_items,)
        Variant positions, using 1-based coordinates, in ascending order.
    h : array_like, int, shape (n_variants, n_haplotypes)
        Haplotype array.
    size : int
        The window size (number of variants).
    start : int, optional
        The index at which to start.
    stop : int, optional
        The index at which to stop.
    n : int, optional
        Color only the `n` most frequent haplotypes (by default, all
        non-singleton haplotypes are colored).
    palette : string, optional
        A Seaborn palette name.
    singleton_color : string, optional
        Color to paint singleton haplotypes.
    ax : axes, optional
        The axes on which to draw. If not provided, a new figure will be

    ax : axes


    import matplotlib as mpl
    import matplotlib.pyplot as plt
    import seaborn as sns

    # setup figure
    if ax is None:
        fig, ax = plt.subplots()

    # compute haplotype frequencies
    # N.B., here we use a haplotype rank data structure to enable the use of
    # pcolormesh() which is a lot faster than any other type of plotting
    # function
    hr = moving_hfs_rank(h, size=size, start=start, stop=stop)

    # truncate to n most common haplotypes
    if n:
        hr[hr > n] = 0

    # compute window start and stop positions
    windows = moving_statistic(pos,
                               statistic=lambda v: (v[0], v[-1]),

    # create color map
    colors = [singleton_color] + sns.color_palette(palette, n_colors=hr.max())
    cmap = mpl.colors.ListedColormap(colors)

    # draw colors
    x = np.append(windows[:, 0], windows[-1, -1])
    y = np.arange(h.shape[1] + 1)
    ax.pcolormesh(x, y, hr.T, cmap=cmap)

    # tidy up
    ax.set_xlim(windows[0, 0], windows[-1, -1])
    ax.set_ylim(0, h.shape[1])
    ax.set_ylabel('haplotype count')
    ax.set_xlabel('position (bp)')

    return ax
Esempio n. 23
def plot_moving_haplotype_frequencies(
    pos, h, size, start=0, stop=None, n=None, palette="Paired", singleton_color="w", ax=None
    """Plot haplotype frequencies in moving windows over the genome.

    pos : array_like, int, shape (n_items,)
        Variant positions, using 1-based coordinates, in ascending order.
    h : array_like, int, shape (n_variants, n_haplotypes)
        Haplotype array.
    size : int
        The window size (number of variants).
    start : int, optional
        The index at which to start.
    stop : int, optional
        The index at which to stop.
    n : int, optional
        Color only the `n` most frequent haplotypes (by default, all
        non-singleton haplotypes are colored).
    palette : string, optional
        A Seaborn palette name.
    singleton_color : string, optional
        Color to paint singleton haplotypes.
    ax : axes, optional
        The axes on which to draw. If not provided, a new figure will be

    ax : axes


    import matplotlib as mpl
    import matplotlib.pyplot as plt
    import seaborn as sns

    # setup figure
    if ax is None:
        fig, ax = plt.subplots()

    # compute haplotype frequencies
    # N.B., here we use a haplotype rank data structure to enable the use of
    # pcolormesh() which is a lot faster than any other type of plotting
    # function
    hr = moving_hfs_rank(h, size=size, start=start, stop=stop)

    # truncate to n most common haplotypes
    if n:
        hr[hr > n] = 0

    # compute window start and stop positions
    windows = moving_statistic(pos, statistic=lambda x: (x[0], x[-1]), size=size, start=start, stop=stop)

    # create color map
    colors = [singleton_color] + sns.color_palette(palette, n_colors=hr.max())
    cmap = mpl.colors.ListedColormap(colors)

    # draw colors
    x = np.append(windows[:, 0], windows[-1, -1])
    y = np.arange(h.shape[1] + 1)
    ax.pcolormesh(x, y, hr.T, cmap=cmap)

    # tidy up
    ax.set_xlim(windows[0, 0], windows[-1, -1])
    ax.set_ylim(0, h.shape[1])
    ax.set_ylabel("haplotype count")
    ax.set_xlabel("position (bp)")

    return ax