Example #1
0
def thetah(pos, ac, start=None, stop=None, is_accessible=None):
    # check inputs
    if not isinstance(pos, SortedIndex):
        pos = SortedIndex(pos, copy=False)
    ac = asarray_ndim(ac, 2)
    is_accessible = asarray_ndim(is_accessible, 1, allow_none=True)

    # deal with subregion
    if start is not None or stop is not None:
        loc = pos.locate_range(start, stop)
        pos = pos[loc]
        ac = ac[loc]
    if start is None:
        start = pos[0]
    if stop is None:
        stop = pos[-1]

    # calculate values of the stat
    h = 0
    for i in range(len(ac)):
        p1 = ac[i, 1]
        n = p1+ac[i, 0]
        if n > 1:
            h += (p1*p1)/(n*(n-1.0))
    h *= 2

    # calculate value per base
    if is_accessible is None:
        n_bases = stop - start + 1
    else:
        n_bases = np.count_nonzero(is_accessible[start-1:stop])

    h = h / n_bases
    return h
Example #2
0
    def test_constructor(self):

        # missing data arg
        with self.assertRaises(TypeError):
            # noinspection PyArgumentList
            SortedIndex()

        # data has wrong dtype
        data = 'foo bar'
        with self.assertRaises(TypeError):
            SortedIndex(data)

        # data has wrong dimensions
        data = [[1, 2], [3, 4]]
        with self.assertRaises(TypeError):
            SortedIndex(data)

        # values are not sorted
        data = [2, 1, 3, 5]
        with self.assertRaises(ValueError):
            SortedIndex(data)

        # values are not sorted
        data = [4., 5., 3.7]
        with self.assertRaises(ValueError):
            SortedIndex(data)

        # valid data (unique)
        data = [1, 4, 5, 7, 12]
        idx = SortedIndex(data)
        aeq(data, idx)
        eq(np.int, idx.dtype)
        eq(1, idx.ndim)
        eq(5, len(idx))
        assert idx.is_unique

        # valid data (non-unique)
        data = [1, 4, 5, 5, 7, 12]
        idx = SortedIndex(data)
        aeq(data, idx)
        eq(np.int, idx.dtype)
        eq(1, idx.ndim)
        eq(6, len(idx))
        assert not idx.is_unique

        # valid data (typed)
        data = [1, 4, 5, 5, 7, 12]
        idx = SortedIndex(data, dtype='u4')
        aeq(data, idx)
        eq(np.uint32, idx.dtype)

        # valid data (non-numeric)
        data = ['1', '12', '4', '5', '5', '7']
        idx = SortedIndex(data)
        aeq(data, idx)
Example #3
0
def pairwise_dxy(pos, gac, start=None, stop=None, is_accessible=None):
    """Convenience function to calculate a pairwise distance matrix using
    nucleotide divergence (a.k.a. Dxy) as the distance metric.

    Parameters
    ----------
    pos : array_like, int, shape (n_variants,)
        Variant positions.
    gac : array_like, int, shape (n_variants, n_samples, n_alleles)
        Per-genotype allele counts.
    start : int, optional
        Start position of region to use.
    stop : int, optional
        Stop position of region to use.
    is_accessible : array_like, bool, shape (len(contig),), optional
        Boolean array indicating accessibility status for all positions in the
        chromosome/contig.

    Returns
    -------
    dist : ndarray
        Distance matrix in condensed form.

    See Also
    --------
    allel.model.ndarray.GenotypeArray.to_allele_counts

    """

    if not isinstance(pos, SortedIndex):
        pos = SortedIndex(pos, copy=False)
    gac = asarray_ndim(gac, 3)
    # compute this once here, to avoid repeated evaluation within the loop
    gan = np.sum(gac, axis=2)
    m = gac.shape[1]
    dist = list()
    for i, j in itertools.combinations(range(m), 2):
        ac1 = gac[:, i, ...]
        an1 = gan[:, i]
        ac2 = gac[:, j, ...]
        an2 = gan[:, j]
        d = sequence_divergence(pos,
                                ac1,
                                ac2,
                                an1=an1,
                                an2=an2,
                                start=start,
                                stop=stop,
                                is_accessible=is_accessible)
        dist.append(d)
    return np.array(dist)
Example #4
0
    def test_slice(self):

        data = [1, 4, 5, 5, 7, 12]
        idx = SortedIndex(data, dtype='u4')

        # row slice
        s = idx[1:]
        assert_is_instance(s, SortedIndex)

        # index
        s = idx[0]
        assert_is_instance(s, np.uint32)
        assert_not_is_instance(s, SortedIndex)
        eq(data[0], s)
Example #5
0
def maxFDA(pos, ac, start=None, stop=None, is_accessible=None):
    # check inputs
    if not isinstance(pos, SortedIndex):
        pos = SortedIndex(pos, copy=False)
    ac = asarray_ndim(ac, 2)
    is_accessible = asarray_ndim(is_accessible, 1, allow_none=True)

    # deal with subregion
    if start is not None or stop is not None:
        loc = pos.locate_range(start, stop)
        pos = pos[loc]
        ac = ac[loc]
    if start is None:
        start = pos[0]
    if stop is None:
        stop = pos[-1]

    # calculate values of the stat
    dafs = []
    for i in range(len(ac)):
        p1 = ac[i, 1]
        n = p1+ac[i, 0]
        dafs.append(p1/float(n))
    return max(dafs)
Example #6
0
 def setup_instance(self, data):
     return SortedIndex(data)
Example #7
0
def windowed_diversity(pos, ac, size=None, start=None, stop=None, step=None,
                       windows=None, is_accessible=None, fill=np.nan):
    """Estimate nucleotide diversity in windows over a single
    chromosome/contig.

    Parameters
    ----------

    pos : array_like, int, shape (n_items,)
        Variant positions, using 1-based coordinates, in ascending order.
    ac : array_like, int, shape (n_variants, n_alleles)
        Allele counts array.
    size : int, optional
        The window size (number of bases).
    start : int, optional
        The position at which to start (1-based).
    stop : int, optional
        The position at which to stop (1-based).
    step : int, optional
        The distance between start positions of windows. If not given,
        defaults to the window size, i.e., non-overlapping windows.
    windows : array_like, int, shape (n_windows, 2), optional
        Manually specify the windows to use as a sequence of (window_start,
        window_stop) positions, using 1-based coordinates. Overrides the
        size/start/stop/step parameters.
    is_accessible : array_like, bool, shape (len(contig),), optional
        Boolean array indicating accessibility status for all positions in the
        chromosome/contig.
    fill : object, optional
        The value to use where a window is completely inaccessible.

    Returns
    -------

    pi : ndarray, float, shape (n_windows,)
        Nucleotide diversity in each window.
    windows : ndarray, int, shape (n_windows, 2)
        The windows used, as an array of (window_start, window_stop) positions,
        using 1-based coordinates.
    n_bases : ndarray, int, shape (n_windows,)
        Number of (accessible) bases in each window.
    counts : ndarray, int, shape (n_windows,)
        Number of variants in each window.

    Examples
    --------

    >>> import allel
    >>> g = allel.GenotypeArray([[[0, 0], [0, 0]],
    ...                          [[0, 0], [0, 1]],
    ...                          [[0, 0], [1, 1]],
    ...                          [[0, 1], [1, 1]],
    ...                          [[1, 1], [1, 1]],
    ...                          [[0, 0], [1, 2]],
    ...                          [[0, 1], [1, 2]],
    ...                          [[0, 1], [-1, -1]],
    ...                          [[-1, -1], [-1, -1]]])
    >>> ac = g.count_alleles()
    >>> pos = [2, 4, 7, 14, 15, 18, 19, 25, 27]
    >>> pi, windows, n_bases, counts = allel.windowed_diversity(
    ...     pos, ac, size=10, start=1, stop=31
    ... )
    >>> pi
    array([0.11666667, 0.21666667, 0.09090909])
    >>> windows
    array([[ 1, 10],
           [11, 20],
           [21, 31]])
    >>> n_bases
    array([10, 10, 11])
    >>> counts
    array([3, 4, 2])

    """

    # check inputs
    if not isinstance(pos, SortedIndex):
        pos = SortedIndex(pos, copy=False)
    is_accessible = asarray_ndim(is_accessible, 1, allow_none=True)
    # masking inaccessible sites from pos and ac
    pos, ac = mask_inaccessible(is_accessible, pos, ac)

    # calculate mean pairwise difference
    mpd = mean_pairwise_difference(ac, fill=0)

    # sum differences in windows
    mpd_sum, windows, counts = windowed_statistic(
        pos, values=mpd, statistic=np.sum, size=size, start=start, stop=stop,
        step=step, windows=windows, fill=0
    )

    # calculate value per base
    pi, n_bases = per_base(mpd_sum, windows, is_accessible=is_accessible,
                           fill=fill)

    return pi, windows, n_bases, counts
Example #8
0
def watterson_theta(pos, ac, start=None, stop=None, is_accessible=None):
    """Calculate the value of Watterson's estimator over a given region.

    Parameters
    ----------

    pos : array_like, int, shape (n_items,)
        Variant positions, using 1-based coordinates, in ascending order.
    ac : array_like, int, shape (n_variants, n_alleles)
        Allele counts array.
    start : int, optional
        The position at which to start (1-based).
    stop : int, optional
        The position at which to stop (1-based).
    is_accessible : array_like, bool, shape (len(contig),), optional
        Boolean array indicating accessibility status for all positions in the
        chromosome/contig.

    Returns
    -------

    theta_hat_w : float
        Watterson's estimator (theta hat per base).

    Examples
    --------

    >>> import allel
    >>> g = allel.GenotypeArray([[[0, 0], [0, 0]],
    ...                          [[0, 0], [0, 1]],
    ...                          [[0, 0], [1, 1]],
    ...                          [[0, 1], [1, 1]],
    ...                          [[1, 1], [1, 1]],
    ...                          [[0, 0], [1, 2]],
    ...                          [[0, 1], [1, 2]],
    ...                          [[0, 1], [-1, -1]],
    ...                          [[-1, -1], [-1, -1]]])
    >>> ac = g.count_alleles()
    >>> pos = [2, 4, 7, 14, 15, 18, 19, 25, 27]
    >>> theta_hat_w = allel.stats.watterson_theta(pos, ac, start=1, stop=31)
    >>> theta_hat_w
    0.10557184750733138

    """

    # check inputs
    if not isinstance(pos, SortedIndex):
        pos = SortedIndex(pos, copy=False)
    is_accessible = asarray_ndim(is_accessible, 1, allow_none=True)
    if not hasattr(ac, "count_segregating"):
        ac = AlleleCountsArray(ac, copy=False)

    # deal with subregion
    if start is not None or stop is not None:
        loc = pos.locate_range(start, stop)
        pos = pos[loc]
        ac = ac[loc]
    if start is None:
        start = pos[0]
    if stop is None:
        stop = pos[-1]

    # count segregating variants
    S = ac.count_segregating()

    # assume number of chromosomes sampled is constant for all variants
    n = ac.sum(axis=1).max()

    # (n-1)th harmonic number
    a1 = np.sum(1 / np.arange(1, n))

    # calculate absolute value
    theta_hat_w_abs = S / a1

    # calculate value per base
    if is_accessible is None:
        n_bases = stop - start + 1
    else:
        n_bases = np.count_nonzero(is_accessible[start - 1 : stop])
    theta_hat_w = theta_hat_w_abs / n_bases

    return theta_hat_w
Example #9
0
def windowed_divergence(pos,
                        ac1,
                        ac2,
                        size=None,
                        start=None,
                        stop=None,
                        step=None,
                        windows=None,
                        is_accessible=None,
                        fill=np.nan):
    """Estimate nucleotide divergence between two populations in windows
    over a single chromosome/contig.

    Parameters
    ----------

    pos : array_like, int, shape (n_items,)
        Variant positions, using 1-based coordinates, in ascending order.
    ac1 : array_like, int, shape (n_variants, n_alleles)
        Allele counts array for the first population.
    ac2 : array_like, int, shape (n_variants, n_alleles)
        Allele counts array for the second population.
    size : int, optional
        The window size (number of bases).
    start : int, optional
        The position at which to start (1-based).
    stop : int, optional
        The position at which to stop (1-based).
    step : int, optional
        The distance between start positions of windows. If not given,
        defaults to the window size, i.e., non-overlapping windows.
    windows : array_like, int, shape (n_windows, 2), optional
        Manually specify the windows to use as a sequence of (window_start,
        window_stop) positions, using 1-based coordinates. Overrides the
        size/start/stop/step parameters.
    is_accessible : array_like, bool, shape (len(contig),), optional
        Boolean array indicating accessibility status for all positions in the
        chromosome/contig.
    fill : object, optional
        The value to use where a window is completely inaccessible.

    Returns
    -------

    Dxy : ndarray, float, shape (n_windows,)
        Nucleotide divergence in each window.
    windows : ndarray, int, shape (n_windows, 2)
        The windows used, as an array of (window_start, window_stop) positions,
        using 1-based coordinates.
    n_bases : ndarray, int, shape (n_windows,)
        Number of (accessible) bases in each window.
    counts : ndarray, int, shape (n_windows,)
        Number of variants in each window.

    Examples
    --------

    Simplest case, two haplotypes in each population::

        >>> import allel
        >>> h = allel.HaplotypeArray([[0, 0, 0, 0],
        ...                           [0, 0, 0, 1],
        ...                           [0, 0, 1, 1],
        ...                           [0, 1, 1, 1],
        ...                           [1, 1, 1, 1],
        ...                           [0, 0, 1, 2],
        ...                           [0, 1, 1, 2],
        ...                           [0, 1, -1, -1],
        ...                           [-1, -1, -1, -1]])
        >>> ac1 = h.count_alleles(subpop=[0, 1])
        >>> ac2 = h.count_alleles(subpop=[2, 3])
        >>> pos = [2, 4, 7, 14, 15, 18, 19, 25, 27]
        >>> dxy, windows, n_bases, counts = windowed_divergence(
        ...     pos, ac1, ac2, size=10, start=1, stop=31
        ... )
        >>> dxy
        array([0.15 , 0.225, 0.   ])
        >>> windows
        array([[ 1, 10],
               [11, 20],
               [21, 31]])
        >>> n_bases
        array([10, 10, 11])
        >>> counts
        array([3, 4, 2])

    """

    # check inputs
    pos = SortedIndex(pos, copy=False)
    is_accessible = asarray_ndim(is_accessible, 1, allow_none=True)

    # calculate mean pairwise divergence
    mpd = mean_pairwise_difference_between(ac1, ac2, fill=0)

    # sum in windows
    mpd_sum, windows, counts = windowed_statistic(pos,
                                                  values=mpd,
                                                  statistic=np.sum,
                                                  size=size,
                                                  start=start,
                                                  stop=stop,
                                                  step=step,
                                                  windows=windows,
                                                  fill=0)

    # calculate value per base
    dxy, n_bases = per_base(mpd_sum,
                            windows,
                            is_accessible=is_accessible,
                            fill=fill)

    return dxy, windows, n_bases, counts
Example #10
0
def sequence_divergence(pos,
                        ac1,
                        ac2,
                        an1=None,
                        an2=None,
                        start=None,
                        stop=None,
                        is_accessible=None):
    """Estimate nucleotide divergence between two populations within a
    given region, which is the average proportion of sites (including
    monomorphic sites not present in the data) that differ between randomly
    chosen pairs of chromosomes, one from each population.

    Parameters
    ----------

    pos : array_like, int, shape (n_items,)
        Variant positions, using 1-based coordinates, in ascending order.
    ac1 : array_like, int, shape (n_variants, n_alleles)
        Allele counts array for the first population.
    ac2 : array_like, int, shape (n_variants, n_alleles)
        Allele counts array for the second population.
    an1 : array_like, int, shape (n_variants,), optional
        Allele numbers for the first population. If not provided, will be
        calculated from `ac1`.
    an2 : array_like, int, shape (n_variants,), optional
        Allele numbers for the second population. If not provided, will be
        calculated from `ac2`.
    start : int, optional
        The position at which to start (1-based). Defaults to the first position.
    stop : int, optional
        The position at which to stop (1-based). Defaults to the last position.
    is_accessible : array_like, bool, shape (len(contig),), optional
        Boolean array indicating accessibility status for all positions in the
        chromosome/contig.

    Returns
    -------

    Dxy : ndarray, float, shape (n_windows,)
        Nucleotide divergence.

    Examples
    --------

    Simplest case, two haplotypes in each population::

        >>> import allel
        >>> h = allel.HaplotypeArray([[0, 0, 0, 0],
        ...                           [0, 0, 0, 1],
        ...                           [0, 0, 1, 1],
        ...                           [0, 1, 1, 1],
        ...                           [1, 1, 1, 1],
        ...                           [0, 0, 1, 2],
        ...                           [0, 1, 1, 2],
        ...                           [0, 1, -1, -1],
        ...                           [-1, -1, -1, -1]])
        >>> ac1 = h.count_alleles(subpop=[0, 1])
        >>> ac2 = h.count_alleles(subpop=[2, 3])
        >>> pos = [2, 4, 7, 14, 15, 18, 19, 25, 27]
        >>> dxy = sequence_divergence(pos, ac1, ac2, start=1, stop=31)
        >>> dxy
        0.12096774193548387

    """

    # check inputs
    if not isinstance(pos, SortedIndex):
        pos = SortedIndex(pos, copy=False)
    ac1 = asarray_ndim(ac1, 2)
    ac2 = asarray_ndim(ac2, 2)
    if an1 is not None:
        an1 = asarray_ndim(an1, 1)
    if an2 is not None:
        an2 = asarray_ndim(an2, 1)
    is_accessible = asarray_ndim(is_accessible, 1, allow_none=True)

    # handle start/stop
    if start is not None or stop is not None:
        loc = pos.locate_range(start, stop)
        pos = pos[loc]
        ac1 = ac1[loc]
        ac2 = ac2[loc]
        if an1 is not None:
            an1 = an1[loc]
        if an2 is not None:
            an2 = an2[loc]
    if start is None:
        start = pos[0]
    if stop is None:
        stop = pos[-1]

    # calculate mean pairwise difference between the two populations
    mpd = mean_pairwise_difference_between(ac1, ac2, an1=an1, an2=an2, fill=0)

    # sum differences over variants
    mpd_sum = np.sum(mpd)

    # calculate value per base, N.B., expect pos is 1-based
    if is_accessible is None:
        n_bases = stop - start + 1
    else:
        n_bases = np.count_nonzero(is_accessible[start - 1:stop])

    dxy = mpd_sum / n_bases

    return dxy
Example #11
0
def sequence_diversity(pos, ac, start=None, stop=None, is_accessible=None):
    """Estimate nucleotide diversity within a given region, which is the
    average proportion of sites (including monomorphic sites not present in the
    data) that differ between randomly chosen pairs of chromosomes.

    Parameters
    ----------

    pos : array_like, int, shape (n_items,)
        Variant positions, using 1-based coordinates, in ascending order.
    ac : array_like, int, shape (n_variants, n_alleles)
        Allele counts array.
    start : int, optional
        The position at which to start (1-based). Defaults to the first position.
    stop : int, optional
        The position at which to stop (1-based). Defaults to the last position.
    is_accessible : array_like, bool, shape (len(contig),), optional
        Boolean array indicating accessibility status for all positions in the
        chromosome/contig.

    Returns
    -------

    pi : ndarray, float, shape (n_windows,)
        Nucleotide diversity.

    Notes
    -----

    If start and/or stop are not provided, uses the difference between the last
    and the first position as a proxy for the total number of sites, which can
    overestimate the sequence diversity.

    Examples
    --------

    >>> import allel
    >>> g = allel.GenotypeArray([[[0, 0], [0, 0]],
    ...                          [[0, 0], [0, 1]],
    ...                          [[0, 0], [1, 1]],
    ...                          [[0, 1], [1, 1]],
    ...                          [[1, 1], [1, 1]],
    ...                          [[0, 0], [1, 2]],
    ...                          [[0, 1], [1, 2]],
    ...                          [[0, 1], [-1, -1]],
    ...                          [[-1, -1], [-1, -1]]])
    >>> ac = g.count_alleles()
    >>> pos = [2, 4, 7, 14, 15, 18, 19, 25, 27]
    >>> pi = allel.sequence_diversity(pos, ac, start=1, stop=31)
    >>> pi
    0.13978494623655915

    """

    # check inputs
    if not isinstance(pos, SortedIndex):
        pos = SortedIndex(pos, copy=False)
    ac = asarray_ndim(ac, 2)
    is_accessible = asarray_ndim(is_accessible, 1, allow_none=True)

    # deal with subregion
    if start is not None or stop is not None:
        loc = pos.locate_range(start, stop)
        pos = pos[loc]
        ac = ac[loc]
    if start is None:
        start = pos[0]
    if stop is None:
        stop = pos[-1]

    # calculate mean pairwise difference
    mpd = mean_pairwise_difference(ac, fill=0)

    # sum differences over variants
    mpd_sum = np.sum(mpd)

    # calculate value per base
    if is_accessible is None:
        n_bases = stop - start + 1
    else:
        n_bases = np.count_nonzero(is_accessible[start - 1:stop])

    pi = mpd_sum / n_bases
    return pi
Example #12
0
def windowed_tajima_d(pos,
                      ac,
                      size=None,
                      start=None,
                      stop=None,
                      step=None,
                      windows=None,
                      min_sites=3):
    """Calculate the value of Tajima's D in windows over a single
    chromosome/contig.

    Parameters
    ----------
    pos : array_like, int, shape (n_items,)
        Variant positions, using 1-based coordinates, in ascending order.
    ac : array_like, int, shape (n_variants, n_alleles)
        Allele counts array.
    size : int, optional
        The window size (number of bases).
    start : int, optional
        The position at which to start (1-based).
    stop : int, optional
        The position at which to stop (1-based).
    step : int, optional
        The distance between start positions of windows. If not given,
        defaults to the window size, i.e., non-overlapping windows.
    windows : array_like, int, shape (n_windows, 2), optional
        Manually specify the windows to use as a sequence of (window_start,
        window_stop) positions, using 1-based coordinates. Overrides the
        size/start/stop/step parameters.
    min_sites : int, optional
        Minimum number of segregating sites for which to calculate a value. If
        there are fewer, np.nan is returned. Defaults to 3.

    Returns
    -------
    D : ndarray, float, shape (n_windows,)
        Tajima's D.
    windows : ndarray, int, shape (n_windows, 2)
        The windows used, as an array of (window_start, window_stop) positions,
        using 1-based coordinates.
    counts : ndarray, int, shape (n_windows,)
        Number of variants in each window.

    Examples
    --------

    >>> import allel
    >>> g = allel.GenotypeArray([[[0, 0], [0, 0]],
    ...                          [[0, 0], [0, 1]],
    ...                          [[0, 0], [1, 1]],
    ...                          [[0, 1], [1, 1]],
    ...                          [[1, 1], [1, 1]],
    ...                          [[0, 0], [1, 2]],
    ...                          [[0, 1], [1, 2]],
    ...                          [[0, 1], [-1, -1]],
    ...                          [[-1, -1], [-1, -1]]])
    >>> ac = g.count_alleles()
    >>> pos = [2, 4, 7, 14, 15, 20, 22, 25, 27]
    >>> D, windows, counts = allel.windowed_tajima_d(pos, ac, size=20, step=10, start=1, stop=31)
    >>> D
    array([1.36521524, 4.22566622])
    >>> windows
    array([[ 1, 20],
           [11, 31]])
    >>> counts
    array([6, 6])

    """

    # check inputs
    if not isinstance(pos, SortedIndex):
        pos = SortedIndex(pos, copy=False)
    if not hasattr(ac, 'count_segregating'):
        ac = AlleleCountsArray(ac, copy=False)

    # assume number of chromosomes sampled is constant for all variants
    n = ac.sum(axis=1).max()

    # calculate constants
    a1 = np.sum(1 / np.arange(1, n))
    a2 = np.sum(1 / (np.arange(1, n)**2))
    b1 = (n + 1) / (3 * (n - 1))
    b2 = 2 * (n**2 + n + 3) / (9 * n * (n - 1))
    c1 = b1 - (1 / a1)
    c2 = b2 - ((n + 2) / (a1 * n)) + (a2 / (a1**2))
    e1 = c1 / a1
    e2 = c2 / (a1**2 + a2)

    # locate segregating variants
    is_seg = ac.is_segregating()

    # calculate mean pairwise difference
    mpd = mean_pairwise_difference(ac, fill=0)

    # define statistic to compute for each window
    # noinspection PyPep8Naming
    def statistic(w_is_seg, w_mpd):
        S = np.count_nonzero(w_is_seg)
        if S < min_sites:
            return np.nan
        pi = np.sum(w_mpd)
        d = pi - (S / a1)
        d_stdev = np.sqrt((e1 * S) + (e2 * S * (S - 1)))
        wD = d / d_stdev
        return wD

    D, windows, counts = windowed_statistic(pos,
                                            values=(is_seg, mpd),
                                            statistic=statistic,
                                            size=size,
                                            start=start,
                                            stop=stop,
                                            step=step,
                                            windows=windows,
                                            fill=np.nan)

    return D, windows, counts
Example #13
0
def windowed_statistic(pos,
                       values,
                       statistic,
                       size=None,
                       start=None,
                       stop=None,
                       step=None,
                       windows=None,
                       fill=np.nan):
    """Calculate a statistic from items in windows over a single
    chromosome/contig.

    Parameters
    ----------

    pos : array_like, int, shape (n_items,)
        The item positions in ascending order, using 1-based coordinates..
    values : array_like, int, shape (n_items,)
        The values to summarise. May also be a tuple of values arrays,
        in which case each array will be sliced and passed through to the
        statistic function as separate arguments.
    statistic : function
        The statistic to compute.
    size : int, optional
        The window size (number of bases).
    start : int, optional
        The position at which to start (1-based).
    stop : int, optional
        The position at which to stop (1-based).
    step : int, optional
        The distance between start positions of windows. If not given,
        defaults to the window size, i.e., non-overlapping windows.
    windows : array_like, int, shape (n_windows, 2), optional
        Manually specify the windows to use as a sequence of (window_start,
        window_stop) positions, using 1-based coordinates. Overrides the
        size/start/stop/step parameters.
    fill : object, optional
        The value to use where a window is empty, i.e., contains no items.

    Returns
    -------

    out : ndarray, shape (n_windows,)
        The value of the statistic for each window.
    windows : ndarray, int, shape (n_windows, 2)
        The windows used, as an array of (window_start, window_stop) positions,
        using 1-based coordinates.
    counts : ndarray, int, shape (n_windows,)
        The number of items in each window.

    Notes
    -----

    The window stop positions are included within a window.

    The final window will be truncated to the specified stop position,
    and so may be smaller than the other windows.

    Examples
    --------

    Count non-zero (i.e., True) items in non-overlapping windows::

        >>> import allel
        >>> pos = [1, 7, 12, 15, 28]
        >>> values = [True, False, True, False, False]
        >>> nnz, windows, counts = allel.windowed_statistic(
        ...     pos, values, statistic=np.count_nonzero, size=10
        ... )
        >>> nnz
        array([1, 1, 0])
        >>> windows
        array([[ 1, 10],
               [11, 20],
               [21, 28]])
        >>> counts
        array([2, 2, 1])

    Compute a sum over items in half-overlapping windows::

        >>> values = [3, 4, 2, 6, 9]
        >>> x, windows, counts = allel.windowed_statistic(
        ...     pos, values, statistic=np.sum, size=10, step=5, fill=0
        ... )
        >>> x
        array([ 7, 12,  8,  0,  9])
        >>> windows
        array([[ 1, 10],
               [ 6, 15],
               [11, 20],
               [16, 25],
               [21, 28]])
        >>> counts
        array([2, 3, 2, 0, 1])

    """

    # assume sorted positions
    if not isinstance(pos, SortedIndex):
        pos = SortedIndex(pos, copy=False)

    # check lengths are equal
    if isinstance(values, tuple):
        # assume multiple values arrays
        check_equal_length(pos, *values)
    else:
        # assume a single values array
        check_equal_length(pos, values)

    # setup windows
    if windows is None:
        windows = position_windows(pos, size, start, stop, step)
    else:
        windows = asarray_ndim(windows, 2)

    # find window locations
    locs = window_locations(pos, windows)

    # setup outputs
    out = []
    counts = []

    # iterate over windows
    for start_idx, stop_idx in locs:

        # calculate number of values in window
        n = stop_idx - start_idx

        if n == 0:
            # window is empty
            s = fill

        else:

            if isinstance(values, tuple):
                # assume multiple values arrays
                wv = [v[start_idx:stop_idx] for v in values]
                s = statistic(*wv)

            else:
                # assume a single values array
                wv = values[start_idx:stop_idx]
                s = statistic(wv)

        # store outputs
        out.append(s)
        counts.append(n)

    # convert to arrays for output
    return np.asarray(out), windows, np.asarray(counts)
Example #14
0
def windowed_count(pos,
                   size=None,
                   start=None,
                   stop=None,
                   step=None,
                   windows=None):
    """Count the number of items in windows over a single chromosome/contig.

    Parameters
    ----------

    pos : array_like, int, shape (n_items,)
        The item positions in ascending order, using 1-based coordinates..
    size : int, optional
        The window size (number of bases).
    start : int, optional
        The position at which to start (1-based).
    stop : int, optional
        The position at which to stop (1-based).
    step : int, optional
        The distance between start positions of windows. If not given,
        defaults to the window size, i.e., non-overlapping windows.
    windows : array_like, int, shape (n_windows, 2), optional
        Manually specify the windows to use as a sequence of (window_start,
        window_stop) positions, using 1-based coordinates. Overrides the
        size/start/stop/step parameters.

    Returns
    -------

    counts : ndarray, int, shape (n_windows,)
        The number of items in each window.
    windows : ndarray, int, shape (n_windows, 2)
        The windows used, as an array of (window_start, window_stop) positions,
        using 1-based coordinates.

    Notes
    -----

    The window stop positions are included within a window.

    The final window will be truncated to the specified stop position,
    and so may be smaller than the other windows.

    Examples
    --------

    Non-overlapping windows::

        >>> import allel
        >>> pos = [1, 7, 12, 15, 28]
        >>> counts, windows = allel.windowed_count(pos, size=10)
        >>> counts
        array([2, 2, 1])
        >>> windows
        array([[ 1, 10],
               [11, 20],
               [21, 28]])

    Half-overlapping windows::

        >>> counts, windows = allel.windowed_count(pos, size=10, step=5)
        >>> counts
        array([2, 3, 2, 0, 1])
        >>> windows
        array([[ 1, 10],
               [ 6, 15],
               [11, 20],
               [16, 25],
               [21, 28]])

    """

    # assume sorted positions
    if not isinstance(pos, SortedIndex):
        pos = SortedIndex(pos, copy=False)

    # setup windows
    if windows is None:
        windows = position_windows(pos, size, start, stop, step)
    else:
        windows = asarray_ndim(windows, 2)

    # find window locations
    locs = window_locations(pos, windows)

    # count number of items in each window
    counts = np.diff(locs, axis=1).reshape(-1)

    return counts, windows
Example #15
0
def sequence_diversity(pos, ac, start=None, stop=None, is_accessible=None):
    """Estimate nucleotide diversity within a given region.

    Parameters
    ----------

    pos : array_like, int, shape (n_items,)
        Variant positions, using 1-based coordinates, in ascending order.
    ac : array_like, int, shape (n_variants, n_alleles)
        Allele counts array.
    start : int, optional
        The position at which to start (1-based).
    stop : int, optional
        The position at which to stop (1-based).
    is_accessible : array_like, bool, shape (len(contig),), optional
        Boolean array indicating accessibility status for all positions in the
        chromosome/contig.

    Returns
    -------

    pi : ndarray, float, shape (n_windows,)
        Nucleotide diversity.

    Examples
    --------

    >>> import allel
    >>> g = allel.GenotypeArray([[[0, 0], [0, 0]],
    ...                          [[0, 0], [0, 1]],
    ...                          [[0, 0], [1, 1]],
    ...                          [[0, 1], [1, 1]],
    ...                          [[1, 1], [1, 1]],
    ...                          [[0, 0], [1, 2]],
    ...                          [[0, 1], [1, 2]],
    ...                          [[0, 1], [-1, -1]],
    ...                          [[-1, -1], [-1, -1]]])
    >>> ac = g.count_alleles()
    >>> pos = [2, 4, 7, 14, 15, 18, 19, 25, 27]
    >>> pi = allel.stats.sequence_diversity(pos, ac, start=1, stop=31)
    >>> pi
    0.13978494623655915

    """

    # check inputs
    if not isinstance(pos, SortedIndex):
        pos = SortedIndex(pos, copy=False)
    ac = asarray_ndim(ac, 2)
    is_accessible = asarray_ndim(is_accessible, 1, allow_none=True)

    # deal with subregion
    if start is not None or stop is not None:
        loc = pos.locate_range(start, stop)
        pos = pos[loc]
        ac = ac[loc]
    if start is None:
        start = pos[0]
    if stop is None:
        stop = pos[-1]

    # calculate mean pairwise difference
    mpd = mean_pairwise_difference(ac, fill=0)

    # sum differences over variants
    mpd_sum = np.sum(mpd)

    # calculate value per base
    if is_accessible is None:
        n_bases = stop - start + 1
    else:
        n_bases = np.count_nonzero(is_accessible[start - 1 : stop])

    pi = mpd_sum / n_bases
    return pi
Example #16
0
def sequence_divergence(pos, ac1, ac2, an1=None, an2=None, start=None, stop=None, is_accessible=None):
    """Estimate nucleotide divergence between two populations within a
    given region.

    Parameters
    ----------

    pos : array_like, int, shape (n_items,)
        Variant positions, using 1-based coordinates, in ascending order.
    ac1 : array_like, int, shape (n_variants, n_alleles)
        Allele counts array for the first population.
    ac2 : array_like, int, shape (n_variants, n_alleles)
        Allele counts array for the second population.
    start : int, optional
        The position at which to start (1-based).
    stop : int, optional
        The position at which to stop (1-based).
    is_accessible : array_like, bool, shape (len(contig),), optional
        Boolean array indicating accessibility status for all positions in the
        chromosome/contig.

    Returns
    -------

    Dxy : ndarray, float, shape (n_windows,)
        Nucleotide divergence.

    Examples
    --------

    Simplest case, two haplotypes in each population::

        >>> import allel
        >>> h = allel.HaplotypeArray([[0, 0, 0, 0],
        ...                           [0, 0, 0, 1],
        ...                           [0, 0, 1, 1],
        ...                           [0, 1, 1, 1],
        ...                           [1, 1, 1, 1],
        ...                           [0, 0, 1, 2],
        ...                           [0, 1, 1, 2],
        ...                           [0, 1, -1, -1],
        ...                           [-1, -1, -1, -1]])
        >>> ac1 = h.count_alleles(subpop=[0, 1])
        >>> ac2 = h.count_alleles(subpop=[2, 3])
        >>> pos = [2, 4, 7, 14, 15, 18, 19, 25, 27]
        >>> dxy = sequence_divergence(pos, ac1, ac2, start=1, stop=31)
        >>> dxy
        0.12096774193548387

    """

    # check inputs
    if not isinstance(pos, SortedIndex):
        pos = SortedIndex(pos, copy=False)
    ac1 = asarray_ndim(ac1, 2)
    ac2 = asarray_ndim(ac2, 2)
    if an1 is not None:
        an1 = asarray_ndim(an1, 1)
    if an2 is not None:
        an2 = asarray_ndim(an2, 1)
    is_accessible = asarray_ndim(is_accessible, 1, allow_none=True)

    # handle start/stop
    if start is not None or stop is not None:
        loc = pos.locate_range(start, stop)
        pos = pos[loc]
        ac1 = ac1[loc]
        ac2 = ac2[loc]
        if an1 is not None:
            an1 = an1[loc]
        if an2 is not None:
            an2 = an2[loc]
    if start is None:
        start = pos[0]
    if stop is None:
        stop = pos[-1]

    # calculate mean pairwise difference between the two populations
    mpd = mean_pairwise_difference_between(ac1, ac2, an1=an1, an2=an2, fill=0)

    # sum differences over variants
    mpd_sum = np.sum(mpd)

    # calculate value per base, N.B., expect pos is 1-based
    if is_accessible is None:
        n_bases = stop - start + 1
    else:
        n_bases = np.count_nonzero(is_accessible[start - 1 : stop])

    dxy = mpd_sum / n_bases

    return dxy
Example #17
0
def windowed_df(pos,
                ac1,
                ac2,
                size=None,
                start=None,
                stop=None,
                step=None,
                windows=None,
                is_accessible=None,
                fill=np.nan):
    """Calculate the density of fixed differences between two populations in
    windows over a single chromosome/contig.

    Parameters
    ----------

    pos : array_like, int, shape (n_items,)
        Variant positions, using 1-based coordinates, in ascending order.
    ac1 : array_like, int, shape (n_variants, n_alleles)
        Allele counts array for the first population.
    ac2 : array_like, int, shape (n_variants, n_alleles)
        Allele counts array for the second population.
    size : int, optional
        The window size (number of bases).
    start : int, optional
        The position at which to start (1-based).
    stop : int, optional
        The position at which to stop (1-based).
    step : int, optional
        The distance between start positions of windows. If not given,
        defaults to the window size, i.e., non-overlapping windows.
    windows : array_like, int, shape (n_windows, 2), optional
        Manually specify the windows to use as a sequence of (window_start,
        window_stop) positions, using 1-based coordinates. Overrides the
        size/start/stop/step parameters.
    is_accessible : array_like, bool, shape (len(contig),), optional
        Boolean array indicating accessibility status for all positions in the
        chromosome/contig.
    fill : object, optional
        The value to use where a window is completely inaccessible.

    Returns
    -------

    df : ndarray, float, shape (n_windows,)
        Per-base density of fixed differences in each window.
    windows : ndarray, int, shape (n_windows, 2)
        The windows used, as an array of (window_start, window_stop) positions,
        using 1-based coordinates.
    n_bases : ndarray, int, shape (n_windows,)
        Number of (accessible) bases in each window.
    counts : ndarray, int, shape (n_windows,)
        Number of variants in each window.

    See Also
    --------

    allel.model.locate_fixed_differences

    """

    # check inputs
    pos = SortedIndex(pos, copy=False)
    is_accessible = asarray_ndim(is_accessible, 1, allow_none=True)

    # locate fixed differences
    loc_df = locate_fixed_differences(ac1, ac2)

    # count number of fixed differences in windows
    n_df, windows, counts = windowed_statistic(pos,
                                               values=loc_df,
                                               statistic=np.count_nonzero,
                                               size=size,
                                               start=start,
                                               stop=stop,
                                               step=step,
                                               windows=windows,
                                               fill=0)

    # calculate value per base
    df, n_bases = per_base(n_df,
                           windows,
                           is_accessible=is_accessible,
                           fill=fill)

    return df, windows, n_bases, counts
Example #18
0
def tajima_d(ac, pos=None, start=None, stop=None):
    """Calculate the value of Tajima's D over a given region.

    Parameters
    ----------
    ac : array_like, int, shape (n_variants, n_alleles)
        Allele counts array.
    pos : array_like, int, shape (n_items,), optional
        Variant positions, using 1-based coordinates, in ascending order.
    start : int, optional
        The position at which to start (1-based).
    stop : int, optional
        The position at which to stop (1-based).

    Returns
    -------
    D : float

    Examples
    --------

    >>> import allel
    >>> g = allel.GenotypeArray([[[0, 0], [0, 0]],
    ...                    [[0, 0], [0, 1]],
    ...                          [[0, 0], [1, 1]],
    ...                          [[0, 1], [1, 1]],
    ...                          [[1, 1], [1, 1]],
    ...                          [[0, 0], [1, 2]],
    ...                          [[0, 1], [1, 2]],
    ...                          [[0, 1], [-1, -1]],
    ...                          [[-1, -1], [-1, -1]]])
    >>> ac = g.count_alleles()
    >>> allel.stats.tajima_d(ac)
    3.1445848780213814
    >>> pos = [2, 4, 7, 14, 15, 18, 19, 25, 27]
    >>> allel.stats.tajima_d(ac, pos=pos, start=7, stop=25)
    3.8779735196179366

    """

    # check inputs
    if not hasattr(ac, "count_segregating"):
        ac = AlleleCountsArray(ac, copy=False)

    # deal with subregion
    if pos is not None and (start is not None or stop is not None):
        if not isinstance(pos, SortedIndex):
            pos = SortedIndex(pos, copy=False)
        loc = pos.locate_range(start, stop)
        ac = ac[loc]

    # assume number of chromosomes sampled is constant for all variants
    n = ac.sum(axis=1).max()

    # count segregating variants
    S = ac.count_segregating()

    # (n-1)th harmonic number
    a1 = np.sum(1 / np.arange(1, n))

    # calculate Watterson's theta (absolute value)
    theta_hat_w_abs = S / a1

    # calculate mean pairwise difference
    mpd = mean_pairwise_difference(ac, fill=0)

    # calculate theta_hat pi (sum differences over variants)
    theta_hat_pi_abs = np.sum(mpd)

    # N.B., both theta estimates are usually divided by the number of
    # (accessible) bases but here we want the absolute difference
    d = theta_hat_pi_abs - theta_hat_w_abs

    # calculate the denominator (standard deviation)
    a2 = np.sum(1 / (np.arange(1, n) ** 2))
    b1 = (n + 1) / (3 * (n - 1))
    b2 = 2 * (n ** 2 + n + 3) / (9 * n * (n - 1))
    c1 = b1 - (1 / a1)
    c2 = b2 - ((n + 2) / (a1 * n)) + (a2 / (a1 ** 2))
    e1 = c1 / a1
    e2 = c2 / (a1 ** 2 + a2)
    d_stdev = np.sqrt((e1 * S) + (e2 * S * (S - 1)))

    # finally calculate Tajima's D
    D = d / d_stdev

    return D
Example #19
0
def watterson_theta(pos, ac, start=None, stop=None, is_accessible=None):
    """Calculate the value of Watterson's estimator over a given region.

    Parameters
    ----------

    pos : array_like, int, shape (n_items,)
        Variant positions, using 1-based coordinates, in ascending order.
    ac : array_like, int, shape (n_variants, n_alleles)
        Allele counts array.
    start : int, optional
        The position at which to start (1-based). Defaults to the first position.
    stop : int, optional
        The position at which to stop (1-based). Defaults to the last position.
    is_accessible : array_like, bool, shape (len(contig),), optional
        Boolean array indicating accessibility status for all positions in the
        chromosome/contig.

    Returns
    -------

    theta_hat_w : float
        Watterson's estimator (theta hat per base).

    Examples
    --------

    >>> import allel
    >>> g = allel.GenotypeArray([[[0, 0], [0, 0]],
    ...                          [[0, 0], [0, 1]],
    ...                          [[0, 0], [1, 1]],
    ...                          [[0, 1], [1, 1]],
    ...                          [[1, 1], [1, 1]],
    ...                          [[0, 0], [1, 2]],
    ...                          [[0, 1], [1, 2]],
    ...                          [[0, 1], [-1, -1]],
    ...                          [[-1, -1], [-1, -1]]])
    >>> ac = g.count_alleles()
    >>> pos = [2, 4, 7, 14, 15, 18, 19, 25, 27]
    >>> theta_hat_w = allel.watterson_theta(pos, ac, start=1, stop=31)
    >>> theta_hat_w
    0.10557184750733138

    """

    # check inputs
    if not isinstance(pos, SortedIndex):
        pos = SortedIndex(pos, copy=False)
    is_accessible = asarray_ndim(is_accessible, 1, allow_none=True)
    if not hasattr(ac, 'count_segregating'):
        ac = AlleleCountsArray(ac, copy=False)

    # deal with subregion
    if start is not None or stop is not None:
        loc = pos.locate_range(start, stop)
        pos = pos[loc]
        ac = ac[loc]
    if start is None:
        start = pos[0]
    if stop is None:
        stop = pos[-1]

    # count segregating variants
    S = ac.count_segregating()

    # assume number of chromosomes sampled is constant for all variants
    n = ac.sum(axis=1).max()

    # (n-1)th harmonic number
    a1 = np.sum(1 / np.arange(1, n))

    # calculate absolute value
    theta_hat_w_abs = S / a1

    # calculate value per base
    if is_accessible is None:
        n_bases = stop - start + 1
    else:
        n_bases = np.count_nonzero(is_accessible[start - 1:stop])
    theta_hat_w = theta_hat_w_abs / n_bases

    return theta_hat_w
Example #20
0
def plot_variant_locator(pos, step=None, ax=None, start=None,
                         stop=None, flip=False,
                         line_kwargs=None):
    """
    Plot lines indicating the physical genome location of variants from a
    single chromosome/contig. By default the top x axis is in variant index
    space, and the bottom x axis is in genome position space.

    Parameters
    ----------

    pos : array_like
        A sorted 1-dimensional array of genomic positions from a single
        chromosome/contig.
    step : int, optional
        Plot a line for every `step` variants.
    ax : axes, optional
        The axes on which to draw. If not provided, a new figure will be
        created.
    start : int, optional
        The start position for the region to draw.
    stop : int, optional
        The stop position for the region to draw.
    flip : bool, optional
        Flip the plot upside down.
    line_kwargs : dict-like
        Additional keyword arguments passed through to `plt.Line2D`.

    Returns
    -------

    ax : axes
        The axes on which the plot was drawn

    """

    import matplotlib.pyplot as plt

    # check inputs
    pos = SortedIndex(pos, copy=False)

    # set up axes
    if ax is None:
        x = plt.rcParams['figure.figsize'][0]
        y = x / 7
        fig, ax = plt.subplots(figsize=(x, y))
        fig.tight_layout()

    # determine x axis limits
    if start is None:
        start = np.min(pos)
    if stop is None:
        stop = np.max(pos)
    loc = pos.locate_range(start, stop)
    pos = pos[loc]
    if step is None:
        step = len(pos) // 100
    ax.set_xlim(start, stop)

    # plot the lines
    if line_kwargs is None:
        line_kwargs = dict()
    # line_kwargs.setdefault('linewidth', .5)
    n_variants = len(pos)
    for i, p in enumerate(pos[::step]):
        xfrom = p
        xto = (
            start +
            ((i * step / n_variants) * (stop-start))
        )
        line = plt.Line2D([xfrom, xto], [0, 1], **line_kwargs)
        ax.add_line(line)

    # invert?
    if flip:
        ax.invert_yaxis()
        ax.xaxis.tick_top()
    else:
        ax.xaxis.tick_bottom()

    # tidy up
    ax.set_yticks([])
    ax.xaxis.set_tick_params(direction='out')
    for spine in 'left', 'right':
        ax.spines[spine].set_visible(False)

    return ax
Example #21
0
def windowed_watterson_theta(pos,
                             ac,
                             size=None,
                             start=None,
                             stop=None,
                             step=None,
                             windows=None,
                             is_accessible=None,
                             fill=np.nan):
    """Calculate the value of Watterson's estimator in windows over a single
    chromosome/contig.

    Parameters
    ----------

    pos : array_like, int, shape (n_items,)
        Variant positions, using 1-based coordinates, in ascending order.
    ac : array_like, int, shape (n_variants, n_alleles)
        Allele counts array.
    size : int, optional
        The window size (number of bases).
    start : int, optional
        The position at which to start (1-based).
    stop : int, optional
        The position at which to stop (1-based).
    step : int, optional
        The distance between start positions of windows. If not given,
        defaults to the window size, i.e., non-overlapping windows.
    windows : array_like, int, shape (n_windows, 2), optional
        Manually specify the windows to use as a sequence of (window_start,
        window_stop) positions, using 1-based coordinates. Overrides the
        size/start/stop/step parameters.
    is_accessible : array_like, bool, shape (len(contig),), optional
        Boolean array indicating accessibility status for all positions in the
        chromosome/contig.
    fill : object, optional
        The value to use where a window is completely inaccessible.

    Returns
    -------

    theta_hat_w : ndarray, float, shape (n_windows,)
        Watterson's estimator (theta hat per base).
    windows : ndarray, int, shape (n_windows, 2)
        The windows used, as an array of (window_start, window_stop) positions,
        using 1-based coordinates.
    n_bases : ndarray, int, shape (n_windows,)
        Number of (accessible) bases in each window.
    counts : ndarray, int, shape (n_windows,)
        Number of variants in each window.

    Examples
    --------

    >>> import allel
    >>> g = allel.GenotypeArray([[[0, 0], [0, 0]],
    ...                          [[0, 0], [0, 1]],
    ...                          [[0, 0], [1, 1]],
    ...                          [[0, 1], [1, 1]],
    ...                          [[1, 1], [1, 1]],
    ...                          [[0, 0], [1, 2]],
    ...                          [[0, 1], [1, 2]],
    ...                          [[0, 1], [-1, -1]],
    ...                          [[-1, -1], [-1, -1]]])
    >>> ac = g.count_alleles()
    >>> pos = [2, 4, 7, 14, 15, 18, 19, 25, 27]
    >>> theta_hat_w, windows, n_bases, counts = allel.windowed_watterson_theta(
    ...     pos, ac, size=10, start=1, stop=31
    ... )
    >>> theta_hat_w
    array([0.10909091, 0.16363636, 0.04958678])
    >>> windows
    array([[ 1, 10],
           [11, 20],
           [21, 31]])
    >>> n_bases
    array([10, 10, 11])
    >>> counts
    array([3, 4, 2])

    """  # flake8: noqa

    # check inputs
    if not isinstance(pos, SortedIndex):
        pos = SortedIndex(pos, copy=False)
    is_accessible = asarray_ndim(is_accessible, 1, allow_none=True)
    if not hasattr(ac, 'count_segregating'):
        ac = AlleleCountsArray(ac, copy=False)

    # locate segregating variants
    is_seg = ac.is_segregating()

    # count segregating variants in windows
    S, windows, counts = windowed_statistic(pos,
                                            is_seg,
                                            statistic=np.count_nonzero,
                                            size=size,
                                            start=start,
                                            stop=stop,
                                            step=step,
                                            windows=windows,
                                            fill=0)

    # assume number of chromosomes sampled is constant for all variants
    n = ac.sum(axis=1).max()

    # (n-1)th harmonic number
    a1 = np.sum(1 / np.arange(1, n))

    # absolute value of Watterson's theta
    theta_hat_w_abs = S / a1

    # theta per base
    theta_hat_w, n_bases = per_base(theta_hat_w_abs,
                                    windows=windows,
                                    is_accessible=is_accessible,
                                    fill=fill)

    return theta_hat_w, windows, n_bases, counts
Example #22
0
def tajima_d(ac, pos=None, start=None, stop=None, min_sites=3):
    """Calculate the value of Tajima's D over a given region.

    Parameters
    ----------
    ac : array_like, int, shape (n_variants, n_alleles)
        Allele counts array.
    pos : array_like, int, shape (n_items,), optional
        Variant positions, using 1-based coordinates, in ascending order.
    start : int, optional
        The position at which to start (1-based). Defaults to the first position.
    stop : int, optional
        The position at which to stop (1-based). Defaults to the last position.
    min_sites : int, optional
        Minimum number of segregating sites for which to calculate a value. If
        there are fewer, np.nan is returned. Defaults to 3.

    Returns
    -------
    D : float

    Examples
    --------

    >>> import allel
    >>> g = allel.GenotypeArray([[[0, 0], [0, 0]],
    ...                    [[0, 0], [0, 1]],
    ...                          [[0, 0], [1, 1]],
    ...                          [[0, 1], [1, 1]],
    ...                          [[1, 1], [1, 1]],
    ...                          [[0, 0], [1, 2]],
    ...                          [[0, 1], [1, 2]],
    ...                          [[0, 1], [-1, -1]],
    ...                          [[-1, -1], [-1, -1]]])
    >>> ac = g.count_alleles()
    >>> allel.tajima_d(ac)
    3.1445848780213814
    >>> pos = [2, 4, 7, 14, 15, 18, 19, 25, 27]
    >>> allel.tajima_d(ac, pos=pos, start=7, stop=25)
    3.8779735196179366

    """

    # check inputs
    if not hasattr(ac, 'count_segregating'):
        ac = AlleleCountsArray(ac, copy=False)

    # deal with subregion
    if pos is not None and (start is not None or stop is not None):
        if not isinstance(pos, SortedIndex):
            pos = SortedIndex(pos, copy=False)
        loc = pos.locate_range(start, stop)
        ac = ac[loc]

    # count segregating variants
    S = ac.count_segregating()
    if S < min_sites:
        return np.nan

    # assume number of chromosomes sampled is constant for all variants
    n = ac.sum(axis=1).max()

    # (n-1)th harmonic number
    a1 = np.sum(1 / np.arange(1, n))

    # calculate Watterson's theta (absolute value)
    theta_hat_w_abs = S / a1

    # calculate mean pairwise difference
    mpd = mean_pairwise_difference(ac, fill=0)

    # calculate theta_hat pi (sum differences over variants)
    theta_hat_pi_abs = np.sum(mpd)

    # N.B., both theta estimates are usually divided by the number of
    # (accessible) bases but here we want the absolute difference
    d = theta_hat_pi_abs - theta_hat_w_abs

    # calculate the denominator (standard deviation)
    a2 = np.sum(1 / (np.arange(1, n)**2))
    b1 = (n + 1) / (3 * (n - 1))
    b2 = 2 * (n**2 + n + 3) / (9 * n * (n - 1))
    c1 = b1 - (1 / a1)
    c2 = b2 - ((n + 2) / (a1 * n)) + (a2 / (a1**2))
    e1 = c1 / a1
    e2 = c2 / (a1**2 + a2)
    d_stdev = np.sqrt((e1 * S) + (e2 * S * (S - 1)))

    # finally calculate Tajima's D
    D = d / d_stdev

    return D
Example #23
0
def plot_variant_locator(pos, step=None, ax=None, start=None,
                         stop=None, flip=False,
                         line_kwargs=None):
    """
    Plot lines indicating the physical genome location of variants from a
    single chromosome/contig. By default the top x axis is in variant index
    space, and the bottom x axis is in genome position space.

    Parameters
    ----------

    pos : array_like
        A sorted 1-dimensional array of genomic positions from a single
        chromosome/contig.
    step : int, optional
        Plot a line for every `step` variants.
    ax : axes, optional
        The axes on which to draw. If not provided, a new figure will be
        created.
    start : int, optional
        The start position for the region to draw.
    stop : int, optional
        The stop position for the region to draw.
    flip : bool, optional
        Flip the plot upside down.
    line_kwargs : dict-like
        Additional keyword arguments passed through to `plt.Line2D`.

    Returns
    -------

    ax : axes
        The axes on which the plot was drawn

    """

    import matplotlib.pyplot as plt

    # check inputs
    pos = SortedIndex(pos, copy=False)

    # set up axes
    if ax is None:
        x = plt.rcParams['figure.figsize'][0]
        y = x / 7
        fig, ax = plt.subplots(figsize=(x, y))
        fig.tight_layout()

    # determine x axis limits
    if start is None:
        start = np.min(pos)
    if stop is None:
        stop = np.max(pos)
    loc = pos.locate_range(start, stop)
    pos = pos[loc]
    if step is None:
        step = len(pos) // 100
    ax.set_xlim(start, stop)

    # plot the lines
    if line_kwargs is None:
        line_kwargs = dict()
    # line_kwargs.setdefault('linewidth', .5)
    n_variants = len(pos)
    for i, p in enumerate(pos[::step]):
        xfrom = p
        xto = (
            start +
            ((i * step / n_variants) * (stop-start))
        )
        l = plt.Line2D([xfrom, xto], [0, 1], **line_kwargs)
        ax.add_line(l)

    # invert?
    if flip:
        ax.invert_yaxis()
        ax.xaxis.tick_top()
    else:
        ax.xaxis.tick_bottom()

    # tidy up
    ax.set_yticks([])
    ax.xaxis.set_tick_params(direction='out')
    for l in 'left', 'right':
        ax.spines[l].set_visible(False)

    return ax