Ejemplo n.º 1
0
def rogers_huff_r_between(gna, gnb, fill=np.nan):
    """Estimate the linkage disequilibrium parameter *r* for each pair of
    variants between the two input arrays, using the method of Rogers and
    Huff (2008).

    Parameters
    ----------

    gna, gnb : array_like, int8, shape (n_variants, n_samples)
        Diploid genotypes at biallelic variants, coded as the number of
        alternate alleles per call (i.e., 0 = hom ref, 1 = het, 2 = hom alt).

    Returns
    -------

    r : ndarray, float, shape (m_variants, n_variants )
        Matrix in rectangular form.

    """

    # check inputs
    gna = asarray_ndim(gna, 2, dtype='i1')
    gnb = asarray_ndim(gnb, 2, dtype='i1')

    # compute correlation coefficients
    from allel.opt.stats import gn_pairwise2_corrcoef_int8
    r = gn_pairwise2_corrcoef_int8(gna, gnb, fill)

    # convenience for singletons
    if r.size == 1:
        r = r[0, 0]

    return r
Ejemplo n.º 2
0
def joint_sfs(dac1, dac2):
    """Compute the joint site frequency spectrum between two populations.

    Parameters
    ----------
    dac1 : array_like, int, shape (n_variants,)
        Derived allele counts for the first population.
    dac2 : array_like, int, shape (n_variants,)
        Derived allele counts for the second population.

    Returns
    -------
    joint_sfs : ndarray, int, shape (m_chromosomes, n_chromosomes)
        Array where the (i, j)th element is the number of variant sites with i
        derived alleles in the first population and j derived alleles in the
        second population.

    """

    # check inputs
    dac1 = asarray_ndim(dac1, 1)
    dac2 = asarray_ndim(dac2, 1)

    # compute site frequency spectrum
    n = np.max(dac1) + 1
    m = np.max(dac2) + 1
    s = np.bincount(dac1 * m + dac2)
    s.resize((n, m))
    return s
Ejemplo n.º 3
0
def joint_sfs_folded(ac1, ac2):
    """Compute the joint folded site frequency spectrum between two
    populations.

    Parameters
    ----------
    ac1 : array_like, int, shape (n_variants, 2)
        Allele counts for the first population.
    ac2 : array_like, int, shape (n_variants, 2)
        Allele counts for the second population.

    Returns
    -------
    joint_sfs_folded : ndarray, int, shape (m_chromosomes//2, n_chromosomes//2)
        Array where the (i, j)th element is the number of variant sites with a
        minor allele count of i in the first population and j in the second
        population.

    """

    # check inputs
    ac1 = asarray_ndim(ac1, 2)
    ac2 = asarray_ndim(ac2, 2)
    assert ac1.shape[1] == ac2.shape[1] == 2, "only biallelic variants are supported"

    # compute minor allele counts
    mac1 = np.amin(ac1, axis=1)
    mac2 = np.amin(ac2, axis=1)

    # compute site frequency spectrum
    m = np.max(mac1) + 1
    n = np.max(mac2) + 1
    s = np.bincount(mac1 * n + mac2)
    s.resize((m, n))
    return s
Ejemplo n.º 4
0
def plot_joint_sfs(s, ax=None, imshow_kwargs=None):
    import matplotlib.pyplot as plt
    import matplotlib as mpl

    # check inputs
    s = asarray_ndim(s, 2)

    # setup axes
    if ax is None:
        w = plt.rcParams['figure.figsize'][0]
        fig, ax = plt.subplots(figsize=(w, w))

    # set plotting defaults
    if imshow_kwargs is None:
        imshow_kwargs = dict()
    imshow_kwargs.setdefault('cmap', 'jet')
    imshow_kwargs.setdefault('interpolation', 'none')
    imshow_kwargs.setdefault('aspect', 'auto')
    imshow_kwargs.setdefault('norm', mpl.colors.LogNorm())

    # plot data
    ax.imshow(s, **imshow_kwargs)

    # tidy
    ax.xaxis.tick_top()
    ax.set_ylabel('derived allele count (population 1)')
    ax.set_xlabel('derived allele count (population 2)')
    ax.xaxis.set_label_position('top')

    return ax
Ejemplo n.º 5
0
def plot_joint_sfs(s, ax=None, imshow_kwargs=None):
    import matplotlib.pyplot as plt
    import matplotlib as mpl

    # check inputs
    s = asarray_ndim(s, 2)

    # setup axes
    if ax is None:
        w = plt.rcParams["figure.figsize"][0]
        fig, ax = plt.subplots(figsize=(w, w))

    # set plotting defaults
    if imshow_kwargs is None:
        imshow_kwargs = dict()
    imshow_kwargs.setdefault("cmap", "jet")
    imshow_kwargs.setdefault("interpolation", "none")
    imshow_kwargs.setdefault("aspect", "auto")
    imshow_kwargs.setdefault("norm", mpl.colors.LogNorm())

    # plot data
    ax.imshow(s, **imshow_kwargs)

    # tidy
    ax.xaxis.tick_top()
    ax.set_ylabel("derived allele count (population 1)")
    ax.set_xlabel("derived allele count (population 2)")
    ax.xaxis.set_label_position("top")

    return ax
Ejemplo n.º 6
0
def sfs_folded(ac):
    """Compute the folded site frequency spectrum given reference and
    alternate allele counts at a set of biallelic variants.

    Parameters
    ----------
    ac : array_like, int, shape (n_variants, 2)
        Allele counts array.

    Returns
    -------
    sfs_folded : ndarray, int, shape (n_chromosomes//2,)
        Array where the kth element is the number of variant sites with a
        minor allele count of k.

    """

    # check input
    ac = asarray_ndim(ac, 2)
    assert ac.shape[1] == 2, 'only biallelic variants are supported'

    # compute minor allele counts
    mac = np.amin(ac, axis=1)

    # compute folded site frequency spectrum
    s = np.bincount(mac)

    return s
Ejemplo n.º 7
0
def h_hat(ac):
    """Unbiased estimator for h, where 2*h is the heterozygosity
    of the population.

    Parameters
    ----------
    ac : array_like, int, shape (n_variants, 2)
        Allele counts array for a single population.

    Returns
    -------
    h_hat : ndarray, float, shape (n_variants,)

    Notes
    -----
    Used in Patterson (2012) for calculation of various statistics.

    """

    # check inputs
    ac = asarray_ndim(ac, 2)
    assert ac.shape[1] == 2, 'only biallelic variants supported'

    # compute allele number
    an = ac.sum(axis=1)

    # compute estimator
    x = (ac[:, 0] * ac[:, 1]) / (an * (an - 1))

    return x
Ejemplo n.º 8
0
def fold_sfs(s, n):
    """Fold a site frequency spectrum.

    Parameters
    ----------
    s : array_like, int, shape (n_chromosomes,)
        Site frequency spectrum
    n : int
        Total number of chromosomes called.

    Returns
    -------
    sfs_folded : ndarray, int
        Folded site frequency spectrum

    """

    # check inputs
    s = asarray_ndim(s, 1)
    assert s.shape[0] <= n + 1, 'invalid number of chromosomes'

    # need to check s has all entries up to n
    if s.shape[0] < n + 1:
        sn = np.zeros(n + 1, dtype=s.dtype)
        sn[:s.shape[0]] = s
        s = sn

    # fold
    nf = (n + 1) // 2
    n = nf * 2
    o = s[:nf] + s[nf:n][::-1]

    return o
Ejemplo n.º 9
0
    def fit(self, gn):

        # check input
        gn = asarray_ndim(gn, 2)

        # find mean
        self.mean_ = np.mean(gn, axis=1, keepdims=True)

        return self
Ejemplo n.º 10
0
def rogers_huff_r(gn, fill=np.nan):
    """Estimate the linkage disequilibrium parameter *r* for each pair of
    variants using the method of Rogers and Huff (2008).

    Parameters
    ----------

    gn : array_like, int8, shape (n_variants, n_samples)
        Diploid genotypes at biallelic variants, coded as the number of
        alternate alleles per call (i.e., 0 = hom ref, 1 = het, 2 = hom alt).

    Returns
    -------

    r : ndarray, float, shape (n_variants * (n_variants - 1) // 2,)
        Matrix in condensed form.

    Examples
    --------

    >>> import allel
    >>> g = allel.GenotypeArray([[[0, 0], [1, 1], [0, 0]],
    ...                          [[0, 0], [1, 1], [0, 0]],
    ...                          [[1, 1], [0, 0], [1, 1]],
    ...                          [[0, 0], [0, 1], [-1, -1]]], dtype='i1')
    >>> gn = g.to_n_alt(fill=-1)
    >>> gn
    array([[ 0,  2,  0],
           [ 0,  2,  0],
           [ 2,  0,  2],
           [ 0,  1, -1]], dtype=int8)
    >>> r = allel.stats.rogers_huff_r(gn)
    >>> r
    array([ 1.        , -1.00000012,  1.        , -1.00000012,  1.        , -1.        ], dtype=float32)
    >>> r ** 2
    array([ 1.        ,  1.00000024,  1.        ,  1.00000024,  1.        ,  1.        ], dtype=float32)
    >>> from scipy.spatial.distance import squareform
    >>> squareform(r ** 2)
    array([[ 0.        ,  1.        ,  1.00000024,  1.        ],
           [ 1.        ,  0.        ,  1.00000024,  1.        ],
           [ 1.00000024,  1.00000024,  0.        ,  1.        ],
           [ 1.        ,  1.        ,  1.        ,  0.        ]])

    """  # flake8: noqa

    # check inputs
    gn = asarray_ndim(gn, 2, dtype='i1')

    # compute correlation coefficients
    from allel.opt.stats import gn_pairwise_corrcoef_int8
    r = gn_pairwise_corrcoef_int8(gn, fill)

    # convenience for singletons
    if r.size == 1:
        r = r[0]

    return r
Ejemplo n.º 11
0
def plot_sfs(s, yscale='log', bins=None, n=None,
             clip_endpoints=True, label=None, plot_kwargs=None,
             ax=None):
    import matplotlib.pyplot as plt
    import scipy

    # check inputs
    s = asarray_ndim(s, 1)

    # setup axes
    if ax is None:
        fig, ax = plt.subplots()

    # setup data
    if bins is None:
        if clip_endpoints:
            x = np.arange(1, s.shape[0]-1)
            y = s[1:-1]
        else:
            x = np.arange(s.shape[0])
            y = s
    else:
        if clip_endpoints:
            y, b, _ = scipy.stats.binned_statistic(
                np.arange(1, s.shape[0]-1),
                values=s[1:-1],
                bins=bins,
                statistic='sum')
        else:
            y, b, _ = scipy.stats.binned_statistic(
                np.arange(s.shape[0]),
                values=s,
                bins=bins,
                statistic='sum')
        # use bin midpoints for plotting
        x = (b[:-1] + b[1:]) / 2

    if n:
        # convert allele counts to allele frequencies
        x = x / n
        ax.set_xlabel('derived allele frequency')
    else:
        ax.set_xlabel('derived allele count')

    # do plotting
    if plot_kwargs is None:
        plot_kwargs = dict()
    ax.plot(x, y, label=label, **plot_kwargs)

    # tidy
    ax.set_yscale(yscale)
    ax.set_ylabel('site frequency')
    ax.autoscale(axis='x', tight=True)

    return ax
Ejemplo n.º 12
0
    def fit(self, gn):

        # check input
        gn = asarray_ndim(gn, 2)

        # find mean
        self.mean_ = np.mean(gn, axis=1, keepdims=True)

        # find scaling factor
        self.std_ = np.std(gn, axis=1, keepdims=True)

        return self
Ejemplo n.º 13
0
    def transform(self, gn, copy=None):

        # check inputs
        copy = copy if copy is not None else self.copy
        gn = asarray_ndim(gn, 2, copy=copy)
        if not gn.dtype.kind == 'f':
            gn = gn.astype('f2')

        # center
        gn -= self.mean_

        return gn
Ejemplo n.º 14
0
    def fit(self, gn):

        # check input
        gn = asarray_ndim(gn, 2)

        # find mean
        self.mean_ = np.mean(gn, axis=1, keepdims=True)

        # find scaling factor
        p = self.mean_ / self.ploidy
        self.std_ = np.sqrt(p * (1 - p))

        return self
Ejemplo n.º 15
0
def pairwise_dxy(pos, gac, start=None, stop=None, is_accessible=None):
    """Convenience function to calculate a pairwise distance matrix using
    nucleotide divergence (a.k.a. Dxy) as the distance metric.

    Parameters
    ----------
    pos : array_like, int, shape (n_variants,)
        Variant positions.
    gac : array_like, int, shape (n_variants, n_samples, n_alleles)
        Per-genotype allele counts.
    start : int, optional
        Start position of region to use.
    stop : int, optional
        Stop position of region to use.
    is_accessible : array_like, bool, shape (len(contig),), optional
        Boolean array indicating accessibility status for all positions in the
        chromosome/contig.

    Returns
    -------
    dist : ndarray
        Distance matrix in condensed form.

    See Also
    --------
    allel.model.ndarray.GenotypeArray.to_allele_counts

    """

    if not isinstance(pos, SortedIndex):
        pos = SortedIndex(pos, copy=False)
    gac = asarray_ndim(gac, 3)
    # compute this once here, to avoid repeated evaluation within the loop
    gan = np.sum(gac, axis=2)
    m = gac.shape[1]
    dist = list()
    for i, j in itertools.combinations(range(m), 2):
        ac1 = gac[:, i, ...]
        an1 = gan[:, i]
        ac2 = gac[:, j, ...]
        an2 = gan[:, j]
        d = sequence_divergence(pos,
                                ac1,
                                ac2,
                                an1=an1,
                                an2=an2,
                                start=start,
                                stop=stop,
                                is_accessible=is_accessible)
        dist.append(d)
    return np.array(dist)
Ejemplo n.º 16
0
def fold_joint_sfs(s, m, n):
    """Fold a joint site frequency spectrum.

    Parameters
    ----------
    s : array_like, int, shape (m_chromosomes, n_chromosomes)
        Joint site frequency spectrum.
    m : int
        Number of chromosomes called in the first population.
    n : int
        Number of chromosomes called in the second population.

    Returns
    -------
    joint_sfs_folded : ndarray, int
        Folded joint site frequency spectrum.

    """

    # check inputs
    s = asarray_ndim(s, 2)
    assert s.shape[0] <= m + 1, "invalid number of chromosomes"
    assert s.shape[1] <= n + 1, "invalid number of chromosomes"

    # need to check s has all entries up to m
    if s.shape[0] < m + 1:
        sm = np.zeros((m + 1, s.shape[1]), dtype=s.dtype)
        sm[: s.shape[0]] = s
        s = sm

    # need to check s has all entries up to n
    if s.shape[1] < n + 1:
        sn = np.zeros((s.shape[0], n + 1), dtype=s.dtype)
        sn[:, : s.shape[1]] = s
        s = sn

    # fold
    mf = (m + 1) // 2
    nf = (n + 1) // 2
    m = mf * 2
    n = nf * 2
    o = (
        s[:mf, :nf]
        + s[mf:m, :nf][::-1]  # top left
        + s[:mf, nf:n][:, ::-1]  # top right
        + s[mf:m, nf:n][::-1, ::-1]  # bottom left
    )  # bottom right

    return o
Ejemplo n.º 17
0
def joint_sfs_folded(ac1, ac2):
    """Compute the joint folded site frequency spectrum between two
    populations.

    Parameters
    ----------
    ac1 : array_like, int, shape (n_variants, 2)
        Allele counts for the first population.
    ac2 : array_like, int, shape (n_variants, 2)
        Allele counts for the second population.

    Returns
    -------
    joint_sfs_folded : ndarray, int, shape (m_chromosomes//2, n_chromosomes//2)
        Array where the (i, j)th element is the number of variant sites with a
        minor allele count of i in the first population and j in the second
        population.

    """

    # check inputs
    ac1 = asarray_ndim(ac1, 2)
    ac2 = asarray_ndim(ac2, 2)
    assert ac1.shape[1] == ac2.shape[1] == 2, \
        'only biallelic variants are supported'

    # compute minor allele counts
    mac1 = np.amin(ac1, axis=1)
    mac2 = np.amin(ac2, axis=1)

    # compute site frequency spectrum
    m = np.max(mac1) + 1
    n = np.max(mac2) + 1
    s = np.bincount(mac1 * n + mac2)
    s.resize((m, n))
    return s
Ejemplo n.º 18
0
    def transform(self, gn, copy=None):

        # check inputs
        copy = copy if copy is not None else self.copy
        gn = asarray_ndim(gn, 2, copy=copy)
        if not gn.dtype.kind == 'f':
            gn = gn.astype('f2')

        # center
        gn -= self.mean_

        # scale
        gn /= self.std_

        return gn
Ejemplo n.º 19
0
def maxFDA(pos, ac, start=None, stop=None, is_accessible=None):
    # check inputs
    if not isinstance(pos, SortedIndex):
        pos = SortedIndex(pos, copy=False)
    ac = asarray_ndim(ac, 2)
    is_accessible = asarray_ndim(is_accessible, 1, allow_none=True)

    # deal with subregion
    if start is not None or stop is not None:
        loc = pos.locate_range(start, stop)
        pos = pos[loc]
        ac = ac[loc]
    if start is None:
        start = pos[0]
    if stop is None:
        stop = pos[-1]

    # calculate values of the stat
    dafs = []
    for i in range(len(ac)):
        p1 = ac[i, 1]
        n = p1+ac[i, 0]
        dafs.append(p1/float(n))
    return max(dafs)
Ejemplo n.º 20
0
def fold_joint_sfs(s, m, n):
    """Fold a joint site frequency spectrum.

    Parameters
    ----------
    s : array_like, int, shape (m_chromosomes, n_chromosomes)
        Joint site frequency spectrum.
    m : int
        Number of chromosomes called in the first population.
    n : int
        Number of chromosomes called in the second population.

    Returns
    -------
    joint_sfs_folded : ndarray, int
        Folded joint site frequency spectrum.

    """

    # check inputs
    s = asarray_ndim(s, 2)
    assert s.shape[0] <= m + 1, 'invalid number of chromosomes'
    assert s.shape[1] <= n + 1, 'invalid number of chromosomes'

    # need to check s has all entries up to m
    if s.shape[0] < m + 1:
        sm = np.zeros((m + 1, s.shape[1]), dtype=s.dtype)
        sm[:s.shape[0]] = s
        s = sm

    # need to check s has all entries up to n
    if s.shape[1] < n + 1:
        sn = np.zeros((s.shape[0], n + 1), dtype=s.dtype)
        sn[:, :s.shape[1]] = s
        s = sn

    # fold
    mf = (m + 1) // 2
    nf = (n + 1) // 2
    m = mf * 2
    n = nf * 2
    o = (
        s[:mf, :nf] +  # top left
        s[mf:m, :nf][::-1] +  # top right
        s[:mf, nf:n][:, ::-1] +  # bottom left
        s[mf:m, nf:n][::-1, ::-1])  # bottom right

    return o
Ejemplo n.º 21
0
def plot_joint_sfs(s, ax=None, imshow_kwargs=None):
    """Plot a joint site frequency spectrum.

    Parameters
    ----------
    s : array_like, int, shape (n_chromosomes_pop1, n_chromosomes_pop2)
        Joint site frequency spectrum.
    ax : axes, optional
        Axes on which to draw. If not provided, a new figure will be created.
    imshow_kwargs : dict-like
        Additional keyword arguments, passed through to ax.imshow().

    Returns
    -------
    ax : axes
        The axes on which the plot was drawn.

    """

    import matplotlib.pyplot as plt
    from matplotlib.colors import LogNorm

    # check inputs
    s = asarray_ndim(s, 2)

    # setup axes
    if ax is None:
        w = plt.rcParams['figure.figsize'][0]
        fig, ax = plt.subplots(figsize=(w, w))

    # set plotting defaults
    if imshow_kwargs is None:
        imshow_kwargs = dict()
    imshow_kwargs.setdefault('cmap', 'jet')
    imshow_kwargs.setdefault('interpolation', 'none')
    imshow_kwargs.setdefault('aspect', 'auto')
    imshow_kwargs.setdefault('norm', LogNorm())

    # plot data
    ax.imshow(s.T, **imshow_kwargs)

    # tidy
    ax.invert_yaxis()
    ax.set_xlabel('derived allele count (population 1)')
    ax.set_ylabel('derived allele count (population 2)')

    return ax
Ejemplo n.º 22
0
def plot_joint_sfs(s, ax=None, imshow_kwargs=None):
    """Plot a joint site frequency spectrum.

    Parameters
    ----------
    s : array_like, int, shape (n_chromosomes_pop1, n_chromosomes_pop2)
        Joint site frequency spectrum.
    ax : axes, optional
        Axes on which to draw. If not provided, a new figure will be created.
    imshow_kwargs : dict-like
        Additional keyword arguments, passed through to ax.imshow().

    Returns
    -------
    ax : axes
        The axes on which the plot was drawn.

    """

    import matplotlib.pyplot as plt
    from matplotlib.colors import LogNorm

    # check inputs
    s = asarray_ndim(s, 2)

    # setup axes
    if ax is None:
        w = plt.rcParams['figure.figsize'][0]
        fig, ax = plt.subplots(figsize=(w, w))

    # set plotting defaults
    if imshow_kwargs is None:
        imshow_kwargs = dict()
    imshow_kwargs.setdefault('cmap', 'jet')
    imshow_kwargs.setdefault('interpolation', 'none')
    imshow_kwargs.setdefault('aspect', 'auto')
    imshow_kwargs.setdefault('norm', LogNorm())

    # plot data
    ax.imshow(s.T, **imshow_kwargs)

    # tidy
    ax.invert_yaxis()
    ax.set_xlabel('derived allele count (population 1)')
    ax.set_ylabel('derived allele count (population 2)')

    return ax
Ejemplo n.º 23
0
def pairwise_dxy(pos, gac, start=None, stop=None, is_accessible=None):
    """Convenience function to calculate a pairwise distance matrix using
    nucleotide divergence (a.k.a. Dxy) as the distance metric.

    Parameters
    ----------
    pos : array_like, int, shape (n_variants,)
        Variant positions.
    gac : array_like, int, shape (n_variants, n_samples, n_alleles)
        Per-genotype allele counts.
    start : int, optional
        Start position of region to use.
    stop : int, optional
        Stop position of region to use.
    is_accessible : array_like, bool, shape (len(contig),), optional
        Boolean array indicating accessibility status for all positions in the
        chromosome/contig.

    Returns
    -------
    dist : ndarray
        Distance matrix in condensed form.

    See Also
    --------
    allel.model.ndarray.GenotypeArray.to_allele_counts

    """

    if not isinstance(pos, SortedIndex):
        pos = SortedIndex(pos, copy=False)
    gac = asarray_ndim(gac, 3)
    # compute this once here, to avoid repeated evaluation within the loop
    gan = np.sum(gac, axis=2)
    m = gac.shape[1]
    dist = list()
    for i, j in itertools.combinations(range(m), 2):
        ac1 = gac[:, i, ...]
        an1 = gan[:, i]
        ac2 = gac[:, j, ...]
        an2 = gan[:, j]
        d = sequence_divergence(pos, ac1, ac2, an1=an1, an2=an2,
                                start=start, stop=stop,
                                is_accessible=is_accessible)
        dist.append(d)
    return np.array(dist)
Ejemplo n.º 24
0
def heterozygosity_expected(af, ploidy, fill=np.nan):
    """Calculate the expected rate of heterozygosity for each variant
    under Hardy-Weinberg equilibrium.

    Parameters
    ----------

    af : array_like, float, shape (n_variants, n_alleles)
        Allele frequencies array.
    ploidy : int
        Sample ploidy.
    fill : float, optional
        Use this value for variants where allele frequencies do not sum to 1.

    Returns
    -------

    he : ndarray, float, shape (n_variants,)
        Expected heterozygosity

    Examples
    --------

    >>> import allel
    >>> g = allel.GenotypeArray([[[0, 0], [0, 0], [0, 0]],
    ...                          [[0, 0], [0, 1], [1, 1]],
    ...                          [[0, 0], [1, 1], [2, 2]],
    ...                          [[1, 1], [1, 2], [-1, -1]]])
    >>> af = g.count_alleles().to_frequencies()
    >>> allel.stats.heterozygosity_expected(af, ploidy=2)
    array([ 0.        ,  0.5       ,  0.66666667,  0.375     ])

    """

    # check inputs
    af = asarray_ndim(af, 2)

    # calculate expected heterozygosity
    out = 1 - np.sum(np.power(af, ploidy), axis=1)

    # fill values where allele frequencies could not be calculated
    af_sum = np.sum(af, axis=1)
    with ignore_invalid():
        out[(af_sum < 1) | np.isnan(af_sum)] = fill

    return out
Ejemplo n.º 25
0
def fold_joint_sfs(s, n1, n2):
    """Fold a joint site frequency spectrum.

    Parameters
    ----------
    s : array_like, int, shape (m_chromosomes, n_chromosomes)
        Joint site frequency spectrum.
    n1, n2 : int, optional
        The total number of chromosomes called in each population.

    Returns
    -------
    joint_sfs_folded : ndarray, int
        Folded joint site frequency spectrum.

    """

    # check inputs
    s = asarray_ndim(s, 2)
    assert s.shape[0] <= n1 + 1, 'invalid number of chromosomes'
    assert s.shape[1] <= n2 + 1, 'invalid number of chromosomes'

    # need to check s has all entries up to m
    if s.shape[0] < n1 + 1:
        sm = np.zeros((n1 + 1, s.shape[1]), dtype=s.dtype)
        sm[:s.shape[0]] = s
        s = sm

    # need to check s has all entries up to n
    if s.shape[1] < n2 + 1:
        sn = np.zeros((s.shape[0], n2 + 1), dtype=s.dtype)
        sn[:, :s.shape[1]] = s
        s = sn

    # fold
    mf = (n1 + 1) // 2
    nf = (n2 + 1) // 2
    n1 = mf * 2
    n2 = nf * 2
    o = (
        s[:mf, :nf] +  # top left
        s[mf:n1, :nf][::-1] +  # top right
        s[:mf, nf:n2][:, ::-1] +  # bottom left
        s[mf:n1, nf:n2][::-1, ::-1])  # bottom right

    return o
Ejemplo n.º 26
0
def heterozygosity_expected(af, ploidy, fill=np.nan):
    """Calculate the expected rate of heterozygosity for each variant
    under Hardy-Weinberg equilibrium.

    Parameters
    ----------

    af : array_like, float, shape (n_variants, n_alleles)
        Allele frequencies array.
    fill : float, optional
        Use this value for variants where allele frequencies do not sum to 1.

    Returns
    -------

    he : ndarray, float, shape (n_variants,)
        Expected heterozygosity

    Examples
    --------

    >>> import allel
    >>> g = allel.GenotypeArray([[[0, 0], [0, 0], [0, 0]],
    ...                          [[0, 0], [0, 1], [1, 1]],
    ...                          [[0, 0], [1, 1], [2, 2]],
    ...                          [[1, 1], [1, 2], [-1, -1]]])
    >>> af = g.count_alleles().to_frequencies()
    >>> allel.stats.heterozygosity_expected(af, ploidy=2)
    array([ 0.        ,  0.5       ,  0.66666667,  0.375     ])

    """

    # check inputs
    af = asarray_ndim(af, 2)

    # calculate expected heterozygosity
    out = 1 - np.sum(np.power(af, ploidy), axis=1)

    # fill values where allele frequencies could not be calculated
    af_sum = np.sum(af, axis=1)
    with ignore_invalid():
        out[(af_sum < 1) | np.isnan(af_sum)] = fill

    return out
Ejemplo n.º 27
0
    def count_alleles(self, max_allele=None, subpop=None):

        # if max_allele not specified, count all alleles
        if max_allele is None:
            max_allele = self.max().compute()[()]

        # deal with subpop
        subpop = asarray_ndim(subpop, 1, allow_none=True, dtype=np.int64)
        if subpop is not None:
            gd = self.take(subpop, axis=1).values
        else:
            gd = self.values

        # determine output chunks - preserve axis0; change axis1, axis2
        chunks = (gd.chunks[0], (1, ) * len(gd.chunks[1]), (max_allele + 1, ))

        if self.mask is None:

            # simple case, no mask
            def f(block):
                gb = GenotypeArray(block)
                return gb.count_alleles(max_allele=max_allele)[:, None, :]

            # map blocks and reduce
            out = da.map_blocks(f, gd, chunks=chunks).sum(axis=1, dtype='i4')

        else:

            # map with mask
            def f(block, bmask):
                g = GenotypeArray(block)
                g.mask = bmask[:, :, 0]
                return g.count_alleles(max_allele=max_allele)[:, None, :]

            md = self.mask[:, :, None]
            out = da.map_blocks(f, gd, md, chunks=chunks).sum(axis=1,
                                                              dtype='i4')

        return AlleleCountsDaskArray(out)
Ejemplo n.º 28
0
def sfs(dac):
    """Compute the site frequency spectrum given derived allele counts at
    a set of biallelic variants.

    Parameters
    ----------
    dac : array_like, int, shape (n_variants,)
        Array of derived allele counts.

    Returns
    -------
    sfs : ndarray, int, shape (n_chromosomes,)
        Array where the kth element is the number of variant sites with k
        derived alleles.

    """

    # check input
    dac = asarray_ndim(dac, 1)

    # compute site frequency spectrum
    s = np.bincount(dac)

    return s
Ejemplo n.º 29
0
def sfs(dac):
    """Compute the site frequency spectrum given derived allele counts at
    a set of biallelic variants.

    Parameters
    ----------
    dac : array_like, int, shape (n_variants,)
        Array of derived allele counts.

    Returns
    -------
    sfs : ndarray, int, shape (n_chromosomes,)
        Array where the kth element is the number of variant sites with k
        derived alleles.

    """

    # check input
    dac = asarray_ndim(dac, 1)

    # compute site frequency spectrum
    s = np.bincount(dac)

    return s
Ejemplo n.º 30
0
def windowed_df(pos,
                ac1,
                ac2,
                size=None,
                start=None,
                stop=None,
                step=None,
                windows=None,
                is_accessible=None,
                fill=np.nan):
    """Calculate the density of fixed differences between two populations in
    windows over a single chromosome/contig.

    Parameters
    ----------

    pos : array_like, int, shape (n_items,)
        Variant positions, using 1-based coordinates, in ascending order.
    ac1 : array_like, int, shape (n_variants, n_alleles)
        Allele counts array for the first population.
    ac2 : array_like, int, shape (n_variants, n_alleles)
        Allele counts array for the second population.
    size : int, optional
        The window size (number of bases).
    start : int, optional
        The position at which to start (1-based).
    stop : int, optional
        The position at which to stop (1-based).
    step : int, optional
        The distance between start positions of windows. If not given,
        defaults to the window size, i.e., non-overlapping windows.
    windows : array_like, int, shape (n_windows, 2), optional
        Manually specify the windows to use as a sequence of (window_start,
        window_stop) positions, using 1-based coordinates. Overrides the
        size/start/stop/step parameters.
    is_accessible : array_like, bool, shape (len(contig),), optional
        Boolean array indicating accessibility status for all positions in the
        chromosome/contig.
    fill : object, optional
        The value to use where a window is completely inaccessible.

    Returns
    -------

    df : ndarray, float, shape (n_windows,)
        Per-base density of fixed differences in each window.
    windows : ndarray, int, shape (n_windows, 2)
        The windows used, as an array of (window_start, window_stop) positions,
        using 1-based coordinates.
    n_bases : ndarray, int, shape (n_windows,)
        Number of (accessible) bases in each window.
    counts : ndarray, int, shape (n_windows,)
        Number of variants in each window.

    See Also
    --------

    allel.model.locate_fixed_differences

    """

    # check inputs
    pos = SortedIndex(pos, copy=False)
    is_accessible = asarray_ndim(is_accessible, 1, allow_none=True)

    # locate fixed differences
    loc_df = locate_fixed_differences(ac1, ac2)

    # count number of fixed differences in windows
    n_df, windows, counts = windowed_statistic(pos,
                                               values=loc_df,
                                               statistic=np.count_nonzero,
                                               size=size,
                                               start=start,
                                               stop=stop,
                                               step=step,
                                               windows=windows,
                                               fill=0)

    # calculate value per base
    df, n_bases = per_base(n_df,
                           windows,
                           is_accessible=is_accessible,
                           fill=fill)

    return df, windows, n_bases, counts
Ejemplo n.º 31
0
def hudson_fst(ac1, ac2, fill=np.nan):
    """Calculate the numerator and denominator for Fst estimation using the
    method of Hudson (1992) elaborated by Bhatia et al. (2013).

    Parameters
    ----------
    ac1 : array_like, int, shape (n_variants, n_alleles)
        Allele counts array from the first population.
    ac2 : array_like, int, shape (n_variants, n_alleles)
        Allele counts array from the second population.
    fill : float
        Use this value where there are no pairs to compare (e.g.,
        all allele calls are missing).

    Returns
    -------
    num : ndarray, float, shape (n_variants,)
        Divergence between the two populations minus average
        of diversity within each population.
    den : ndarray, float, shape (n_variants,)
        Divergence between the two populations.

    Examples
    --------
    Calculate numerator and denominator for Fst estimation::

        >>> import allel
        >>> g = allel.GenotypeArray([[[0, 0], [0, 0], [1, 1], [1, 1]],
        ...                          [[0, 1], [0, 1], [0, 1], [0, 1]],
        ...                          [[0, 0], [0, 0], [0, 0], [0, 0]],
        ...                          [[0, 1], [1, 2], [1, 1], [2, 2]],
        ...                          [[0, 0], [1, 1], [0, 1], [-1, -1]]])
        >>> subpops = [[0, 1], [2, 3]]
        >>> ac1 = g.count_alleles(subpop=subpops[0])
        >>> ac2 = g.count_alleles(subpop=subpops[1])
        >>> num, den = allel.hudson_fst(ac1, ac2)
        >>> num
        array([ 1.        , -0.16666667,  0.        , -0.125     , -0.33333333])
        >>> den
        array([1.   , 0.5  , 0.   , 0.625, 0.5  ])

    Estimate Fst for each variant individually::

        >>> fst = num / den
        >>> fst
        array([ 1.        , -0.33333333,         nan, -0.2       , -0.66666667])

    Estimate Fst averaging over variants::

        >>> fst = np.sum(num) / np.sum(den)
        >>> fst
        0.1428571428571429

    """  # flake8: noqa

    # check inputs
    ac1 = asarray_ndim(ac1, 2)
    ac2 = asarray_ndim(ac2, 2)
    check_dim0_aligned(ac1, ac2)
    ac1, ac2 = ensure_dim1_aligned(ac1, ac2)

    # calculate these once only
    an1 = np.sum(ac1, axis=1)
    an2 = np.sum(ac2, axis=1)

    # calculate average diversity (a.k.a. heterozygosity) within each
    # population
    within = (mean_pairwise_difference(ac1, an1, fill=fill) +
              mean_pairwise_difference(ac2, an2, fill=fill)) / 2

    # calculate divergence (a.k.a. heterozygosity) between each population
    between = mean_pairwise_difference_between(ac1, ac2, an1, an2, fill=fill)

    # define numerator and denominator for Fst calculations
    num = between - within
    den = between

    return num, den
Ejemplo n.º 32
0
def windowed_watterson_theta(pos,
                             ac,
                             size=None,
                             start=None,
                             stop=None,
                             step=None,
                             windows=None,
                             is_accessible=None,
                             fill=np.nan):
    """Calculate the value of Watterson's estimator in windows over a single
    chromosome/contig.

    Parameters
    ----------

    pos : array_like, int, shape (n_items,)
        Variant positions, using 1-based coordinates, in ascending order.
    ac : array_like, int, shape (n_variants, n_alleles)
        Allele counts array.
    size : int, optional
        The window size (number of bases).
    start : int, optional
        The position at which to start (1-based).
    stop : int, optional
        The position at which to stop (1-based).
    step : int, optional
        The distance between start positions of windows. If not given,
        defaults to the window size, i.e., non-overlapping windows.
    windows : array_like, int, shape (n_windows, 2), optional
        Manually specify the windows to use as a sequence of (window_start,
        window_stop) positions, using 1-based coordinates. Overrides the
        size/start/stop/step parameters.
    is_accessible : array_like, bool, shape (len(contig),), optional
        Boolean array indicating accessibility status for all positions in the
        chromosome/contig.
    fill : object, optional
        The value to use where a window is completely inaccessible.

    Returns
    -------

    theta_hat_w : ndarray, float, shape (n_windows,)
        Watterson's estimator (theta hat per base).
    windows : ndarray, int, shape (n_windows, 2)
        The windows used, as an array of (window_start, window_stop) positions,
        using 1-based coordinates.
    n_bases : ndarray, int, shape (n_windows,)
        Number of (accessible) bases in each window.
    counts : ndarray, int, shape (n_windows,)
        Number of variants in each window.

    Examples
    --------

    >>> import allel
    >>> g = allel.GenotypeArray([[[0, 0], [0, 0]],
    ...                          [[0, 0], [0, 1]],
    ...                          [[0, 0], [1, 1]],
    ...                          [[0, 1], [1, 1]],
    ...                          [[1, 1], [1, 1]],
    ...                          [[0, 0], [1, 2]],
    ...                          [[0, 1], [1, 2]],
    ...                          [[0, 1], [-1, -1]],
    ...                          [[-1, -1], [-1, -1]]])
    >>> ac = g.count_alleles()
    >>> pos = [2, 4, 7, 14, 15, 18, 19, 25, 27]
    >>> theta_hat_w, windows, n_bases, counts = allel.windowed_watterson_theta(
    ...     pos, ac, size=10, start=1, stop=31
    ... )
    >>> theta_hat_w
    array([0.10909091, 0.16363636, 0.04958678])
    >>> windows
    array([[ 1, 10],
           [11, 20],
           [21, 31]])
    >>> n_bases
    array([10, 10, 11])
    >>> counts
    array([3, 4, 2])

    """  # flake8: noqa

    # check inputs
    if not isinstance(pos, SortedIndex):
        pos = SortedIndex(pos, copy=False)
    is_accessible = asarray_ndim(is_accessible, 1, allow_none=True)
    if not hasattr(ac, 'count_segregating'):
        ac = AlleleCountsArray(ac, copy=False)

    # locate segregating variants
    is_seg = ac.is_segregating()

    # count segregating variants in windows
    S, windows, counts = windowed_statistic(pos,
                                            is_seg,
                                            statistic=np.count_nonzero,
                                            size=size,
                                            start=start,
                                            stop=stop,
                                            step=step,
                                            windows=windows,
                                            fill=0)

    # assume number of chromosomes sampled is constant for all variants
    n = ac.sum(axis=1).max()

    # (n-1)th harmonic number
    a1 = np.sum(1 / np.arange(1, n))

    # absolute value of Watterson's theta
    theta_hat_w_abs = S / a1

    # theta per base
    theta_hat_w, n_bases = per_base(theta_hat_w_abs,
                                    windows=windows,
                                    is_accessible=is_accessible,
                                    fill=fill)

    return theta_hat_w, windows, n_bases, counts
Ejemplo n.º 33
0
def watterson_theta(pos, ac, start=None, stop=None, is_accessible=None):
    """Calculate the value of Watterson's estimator over a given region.

    Parameters
    ----------

    pos : array_like, int, shape (n_items,)
        Variant positions, using 1-based coordinates, in ascending order.
    ac : array_like, int, shape (n_variants, n_alleles)
        Allele counts array.
    start : int, optional
        The position at which to start (1-based). Defaults to the first position.
    stop : int, optional
        The position at which to stop (1-based). Defaults to the last position.
    is_accessible : array_like, bool, shape (len(contig),), optional
        Boolean array indicating accessibility status for all positions in the
        chromosome/contig.

    Returns
    -------

    theta_hat_w : float
        Watterson's estimator (theta hat per base).

    Examples
    --------

    >>> import allel
    >>> g = allel.GenotypeArray([[[0, 0], [0, 0]],
    ...                          [[0, 0], [0, 1]],
    ...                          [[0, 0], [1, 1]],
    ...                          [[0, 1], [1, 1]],
    ...                          [[1, 1], [1, 1]],
    ...                          [[0, 0], [1, 2]],
    ...                          [[0, 1], [1, 2]],
    ...                          [[0, 1], [-1, -1]],
    ...                          [[-1, -1], [-1, -1]]])
    >>> ac = g.count_alleles()
    >>> pos = [2, 4, 7, 14, 15, 18, 19, 25, 27]
    >>> theta_hat_w = allel.watterson_theta(pos, ac, start=1, stop=31)
    >>> theta_hat_w
    0.10557184750733138

    """

    # check inputs
    if not isinstance(pos, SortedIndex):
        pos = SortedIndex(pos, copy=False)
    is_accessible = asarray_ndim(is_accessible, 1, allow_none=True)
    if not hasattr(ac, 'count_segregating'):
        ac = AlleleCountsArray(ac, copy=False)

    # deal with subregion
    if start is not None or stop is not None:
        loc = pos.locate_range(start, stop)
        pos = pos[loc]
        ac = ac[loc]
    if start is None:
        start = pos[0]
    if stop is None:
        stop = pos[-1]

    # count segregating variants
    S = ac.count_segregating()

    # assume number of chromosomes sampled is constant for all variants
    n = ac.sum(axis=1).max()

    # (n-1)th harmonic number
    a1 = np.sum(1 / np.arange(1, n))

    # calculate absolute value
    theta_hat_w_abs = S / a1

    # calculate value per base
    if is_accessible is None:
        n_bases = stop - start + 1
    else:
        n_bases = np.count_nonzero(is_accessible[start - 1:stop])
    theta_hat_w = theta_hat_w_abs / n_bases

    return theta_hat_w
Ejemplo n.º 34
0
def xpnsl(h1, h2, use_threads=True):
    """Cross-population version of the NSL statistic.

    Parameters
    ----------
    h1 : array_like, int, shape (n_variants, n_haplotypes)
        Haplotype array for the first population.
    h2 : array_like, int, shape (n_variants, n_haplotypes)
        Haplotype array for the second population.
    use_threads : bool, optional
        If True use multiple threads to compute.

    Returns
    -------
    score : ndarray, float, shape (n_variants,)
        Unstandardized XPNSL scores.

    """

    # check inputs
    h1 = asarray_ndim(h1, 2)
    check_integer_dtype(h1)
    h2 = asarray_ndim(h2, 2)
    check_integer_dtype(h2)
    check_dim0_aligned(h1, h2)

    if use_threads and multiprocessing.cpu_count() > 1:
        # use multiple threads

        # setup threadpool
        pool = ThreadPool(min(4, multiprocessing.cpu_count()))

        # scan forward
        res1_fwd = pool.apply_async(nsl_scan, args=(h1, ))
        res2_fwd = pool.apply_async(nsl_scan, args=(h2, ))

        # scan backward
        res1_rev = pool.apply_async(nsl_scan, args=(h1[::-1], ))
        res2_rev = pool.apply_async(nsl_scan, args=(h2[::-1], ))

        # wait for both to finish
        pool.close()
        pool.join()

        # obtain results
        nsl1_fwd = res1_fwd.get()
        nsl2_fwd = res2_fwd.get()
        nsl1_rev = res1_rev.get()
        nsl2_rev = res2_rev.get()

        # cleanup
        pool.terminate()

    else:
        # compute without threads

        # scan forward
        nsl1_fwd = nsl_scan(h1)
        nsl2_fwd = nsl_scan(h2)

        # scan backward
        nsl1_rev = nsl_scan(h1[::-1])
        nsl2_rev = nsl_scan(h2[::-1])

    # handle reverse scans
    nsl1_rev = nsl1_rev[::-1]
    nsl2_rev = nsl2_rev[::-1]

    # compute unstandardized score
    nsl1 = nsl1_fwd + nsl1_rev
    nsl2 = nsl2_fwd + nsl2_rev
    score = np.log(nsl1 / nsl2)

    return score
Ejemplo n.º 35
0
def tabulate_state_blocks(x, states, pos=None):
    """Construct a dataframe where each row provides information about continuous state blocks.

    Parameters
    ----------
    x : array_like, int
        1-dimensional array of state values.
    states : set
        Set of states of interest. Any state value not in this set will be ignored.
    pos : array_like, int, optional
        Array of positions corresponding to values in `x`.

    Returns
    -------
    df : DataFrame

    Examples
    --------
    >>> import allel
    >>> x = [1, 1, 0, 1, 1, 2, 2, 0, 2, 1, 1]
    >>> df = allel.tabulate_state_blocks(x, states={1, 2})
    >>> df
       state  support  start_lidx     ...       size_min  size_max  is_marginal
    0      1        4          -1     ...              5        -1         True
    1      2        3           4     ...              4         4        False
    2      1        2           8     ...              2        -1         True
    [3 rows x 9 columns]
    >>> pos = [2, 4, 7, 8, 10, 14, 19, 23, 28, 30, 31]
    >>> df = allel.tabulate_state_blocks(x, states={1, 2}, pos=pos)
    >>> df
       state  support  start_lidx     ...      stop_rpos  length_min  length_max
    0      1        4          -1     ...             14           9          -1
    1      2        3           4     ...             30          15          19
    2      1        2           8     ...             -1           2          -1
    [3 rows x 15 columns]

    """

    # check inputs
    x = asarray_ndim(x, 1)
    check_integer_dtype(x)
    x = memoryview_safe(x)

    # find state transitions
    switch_points, transitions, observations = state_transitions(x, states)

    # setup some helpers
    t = transitions[1:, 0]
    o = observations[1:]
    s1 = switch_points[:-1]
    s2 = switch_points[1:]
    is_marginal = (s1[:, 0] < 0) | (s2[:, 1] < 0)
    size_min = s2[:, 0] - s1[:, 1] + 1
    size_max = s2[:, 1] - s1[:, 0] - 1
    size_max[is_marginal] = -1

    # start to build a dataframe
    items = [
        ('state', t),
        ('support', o),
        ('start_lidx', s1[:, 0]),
        ('start_ridx', s1[:, 1]),
        ('stop_lidx', s2[:, 0]),
        ('stop_ridx', s2[:, 1]),
        ('size_min', size_min),
        ('size_max', size_max),
        ('is_marginal', is_marginal)
    ]

    # deal with optional positions
    if pos is not None:
        pos = asarray_ndim(pos, 1)
        check_dim0_aligned(x, pos)
        check_integer_dtype(pos)

        # obtain switch positions
        switch_positions = np.take(pos, switch_points)
        # deal with boundary transitions
        switch_positions[0, 0] = -1
        switch_positions[-1, 1] = -1

        # setup helpers
        p1 = switch_positions[:-1]
        p2 = switch_positions[1:]
        length_min = p2[:, 0] - p1[:, 1] + 1
        length_max = p2[:, 1] - p1[:, 0] - 1
        length_max[is_marginal] = -1

        items += [
            ('start_lpos', p1[:, 0]),
            ('start_rpos', p1[:, 1]),
            ('stop_lpos', p2[:, 0]),
            ('stop_rpos', p2[:, 1]),
            ('length_min', length_min),
            ('length_max', length_max),
        ]

    import pandas
    return pandas.DataFrame.from_dict(OrderedDict(items))
Ejemplo n.º 36
0
def windowed_statistic(pos, values, statistic, size=None, start=None,
                       stop=None, step=None, windows=None, fill=np.nan):
    """Calculate a statistic from items in windows over a single
    chromosome/contig.

    Parameters
    ----------

    pos : array_like, int, shape (n_items,)
        The item positions in ascending order, using 1-based coordinates..
    values : array_like, int, shape (n_items,)
        The values to summarise. May also be a tuple of values arrays,
        in which case each array will be sliced and passed through to the
        statistic function as separate arguments.
    statistic : function
        The statistic to compute.
    size : int, optional
        The window size (number of bases).
    start : int, optional
        The position at which to start (1-based).
    stop : int, optional
        The position at which to stop (1-based).
    step : int, optional
        The distance between start positions of windows. If not given,
        defaults to the window size, i.e., non-overlapping windows.
    windows : array_like, int, shape (n_windows, 2), optional
        Manually specify the windows to use as a sequence of (window_start,
        window_stop) positions, using 1-based coordinates. Overrides the
        size/start/stop/step parameters.
    fill : object, optional
        The value to use where a window is empty, i.e., contains no items.

    Returns
    -------

    out : ndarray, shape (n_windows,)
        The value of the statistic for each window.
    windows : ndarray, int, shape (n_windows, 2)
        The windows used, as an array of (window_start, window_stop) positions,
        using 1-based coordinates.
    counts : ndarray, int, shape (n_windows,)
        The number of items in each window.

    Notes
    -----

    The window stop positions are included within a window.

    The final window will be truncated to the specified stop position,
    and so may be smaller than the other windows.

    Examples
    --------

    Count non-zero (i.e., True) items in non-overlapping windows::

        >>> import allel
        >>> pos = [1, 7, 12, 15, 28]
        >>> values = [True, False, True, False, False]
        >>> nnz, windows, counts = allel.stats.windowed_statistic(
        ...     pos, values, statistic=np.count_nonzero, size=10
        ... )
        >>> nnz
        array([1, 1, 0])
        >>> windows
        array([[ 1, 10],
               [11, 20],
               [21, 28]])
        >>> counts
        array([2, 2, 1])

    Compute a sum over items in half-overlapping windows::

        >>> values = [3, 4, 2, 6, 9]
        >>> x, windows, counts = allel.stats.windowed_statistic(
        ...     pos, values, statistic=np.sum, size=10, step=5, fill=0
        ... )
        >>> x
        array([ 7, 12,  8,  0,  9])
        >>> windows
        array([[ 1, 10],
               [ 6, 15],
               [11, 20],
               [16, 25],
               [21, 28]])
        >>> counts
        array([2, 3, 2, 0, 1])

    """

    # assume sorted positions
    if not isinstance(pos, SortedIndex):
        pos = SortedIndex(pos, copy=False)

    # check lengths are equal
    if isinstance(values, tuple):
        # assume multiple values arrays
        check_equal_length(pos, *values)
    else:
        # assume a single values array
        check_equal_length(pos, values)

    # setup windows
    if windows is None:
        windows = position_windows(pos, size, start, stop, step)
    else:
        windows = asarray_ndim(windows, 2)

    # find window locations
    locs = window_locations(pos, windows)

    # setup outputs
    out = []
    counts = []

    # iterate over windows
    for start_idx, stop_idx in locs:

        # calculate number of values in window
        n = stop_idx - start_idx

        if n == 0:
            # window is empty
            s = fill

        else:

            if isinstance(values, tuple):
                # assume multiple values arrays
                wv = [v[start_idx:stop_idx] for v in values]
                s = statistic(*wv)

            else:
                # assume a single values array
                wv = values[start_idx:stop_idx]
                s = statistic(wv)

        # store outputs
        out.append(s)
        counts.append(n)

    # convert to arrays for output
    return np.asarray(out), windows, np.asarray(counts)
Ejemplo n.º 37
0
def mean_pairwise_difference(ac, an=None, fill=np.nan):
    """Calculate for each variant the mean number of pairwise differences
    between chromosomes sampled from within a single population.

    Parameters
    ----------

    ac : array_like, int, shape (n_variants, n_alleles)
        Allele counts array.
    an : array_like, int, shape (n_variants,), optional
        Allele numbers. If not provided, will be calculated from `ac`.
    fill : float
        Use this value where there are no pairs to compare (e.g.,
        all allele calls are missing).

    Returns
    -------

    mpd : ndarray, float, shape (n_variants,)

    Notes
    -----

    The values returned by this function can be summed over a genome
    region and divided by the number of accessible bases to estimate
    nucleotide diversity, a.k.a. *pi*.

    Examples
    --------

    >>> import allel
    >>> h = allel.HaplotypeArray([[0, 0, 0, 0],
    ...                           [0, 0, 0, 1],
    ...                           [0, 0, 1, 1],
    ...                           [0, 1, 1, 1],
    ...                           [1, 1, 1, 1],
    ...                           [0, 0, 1, 2],
    ...                           [0, 1, 1, 2],
    ...                           [0, 1, -1, -1]])
    >>> ac = h.count_alleles()
    >>> allel.mean_pairwise_difference(ac)
    array([0.        , 0.5       , 0.66666667, 0.5       , 0.        ,
           0.83333333, 0.83333333, 1.        ])

    See Also
    --------

    sequence_diversity, windowed_diversity

    """

    # This function calculates the mean number of pairwise differences
    # between haplotypes within a single population, generalising to any number
    # of alleles.

    # check inputs
    ac = asarray_ndim(ac, 2)

    # total number of haplotypes
    if an is None:
        an = np.sum(ac, axis=1)
    else:
        an = asarray_ndim(an, 1)
        check_dim0_aligned(ac, an)

    # total number of pairwise comparisons for each variant:
    # (an choose 2)
    n_pairs = an * (an - 1) / 2

    # number of pairwise comparisons where there is no difference:
    # sum of (ac choose 2) for each allele (i.e., number of ways to
    # choose the same allele twice)
    n_same = np.sum(ac * (ac - 1) / 2, axis=1)

    # number of pairwise differences
    n_diff = n_pairs - n_same

    # mean number of pairwise differences, accounting for cases where
    # there are no pairs
    with ignore_invalid():
        mpd = np.where(n_pairs > 0, n_diff / n_pairs, fill)

    return mpd
Ejemplo n.º 38
0
def roh_mhmm(gv,
             pos,
             phet_roh=0.001,
             phet_nonroh=(0.0025, 0.01),
             transition=1e-6,
             min_roh=0,
             is_accessible=None,
             contig_size=None):
    """Call ROH (runs of homozygosity) in a single individual given a genotype vector.

    This function computes the likely ROH using a Multinomial HMM model. There are 3
    observable states at each position in a chromosome/contig: 0 = Hom, 1 = Het,
    2 = inaccessible (i.e., unobserved).

    The model is provided with a probability of observing a het in a ROH (`phet_roh`) and one
    or more probabilities of observing a het in a non-ROH, as this probability may not be
    constant across the genome (`phet_nonroh`).

    Parameters
    ----------
    gv : array_like, int, shape (n_variants, ploidy)
        Genotype vector.
    pos: array_like, int, shape (n_variants,)
        Positions of variants, same 0th dimension as `gv`.
    phet_roh: float, optional
        Probability of observing a heterozygote in a ROH. Appropriate values
        will depend on de novo mutation rate and genotype error rate.
    phet_nonroh: tuple of floats, optional
        One or more probabilites of observing a heterozygote outside of ROH.
        Appropriate values will depend primarily on nucleotide diversity within
        the population, but also on mutation rate and genotype error rate.
    transition: float, optional
        Probability of moving between states.
    min_roh: integer, optional
        Minimum size (bp) to condsider as a ROH. Will depend on contig size
        and recombination rate.
    is_accessible: array_like, bool, shape (`contig_size`,), optional
        Boolean array for each position in contig describing whether accessible
        or not.
    contig_size: int, optional
        If is_accessible not known/not provided, allows specification of
        total length of contig.

    Returns
    -------
    df_roh: DataFrame
        Data frame where each row describes a run of homozygosity. Columns are 'start',
        'stop', 'length' and 'is_marginal'. Start and stop are 1-based, stop-inclusive.
    froh: float
        Proportion of genome in a ROH.

    Notes
    -----
    This function requires `hmmlearn <http://hmmlearn.readthedocs.io/en/latest/>`_ to be
    installed.

    This function currently requires around 4GB memory for a contig size of ~50Mbp.

    """

    from hmmlearn import hmm

    # setup inputs
    if isinstance(phet_nonroh, float):
        phet_nonroh = phet_nonroh,
    gv = GenotypeVector(gv)
    pos = asarray_ndim(pos, 1)
    check_dim0_aligned(gv, pos)
    is_accessible = asarray_ndim(is_accessible, 1, dtype=bool)

    # heterozygote probabilities
    het_px = np.concatenate([(phet_roh, ), phet_nonroh])

    # start probabilities (all equal)
    start_prob = np.repeat(1 / het_px.size, het_px.size)

    # transition between underlying states
    transition_mx = _hmm_derive_transition_matrix(transition, het_px.size)

    # probability of inaccessible
    if is_accessible is None:
        if contig_size is None:
            raise ValueError(
                "If is_accessibile argument is not provided, you must provide contig_size"
            )
        p_accessible = 1.0
    else:
        p_accessible = is_accessible.mean()
        contig_size = is_accessible.size

    emission_mx = _mhmm_derive_emission_matrix(het_px, p_accessible)

    # initialize HMM
    roh_hmm = hmm.MultinomialHMM(n_components=het_px.size)
    roh_hmm.n_symbols_ = 3
    roh_hmm.startprob_ = start_prob
    roh_hmm.transmat_ = transition_mx
    roh_hmm.emissionprob_ = emission_mx

    # locate heterozygous calls
    is_het = gv.is_het()

    # predict ROH state
    pred, obs = _mhmm_predict_roh_state(roh_hmm, is_het, pos, is_accessible,
                                        contig_size)

    # find ROH windows
    df_blocks = tabulate_state_blocks(pred, states=list(range(len(het_px))))
    df_roh = df_blocks[(df_blocks.state == 0)].reset_index(drop=True)
    # adapt the dataframe for ROH
    for col in 'state', 'support', 'start_lidx', 'stop_ridx', 'size_max':
        del df_roh[col]
    df_roh.rename(columns={
        'start_ridx': 'start',
        'stop_lidx': 'stop',
        'size_min': 'length'
    },
                  inplace=True)
    # make coordinates 1-based
    df_roh['start'] = df_roh['start'] + 1
    df_roh['stop'] = df_roh['stop'] + 1

    # filter by ROH size
    if min_roh > 0:
        df_roh = df_roh[df_roh.length >= min_roh]

    # compute FROH
    froh = df_roh.length.sum() / contig_size

    return df_roh, froh
Ejemplo n.º 39
0
def windowed_count(pos, size=None, start=None, stop=None, step=None,
                   windows=None):
    """Count the number of items in windows over a single chromosome/contig.

    Parameters
    ----------

    pos : array_like, int, shape (n_items,)
        The item positions in ascending order, using 1-based coordinates..
    size : int, optional
        The window size (number of bases).
    start : int, optional
        The position at which to start (1-based).
    stop : int, optional
        The position at which to stop (1-based).
    step : int, optional
        The distance between start positions of windows. If not given,
        defaults to the window size, i.e., non-overlapping windows.
    windows : array_like, int, shape (n_windows, 2), optional
        Manually specify the windows to use as a sequence of (window_start,
        window_stop) positions, using 1-based coordinates. Overrides the
        size/start/stop/step parameters.

    Returns
    -------

    counts : ndarray, int, shape (n_windows,)
        The number of items in each window.
    windows : ndarray, int, shape (n_windows, 2)
        The windows used, as an array of (window_start, window_stop) positions,
        using 1-based coordinates.

    Notes
    -----

    The window stop positions are included within a window.

    The final window will be truncated to the specified stop position,
    and so may be smaller than the other windows.

    Examples
    --------

    Non-overlapping windows::

        >>> import allel
        >>> pos = [1, 7, 12, 15, 28]
        >>> counts, windows = allel.stats.windowed_count(pos, size=10)
        >>> counts
        array([2, 2, 1])
        >>> windows
        array([[ 1, 10],
               [11, 20],
               [21, 28]])

    Half-overlapping windows::

        >>> counts, windows = allel.stats.windowed_count(pos, size=10, step=5)
        >>> counts
        array([2, 3, 2, 0, 1])
        >>> windows
        array([[ 1, 10],
               [ 6, 15],
               [11, 20],
               [16, 25],
               [21, 28]])

    """

    # assume sorted positions
    if not isinstance(pos, SortedIndex):
        pos = SortedIndex(pos, copy=False)

    # setup windows
    if windows is None:
        windows = position_windows(pos, size, start, stop, step)
    else:
        windows = asarray_ndim(windows, 2)

    # find window locations
    locs = window_locations(pos, windows)

    # count number of items in each window
    counts = np.diff(locs, axis=1).reshape(-1)

    return counts, windows
Ejemplo n.º 40
0
def standardize_by_allele_count(score,
                                aac,
                                bins=None,
                                n_bins=None,
                                diagnostics=True):
    """Standardize `score` within allele frequency bins.

    Parameters
    ----------
    score : array_like, float
        The score to be standardized, e.g., IHS or NSL.
    aac : array_like, int
        An array of alternate allele counts.
    bins : array_like, int, optional
        Allele count bins, overrides `n_bins`.
    n_bins : int, optional
        Number of allele count bins to use.
    diagnostics : bool, optional
        If True, plot some diagnostic information about the standardization.

    Returns
    -------
    score_standardized : ndarray, float
        Standardized scores.
    bins : ndarray, int
        Allele count bins used for standardization.

    """

    from scipy.stats import binned_statistic

    # check inputs
    score = asarray_ndim(score, 1)
    aac = asarray_ndim(aac, 1)
    check_dim0_aligned(score, aac)

    # remove nans
    nonan = ~np.isnan(score)
    score_nonan = score[nonan]
    aac_nonan = aac[nonan]

    if bins is None:
        # make our own similar sized bins

        # how many bins to make?
        if n_bins is None:
            # something vaguely reasonable
            n_bins = np.max(aac) // 2

        # make bins
        bins = make_similar_sized_bins(aac_nonan, n_bins)

    else:
        # user-provided bins
        bins = asarray_ndim(bins, 1)

    mean_score, _, _ = binned_statistic(aac_nonan,
                                        score_nonan,
                                        statistic=np.mean,
                                        bins=bins)
    std_score, _, _ = binned_statistic(aac_nonan,
                                       score_nonan,
                                       statistic=np.std,
                                       bins=bins)

    if diagnostics:
        import matplotlib.pyplot as plt
        x = (bins[:-1] + bins[1:]) / 2
        plt.figure()
        plt.fill_between(x,
                         mean_score - std_score,
                         mean_score + std_score,
                         alpha=.5,
                         label='std')
        plt.plot(x, mean_score, marker='o', label='mean')
        plt.grid(axis='y')
        plt.xlabel('Alternate allele count')
        plt.ylabel('Unstandardized score')
        plt.title('Standardization diagnostics')
        plt.legend()

    # apply standardization
    score_standardized = np.empty_like(score)
    for i in range(len(bins) - 1):
        x1 = bins[i]
        x2 = bins[i + 1]
        if i == 0:
            # first bin
            loc = (aac < x2)
        elif i == len(bins) - 2:
            # last bin
            loc = (aac >= x1)
        else:
            # middle bins
            loc = (aac >= x1) & (aac < x2)
        m = mean_score[i]
        s = std_score[i]
        score_standardized[loc] = (score[loc] - m) / s

    return score_standardized, bins
Ejemplo n.º 41
0
def sequence_divergence(pos,
                        ac1,
                        ac2,
                        an1=None,
                        an2=None,
                        start=None,
                        stop=None,
                        is_accessible=None):
    """Estimate nucleotide divergence between two populations within a
    given region, which is the average proportion of sites (including
    monomorphic sites not present in the data) that differ between randomly
    chosen pairs of chromosomes, one from each population.

    Parameters
    ----------

    pos : array_like, int, shape (n_items,)
        Variant positions, using 1-based coordinates, in ascending order.
    ac1 : array_like, int, shape (n_variants, n_alleles)
        Allele counts array for the first population.
    ac2 : array_like, int, shape (n_variants, n_alleles)
        Allele counts array for the second population.
    an1 : array_like, int, shape (n_variants,), optional
        Allele numbers for the first population. If not provided, will be
        calculated from `ac1`.
    an2 : array_like, int, shape (n_variants,), optional
        Allele numbers for the second population. If not provided, will be
        calculated from `ac2`.
    start : int, optional
        The position at which to start (1-based). Defaults to the first position.
    stop : int, optional
        The position at which to stop (1-based). Defaults to the last position.
    is_accessible : array_like, bool, shape (len(contig),), optional
        Boolean array indicating accessibility status for all positions in the
        chromosome/contig.

    Returns
    -------

    Dxy : ndarray, float, shape (n_windows,)
        Nucleotide divergence.

    Examples
    --------

    Simplest case, two haplotypes in each population::

        >>> import allel
        >>> h = allel.HaplotypeArray([[0, 0, 0, 0],
        ...                           [0, 0, 0, 1],
        ...                           [0, 0, 1, 1],
        ...                           [0, 1, 1, 1],
        ...                           [1, 1, 1, 1],
        ...                           [0, 0, 1, 2],
        ...                           [0, 1, 1, 2],
        ...                           [0, 1, -1, -1],
        ...                           [-1, -1, -1, -1]])
        >>> ac1 = h.count_alleles(subpop=[0, 1])
        >>> ac2 = h.count_alleles(subpop=[2, 3])
        >>> pos = [2, 4, 7, 14, 15, 18, 19, 25, 27]
        >>> dxy = sequence_divergence(pos, ac1, ac2, start=1, stop=31)
        >>> dxy
        0.12096774193548387

    """

    # check inputs
    if not isinstance(pos, SortedIndex):
        pos = SortedIndex(pos, copy=False)
    ac1 = asarray_ndim(ac1, 2)
    ac2 = asarray_ndim(ac2, 2)
    if an1 is not None:
        an1 = asarray_ndim(an1, 1)
    if an2 is not None:
        an2 = asarray_ndim(an2, 1)
    is_accessible = asarray_ndim(is_accessible, 1, allow_none=True)

    # handle start/stop
    if start is not None or stop is not None:
        loc = pos.locate_range(start, stop)
        pos = pos[loc]
        ac1 = ac1[loc]
        ac2 = ac2[loc]
        if an1 is not None:
            an1 = an1[loc]
        if an2 is not None:
            an2 = an2[loc]
    if start is None:
        start = pos[0]
    if stop is None:
        stop = pos[-1]

    # calculate mean pairwise difference between the two populations
    mpd = mean_pairwise_difference_between(ac1, ac2, an1=an1, an2=an2, fill=0)

    # sum differences over variants
    mpd_sum = np.sum(mpd)

    # calculate value per base, N.B., expect pos is 1-based
    if is_accessible is None:
        n_bases = stop - start + 1
    else:
        n_bases = np.count_nonzero(is_accessible[start - 1:stop])

    dxy = mpd_sum / n_bases

    return dxy
Ejemplo n.º 42
0
def compute_ihh_gaps(pos, map_pos, gap_scale, max_gap, is_accessible):
    """Compute spacing between variants for integrating haplotype
    homozygosity.

    Parameters
    ----------
    pos : array_like, int, shape (n_variants,)
        Variant positions (physical distance).
    map_pos : array_like, float, shape (n_variants,)
        Variant positions (genetic map distance).
    gap_scale : int, optional
        Rescale distance between variants if gap is larger than this value.
    max_gap : int, optional
        Do not report scores if EHH spans a gap larger than this number of
        base pairs.
    is_accessible : array_like, bool, optional
        Genome accessibility array. If provided, distance between variants
        will be computed as the number of accessible bases between them.

    Returns
    -------
    gaps : ndarray, float, shape (n_variants - 1,)

    """

    # check inputs
    if map_pos is None:
        # integrate over physical distance
        map_pos = pos
    else:
        map_pos = asarray_ndim(map_pos, 1)
        check_dim0_aligned(pos, map_pos)

    # compute physical gaps
    physical_gaps = np.diff(pos)

    # compute genetic gaps
    gaps = np.diff(map_pos).astype('f8')

    if is_accessible is not None:

        # compute accessible gaps
        is_accessible = asarray_ndim(is_accessible, 1)
        assert is_accessible.shape[0] > pos[-1], \
            'accessibility array too short'
        accessible_gaps = np.zeros_like(physical_gaps)
        for i in range(1, len(pos)):
            # N.B., expect pos is 1-based
            n_access = np.count_nonzero(is_accessible[pos[i - 1] - 1:pos[i] -
                                                      1])
            accessible_gaps[i - 1] = n_access

        # adjust using accessibility
        scaling = accessible_gaps / physical_gaps
        gaps = gaps * scaling

    elif gap_scale is not None and gap_scale > 0:

        scaling = np.ones(gaps.shape, dtype='f8')
        loc_scale = physical_gaps > gap_scale
        scaling[loc_scale] = gap_scale / physical_gaps[loc_scale]
        gaps = gaps * scaling

    if max_gap is not None and max_gap > 0:

        # deal with very large gaps
        gaps[physical_gaps > max_gap] = -1

    return gaps
Ejemplo n.º 43
0
def windowed_divergence(pos,
                        ac1,
                        ac2,
                        size=None,
                        start=None,
                        stop=None,
                        step=None,
                        windows=None,
                        is_accessible=None,
                        fill=np.nan):
    """Estimate nucleotide divergence between two populations in windows
    over a single chromosome/contig.

    Parameters
    ----------

    pos : array_like, int, shape (n_items,)
        Variant positions, using 1-based coordinates, in ascending order.
    ac1 : array_like, int, shape (n_variants, n_alleles)
        Allele counts array for the first population.
    ac2 : array_like, int, shape (n_variants, n_alleles)
        Allele counts array for the second population.
    size : int, optional
        The window size (number of bases).
    start : int, optional
        The position at which to start (1-based).
    stop : int, optional
        The position at which to stop (1-based).
    step : int, optional
        The distance between start positions of windows. If not given,
        defaults to the window size, i.e., non-overlapping windows.
    windows : array_like, int, shape (n_windows, 2), optional
        Manually specify the windows to use as a sequence of (window_start,
        window_stop) positions, using 1-based coordinates. Overrides the
        size/start/stop/step parameters.
    is_accessible : array_like, bool, shape (len(contig),), optional
        Boolean array indicating accessibility status for all positions in the
        chromosome/contig.
    fill : object, optional
        The value to use where a window is completely inaccessible.

    Returns
    -------

    Dxy : ndarray, float, shape (n_windows,)
        Nucleotide divergence in each window.
    windows : ndarray, int, shape (n_windows, 2)
        The windows used, as an array of (window_start, window_stop) positions,
        using 1-based coordinates.
    n_bases : ndarray, int, shape (n_windows,)
        Number of (accessible) bases in each window.
    counts : ndarray, int, shape (n_windows,)
        Number of variants in each window.

    Examples
    --------

    Simplest case, two haplotypes in each population::

        >>> import allel
        >>> h = allel.HaplotypeArray([[0, 0, 0, 0],
        ...                           [0, 0, 0, 1],
        ...                           [0, 0, 1, 1],
        ...                           [0, 1, 1, 1],
        ...                           [1, 1, 1, 1],
        ...                           [0, 0, 1, 2],
        ...                           [0, 1, 1, 2],
        ...                           [0, 1, -1, -1],
        ...                           [-1, -1, -1, -1]])
        >>> ac1 = h.count_alleles(subpop=[0, 1])
        >>> ac2 = h.count_alleles(subpop=[2, 3])
        >>> pos = [2, 4, 7, 14, 15, 18, 19, 25, 27]
        >>> dxy, windows, n_bases, counts = windowed_divergence(
        ...     pos, ac1, ac2, size=10, start=1, stop=31
        ... )
        >>> dxy
        array([0.15 , 0.225, 0.   ])
        >>> windows
        array([[ 1, 10],
               [11, 20],
               [21, 31]])
        >>> n_bases
        array([10, 10, 11])
        >>> counts
        array([3, 4, 2])

    """

    # check inputs
    pos = SortedIndex(pos, copy=False)
    is_accessible = asarray_ndim(is_accessible, 1, allow_none=True)

    # calculate mean pairwise divergence
    mpd = mean_pairwise_difference_between(ac1, ac2, fill=0)

    # sum in windows
    mpd_sum, windows, counts = windowed_statistic(pos,
                                                  values=mpd,
                                                  statistic=np.sum,
                                                  size=size,
                                                  start=start,
                                                  stop=stop,
                                                  step=step,
                                                  windows=windows,
                                                  fill=0)

    # calculate value per base
    dxy, n_bases = per_base(mpd_sum,
                            windows,
                            is_accessible=is_accessible,
                            fill=fill)

    return dxy, windows, n_bases, counts
Ejemplo n.º 44
0
def tabulate_state_transitions(x, states, pos=None):
    """Construct a dataframe where each row provides information about a state transition.

    Parameters
    ----------
    x : array_like, int
        1-dimensional array of state values.
    states : set
        Set of states of interest. Any state value not in this set will be ignored.
    pos : array_like, int, optional
        Array of positions corresponding to values in `x`.

    Returns
    -------
    df : DataFrame

    Notes
    -----
    The resulting dataframe includes one row at the start representing the first state
    observation and one row at the end representing the last state observation.

    Examples
    --------
    >>> import allel
    >>> x = [1, 1, 0, 1, 1, 2, 2, 0, 2, 1, 1]
    >>> df = allel.tabulate_state_transitions(x, states={1, 2})
    >>> df
       lstate  rstate  lidx  ridx
    0      -1       1    -1     0
    1       1       2     4     5
    2       2       1     8     9
    3       1      -1    10    -1
    >>> pos = [2, 4, 7, 8, 10, 14, 19, 23, 28, 30, 31]
    >>> df = allel.tabulate_state_transitions(x, states={1, 2}, pos=pos)
    >>> df
       lstate  rstate  lidx  ridx  lpos  rpos
    0      -1       1    -1     0    -1     2
    1       1       2     4     5    10    14
    2       2       1     8     9    28    30
    3       1      -1    10    -1    31    -1

    """

    # check inputs
    x = asarray_ndim(x, 1)
    check_integer_dtype(x)
    x = memoryview_safe(x)

    # find state transitions
    switch_points, transitions, _ = state_transitions(x, states)

    # start to build a dataframe
    items = [('lstate', transitions[:, 0]),
             ('rstate', transitions[:, 1]),
             ('lidx', switch_points[:, 0]),
             ('ridx', switch_points[:, 1])]

    # deal with optional positions
    if pos is not None:
        pos = asarray_ndim(pos, 1)
        check_dim0_aligned(x, pos)
        check_integer_dtype(pos)

        # find switch positions
        switch_positions = np.take(pos, switch_points)
        # deal with boundary transitions
        switch_positions[0, 0] = -1
        switch_positions[-1, 1] = -1

        # add columns into dataframe
        items += [('lpos', switch_positions[:, 0]),
                  ('rpos', switch_positions[:, 1])]

    import pandas
    return pandas.DataFrame.from_dict(OrderedDict(items))
Ejemplo n.º 45
0
def windowed_watterson_theta(
    pos, ac, size=None, start=None, stop=None, step=None, windows=None, is_accessible=None, fill=np.nan
):
    """Calculate the value of Watterson's estimator in windows over a single
    chromosome/contig.

    Parameters
    ----------

    pos : array_like, int, shape (n_items,)
        Variant positions, using 1-based coordinates, in ascending order.
    ac : array_like, int, shape (n_variants, n_alleles)
        Allele counts array.
    size : int, optional
        The window size (number of bases).
    start : int, optional
        The position at which to start (1-based).
    stop : int, optional
        The position at which to stop (1-based).
    step : int, optional
        The distance between start positions of windows. If not given,
        defaults to the window size, i.e., non-overlapping windows.
    windows : array_like, int, shape (n_windows, 2), optional
        Manually specify the windows to use as a sequence of (window_start,
        window_stop) positions, using 1-based coordinates. Overrides the
        size/start/stop/step parameters.
    is_accessible : array_like, bool, shape (len(contig),), optional
        Boolean array indicating accessibility status for all positions in the
        chromosome/contig.
    fill : object, optional
        The value to use where a window is completely inaccessible.

    Returns
    -------

    theta_hat_w : ndarray, float, shape (n_windows,)
        Watterson's estimator (theta hat per base).
    windows : ndarray, int, shape (n_windows, 2)
        The windows used, as an array of (window_start, window_stop) positions,
        using 1-based coordinates.
    n_bases : ndarray, int, shape (n_windows,)
        Number of (accessible) bases in each window.
    counts : ndarray, int, shape (n_windows,)
        Number of variants in each window.

    Examples
    --------

    >>> import allel
    >>> g = allel.GenotypeArray([[[0, 0], [0, 0]],
    ...                          [[0, 0], [0, 1]],
    ...                          [[0, 0], [1, 1]],
    ...                          [[0, 1], [1, 1]],
    ...                          [[1, 1], [1, 1]],
    ...                          [[0, 0], [1, 2]],
    ...                          [[0, 1], [1, 2]],
    ...                          [[0, 1], [-1, -1]],
    ...                          [[-1, -1], [-1, -1]]])
    >>> ac = g.count_alleles()
    >>> pos = [2, 4, 7, 14, 15, 18, 19, 25, 27]
    >>> theta_hat_w, windows, n_bases, counts = allel.stats.windowed_watterson_theta(
    ...     pos, ac, size=10, start=1, stop=31
    ... )
    >>> theta_hat_w
    array([ 0.10909091,  0.16363636,  0.04958678])
    >>> windows
    array([[ 1, 10],
           [11, 20],
           [21, 31]])
    >>> n_bases
    array([10, 10, 11])
    >>> counts
    array([3, 4, 2])

    """  # flake8: noqa

    # check inputs
    if not isinstance(pos, SortedIndex):
        pos = SortedIndex(pos, copy=False)
    is_accessible = asarray_ndim(is_accessible, 1, allow_none=True)
    if not hasattr(ac, "count_segregating"):
        ac = AlleleCountsArray(ac, copy=False)

    # locate segregating variants
    is_seg = ac.is_segregating()

    # count segregating variants in windows
    S, windows, counts = windowed_statistic(
        pos, is_seg, statistic=np.count_nonzero, size=size, start=start, stop=stop, step=step, windows=windows, fill=0
    )

    # assume number of chromosomes sampled is constant for all variants
    n = ac.sum(axis=1).max()

    # (n-1)th harmonic number
    a1 = np.sum(1 / np.arange(1, n))

    # absolute value of Watterson's theta
    theta_hat_w_abs = S / a1

    # theta per base
    theta_hat_w, n_bases = per_base(theta_hat_w_abs, windows=windows, is_accessible=is_accessible, fill=fill)

    return theta_hat_w, windows, n_bases, counts
Ejemplo n.º 46
0
def xpehh(h1,
          h2,
          pos,
          map_pos=None,
          min_ehh=0.05,
          include_edges=False,
          gap_scale=20000,
          max_gap=200000,
          is_accessible=None,
          use_threads=True):
    """Compute the unstandardized cross-population extended haplotype
    homozygosity score (XPEHH) for each variant.

    Parameters
    ----------
    h1 : array_like, int, shape (n_variants, n_haplotypes)
        Haplotype array for the first population.
    h2 : array_like, int, shape (n_variants, n_haplotypes)
        Haplotype array for the second population.
    pos : array_like, int, shape (n_variants,)
        Variant positions on physical or genetic map.
    map_pos : array_like, float, shape (n_variants,)
        Variant positions (genetic map distance).
    min_ehh: float, optional
        Minimum EHH beyond which to truncate integrated haplotype
        homozygosity calculation.
    include_edges : bool, optional
        If True, report scores even if EHH does not decay below `min_ehh`
        before reaching the edge of the data.
    gap_scale : int, optional
        Rescale distance between variants if gap is larger than this value.
    max_gap : int, optional
        Do not report scores if EHH spans a gap larger than this number of
        base pairs.
    is_accessible : array_like, bool, optional
        Genome accessibility array. If provided, distance between variants
        will be computed as the number of accessible bases between them.
    use_threads : bool, optional
        If True use multiple threads to compute.

    Returns
    -------
    score : ndarray, float, shape (n_variants,)
        Unstandardized XPEHH scores.

    Notes
    -----

    This function will calculate XPEHH for all variants. To exclude variants
    below a given minor allele frequency, filter the input haplotype arrays
    before passing to this function.

    This function returns NaN for any EHH calculations where haplotype
    homozygosity does not decay below `min_ehh` before reaching the first or
    last variant. To disable this behaviour, set `include_edges` to True.

    Note that the unstandardized score is returned. Usually these scores are
    then standardized genome-wide.

    Haplotype arrays from the two populations may have different numbers of
    haplotypes.

    See Also
    --------
    standardize

    """

    # check inputs
    h1 = asarray_ndim(h1, 2)
    check_integer_dtype(h1)
    h2 = asarray_ndim(h2, 2)
    check_integer_dtype(h2)
    pos = asarray_ndim(pos, 1)
    check_dim0_aligned(h1, h2, pos)

    # compute gaps between variants for integration
    gaps = compute_ihh_gaps(pos, map_pos, gap_scale, max_gap, is_accessible)

    # setup kwargs
    kwargs = dict(min_ehh=min_ehh, include_edges=include_edges)

    if use_threads and multiprocessing.cpu_count() > 1:
        # use multiple threads

        # setup threadpool
        pool = ThreadPool(min(4, multiprocessing.cpu_count()))

        # scan forward
        res1_fwd = pool.apply_async(ihh_scan, (h1, gaps), kwargs)
        res2_fwd = pool.apply_async(ihh_scan, (h2, gaps), kwargs)

        # scan backward
        res1_rev = pool.apply_async(ihh_scan, (h1[::-1], gaps[::-1]), kwargs)
        res2_rev = pool.apply_async(ihh_scan, (h2[::-1], gaps[::-1]), kwargs)

        # wait for both to finish
        pool.close()
        pool.join()

        # obtain results
        ihh1_fwd = res1_fwd.get()
        ihh2_fwd = res2_fwd.get()
        ihh1_rev = res1_rev.get()
        ihh2_rev = res2_rev.get()

        # cleanup
        pool.terminate()

    else:
        # compute without threads

        # scan forward
        ihh1_fwd = ihh_scan(h1, gaps, **kwargs)
        ihh2_fwd = ihh_scan(h2, gaps, **kwargs)

        # scan backward
        ihh1_rev = ihh_scan(h1[::-1], gaps[::-1], **kwargs)
        ihh2_rev = ihh_scan(h2[::-1], gaps[::-1], **kwargs)

    # handle reverse scans
    ihh1_rev = ihh1_rev[::-1]
    ihh2_rev = ihh2_rev[::-1]

    # compute unstandardized score
    ihh1 = ihh1_fwd + ihh1_rev
    ihh2 = ihh2_fwd + ihh2_rev
    score = np.log(ihh1 / ihh2)

    return score
Ejemplo n.º 47
0
def mean_pairwise_difference_between(ac1, ac2, an1=None, an2=None, fill=np.nan):
    """Calculate for each variant the mean number of pairwise differences
    between chromosomes sampled from two different populations.

    Parameters
    ----------

    ac1 : array_like, int, shape (n_variants, n_alleles)
        Allele counts array from the first population.
    ac2 : array_like, int, shape (n_variants, n_alleles)
        Allele counts array from the second population.
    an1 : array_like, int, shape (n_variants,), optional
        Allele numbers for the first population. If not provided, will be
        calculated from `ac1`.
    an2 : array_like, int, shape (n_variants,), optional
        Allele numbers for the second population. If not provided, will be
        calculated from `ac2`.
    fill : float
        Use this value where there are no pairs to compare (e.g.,
        all allele calls are missing).

    Returns
    -------

    mpd : ndarray, float, shape (n_variants,)

    Notes
    -----

    The values returned by this function can be summed over a genome
    region and divided by the number of accessible bases to estimate
    nucleotide divergence between two populations, a.k.a. *Dxy*.

    Examples
    --------

    >>> import allel
    >>> h = allel.HaplotypeArray([[0, 0, 0, 0],
    ...                           [0, 0, 0, 1],
    ...                           [0, 0, 1, 1],
    ...                           [0, 1, 1, 1],
    ...                           [1, 1, 1, 1],
    ...                           [0, 0, 1, 2],
    ...                           [0, 1, 1, 2],
    ...                           [0, 1, -1, -1]])
    >>> ac1 = h.count_alleles(subpop=[0, 1])
    >>> ac2 = h.count_alleles(subpop=[2, 3])
    >>> allel.stats.mean_pairwise_difference_between(ac1, ac2)
    array([ 0.  ,  0.5 ,  1.  ,  0.5 ,  0.  ,  1.  ,  0.75,   nan])

    See Also
    --------

    sequence_divergence, windowed_divergence

    """

    # This function calculates the mean number of pairwise differences
    # between haplotypes from two different populations, generalising to any
    # number of alleles.

    # check inputs
    ac1 = asarray_ndim(ac1, 2)
    ac2 = asarray_ndim(ac2, 2)
    check_dim0_aligned(ac1, ac2)
    ac1, ac2 = ensure_dim1_aligned(ac1, ac2)

    # total number of haplotypes sampled from each population
    if an1 is None:
        an1 = np.sum(ac1, axis=1)
    else:
        an1 = asarray_ndim(an1, 1)
        check_dim0_aligned(ac1, an1)
    if an2 is None:
        an2 = np.sum(ac2, axis=1)
    else:
        an2 = asarray_ndim(an2, 1)
        check_dim0_aligned(ac2, an2)

    # total number of pairwise comparisons for each variant
    n_pairs = an1 * an2

    # number of pairwise comparisons where there is no difference:
    # sum of (ac1 * ac2) for each allele (i.e., number of ways to
    # choose the same allele twice)
    n_same = np.sum(ac1 * ac2, axis=1)

    # number of pairwise differences
    n_diff = n_pairs - n_same

    # mean number of pairwise differences, accounting for cases where
    # there are no pairs
    with ignore_invalid():
        mpd = np.where(n_pairs > 0, n_diff / n_pairs, fill)

    return mpd
Ejemplo n.º 48
0
def sequence_diversity(pos, ac, start=None, stop=None, is_accessible=None):
    """Estimate nucleotide diversity within a given region, which is the
    average proportion of sites (including monomorphic sites not present in the
    data) that differ between randomly chosen pairs of chromosomes.

    Parameters
    ----------

    pos : array_like, int, shape (n_items,)
        Variant positions, using 1-based coordinates, in ascending order.
    ac : array_like, int, shape (n_variants, n_alleles)
        Allele counts array.
    start : int, optional
        The position at which to start (1-based). Defaults to the first position.
    stop : int, optional
        The position at which to stop (1-based). Defaults to the last position.
    is_accessible : array_like, bool, shape (len(contig),), optional
        Boolean array indicating accessibility status for all positions in the
        chromosome/contig.

    Returns
    -------

    pi : ndarray, float, shape (n_windows,)
        Nucleotide diversity.

    Notes
    -----

    If start and/or stop are not provided, uses the difference between the last
    and the first position as a proxy for the total number of sites, which can
    overestimate the sequence diversity.

    Examples
    --------

    >>> import allel
    >>> g = allel.GenotypeArray([[[0, 0], [0, 0]],
    ...                          [[0, 0], [0, 1]],
    ...                          [[0, 0], [1, 1]],
    ...                          [[0, 1], [1, 1]],
    ...                          [[1, 1], [1, 1]],
    ...                          [[0, 0], [1, 2]],
    ...                          [[0, 1], [1, 2]],
    ...                          [[0, 1], [-1, -1]],
    ...                          [[-1, -1], [-1, -1]]])
    >>> ac = g.count_alleles()
    >>> pos = [2, 4, 7, 14, 15, 18, 19, 25, 27]
    >>> pi = allel.sequence_diversity(pos, ac, start=1, stop=31)
    >>> pi
    0.13978494623655915

    """

    # check inputs
    if not isinstance(pos, SortedIndex):
        pos = SortedIndex(pos, copy=False)
    ac = asarray_ndim(ac, 2)
    is_accessible = asarray_ndim(is_accessible, 1, allow_none=True)

    # deal with subregion
    if start is not None or stop is not None:
        loc = pos.locate_range(start, stop)
        pos = pos[loc]
        ac = ac[loc]
    if start is None:
        start = pos[0]
    if stop is None:
        stop = pos[-1]

    # calculate mean pairwise difference
    mpd = mean_pairwise_difference(ac, fill=0)

    # sum differences over variants
    mpd_sum = np.sum(mpd)

    # calculate value per base
    if is_accessible is None:
        n_bases = stop - start + 1
    else:
        n_bases = np.count_nonzero(is_accessible[start - 1:stop])

    pi = mpd_sum / n_bases
    return pi
Ejemplo n.º 49
0
def plot_sfs(s, yscale='log', bins=None, n=None,
             clip_endpoints=True, label=None, plot_kwargs=None,
             ax=None):
    """Plot a site frequency spectrum.

    Parameters
    ----------
    s : array_like, int, shape (n_chromosomes,)
        Site frequency spectrum.
    yscale : string, optional
        Y axis scale.
    bins : int or array_like, int, optional
        Allele count bins.
    n : int, optional
        Number of chromosomes sampled. If provided, X axis will be plotted
        as allele frequency, otherwise as allele count.
    clip_endpoints : bool, optional
        If True, do not plot first and last values from frequency spectrum.
    label : string, optional
        Label for data series in plot.
    plot_kwargs : dict-like
        Additional keyword arguments, passed through to ax.plot().
    ax : axes, optional
        Axes on which to draw. If not provided, a new figure will be created.

    Returns
    -------
    ax : axes
        The axes on which the plot was drawn.

    """

    import matplotlib.pyplot as plt
    import scipy

    # check inputs
    s = asarray_ndim(s, 1)

    # setup axes
    if ax is None:
        fig, ax = plt.subplots()

    # setup data
    if bins is None:
        if clip_endpoints:
            x = np.arange(1, s.shape[0]-1)
            y = s[1:-1]
        else:
            x = np.arange(s.shape[0])
            y = s
    else:
        if clip_endpoints:
            y, b, _ = scipy.stats.binned_statistic(
                np.arange(1, s.shape[0]-1),
                values=s[1:-1],
                bins=bins,
                statistic='sum')
        else:
            y, b, _ = scipy.stats.binned_statistic(
                np.arange(s.shape[0]),
                values=s,
                bins=bins,
                statistic='sum')
        # use bin midpoints for plotting
        x = (b[:-1] + b[1:]) / 2

    if n:
        # convert allele counts to allele frequencies
        x = x / n
        ax.set_xlabel('derived allele frequency')
    else:
        ax.set_xlabel('derived allele count')

    # do plotting
    if plot_kwargs is None:
        plot_kwargs = dict()
    ax.plot(x, y, label=label, **plot_kwargs)

    # tidy
    ax.set_yscale(yscale)
    ax.set_ylabel('site frequency')
    ax.autoscale(axis='x', tight=True)

    return ax
Ejemplo n.º 50
0
def mean_pairwise_difference(ac, an=None, fill=np.nan):
    """Calculate for each variant the mean number of pairwise differences
    between chromosomes sampled from within a single population.

    Parameters
    ----------

    ac : array_like, int, shape (n_variants, n_alleles)
        Allele counts array.
    an : array_like, int, shape (n_variants,), optional
        Allele numbers. If not provided, will be calculated from `ac`.
    fill : float
        Use this value where there are no pairs to compare (e.g.,
        all allele calls are missing).

    Returns
    -------

    mpd : ndarray, float, shape (n_variants,)

    Notes
    -----

    The values returned by this function can be summed over a genome
    region and divided by the number of accessible bases to estimate
    nucleotide diversity, a.k.a. *pi*.

    Examples
    --------

    >>> import allel
    >>> h = allel.HaplotypeArray([[0, 0, 0, 0],
    ...                           [0, 0, 0, 1],
    ...                           [0, 0, 1, 1],
    ...                           [0, 1, 1, 1],
    ...                           [1, 1, 1, 1],
    ...                           [0, 0, 1, 2],
    ...                           [0, 1, 1, 2],
    ...                           [0, 1, -1, -1]])
    >>> ac = h.count_alleles()
    >>> allel.stats.mean_pairwise_difference(ac)
    array([ 0.        ,  0.5       ,  0.66666667,  0.5       ,  0.        ,
            0.83333333,  0.83333333,  1.        ])

    See Also
    --------

    sequence_diversity, windowed_diversity

    """

    # This function calculates the mean number of pairwise differences
    # between haplotypes within a single population, generalising to any number
    # of alleles.

    # check inputs
    ac = asarray_ndim(ac, 2)

    # total number of haplotypes
    if an is None:
        an = np.sum(ac, axis=1)
    else:
        an = asarray_ndim(an, 1)
        check_dim0_aligned(ac, an)

    # total number of pairwise comparisons for each variant:
    # (an choose 2)
    n_pairs = an * (an - 1) / 2

    # number of pairwise comparisons where there is no difference:
    # sum of (ac choose 2) for each allele (i.e., number of ways to
    # choose the same allele twice)
    n_same = np.sum(ac * (ac - 1) / 2, axis=1)

    # number of pairwise differences
    n_diff = n_pairs - n_same

    # mean number of pairwise differences, accounting for cases where
    # there are no pairs
    with ignore_invalid():
        mpd = np.where(n_pairs > 0, n_diff / n_pairs, fill)

    return mpd
Ejemplo n.º 51
0
def standardize(score):
    """Centre and scale to unit variance."""
    score = asarray_ndim(score, 1)
    return (score - np.nanmean(score)) / np.nanstd(score)
Ejemplo n.º 52
0
def sequence_diversity(pos, ac, start=None, stop=None, is_accessible=None):
    """Estimate nucleotide diversity within a given region.

    Parameters
    ----------

    pos : array_like, int, shape (n_items,)
        Variant positions, using 1-based coordinates, in ascending order.
    ac : array_like, int, shape (n_variants, n_alleles)
        Allele counts array.
    start : int, optional
        The position at which to start (1-based).
    stop : int, optional
        The position at which to stop (1-based).
    is_accessible : array_like, bool, shape (len(contig),), optional
        Boolean array indicating accessibility status for all positions in the
        chromosome/contig.

    Returns
    -------

    pi : ndarray, float, shape (n_windows,)
        Nucleotide diversity.

    Examples
    --------

    >>> import allel
    >>> g = allel.GenotypeArray([[[0, 0], [0, 0]],
    ...                          [[0, 0], [0, 1]],
    ...                          [[0, 0], [1, 1]],
    ...                          [[0, 1], [1, 1]],
    ...                          [[1, 1], [1, 1]],
    ...                          [[0, 0], [1, 2]],
    ...                          [[0, 1], [1, 2]],
    ...                          [[0, 1], [-1, -1]],
    ...                          [[-1, -1], [-1, -1]]])
    >>> ac = g.count_alleles()
    >>> pos = [2, 4, 7, 14, 15, 18, 19, 25, 27]
    >>> pi = allel.stats.sequence_diversity(pos, ac, start=1, stop=31)
    >>> pi
    0.13978494623655915

    """

    # check inputs
    if not isinstance(pos, SortedIndex):
        pos = SortedIndex(pos, copy=False)
    ac = asarray_ndim(ac, 2)
    is_accessible = asarray_ndim(is_accessible, 1, allow_none=True)

    # deal with subregion
    if start is not None or stop is not None:
        loc = pos.locate_range(start, stop)
        pos = pos[loc]
        ac = ac[loc]
    if start is None:
        start = pos[0]
    if stop is None:
        stop = pos[-1]

    # calculate mean pairwise difference
    mpd = mean_pairwise_difference(ac, fill=0)

    # sum differences over variants
    mpd_sum = np.sum(mpd)

    # calculate value per base
    if is_accessible is None:
        n_bases = stop - start + 1
    else:
        n_bases = np.count_nonzero(is_accessible[start - 1 : stop])

    pi = mpd_sum / n_bases
    return pi
Ejemplo n.º 53
0
def fig_voight_painting(h,
                        index=None,
                        palette='colorblind',
                        height_factor=0.01,
                        fig=None):
    """Make a figure of shared haplotype prefixes for both left and right
    flanks, centred on some variant of choice.

    Parameters
    ----------
    h : array_like, int, shape (n_variants, n_haplotypes)
        Haplotype array.
    index : int, optional
        Index of the variant within the haplotype array to centre on. If not
        provided, the middle variant will be used.
    palette : string, optional
        A Seaborn palette name.
    height_factor : float, optional
        If no axes provided, determine height of figure by multiplying
        height of painting array by this number.
    fig : figure
        The figure on which to draw. If not provided, a new figure will be
        created.

    Returns
    -------
    fig : figure

    Notes
    -----
    N.B., the ordering of haplotypes on the left and right flanks will be
    different. This means that haplotypes on the right flank **will not**
    correspond to haplotypes on the left flank at the same vertical position.

    """

    import matplotlib.pyplot as plt
    from matplotlib.gridspec import GridSpec
    import seaborn as sns

    # check inputs
    h = asarray_ndim(h, 2)
    if index is None:
        # use midpoint
        index = h.shape[0] // 2

    # divide data into two flanks
    hl = h[:index + 1][::-1]
    hr = h[index:]

    # paint both flanks
    pl, il = voight_painting(hl)
    pr, ir = voight_painting(hr)

    # compute ehh decay for both flanks
    el = ehh_decay(hl, truncate=False)
    er = ehh_decay(hr, truncate=False)

    # setup figure
    # fixed height for EHH decay subplot
    h_ehh = plt.rcParams['figure.figsize'][1] // 3
    # add height for paintings
    h_painting = height_factor * h.shape[1]
    if fig is None:
        w = plt.rcParams['figure.figsize'][0]
        h = h_ehh + h_painting
        fig = plt.figure(figsize=(w, h))

    # setup gridspec
    gs = GridSpec(2,
                  2,
                  width_ratios=[hl.shape[0], hr.shape[0]],
                  height_ratios=[h_painting, h_ehh])

    # plot paintings
    ax = fig.add_subplot(gs[0, 0])
    sns.despine(ax=ax, left=True, bottom=True)
    plot_voight_painting(pl, palette=palette, flank='left', ax=ax)
    ax = fig.add_subplot(gs[0, 1])
    sns.despine(ax=ax, left=True, bottom=True)
    plot_voight_painting(pr, palette=palette, flank='right', ax=ax)

    # plot ehh
    ax = fig.add_subplot(gs[1, 0])
    sns.despine(ax=ax, offset=3)
    x = np.arange(el.shape[0])
    y = el
    ax.fill_between(x, 0, y)
    ax.set_ylim(0, 1)
    ax.set_yticks([0, 1])
    ax.set_ylabel('EHH')
    ax.invert_xaxis()
    ax = fig.add_subplot(gs[1, 1])
    sns.despine(ax=ax, left=True, right=False, offset=3)
    ax.yaxis.tick_right()
    ax.set_ylim(0, 1)
    ax.set_yticks([0, 1])
    x = np.arange(er.shape[0])
    y = er
    ax.fill_between(x, 0, y)

    # tidy up
    fig.tight_layout()

    return fig
Ejemplo n.º 54
0
def sequence_divergence(pos, ac1, ac2, an1=None, an2=None, start=None, stop=None, is_accessible=None):
    """Estimate nucleotide divergence between two populations within a
    given region.

    Parameters
    ----------

    pos : array_like, int, shape (n_items,)
        Variant positions, using 1-based coordinates, in ascending order.
    ac1 : array_like, int, shape (n_variants, n_alleles)
        Allele counts array for the first population.
    ac2 : array_like, int, shape (n_variants, n_alleles)
        Allele counts array for the second population.
    start : int, optional
        The position at which to start (1-based).
    stop : int, optional
        The position at which to stop (1-based).
    is_accessible : array_like, bool, shape (len(contig),), optional
        Boolean array indicating accessibility status for all positions in the
        chromosome/contig.

    Returns
    -------

    Dxy : ndarray, float, shape (n_windows,)
        Nucleotide divergence.

    Examples
    --------

    Simplest case, two haplotypes in each population::

        >>> import allel
        >>> h = allel.HaplotypeArray([[0, 0, 0, 0],
        ...                           [0, 0, 0, 1],
        ...                           [0, 0, 1, 1],
        ...                           [0, 1, 1, 1],
        ...                           [1, 1, 1, 1],
        ...                           [0, 0, 1, 2],
        ...                           [0, 1, 1, 2],
        ...                           [0, 1, -1, -1],
        ...                           [-1, -1, -1, -1]])
        >>> ac1 = h.count_alleles(subpop=[0, 1])
        >>> ac2 = h.count_alleles(subpop=[2, 3])
        >>> pos = [2, 4, 7, 14, 15, 18, 19, 25, 27]
        >>> dxy = sequence_divergence(pos, ac1, ac2, start=1, stop=31)
        >>> dxy
        0.12096774193548387

    """

    # check inputs
    if not isinstance(pos, SortedIndex):
        pos = SortedIndex(pos, copy=False)
    ac1 = asarray_ndim(ac1, 2)
    ac2 = asarray_ndim(ac2, 2)
    if an1 is not None:
        an1 = asarray_ndim(an1, 1)
    if an2 is not None:
        an2 = asarray_ndim(an2, 1)
    is_accessible = asarray_ndim(is_accessible, 1, allow_none=True)

    # handle start/stop
    if start is not None or stop is not None:
        loc = pos.locate_range(start, stop)
        pos = pos[loc]
        ac1 = ac1[loc]
        ac2 = ac2[loc]
        if an1 is not None:
            an1 = an1[loc]
        if an2 is not None:
            an2 = an2[loc]
    if start is None:
        start = pos[0]
    if stop is None:
        stop = pos[-1]

    # calculate mean pairwise difference between the two populations
    mpd = mean_pairwise_difference_between(ac1, ac2, an1=an1, an2=an2, fill=0)

    # sum differences over variants
    mpd_sum = np.sum(mpd)

    # calculate value per base, N.B., expect pos is 1-based
    if is_accessible is None:
        n_bases = stop - start + 1
    else:
        n_bases = np.count_nonzero(is_accessible[start - 1 : stop])

    dxy = mpd_sum / n_bases

    return dxy
Ejemplo n.º 55
0
def ihs(h,
        pos,
        map_pos=None,
        min_ehh=0.05,
        min_maf=0.05,
        include_edges=False,
        gap_scale=20000,
        max_gap=200000,
        is_accessible=None,
        use_threads=True):
    """Compute the unstandardized integrated haplotype score (IHS) for each
    variant, comparing integrated haplotype homozygosity between the
    reference (0) and alternate (1) alleles.

    Parameters
    ----------
    h : array_like, int, shape (n_variants, n_haplotypes)
        Haplotype array.
    pos : array_like, int, shape (n_variants,)
        Variant positions (physical distance).
    map_pos : array_like, float, shape (n_variants,)
        Variant positions (genetic map distance).
    min_ehh: float, optional
        Minimum EHH beyond which to truncate integrated haplotype
        homozygosity calculation.
    min_maf : float, optional
        Do not compute integrated haplotype homozogysity for variants with
        minor allele frequency below this value.
    include_edges : bool, optional
        If True, report scores even if EHH does not decay below `min_ehh`
        before reaching the edge of the data.
    gap_scale : int, optional
        Rescale distance between variants if gap is larger than this value.
    max_gap : int, optional
        Do not report scores if EHH spans a gap larger than this number of
        base pairs.
    is_accessible : array_like, bool, optional
        Genome accessibility array. If provided, distance between variants
        will be computed as the number of accessible bases between them.
    use_threads : bool, optional
        If True use multiple threads to compute.

    Returns
    -------
    score : ndarray, float, shape (n_variants,)
        Unstandardized IHS scores.

    Notes
    -----

    This function will calculate IHS for all variants. To exclude variants
    below a given minor allele frequency, filter the input haplotype array
    before passing to this function.

    This function computes IHS comparing the reference and alternate alleles.
    These can be polarised by switching the sign for any variant where the
    reference allele is derived.

    This function returns NaN for any IHS calculations where haplotype
    homozygosity does not decay below `min_ehh` before reaching the first or
    last variant. To disable this behaviour, set `include_edges` to True.

    Note that the unstandardized score is returned. Usually these scores are
    then standardized in different allele frequency bins.

    See Also
    --------
    standardize_by_allele_count

    """

    # check inputs
    h = asarray_ndim(h, 2)
    check_integer_dtype(h)
    pos = asarray_ndim(pos, 1)
    check_dim0_aligned(h, pos)

    # compute gaps between variants for integration
    gaps = compute_ihh_gaps(pos, map_pos, gap_scale, max_gap, is_accessible)

    # setup kwargs
    kwargs = dict(min_ehh=min_ehh,
                  min_maf=min_maf,
                  include_edges=include_edges)

    if use_threads and multiprocessing.cpu_count() > 1:
        # run with threads

        # create pool
        pool = ThreadPool(2)

        # scan forward
        result_fwd = pool.apply_async(ihh01_scan, (h, gaps), kwargs)

        # scan backward
        result_rev = pool.apply_async(ihh01_scan, (h[::-1], gaps[::-1]),
                                      kwargs)

        # wait for both to finish
        pool.close()
        pool.join()

        # obtain results
        ihh0_fwd, ihh1_fwd = result_fwd.get()
        ihh0_rev, ihh1_rev = result_rev.get()

        # cleanup
        pool.terminate()

    else:
        # run without threads

        # scan forward
        ihh0_fwd, ihh1_fwd = ihh01_scan(h, gaps, **kwargs)

        # scan backward
        ihh0_rev, ihh1_rev = ihh01_scan(h[::-1], gaps[::-1], **kwargs)

    # handle reverse scan
    ihh0_rev = ihh0_rev[::-1]
    ihh1_rev = ihh1_rev[::-1]

    # compute unstandardized score
    ihh0 = ihh0_fwd + ihh0_rev
    ihh1 = ihh1_fwd + ihh1_rev
    score = np.log(ihh1 / ihh0)

    return score
Ejemplo n.º 56
0
def windowed_divergence(
    pos, ac1, ac2, size=None, start=None, stop=None, step=None, windows=None, is_accessible=None, fill=np.nan
):
    """Estimate nucleotide divergence between two populations in windows
    over a single chromosome/contig.

    Parameters
    ----------

    pos : array_like, int, shape (n_items,)
        Variant positions, using 1-based coordinates, in ascending order.
    ac1 : array_like, int, shape (n_variants, n_alleles)
        Allele counts array for the first population.
    ac2 : array_like, int, shape (n_variants, n_alleles)
        Allele counts array for the second population.
    size : int, optional
        The window size (number of bases).
    start : int, optional
        The position at which to start (1-based).
    stop : int, optional
        The position at which to stop (1-based).
    step : int, optional
        The distance between start positions of windows. If not given,
        defaults to the window size, i.e., non-overlapping windows.
    windows : array_like, int, shape (n_windows, 2), optional
        Manually specify the windows to use as a sequence of (window_start,
        window_stop) positions, using 1-based coordinates. Overrides the
        size/start/stop/step parameters.
    is_accessible : array_like, bool, shape (len(contig),), optional
        Boolean array indicating accessibility status for all positions in the
        chromosome/contig.
    fill : object, optional
        The value to use where a window is completely inaccessible.

    Returns
    -------

    Dxy : ndarray, float, shape (n_windows,)
        Nucleotide divergence in each window.
    windows : ndarray, int, shape (n_windows, 2)
        The windows used, as an array of (window_start, window_stop) positions,
        using 1-based coordinates.
    n_bases : ndarray, int, shape (n_windows,)
        Number of (accessible) bases in each window.
    counts : ndarray, int, shape (n_windows,)
        Number of variants in each window.

    Examples
    --------

    Simplest case, two haplotypes in each population::

        >>> import allel
        >>> h = allel.HaplotypeArray([[0, 0, 0, 0],
        ...                           [0, 0, 0, 1],
        ...                           [0, 0, 1, 1],
        ...                           [0, 1, 1, 1],
        ...                           [1, 1, 1, 1],
        ...                           [0, 0, 1, 2],
        ...                           [0, 1, 1, 2],
        ...                           [0, 1, -1, -1],
        ...                           [-1, -1, -1, -1]])
        >>> ac1 = h.count_alleles(subpop=[0, 1])
        >>> ac2 = h.count_alleles(subpop=[2, 3])
        >>> pos = [2, 4, 7, 14, 15, 18, 19, 25, 27]
        >>> dxy, windows, n_bases, counts = windowed_divergence(
        ...     pos, ac1, ac2, size=10, start=1, stop=31
        ... )
        >>> dxy
        array([ 0.15 ,  0.225,  0.   ])
        >>> windows
        array([[ 1, 10],
               [11, 20],
               [21, 31]])
        >>> n_bases
        array([10, 10, 11])
        >>> counts
        array([3, 4, 2])

    """

    # check inputs
    pos = SortedIndex(pos, copy=False)
    is_accessible = asarray_ndim(is_accessible, 1, allow_none=True)

    # calculate mean pairwise divergence
    mpd = mean_pairwise_difference_between(ac1, ac2, fill=0)

    # sum in windows
    mpd_sum, windows, counts = windowed_statistic(
        pos, values=mpd, statistic=np.sum, size=size, start=start, stop=stop, step=step, windows=windows, fill=0
    )

    # calculate value per base
    dxy, n_bases = per_base(mpd_sum, windows, is_accessible=is_accessible, fill=fill)

    return dxy, windows, n_bases, counts
Ejemplo n.º 57
0
def nsl(h, use_threads=True):
    """Compute the unstandardized number of segregating sites by length (nSl)
    for each variant, comparing the reference and alternate alleles,
    after Ferrer-Admetlla et al. (2014).

    Parameters
    ----------
    h : array_like, int, shape (n_variants, n_haplotypes)
        Haplotype array.
    use_threads : bool, optional
        If True use multiple threads to compute.

    Returns
    -------
    score : ndarray, float, shape (n_variants,)

    Notes
    -----
    This function will calculate nSl for all variants. To exclude variants
    below a given minor allele frequency, filter the input haplotype array
    before passing to this function.

    This function computes nSl by comparing the reference and alternate
    alleles. These can be polarised by switching the sign for any variant where
    the reference allele is derived.

    This function does nothing about nSl calculations where haplotype
    homozygosity extends up to the first or last variant. There may be edge
    effects.

    Note that the unstandardized score is returned. Usually these scores are
    then standardized in different allele frequency bins.

    See Also
    --------
    standardize_by_allele_count

    """

    # check inputs
    h = asarray_ndim(h, 2)
    check_integer_dtype(h)

    # # check there are no invariant sites
    # ac = h.count_alleles()
    # assert np.all(ac.is_segregating()), 'please remove non-segregating sites'

    if use_threads and multiprocessing.cpu_count() > 1:

        # create pool
        pool = ThreadPool(2)

        # scan forward
        result_fwd = pool.apply_async(nsl01_scan, args=(h, ))

        # scan backward
        result_rev = pool.apply_async(nsl01_scan, args=(h[::-1], ))

        # wait for both to finish
        pool.close()
        pool.join()

        # obtain results
        nsl0_fwd, nsl1_fwd = result_fwd.get()
        nsl0_rev, nsl1_rev = result_rev.get()

    else:

        # scan forward
        nsl0_fwd, nsl1_fwd = nsl01_scan(h)

        # scan backward
        nsl0_rev, nsl1_rev = nsl01_scan(h[::-1])

    # handle backwards
    nsl0_rev = nsl0_rev[::-1]
    nsl1_rev = nsl1_rev[::-1]

    # compute unstandardized score
    nsl0 = nsl0_fwd + nsl0_rev
    nsl1 = nsl1_fwd + nsl1_rev
    score = np.log(nsl1 / nsl0)

    return score
Ejemplo n.º 58
0
def windowed_df(
    pos, ac1, ac2, size=None, start=None, stop=None, step=None, windows=None, is_accessible=None, fill=np.nan
):
    """Calculate the density of fixed differences between two populations in
    windows over a single chromosome/contig.

    Parameters
    ----------

    pos : array_like, int, shape (n_items,)
        Variant positions, using 1-based coordinates, in ascending order.
    ac1 : array_like, int, shape (n_variants, n_alleles)
        Allele counts array for the first population.
    ac2 : array_like, int, shape (n_variants, n_alleles)
        Allele counts array for the second population.
    size : int, optional
        The window size (number of bases).
    start : int, optional
        The position at which to start (1-based).
    stop : int, optional
        The position at which to stop (1-based).
    step : int, optional
        The distance between start positions of windows. If not given,
        defaults to the window size, i.e., non-overlapping windows.
    windows : array_like, int, shape (n_windows, 2), optional
        Manually specify the windows to use as a sequence of (window_start,
        window_stop) positions, using 1-based coordinates. Overrides the
        size/start/stop/step parameters.
    is_accessible : array_like, bool, shape (len(contig),), optional
        Boolean array indicating accessibility status for all positions in the
        chromosome/contig.
    fill : object, optional
        The value to use where a window is completely inaccessible.

    Returns
    -------

    df : ndarray, float, shape (n_windows,)
        Per-base density of fixed differences in each window.
    windows : ndarray, int, shape (n_windows, 2)
        The windows used, as an array of (window_start, window_stop) positions,
        using 1-based coordinates.
    n_bases : ndarray, int, shape (n_windows,)
        Number of (accessible) bases in each window.
    counts : ndarray, int, shape (n_windows,)
        Number of variants in each window.

    See Also
    --------

    allel.model.locate_fixed_differences

    """

    # check inputs
    pos = SortedIndex(pos, copy=False)
    is_accessible = asarray_ndim(is_accessible, 1, allow_none=True)

    # locate fixed differences
    loc_df = locate_fixed_differences(ac1, ac2)

    # count number of fixed differences in windows
    n_df, windows, counts = windowed_statistic(
        pos,
        values=loc_df,
        statistic=np.count_nonzero,
        size=size,
        start=start,
        stop=stop,
        step=step,
        windows=windows,
        fill=0,
    )

    # calculate value per base
    df, n_bases = per_base(n_df, windows, is_accessible=is_accessible, fill=fill)

    return df, windows, n_bases, counts
Ejemplo n.º 59
0
def recarray_from_hdf5_group(*args, **kwargs):
    """Load a recarray from columns stored as separate datasets with an
    HDF5 group.

    Either provide an h5py group as a single positional argument,
    or provide two positional arguments giving the HDF5 file path and the
    group node path within the file.

    The following optional parameters may be given.

    Parameters
    ----------
    start : int, optional
        Index to start loading from.
    stop : int, optional
        Index to finish loading at.
    condition : array_like, bool, optional
        A 1-dimensional boolean array of the same length as the columns of the
        table to load, indicating a selection of rows to load.

    """

    import h5py

    h5f = None

    if len(args) == 1:
        group = args[0]

    elif len(args) == 2:
        file_path, node_path = args
        h5f = h5py.File(file_path, mode='r')
        try:
            group = h5f[node_path]
        except Exception as e:
            h5f.close()
            raise e

    else:
        raise ValueError('bad arguments; expected group or (file_path, '
                         'node_path), found %s' % repr(args))

    try:

        if not isinstance(group, h5py.Group):
            raise ValueError('expected group, found %r' % group)

        # determine dataset names to load
        available_dataset_names = [
            n for n in group.keys() if isinstance(group[n], h5py.Dataset)
        ]
        names = kwargs.pop('names', available_dataset_names)
        names = [str(n) for n in names]  # needed for PY2
        for n in names:
            if n not in set(group.keys()):
                raise ValueError('name not found: %s' % n)
            if not isinstance(group[n], h5py.Dataset):
                raise ValueError('name does not refer to a dataset: %s, %r' %
                                 (n, group[n]))

        # check datasets are aligned
        datasets = [group[n] for n in names]
        length = datasets[0].shape[0]
        for d in datasets[1:]:
            if d.shape[0] != length:
                raise ValueError('datasets must be of equal length')

        # determine start and stop parameters for load
        start = kwargs.pop('start', 0)
        stop = kwargs.pop('stop', length)

        # check condition
        condition = kwargs.pop('condition', None)  # type: np.ndarray
        condition = asarray_ndim(condition, 1, allow_none=True)
        if condition is not None and condition.size != length:
            raise ValueError('length of condition does not match length '
                             'of datasets')

        # setup output data
        dtype = [(n, d.dtype, d.shape[1:]) for n, d in zip(names, datasets)]
        ra = np.empty(length, dtype=dtype)

        for n, d in zip(names, datasets):
            a = d[start:stop]
            if condition is not None:
                a = np.compress(condition[start:stop], a, axis=0)
            ra[n] = a

        return ra

    finally:
        if h5f is not None:
            h5f.close()
Ejemplo n.º 60
0
def watterson_theta(pos, ac, start=None, stop=None, is_accessible=None):
    """Calculate the value of Watterson's estimator over a given region.

    Parameters
    ----------

    pos : array_like, int, shape (n_items,)
        Variant positions, using 1-based coordinates, in ascending order.
    ac : array_like, int, shape (n_variants, n_alleles)
        Allele counts array.
    start : int, optional
        The position at which to start (1-based).
    stop : int, optional
        The position at which to stop (1-based).
    is_accessible : array_like, bool, shape (len(contig),), optional
        Boolean array indicating accessibility status for all positions in the
        chromosome/contig.

    Returns
    -------

    theta_hat_w : float
        Watterson's estimator (theta hat per base).

    Examples
    --------

    >>> import allel
    >>> g = allel.GenotypeArray([[[0, 0], [0, 0]],
    ...                          [[0, 0], [0, 1]],
    ...                          [[0, 0], [1, 1]],
    ...                          [[0, 1], [1, 1]],
    ...                          [[1, 1], [1, 1]],
    ...                          [[0, 0], [1, 2]],
    ...                          [[0, 1], [1, 2]],
    ...                          [[0, 1], [-1, -1]],
    ...                          [[-1, -1], [-1, -1]]])
    >>> ac = g.count_alleles()
    >>> pos = [2, 4, 7, 14, 15, 18, 19, 25, 27]
    >>> theta_hat_w = allel.stats.watterson_theta(pos, ac, start=1, stop=31)
    >>> theta_hat_w
    0.10557184750733138

    """

    # check inputs
    if not isinstance(pos, SortedIndex):
        pos = SortedIndex(pos, copy=False)
    is_accessible = asarray_ndim(is_accessible, 1, allow_none=True)
    if not hasattr(ac, "count_segregating"):
        ac = AlleleCountsArray(ac, copy=False)

    # deal with subregion
    if start is not None or stop is not None:
        loc = pos.locate_range(start, stop)
        pos = pos[loc]
        ac = ac[loc]
    if start is None:
        start = pos[0]
    if stop is None:
        stop = pos[-1]

    # count segregating variants
    S = ac.count_segregating()

    # assume number of chromosomes sampled is constant for all variants
    n = ac.sum(axis=1).max()

    # (n-1)th harmonic number
    a1 = np.sum(1 / np.arange(1, n))

    # calculate absolute value
    theta_hat_w_abs = S / a1

    # calculate value per base
    if is_accessible is None:
        n_bases = stop - start + 1
    else:
        n_bases = np.count_nonzero(is_accessible[start - 1 : stop])
    theta_hat_w = theta_hat_w_abs / n_bases

    return theta_hat_w