Exemple #1
0
def locate_unlinked(gn, size=100, step=20, threshold=.1, chunked=False,
                    blen=None):
    """Locate variants in approximate linkage equilibrium, where r**2 is
    below the given `threshold`.

    Parameters
    ----------

    gn : array_like, int8, shape (n_variants, n_samples)
        Diploid genotypes at biallelic variants, coded as the number of
        alternate alleles per call (i.e., 0 = hom ref, 1 = het, 2 = hom alt).
    size : int
        Window size (number of variants).
    step : int
        Number of variants to advance to the next window.
    threshold : float
        Maximum value of r**2 to include variants.
    blen : int, optional
        Block length to use for chunked computation.

    Returns
    -------

    loc : ndarray, bool, shape (n_variants)
        Boolean array where True items locate variants in approximate
        linkage equilibrium.

    Notes
    -----

    The value of r**2 between each pair of variants is calculated using the
    method of Rogers and Huff (2008).

    """

    from allel.opt.stats import gn_locate_unlinked_int8

    # check inputs
    if not hasattr(gn, 'shape') or not hasattr(gn, 'dtype'):
        gn = np.asarray(gn, dtype='i1')
    if gn.ndim != 2:
        raise ValueError('gn must have two dimensions')

    # setup output
    loc = np.ones(gn.shape[0], dtype='u1')

    # compute in chunks to avoid loading big arrays into memory
    blen = get_blen_array(gn, blen)
    blen = max(blen, 10*size)  # avoid too small chunks
    n_variants = gn.shape[0]
    for i in range(0, n_variants, blen):
        # N.B., ensure overlap with next window
        j = min(n_variants, i+blen+size)
        gnb = np.asarray(gn[i:j], dtype='i1')
        locb = loc[i:j]
        gn_locate_unlinked_int8(gnb, locb, size, step, threshold)

    return loc.astype('b1')
Exemple #2
0
def locate_unlinked(gn, size=100, step=20, threshold=.1, blen=None):
    """Locate variants in approximate linkage equilibrium, where r**2 is
    below the given `threshold`.

    Parameters
    ----------
    gn : array_like, int8, shape (n_variants, n_samples)
        Diploid genotypes at biallelic variants, coded as the number of
        alternate alleles per call (i.e., 0 = hom ref, 1 = het, 2 = hom alt).
    size : int
        Window size (number of variants).
    step : int
        Number of variants to advance to the next window.
    threshold : float
        Maximum value of r**2 to include variants.
    blen : int, optional
        Block length to use for chunked computation.

    Returns
    -------
    loc : ndarray, bool, shape (n_variants)
        Boolean array where True items locate variants in approximate
        linkage equilibrium.

    Notes
    -----
    The value of r**2 between each pair of variants is calculated using the
    method of Rogers and Huff (2008).

    """

    # check inputs
    if not hasattr(gn, 'shape') or not hasattr(gn, 'dtype'):
        gn = np.asarray(gn, dtype='i1')
    if gn.ndim != 2:
        raise ValueError('gn must have two dimensions')

    # setup output
    loc = np.ones(gn.shape[0], dtype='u1')

    # compute in chunks to avoid loading big arrays into memory
    blen = get_blen_array(gn, blen)
    blen = max(blen, 10 * size)  # avoid too small chunks
    n_variants = gn.shape[0]
    for i in range(0, n_variants, blen):
        # N.B., ensure overlap with next window
        j = min(n_variants, i + blen + size)
        gnb = np.asarray(gn[i:j], dtype='i1')
        gnb = memoryview_safe(gnb)
        locb = loc[i:j]
        gn_locate_unlinked_int8(gnb, locb, size, step, threshold)

    return loc.astype('b1')
Exemple #3
0
def pairwise_distance(x, metric, chunked=False, blen=None):
    """Compute pairwise distance between individuals (e.g., samples or
    haplotypes).

    Parameters
    ----------
    x : array_like, shape (n, m, ...)
        Array of m observations (e.g., samples or haplotypes) in a space
        with n dimensions (e.g., variants). Note that the order of the first
        two dimensions is **swapped** compared to what is expected by
        scipy.spatial.distance.pdist.
    metric : string or function
        Distance metric. See documentation for the function
        :func:`scipy.spatial.distance.pdist` for a list of built-in
        distance metrics.
    chunked : bool, optional
        If True, use a block-wise implementation to avoid loading the entire
        input array into memory. This means that a distance matrix will be
        calculated for each block of the input array, and the results will
        be summed to produce the final output. For some distance metrics
        this will return a different result from the standard implementation.
    blen : int, optional
        Block length to use for chunked implementation.

    Returns
    -------
    dist : ndarray, shape (m * (m - 1) / 2,)
        Distance matrix in condensed form.

    Examples
    --------

    >>> import allel
    >>> g = allel.GenotypeArray([[[0, 0], [0, 1], [1, 1]],
    ...                          [[0, 1], [1, 1], [1, 2]],
    ...                          [[0, 2], [2, 2], [-1, -1]]])
    >>> d = allel.stats.pairwise_distance(g.to_n_alt(), metric='cityblock')
    >>> d
    array([ 3.,  4.,  3.])
    >>> import scipy.spatial
    >>> scipy.spatial.distance.squareform(d)
    array([[ 0.,  3.,  4.],
           [ 3.,  0.,  3.],
           [ 4.,  3.,  0.]])

    """

    import scipy.spatial

    # check inputs
    if not hasattr(x, "ndim"):
        x = np.asarray(x)
    if x.ndim < 2:
        raise ValueError("array with at least 2 dimensions expected")

    if x.ndim == 2:
        # use scipy to calculate distance, it's most efficient

        def f(b):

            # transpose as pdist expects (m, n) for m observations in an
            # n-dimensional space
            t = b.T

            # compute the distance matrix
            return scipy.spatial.distance.pdist(t, metric=metric)

    else:
        # use our own implementation, it handles multidimensional observations

        def f(b):
            return pdist(b, metric=metric)

    if chunked:
        # use block-wise implementation
        blen = get_blen_array(x, blen)
        dist = None
        for i in range(0, x.shape[0], blen):
            j = min(x.shape[0], i + blen)
            block = x[i:j]
            if dist is None:
                dist = f(block)
            else:
                dist += f(block)

    else:
        # standard implementation
        dist = f(x)

    return dist
Exemple #4
0
def weir_cockerham_fst(g, subpops, max_allele=None, blen=None):
    """Compute the variance components from the analyses of variance of
    allele frequencies according to Weir and Cockerham (1984).

    Parameters
    ----------
    g : array_like, int, shape (n_variants, n_samples, ploidy)
        Genotype array.
    subpops : sequence of sequences of ints
        Sample indices for each subpopulation.
    max_allele : int, optional
        The highest allele index to consider.
    blen : int, optional
        Block length to use for chunked computation.

    Returns
    -------
    a : ndarray, float, shape (n_variants, n_alleles)
        Component of variance between populations.
    b : ndarray, float, shape (n_variants, n_alleles)
        Component of variance between individuals within populations.
    c : ndarray, float, shape (n_variants, n_alleles)
        Component of variance between gametes within individuals.

    Examples
    --------
    Calculate variance components from some genotype data::

        >>> import allel
        >>> g = [[[0, 0], [0, 0], [1, 1], [1, 1]],
        ...      [[0, 1], [0, 1], [0, 1], [0, 1]],
        ...      [[0, 0], [0, 0], [0, 0], [0, 0]],
        ...      [[0, 1], [1, 2], [1, 1], [2, 2]],
        ...      [[0, 0], [1, 1], [0, 1], [-1, -1]]]
        >>> subpops = [[0, 1], [2, 3]]
        >>> a, b, c = allel.weir_cockerham_fst(g, subpops)
        >>> a
        array([[ 0.5  ,  0.5  ,  0.   ],
               [ 0.   ,  0.   ,  0.   ],
               [ 0.   ,  0.   ,  0.   ],
               [ 0.   , -0.125, -0.125],
               [-0.375, -0.375,  0.   ]])
        >>> b
        array([[ 0.        ,  0.        ,  0.        ],
               [-0.25      , -0.25      ,  0.        ],
               [ 0.        ,  0.        ,  0.        ],
               [ 0.        ,  0.125     ,  0.25      ],
               [ 0.41666667,  0.41666667,  0.        ]])
        >>> c
        array([[0.        , 0.        , 0.        ],
               [0.5       , 0.5       , 0.        ],
               [0.        , 0.        , 0.        ],
               [0.125     , 0.25      , 0.125     ],
               [0.16666667, 0.16666667, 0.        ]])

    Estimate the parameter theta (a.k.a., Fst) for each variant
    and each allele individually::

        >>> fst = a / (a + b + c)
        >>> fst
        array([[ 1. ,  1. ,  nan],
               [ 0. ,  0. ,  nan],
               [ nan,  nan,  nan],
               [ 0. , -0.5, -0.5],
               [-1.8, -1.8,  nan]])

    Estimate Fst for each variant individually (averaging over alleles)::

        >>> fst = (np.sum(a, axis=1) /
        ...        (np.sum(a, axis=1) + np.sum(b, axis=1) + np.sum(c, axis=1)))
        >>> fst
        array([ 1. ,  0. ,  nan, -0.4, -1.8])

    Estimate Fst averaging over all variants and alleles::

        >>> fst = np.sum(a) / (np.sum(a) + np.sum(b) + np.sum(c))
        >>> fst
        -4.36809058868914e-17

    Note that estimated Fst values may be negative.

    """

    # check inputs
    if not hasattr(g, 'shape') or not hasattr(g, 'ndim'):
        g = GenotypeArray(g, copy=False)
    if g.ndim != 3:
        raise ValueError('g must have three dimensions')
    if g.shape[2] != 2:
        raise NotImplementedError('only diploid genotypes are supported')

    # determine highest allele index
    if max_allele is None:
        max_allele = g.max()

    # compute in chunks to avoid loading big arrays into memory
    blen = get_blen_array(g, blen)
    n_variants = g.shape[0]
    shape = (n_variants, max_allele + 1)
    a = np.zeros(shape, dtype='f8')
    b = np.zeros(shape, dtype='f8')
    c = np.zeros(shape, dtype='f8')
    for i in range(0, n_variants, blen):
        j = min(n_variants, i + blen)
        gb = g[i:j]
        ab, bb, cb = _weir_cockerham_fst(gb, subpops, max_allele)
        a[i:j] = ab
        b[i:j] = bb
        c[i:j] = cb

    return a, b, c
Exemple #5
0
def pairwise_distance(x, metric, chunked=False, blen=None):
    """Compute pairwise distance between individuals (e.g., samples or
    haplotypes).

    Parameters
    ----------
    x : array_like, shape (n, m, ...)
        Array of m observations (e.g., samples or haplotypes) in a space
        with n dimensions (e.g., variants). Note that the order of the first
        two dimensions is **swapped** compared to what is expected by
        scipy.spatial.distance.pdist.
    metric : string or function
        Distance metric. See documentation for the function
        :func:`scipy.spatial.distance.pdist` for a list of built-in
        distance metrics.
    chunked : bool, optional
        If True, use a block-wise implementation to avoid loading the entire
        input array into memory. This means that a distance matrix will be
        calculated for each block of the input array, and the results will
        be summed to produce the final output. For some distance metrics
        this will return a different result from the standard implementation.
    blen : int, optional
        Block length to use for chunked implementation.

    Returns
    -------
    dist : ndarray, shape (m * (m - 1) / 2,)
        Distance matrix in condensed form.

    Examples
    --------

    >>> import allel
    >>> g = allel.GenotypeArray([[[0, 0], [0, 1], [1, 1]],
    ...                          [[0, 1], [1, 1], [1, 2]],
    ...                          [[0, 2], [2, 2], [-1, -1]]])
    >>> d = allel.stats.pairwise_distance(g.to_n_alt(), metric='cityblock')
    >>> d
    array([ 3.,  4.,  3.])
    >>> import scipy.spatial
    >>> scipy.spatial.distance.squareform(d)
    array([[ 0.,  3.,  4.],
           [ 3.,  0.,  3.],
           [ 4.,  3.,  0.]])

    """

    import scipy.spatial

    # check inputs
    if not hasattr(x, 'ndim'):
        x = np.asarray(x)
    if x.ndim < 2:
        raise ValueError('array with at least 2 dimensions expected')

    if x.ndim == 2:
        # use scipy to calculate distance, it's most efficient

        def f(b):

            # transpose as pdist expects (m, n) for m observations in an
            # n-dimensional space
            t = b.T

            # compute the distance matrix
            return scipy.spatial.distance.pdist(t, metric=metric)

    else:
        # use our own implementation, it handles multidimensional observations

        def f(b):
            return pdist(b, metric=metric)

    if chunked:
        # use block-wise implementation
        blen = get_blen_array(x, blen)
        dist = None
        for i in range(0, x.shape[0], blen):
            j = min(x.shape[0], i + blen)
            block = x[i:j]
            if dist is None:
                dist = f(block)
            else:
                dist += f(block)

    else:
        # standard implementation
        dist = f(x)

    return dist
Exemple #6
0
def weir_cockerham_fst(g, subpops, max_allele=None, chunked=False, blen=None):
    """Compute the variance components from the analyses of variance of
    allele frequencies according to Weir and Cockerham (1984).

    Parameters
    ----------
    g : array_like, int, shape (n_variants, n_samples, ploidy)
        Genotype array.
    subpops : sequence of sequences of ints
        Sample indices for each subpopulation.
    max_allele : int, optional
        The highest allele index to consider.
    chunked : bool, optional
        If True, use a block-wise implementation to avoid loading the entire
        input array into memory.
    blen : int, optional
        Block length to use for chunked implementation.

    Returns
    -------
    a : ndarray, float, shape (n_variants, n_alleles)
        Component of variance between populations.
    b : ndarray, float, shape (n_variants, n_alleles)
        Component of variance between individuals within populations.
    c : ndarray, float, shape (n_variants, n_alleles)
        Component of variance between gametes within individuals.

    Examples
    --------
    Calculate variance components from some genotype data::

        >>> import allel
        >>> g = [[[0, 0], [0, 0], [1, 1], [1, 1]],
        ...      [[0, 1], [0, 1], [0, 1], [0, 1]],
        ...      [[0, 0], [0, 0], [0, 0], [0, 0]],
        ...      [[0, 1], [1, 2], [1, 1], [2, 2]],
        ...      [[0, 0], [1, 1], [0, 1], [-1, -1]]]
        >>> subpops = [[0, 1], [2, 3]]
        >>> a, b, c = allel.stats.weir_cockerham_fst(g, subpops)
        >>> a
        array([[ 0.5  ,  0.5  ,  0.   ],
               [ 0.   ,  0.   ,  0.   ],
               [ 0.   ,  0.   ,  0.   ],
               [ 0.   , -0.125, -0.125],
               [-0.375, -0.375,  0.   ]])
        >>> b
        array([[ 0.        ,  0.        ,  0.        ],
               [-0.25      , -0.25      ,  0.        ],
               [ 0.        ,  0.        ,  0.        ],
               [ 0.        ,  0.125     ,  0.25      ],
               [ 0.41666667,  0.41666667,  0.        ]])
        >>> c
        array([[ 0.        ,  0.        ,  0.        ],
               [ 0.5       ,  0.5       ,  0.        ],
               [ 0.        ,  0.        ,  0.        ],
               [ 0.125     ,  0.25      ,  0.125     ],
               [ 0.16666667,  0.16666667,  0.        ]])

    Estimate the parameter theta (a.k.a., Fst) for each variant
    and each allele individually::

        >>> fst = a / (a + b + c)
        >>> fst
        array([[ 1. ,  1. ,  nan],
               [ 0. ,  0. ,  nan],
               [ nan,  nan,  nan],
               [ 0. , -0.5, -0.5],
               [-1.8, -1.8,  nan]])

    Estimate Fst for each variant individually (averaging over alleles)::

        >>> fst = (np.sum(a, axis=1) /
        ...        (np.sum(a, axis=1) + np.sum(b, axis=1) + np.sum(c, axis=1)))
        >>> fst
        array([ 1. ,  0. ,  nan, -0.4, -1.8])

    Estimate Fst averaging over all variants and alleles::

        >>> fst = np.sum(a) / (np.sum(a) + np.sum(b) + np.sum(c))
        >>> fst
        -4.3680905886891398e-17

    Note that estimated Fst values may be negative.

    """

    # check inputs
    if not hasattr(g, 'shape') or not hasattr(g, 'ndim'):
        g = GenotypeArray(g, copy=False)
    if g.ndim != 3:
        raise ValueError('g must have three dimensions')
    if g.shape[2] != 2:
        raise NotImplementedError('only diploid genotypes are supported')

    # determine highest allele index
    if max_allele is None:
        max_allele = g.max()

    if chunked:
        # use a block-wise implementation
        blen = get_blen_array(g, blen)
        n_variants = g.shape[0]
        shape = (n_variants, max_allele + 1)
        a = np.zeros(shape, dtype='f8')
        b = np.zeros(shape, dtype='f8')
        c = np.zeros(shape, dtype='f8')
        for i in range(0, n_variants, blen):
            j = min(n_variants, i+blen)
            gb = g[i:j]
            ab, bb, cb = _weir_cockerham_fst(gb, subpops, max_allele)
            a[i:j] = ab
            b[i:j] = bb
            c[i:j] = cb

    else:
        a, b, c = _weir_cockerham_fst(g, subpops, max_allele)

    return a, b, c