コード例 #1
0
ファイル: fst.py プロジェクト: yangmqglobe/scikit-allel
def weir_cockerham_fst(g, subpops, max_allele=None, blen=None):
    """Compute the variance components from the analyses of variance of
    allele frequencies according to Weir and Cockerham (1984).

    Parameters
    ----------
    g : array_like, int, shape (n_variants, n_samples, ploidy)
        Genotype array.
    subpops : sequence of sequences of ints
        Sample indices for each subpopulation.
    max_allele : int, optional
        The highest allele index to consider.
    blen : int, optional
        Block length to use for chunked computation.

    Returns
    -------
    a : ndarray, float, shape (n_variants, n_alleles)
        Component of variance between populations.
    b : ndarray, float, shape (n_variants, n_alleles)
        Component of variance between individuals within populations.
    c : ndarray, float, shape (n_variants, n_alleles)
        Component of variance between gametes within individuals.

    Examples
    --------
    Calculate variance components from some genotype data::

        >>> import allel
        >>> g = [[[0, 0], [0, 0], [1, 1], [1, 1]],
        ...      [[0, 1], [0, 1], [0, 1], [0, 1]],
        ...      [[0, 0], [0, 0], [0, 0], [0, 0]],
        ...      [[0, 1], [1, 2], [1, 1], [2, 2]],
        ...      [[0, 0], [1, 1], [0, 1], [-1, -1]]]
        >>> subpops = [[0, 1], [2, 3]]
        >>> a, b, c = allel.weir_cockerham_fst(g, subpops)
        >>> a
        array([[ 0.5  ,  0.5  ,  0.   ],
               [ 0.   ,  0.   ,  0.   ],
               [ 0.   ,  0.   ,  0.   ],
               [ 0.   , -0.125, -0.125],
               [-0.375, -0.375,  0.   ]])
        >>> b
        array([[ 0.        ,  0.        ,  0.        ],
               [-0.25      , -0.25      ,  0.        ],
               [ 0.        ,  0.        ,  0.        ],
               [ 0.        ,  0.125     ,  0.25      ],
               [ 0.41666667,  0.41666667,  0.        ]])
        >>> c
        array([[0.        , 0.        , 0.        ],
               [0.5       , 0.5       , 0.        ],
               [0.        , 0.        , 0.        ],
               [0.125     , 0.25      , 0.125     ],
               [0.16666667, 0.16666667, 0.        ]])

    Estimate the parameter theta (a.k.a., Fst) for each variant
    and each allele individually::

        >>> fst = a / (a + b + c)
        >>> fst
        array([[ 1. ,  1. ,  nan],
               [ 0. ,  0. ,  nan],
               [ nan,  nan,  nan],
               [ 0. , -0.5, -0.5],
               [-1.8, -1.8,  nan]])

    Estimate Fst for each variant individually (averaging over alleles)::

        >>> fst = (np.sum(a, axis=1) /
        ...        (np.sum(a, axis=1) + np.sum(b, axis=1) + np.sum(c, axis=1)))
        >>> fst
        array([ 1. ,  0. ,  nan, -0.4, -1.8])

    Estimate Fst averaging over all variants and alleles::

        >>> fst = np.sum(a) / (np.sum(a) + np.sum(b) + np.sum(c))
        >>> fst
        -4.36809058868914e-17

    Note that estimated Fst values may be negative.

    """

    # check inputs
    if not hasattr(g, 'shape') or not hasattr(g, 'ndim'):
        g = GenotypeArray(g, copy=False)
    if g.ndim != 3:
        raise ValueError('g must have three dimensions')
    if g.shape[2] != 2:
        raise NotImplementedError('only diploid genotypes are supported')

    # determine highest allele index
    if max_allele is None:
        max_allele = g.max()

    # compute in chunks to avoid loading big arrays into memory
    blen = get_blen_array(g, blen)
    n_variants = g.shape[0]
    shape = (n_variants, max_allele + 1)
    a = np.zeros(shape, dtype='f8')
    b = np.zeros(shape, dtype='f8')
    c = np.zeros(shape, dtype='f8')
    for i in range(0, n_variants, blen):
        j = min(n_variants, i + blen)
        gb = g[i:j]
        ab, bb, cb = _weir_cockerham_fst(gb, subpops, max_allele)
        a[i:j] = ab
        b[i:j] = bb
        c[i:j] = cb

    return a, b, c
コード例 #2
0
ファイル: mendel.py プロジェクト: yangmqglobe/scikit-allel
def mendel_errors(parent_genotypes, progeny_genotypes):
    """Locate genotype calls not consistent with Mendelian transmission of
    alleles.

    Parameters
    ----------
    parent_genotypes : array_like, int, shape (n_variants, 2, 2)
        Genotype calls for the two parents.
    progeny_genotypes : array_like, int, shape (n_variants, n_progeny, 2)
        Genotype calls for the progeny.

    Returns
    -------
    me : ndarray, int, shape (n_variants, n_progeny)
        Count of Mendel errors for each progeny genotype call.

    Examples
    --------
    The following are all consistent with Mendelian transmission. Note that a
    value of 0 is returned for missing calls::

        >>> import allel
        >>> import numpy as np
        >>> genotypes = np.array([
        ...     # aa x aa -> aa
        ...     [[0, 0], [0, 0], [0, 0], [-1, -1], [-1, -1], [-1, -1]],
        ...     [[1, 1], [1, 1], [1, 1], [-1, -1], [-1, -1], [-1, -1]],
        ...     [[2, 2], [2, 2], [2, 2], [-1, -1], [-1, -1], [-1, -1]],
        ...     # aa x ab -> aa or ab
        ...     [[0, 0], [0, 1], [0, 0], [0, 1], [-1, -1], [-1, -1]],
        ...     [[0, 0], [0, 2], [0, 0], [0, 2], [-1, -1], [-1, -1]],
        ...     [[1, 1], [0, 1], [1, 1], [0, 1], [-1, -1], [-1, -1]],
        ...     # aa x bb -> ab
        ...     [[0, 0], [1, 1], [0, 1], [-1, -1], [-1, -1], [-1, -1]],
        ...     [[0, 0], [2, 2], [0, 2], [-1, -1], [-1, -1], [-1, -1]],
        ...     [[1, 1], [2, 2], [1, 2], [-1, -1], [-1, -1], [-1, -1]],
        ...     # aa x bc -> ab or ac
        ...     [[0, 0], [1, 2], [0, 1], [0, 2], [-1, -1], [-1, -1]],
        ...     [[1, 1], [0, 2], [0, 1], [1, 2], [-1, -1], [-1, -1]],
        ...     # ab x ab -> aa or ab or bb
        ...     [[0, 1], [0, 1], [0, 0], [0, 1], [1, 1], [-1, -1]],
        ...     [[1, 2], [1, 2], [1, 1], [1, 2], [2, 2], [-1, -1]],
        ...     [[0, 2], [0, 2], [0, 0], [0, 2], [2, 2], [-1, -1]],
        ...     # ab x bc -> ab or ac or bb or bc
        ...     [[0, 1], [1, 2], [0, 1], [0, 2], [1, 1], [1, 2]],
        ...     [[0, 1], [0, 2], [0, 0], [0, 1], [0, 1], [1, 2]],
        ...     # ab x cd -> ac or ad or bc or bd
        ...     [[0, 1], [2, 3], [0, 2], [0, 3], [1, 2], [1, 3]],
        ... ])
        >>> me = allel.mendel_errors(genotypes[:, :2], genotypes[:, 2:])
        >>> me
        array([[0, 0, 0, 0],
               [0, 0, 0, 0],
               [0, 0, 0, 0],
               [0, 0, 0, 0],
               [0, 0, 0, 0],
               [0, 0, 0, 0],
               [0, 0, 0, 0],
               [0, 0, 0, 0],
               [0, 0, 0, 0],
               [0, 0, 0, 0],
               [0, 0, 0, 0],
               [0, 0, 0, 0],
               [0, 0, 0, 0],
               [0, 0, 0, 0],
               [0, 0, 0, 0],
               [0, 0, 0, 0],
               [0, 0, 0, 0]])

    The following are cases of 'non-parental' inheritance where one or two
    alleles are found in the progeny that are not present in either parent.
    Note that the number of errors may be 1 or 2 depending on the number of
    non-parental alleles::

        >>> genotypes = np.array([
        ...     # aa x aa -> ab or ac or bb or cc
        ...     [[0, 0], [0, 0], [0, 1], [0, 2], [1, 1], [2, 2]],
        ...     [[1, 1], [1, 1], [0, 1], [1, 2], [0, 0], [2, 2]],
        ...     [[2, 2], [2, 2], [0, 2], [1, 2], [0, 0], [1, 1]],
        ...     # aa x ab -> ac or bc or cc
        ...     [[0, 0], [0, 1], [0, 2], [1, 2], [2, 2], [2, 2]],
        ...     [[0, 0], [0, 2], [0, 1], [1, 2], [1, 1], [1, 1]],
        ...     [[1, 1], [0, 1], [1, 2], [0, 2], [2, 2], [2, 2]],
        ...     # aa x bb -> ac or bc or cc
        ...     [[0, 0], [1, 1], [0, 2], [1, 2], [2, 2], [2, 2]],
        ...     [[0, 0], [2, 2], [0, 1], [1, 2], [1, 1], [1, 1]],
        ...     [[1, 1], [2, 2], [0, 1], [0, 2], [0, 0], [0, 0]],
        ...     # ab x ab -> ac or bc or cc
        ...     [[0, 1], [0, 1], [0, 2], [1, 2], [2, 2], [2, 2]],
        ...     [[0, 2], [0, 2], [0, 1], [1, 2], [1, 1], [1, 1]],
        ...     [[1, 2], [1, 2], [0, 1], [0, 2], [0, 0], [0, 0]],
        ...     # ab x bc -> ad or bd or cd or dd
        ...     [[0, 1], [1, 2], [0, 3], [1, 3], [2, 3], [3, 3]],
        ...     [[0, 1], [0, 2], [0, 3], [1, 3], [2, 3], [3, 3]],
        ...     [[0, 2], [1, 2], [0, 3], [1, 3], [2, 3], [3, 3]],
        ...     # ab x cd -> ae or be or ce or de
        ...     [[0, 1], [2, 3], [0, 4], [1, 4], [2, 4], [3, 4]],
        ... ])
        >>> me = allel.mendel_errors(genotypes[:, :2], genotypes[:, 2:])
        >>> me
        array([[1, 1, 2, 2],
               [1, 1, 2, 2],
               [1, 1, 2, 2],
               [1, 1, 2, 2],
               [1, 1, 2, 2],
               [1, 1, 2, 2],
               [1, 1, 2, 2],
               [1, 1, 2, 2],
               [1, 1, 2, 2],
               [1, 1, 2, 2],
               [1, 1, 2, 2],
               [1, 1, 2, 2],
               [1, 1, 1, 2],
               [1, 1, 1, 2],
               [1, 1, 1, 2],
               [1, 1, 1, 1]])

    The following are cases of 'hemi-parental' inheritance, where progeny
    appear to have inherited two copies of an allele found only once in one of
    the parents::

        >>> genotypes = np.array([
        ...     # aa x ab -> bb
        ...     [[0, 0], [0, 1], [1, 1], [-1, -1]],
        ...     [[0, 0], [0, 2], [2, 2], [-1, -1]],
        ...     [[1, 1], [0, 1], [0, 0], [-1, -1]],
        ...     # ab x bc -> aa or cc
        ...     [[0, 1], [1, 2], [0, 0], [2, 2]],
        ...     [[0, 1], [0, 2], [1, 1], [2, 2]],
        ...     [[0, 2], [1, 2], [0, 0], [1, 1]],
        ...     # ab x cd -> aa or bb or cc or dd
        ...     [[0, 1], [2, 3], [0, 0], [1, 1]],
        ...     [[0, 1], [2, 3], [2, 2], [3, 3]],
        ... ])
        >>> me = allel.mendel_errors(genotypes[:, :2], genotypes[:, 2:])
        >>> me
        array([[1, 0],
               [1, 0],
               [1, 0],
               [1, 1],
               [1, 1],
               [1, 1],
               [1, 1],
               [1, 1]])

    The following are cases of 'uni-parental' inheritance, where progeny
    appear to have inherited both alleles from a single parent::

        >>> genotypes = np.array([
        ...     # aa x bb -> aa or bb
        ...     [[0, 0], [1, 1], [0, 0], [1, 1]],
        ...     [[0, 0], [2, 2], [0, 0], [2, 2]],
        ...     [[1, 1], [2, 2], [1, 1], [2, 2]],
        ...     # aa x bc -> aa or bc
        ...     [[0, 0], [1, 2], [0, 0], [1, 2]],
        ...     [[1, 1], [0, 2], [1, 1], [0, 2]],
        ...     # ab x cd -> ab or cd
        ...     [[0, 1], [2, 3], [0, 1], [2, 3]],
        ... ])
        >>> me = allel.mendel_errors(genotypes[:, :2], genotypes[:, 2:])
        >>> me
        array([[1, 1],
               [1, 1],
               [1, 1],
               [1, 1],
               [1, 1],
               [1, 1]])

    """

    # setup
    parent_genotypes = GenotypeArray(parent_genotypes)
    progeny_genotypes = GenotypeArray(progeny_genotypes)
    check_ploidy(parent_genotypes.ploidy, 2)
    check_ploidy(progeny_genotypes.ploidy, 2)

    # transform into per-call allele counts
    max_allele = max(parent_genotypes.max(), progeny_genotypes.max())
    parent_gc = parent_genotypes.to_allele_counts(max_allele=max_allele, dtype='i1')
    progeny_gc = progeny_genotypes.to_allele_counts(max_allele=max_allele, dtype='i1')

    # detect nonparental and hemiparental inheritance by comparing allele
    # counts between parents and progeny
    max_progeny_gc = parent_gc.clip(max=1).sum(axis=1)
    max_progeny_gc = max_progeny_gc[:, np.newaxis, :]
    me = (progeny_gc - max_progeny_gc).clip(min=0).sum(axis=2)

    # detect uniparental inheritance by finding cases where no alleles are
    # shared between parents, then comparing progeny allele counts to each
    # parent
    p1_gc = parent_gc[:, 0, np.newaxis, :]
    p2_gc = parent_gc[:, 1, np.newaxis, :]
    # find variants where parents don't share any alleles
    is_shared_allele = (p1_gc > 0) & (p2_gc > 0)
    no_shared_alleles = ~np.any(is_shared_allele, axis=2)
    # find calls where progeny genotype is identical to one or the other parent
    me[no_shared_alleles &
       (np.all(progeny_gc == p1_gc, axis=2) |
        np.all(progeny_gc == p2_gc, axis=2))] = 1

    # retrofit where either or both parent has a missing call
    me[np.any(parent_genotypes.is_missing(), axis=1)] = 0

    return me
コード例 #3
0
ファイル: fst.py プロジェクト: podpearson/scikit-allel
def weir_cockerham_fst(g, subpops, max_allele=None, chunked=False, blen=None):
    """Compute the variance components from the analyses of variance of
    allele frequencies according to Weir and Cockerham (1984).

    Parameters
    ----------
    g : array_like, int, shape (n_variants, n_samples, ploidy)
        Genotype array.
    subpops : sequence of sequences of ints
        Sample indices for each subpopulation.
    max_allele : int, optional
        The highest allele index to consider.
    chunked : bool, optional
        If True, use a block-wise implementation to avoid loading the entire
        input array into memory.
    blen : int, optional
        Block length to use for chunked implementation.

    Returns
    -------
    a : ndarray, float, shape (n_variants, n_alleles)
        Component of variance between populations.
    b : ndarray, float, shape (n_variants, n_alleles)
        Component of variance between individuals within populations.
    c : ndarray, float, shape (n_variants, n_alleles)
        Component of variance between gametes within individuals.

    Examples
    --------
    Calculate variance components from some genotype data::

        >>> import allel
        >>> g = [[[0, 0], [0, 0], [1, 1], [1, 1]],
        ...      [[0, 1], [0, 1], [0, 1], [0, 1]],
        ...      [[0, 0], [0, 0], [0, 0], [0, 0]],
        ...      [[0, 1], [1, 2], [1, 1], [2, 2]],
        ...      [[0, 0], [1, 1], [0, 1], [-1, -1]]]
        >>> subpops = [[0, 1], [2, 3]]
        >>> a, b, c = allel.stats.weir_cockerham_fst(g, subpops)
        >>> a
        array([[ 0.5  ,  0.5  ,  0.   ],
               [ 0.   ,  0.   ,  0.   ],
               [ 0.   ,  0.   ,  0.   ],
               [ 0.   , -0.125, -0.125],
               [-0.375, -0.375,  0.   ]])
        >>> b
        array([[ 0.        ,  0.        ,  0.        ],
               [-0.25      , -0.25      ,  0.        ],
               [ 0.        ,  0.        ,  0.        ],
               [ 0.        ,  0.125     ,  0.25      ],
               [ 0.41666667,  0.41666667,  0.        ]])
        >>> c
        array([[ 0.        ,  0.        ,  0.        ],
               [ 0.5       ,  0.5       ,  0.        ],
               [ 0.        ,  0.        ,  0.        ],
               [ 0.125     ,  0.25      ,  0.125     ],
               [ 0.16666667,  0.16666667,  0.        ]])

    Estimate the parameter theta (a.k.a., Fst) for each variant
    and each allele individually::

        >>> fst = a / (a + b + c)
        >>> fst
        array([[ 1. ,  1. ,  nan],
               [ 0. ,  0. ,  nan],
               [ nan,  nan,  nan],
               [ 0. , -0.5, -0.5],
               [-1.8, -1.8,  nan]])

    Estimate Fst for each variant individually (averaging over alleles)::

        >>> fst = (np.sum(a, axis=1) /
        ...        (np.sum(a, axis=1) + np.sum(b, axis=1) + np.sum(c, axis=1)))
        >>> fst
        array([ 1. ,  0. ,  nan, -0.4, -1.8])

    Estimate Fst averaging over all variants and alleles::

        >>> fst = np.sum(a) / (np.sum(a) + np.sum(b) + np.sum(c))
        >>> fst
        -4.3680905886891398e-17

    Note that estimated Fst values may be negative.

    """

    # check inputs
    if not hasattr(g, 'shape') or not hasattr(g, 'ndim'):
        g = GenotypeArray(g, copy=False)
    if g.ndim != 3:
        raise ValueError('g must have three dimensions')
    if g.shape[2] != 2:
        raise NotImplementedError('only diploid genotypes are supported')

    # determine highest allele index
    if max_allele is None:
        max_allele = g.max()

    if chunked:
        # use a block-wise implementation
        blen = get_blen_array(g, blen)
        n_variants = g.shape[0]
        shape = (n_variants, max_allele + 1)
        a = np.zeros(shape, dtype='f8')
        b = np.zeros(shape, dtype='f8')
        c = np.zeros(shape, dtype='f8')
        for i in range(0, n_variants, blen):
            j = min(n_variants, i+blen)
            gb = g[i:j]
            ab, bb, cb = _weir_cockerham_fst(gb, subpops, max_allele)
            a[i:j] = ab
            b[i:j] = bb
            c[i:j] = cb

    else:
        a, b, c = _weir_cockerham_fst(g, subpops, max_allele)

    return a, b, c