コード例 #1
0
ファイル: test_stats.py プロジェクト: oxpeter/scikit-allel
    def test_heterozygosity_expected(self):

        def refimpl(af, ploidy, fill=0):
            """Limited reference implementation for testing purposes."""

            # check allele frequencies sum to 1
            af_sum = np.sum(af, axis=1)

            # assume three alleles
            p = af[:, 0]
            q = af[:, 1]
            r = af[:, 2]

            out = 1 - p**ploidy - q**ploidy - r**ploidy
            with ignore_invalid():
                out[(af_sum < 1) | np.isnan(af_sum)] = fill

            return out

        # diploid
        g = GenotypeArray([[[0, 0], [0, 0]],
                           [[1, 1], [1, 1]],
                           [[1, 1], [2, 2]],
                           [[0, 0], [0, 1]],
                           [[0, 0], [0, 2]],
                           [[1, 1], [1, 2]],
                           [[0, 1], [0, 1]],
                           [[0, 1], [1, 2]],
                           [[0, 0], [-1, -1]],
                           [[0, 1], [-1, -1]],
                           [[-1, -1], [-1, -1]]], dtype='i1')
        expect1 = [0, 0, 0.5, .375, .375, .375, .5, .625, 0, .5, -1]
        af = g.count_alleles().to_frequencies()
        expect2 = refimpl(af, ploidy=g.ploidy, fill=-1)
        actual = allel.stats.heterozygosity_expected(af, ploidy=g.ploidy,
                                                     fill=-1)
        assert_array_close(expect1, actual)
        assert_array_close(expect2, actual)
        expect3 = [0, 0, 0.5, .375, .375, .375, .5, .625, 0, .5, 0]
        actual = allel.stats.heterozygosity_expected(af, ploidy=g.ploidy,
                                                     fill=0)
        assert_array_close(expect3, actual)

        # polyploid
        g = GenotypeArray([[[0, 0, 0], [0, 0, 0]],
                           [[1, 1, 1], [1, 1, 1]],
                           [[1, 1, 1], [2, 2, 2]],
                           [[0, 0, 0], [0, 0, 1]],
                           [[0, 0, 0], [0, 0, 2]],
                           [[1, 1, 1], [0, 1, 2]],
                           [[0, 0, 1], [0, 1, 1]],
                           [[0, 1, 1], [0, 1, 2]],
                           [[0, 0, 0], [-1, -1, -1]],
                           [[0, 0, 1], [-1, -1, -1]],
                           [[-1, -1, -1], [-1, -1, -1]]], dtype='i1')
        af = g.count_alleles().to_frequencies()
        expect = refimpl(af, ploidy=g.ploidy, fill=-1)
        actual = allel.stats.heterozygosity_expected(af, ploidy=g.ploidy,
                                                     fill=-1)
        assert_array_close(expect, actual)
コード例 #2
0
ファイル: hw.py プロジェクト: quank/scikit-allel
def inbreeding_coefficient(g, fill=np.nan):
    """Calculate the inbreeding coefficient for each variant.

    Parameters
    ----------

    g : array_like, int, shape (n_variants, n_samples, ploidy)
        Genotype array.
    fill : float, optional
        Use this value for variants where the expected heterozygosity is
        zero.

    Returns
    -------

    f : ndarray, float, shape (n_variants,)
        Inbreeding coefficient.

    Notes
    -----

    The inbreeding coefficient is calculated as *1 - (Ho/He)* where *Ho* is
    the observed heterozygosity and *He* is the expected heterozygosity.

    Examples
    --------

    >>> import allel
    >>> g = allel.GenotypeArray([[[0, 0], [0, 0], [0, 0]],
    ...                          [[0, 0], [0, 1], [1, 1]],
    ...                          [[0, 0], [1, 1], [2, 2]],
    ...                          [[1, 1], [1, 2], [-1, -1]]])
    >>> allel.stats.inbreeding_coefficient(g)
    array([        nan,  0.33333333,  1.        , -0.33333333])

    """

    # check inputs
    if not hasattr(g, 'count_het') or not hasattr(g, 'count_called'):
        g = GenotypeArray(g, copy=False)

    # calculate observed and expected heterozygosity
    ho = heterozygosity_observed(g)
    af = g.count_alleles().to_frequencies()
    he = heterozygosity_expected(af, ploidy=g.shape[-1], fill=0)

    # calculate inbreeding coefficient, accounting for variants with no
    # expected heterozygosity
    with ignore_invalid():
        f = np.where(he > 0, 1 - (ho / he), fill)

    return f
コード例 #3
0
ファイル: hw.py プロジェクト: oxpeter/scikit-allel
def inbreeding_coefficient(g, fill=np.nan):
    """Calculate the inbreeding coefficient for each variant.

    Parameters
    ----------

    g : array_like, int, shape (n_variants, n_samples, ploidy)
        Genotype array.
    fill : float, optional
        Use this value for variants where the expected heterozygosity is
        zero.

    Returns
    -------

    f : ndarray, float, shape (n_variants,)
        Inbreeding coefficient.

    Notes
    -----

    The inbreeding coefficient is calculated as *1 - (Ho/He)* where *Ho* is
    the observed heterozygosity and *He* is the expected heterozygosity.

    Examples
    --------

    >>> import allel
    >>> g = allel.GenotypeArray([[[0, 0], [0, 0], [0, 0]],
    ...                          [[0, 0], [0, 1], [1, 1]],
    ...                          [[0, 0], [1, 1], [2, 2]],
    ...                          [[1, 1], [1, 2], [-1, -1]]])
    >>> allel.stats.inbreeding_coefficient(g)
    array([        nan,  0.33333333,  1.        , -0.33333333])

    """

    # check inputs
    if not hasattr(g, 'count_het') or not hasattr(g, 'count_called'):
        g = GenotypeArray(g, copy=False)

    # calculate observed and expected heterozygosity
    ho = heterozygosity_observed(g)
    af = g.count_alleles().to_frequencies()
    he = heterozygosity_expected(af, ploidy=g.shape[-1], fill=0)

    # calculate inbreeding coefficient, accounting for variants with no
    # expected heterozygosity
    with ignore_invalid():
        f = np.where(he > 0, 1 - (ho / he), fill)

    return f
コード例 #4
0
ファイル: fst.py プロジェクト: yangmqglobe/scikit-allel
def _weir_cockerham_fst(g, subpops, max_allele):

    # check inputs
    g = GenotypeArray(g, copy=False)
    n_variants, n_samples, ploidy = g.shape
    n_alleles = max_allele + 1

    # number of populations sampled
    r = len(subpops)
    n_populations = r
    debug('r: %r', r)

    # count alleles within each subpopulation
    ac = [g.count_alleles(subpop=s, max_allele=max_allele) for s in subpops]

    # stack allele counts from each sub-population into a single array
    ac = np.dstack(ac)
    assert ac.shape == (n_variants, n_alleles, n_populations)
    debug('ac: %s, %r', ac.shape, ac)

    # count number of alleles called within each population by summing
    # allele counts along the alleles dimension
    an = np.sum(ac, axis=1)
    assert an.shape == (n_variants, n_populations)
    debug('an: %s, %r', an.shape, an)

    # compute number of individuals sampled from each population
    n = an // 2
    assert n.shape == (n_variants, n_populations)
    debug('n: %s, %r', n.shape, n)

    # compute the total number of individuals sampled across all populations
    n_total = np.sum(n, axis=1)
    assert n_total.shape == (n_variants, )
    debug('n_total: %s, %r', n_total.shape, n_total)

    # compute the average sample size across populations
    n_bar = np.mean(n, axis=1)
    assert n_bar.shape == (n_variants, )
    debug('n_bar: %s, %r', n_bar.shape, n_bar)

    # compute the term n sub C incorporating the coefficient of variation in
    # sample sizes
    n_C = (n_total - (np.sum(n**2, axis=1) / n_total)) / (r - 1)
    assert n_C.shape == (n_variants, )
    debug('n_C: %s, %r', n_C.shape, n_C)

    # compute allele frequencies within each population
    p = ac / an[:, np.newaxis, :]
    assert p.shape == (n_variants, n_alleles, n_populations)
    debug('p: %s, %r', p.shape, p)

    # compute the average sample frequency of each allele
    ac_total = np.sum(ac, axis=2)
    an_total = np.sum(an, axis=1)
    p_bar = ac_total / an_total[:, np.newaxis]
    assert p_bar.shape == (n_variants, n_alleles)
    debug('p_bar: %s, %r', p_bar.shape, p_bar)

    # add in some extra dimensions to enable broadcasting
    n_bar = n_bar[:, np.newaxis]
    n_C = n_C[:, np.newaxis]
    n = n[:, np.newaxis, :]
    p_bar = p_bar[:, :, np.newaxis]

    # compute the sample variance of allele frequencies over populations
    s_squared = (np.sum(n * ((p - p_bar)**2), axis=2) / (n_bar * (r - 1)))
    assert s_squared.shape == (n_variants, n_alleles)
    debug('s_squared: %s, %r', s_squared.shape, s_squared)

    # remove extra dimensions for correct broadcasting
    p_bar = p_bar[:, :, 0]

    # compute the average heterozygosity over all populations
    # N.B., take only samples in subpops of interest
    gs = g.take(list(itertools.chain(*subpops)), axis=1)
    h_bar = [
        gs.count_het(allele=allele, axis=1) / n_total
        for allele in range(n_alleles)
    ]
    h_bar = np.column_stack(h_bar)
    assert h_bar.shape == (n_variants, n_alleles)
    debug('h_bar: %s, %r', h_bar.shape, h_bar)

    # now comes the tricky bit...

    # component of variance between populations
    a = ((n_bar / n_C) * (s_squared -
                          ((1 / (n_bar - 1)) *
                           ((p_bar * (1 - p_bar)) - ((r - 1) * s_squared / r) -
                            (h_bar / 4)))))
    assert a.shape == (n_variants, n_alleles)

    # component of variance between individuals within populations
    b = ((n_bar / (n_bar - 1)) * ((p_bar *
                                   (1 - p_bar)) - ((r - 1) * s_squared / r) -
                                  (((2 * n_bar) - 1) * h_bar / (4 * n_bar))))
    assert b.shape == (n_variants, n_alleles)

    # component of variance between gametes within individuals
    c = h_bar / 2
    assert c.shape == (n_variants, n_alleles)

    return a, b, c
コード例 #5
0
ファイル: fst.py プロジェクト: podpearson/scikit-allel
def _weir_cockerham_fst(g, subpops, max_allele):

    # check inputs
    g = GenotypeArray(g, copy=False)
    n_variants, n_samples, ploidy = g.shape
    n_alleles = max_allele + 1

    # number of populations sampled
    r = len(subpops)
    n_populations = r
    debug('r: %r', r)

    # count alleles within each subpopulation
    ac = [g.count_alleles(subpop=s, max_allele=max_allele) for s in subpops]

    # stack allele counts from each sub-population into a single array
    ac = np.dstack(ac)
    assert ac.shape == (n_variants, n_alleles, n_populations)
    debug('ac: %s, %r', ac.shape, ac)

    # count number of alleles called within each population by summing
    # allele counts along the alleles dimension
    an = np.sum(ac, axis=1)
    assert an.shape == (n_variants, n_populations)
    debug('an: %s, %r', an.shape, an)

    # compute number of individuals sampled from each population
    n = an // 2
    assert n.shape == (n_variants, n_populations)
    debug('n: %s, %r', n.shape, n)

    # compute the total number of individuals sampled across all populations
    n_total = np.sum(n, axis=1)
    assert n_total.shape == (n_variants,)
    debug('n_total: %s, %r', n_total.shape, n_total)

    # compute the average sample size across populations
    n_bar = np.mean(n, axis=1)
    assert n_bar.shape == (n_variants,)
    debug('n_bar: %s, %r', n_bar.shape, n_bar)

    # compute the term n sub C incorporating the coefficient of variation in
    # sample sizes
    n_C = (n_total - (np.sum(n**2, axis=1) / n_total)) / (r - 1)
    assert n_C.shape == (n_variants,)
    debug('n_C: %s, %r', n_C.shape, n_C)

    # compute allele frequencies within each population
    p = ac / an[:, np.newaxis, :]
    assert p.shape == (n_variants, n_alleles, n_populations)
    debug('p: %s, %r', p.shape, p)

    # compute the average sample frequency of each allele
    ac_total = np.sum(ac, axis=2)
    an_total = np.sum(an, axis=1)
    p_bar = ac_total / an_total[:, np.newaxis]
    assert p_bar.shape == (n_variants, n_alleles)
    debug('p_bar: %s, %r', p_bar.shape, p_bar)

    # add in some extra dimensions to enable broadcasting
    n_bar = n_bar[:, np.newaxis]
    n_C = n_C[:, np.newaxis]
    n = n[:, np.newaxis, :]
    p_bar = p_bar[:, :, np.newaxis]

    # compute the sample variance of allele frequencies over populations
    s_squared = (
        np.sum(n * ((p - p_bar) ** 2),
               axis=2) /
        (n_bar * (r - 1))
    )
    assert s_squared.shape == (n_variants, n_alleles)
    debug('s_squared: %s, %r', s_squared.shape, s_squared)

    # remove extra dimensions for correct broadcasting
    p_bar = p_bar[:, :, 0]

    # compute the average heterozygosity over all populations
    # N.B., take only samples in subpops of interest
    gs = g.take(list(itertools.chain(*subpops)), axis=1)
    h_bar = [gs.count_het(allele=allele, axis=1) / n_total
             for allele in range(n_alleles)]
    h_bar = np.column_stack(h_bar)
    assert h_bar.shape == (n_variants, n_alleles)
    debug('h_bar: %s, %r', h_bar.shape, h_bar)

    # now comes the tricky bit...

    # component of variance between populations
    a = ((n_bar / n_C) *
         (s_squared -
          ((1 / (n_bar - 1)) *
           ((p_bar * (1 - p_bar)) -
            ((r - 1) * s_squared / r) -
            (h_bar / 4)))))
    assert a.shape == (n_variants, n_alleles)

    # component of variance between individuals within populations
    b = ((n_bar / (n_bar - 1)) *
         ((p_bar * (1 - p_bar)) -
          ((r - 1) * s_squared / r) -
          (((2 * n_bar) - 1) * h_bar / (4 * n_bar))))
    assert b.shape == (n_variants, n_alleles)

    # component of variance between gametes within individuals
    c = h_bar / 2
    assert c.shape == (n_variants, n_alleles)

    return a, b, c