Python GenotypeArrayの例、allel.model.ndarray.GenotypeArray Pythonの例

コード例 #1

0

ファイルを表示

ファイル: test_stats.py プロジェクト: oxpeter/scikit-allel

    def test_heterozygosity_expected(self):

        def refimpl(af, ploidy, fill=0):
            """Limited reference implementation for testing purposes."""

            # check allele frequencies sum to 1
            af_sum = np.sum(af, axis=1)

            # assume three alleles
            p = af[:, 0]
            q = af[:, 1]
            r = af[:, 2]

            out = 1 - p**ploidy - q**ploidy - r**ploidy
            with ignore_invalid():
                out[(af_sum < 1) | np.isnan(af_sum)] = fill

            return out

        # diploid
        g = GenotypeArray([[[0, 0], [0, 0]],
                           [[1, 1], [1, 1]],
                           [[1, 1], [2, 2]],
                           [[0, 0], [0, 1]],
                           [[0, 0], [0, 2]],
                           [[1, 1], [1, 2]],
                           [[0, 1], [0, 1]],
                           [[0, 1], [1, 2]],
                           [[0, 0], [-1, -1]],
                           [[0, 1], [-1, -1]],
                           [[-1, -1], [-1, -1]]], dtype='i1')
        expect1 = [0, 0, 0.5, .375, .375, .375, .5, .625, 0, .5, -1]
        af = g.count_alleles().to_frequencies()
        expect2 = refimpl(af, ploidy=g.ploidy, fill=-1)
        actual = allel.stats.heterozygosity_expected(af, ploidy=g.ploidy,
                                                     fill=-1)
        assert_array_close(expect1, actual)
        assert_array_close(expect2, actual)
        expect3 = [0, 0, 0.5, .375, .375, .375, .5, .625, 0, .5, 0]
        actual = allel.stats.heterozygosity_expected(af, ploidy=g.ploidy,
                                                     fill=0)
        assert_array_close(expect3, actual)

        # polyploid
        g = GenotypeArray([[[0, 0, 0], [0, 0, 0]],
                           [[1, 1, 1], [1, 1, 1]],
                           [[1, 1, 1], [2, 2, 2]],
                           [[0, 0, 0], [0, 0, 1]],
                           [[0, 0, 0], [0, 0, 2]],
                           [[1, 1, 1], [0, 1, 2]],
                           [[0, 0, 1], [0, 1, 1]],
                           [[0, 1, 1], [0, 1, 2]],
                           [[0, 0, 0], [-1, -1, -1]],
                           [[0, 0, 1], [-1, -1, -1]],
                           [[-1, -1, -1], [-1, -1, -1]]], dtype='i1')
        af = g.count_alleles().to_frequencies()
        expect = refimpl(af, ploidy=g.ploidy, fill=-1)
        actual = allel.stats.heterozygosity_expected(af, ploidy=g.ploidy,
                                                     fill=-1)
        assert_array_close(expect, actual)

コード例 #2

0

ファイルを表示

ファイル: test_model_ndarray.py プロジェクト: hardingnj/scikit-allel

    def test_haploidify_samples(self):

        # diploid
        g = GenotypeArray([[[0, 1], [2, 3]],
                           [[4, 5], [6, 7]],
                           [[8, 9], [10, 11]]], dtype='i1')
        h = g.haploidify_samples()
        eq(2, h.ndim)
        eq(3, h.n_variants)
        eq(2, h.n_haplotypes)
        eq(np.int8, h.dtype)
        for i in range(g.n_variants):
            for j in range(g.n_samples):
                self.assertIn(h[i, j], set(g[i, j]))
        print(repr(h))

        # triploid
        g = GenotypeArray([[[0, 1, 2], [3, 4, 5]],
                           [[6, 7, 8], [9, 10, 11]],
                           [[12, 13, 14], [15, 16, 17]]], dtype='i1')
        h = g.haploidify_samples()
        eq(2, h.ndim)
        eq(3, h.n_variants)
        eq(2, h.n_haplotypes)
        eq(np.int8, h.dtype)
        for i in range(g.n_variants):
            for j in range(g.n_samples):
                self.assertIn(h[i, j], set(g[i, j]))

コード例 #3

0

ファイルを表示

ファイル: hw.py プロジェクト: oxpeter/scikit-allel

def inbreeding_coefficient(g, fill=np.nan):
    """Calculate the inbreeding coefficient for each variant.

    Parameters
    ----------

    g : array_like, int, shape (n_variants, n_samples, ploidy)
        Genotype array.
    fill : float, optional
        Use this value for variants where the expected heterozygosity is
        zero.

    Returns
    -------

    f : ndarray, float, shape (n_variants,)
        Inbreeding coefficient.

    Notes
    -----

    The inbreeding coefficient is calculated as *1 - (Ho/He)* where *Ho* is
    the observed heterozygosity and *He* is the expected heterozygosity.

    Examples
    --------

    >>> import allel
    >>> g = allel.GenotypeArray([[[0, 0], [0, 0], [0, 0]],
    ...                          [[0, 0], [0, 1], [1, 1]],
    ...                          [[0, 0], [1, 1], [2, 2]],
    ...                          [[1, 1], [1, 2], [-1, -1]]])
    >>> allel.stats.inbreeding_coefficient(g)
    array([        nan,  0.33333333,  1.        , -0.33333333])

    """

    # check inputs
    if not hasattr(g, 'count_het') or not hasattr(g, 'count_called'):
        g = GenotypeArray(g, copy=False)

    # calculate observed and expected heterozygosity
    ho = heterozygosity_observed(g)
    af = g.count_alleles().to_frequencies()
    he = heterozygosity_expected(af, ploidy=g.shape[-1], fill=0)

    # calculate inbreeding coefficient, accounting for variants with no
    # expected heterozygosity
    with ignore_invalid():
        f = np.where(he > 0, 1 - (ho / he), fill)

    return f

コード例 #4

0

ファイルを表示

ファイル: hw.py プロジェクト: quank/scikit-allel

def inbreeding_coefficient(g, fill=np.nan):
    """Calculate the inbreeding coefficient for each variant.

    Parameters
    ----------

    g : array_like, int, shape (n_variants, n_samples, ploidy)
        Genotype array.
    fill : float, optional
        Use this value for variants where the expected heterozygosity is
        zero.

    Returns
    -------

    f : ndarray, float, shape (n_variants,)
        Inbreeding coefficient.

    Notes
    -----

    The inbreeding coefficient is calculated as *1 - (Ho/He)* where *Ho* is
    the observed heterozygosity and *He* is the expected heterozygosity.

    Examples
    --------

    >>> import allel
    >>> g = allel.GenotypeArray([[[0, 0], [0, 0], [0, 0]],
    ...                          [[0, 0], [0, 1], [1, 1]],
    ...                          [[0, 0], [1, 1], [2, 2]],
    ...                          [[1, 1], [1, 2], [-1, -1]]])
    >>> allel.stats.inbreeding_coefficient(g)
    array([        nan,  0.33333333,  1.        , -0.33333333])

    """

    # check inputs
    if not hasattr(g, 'count_het') or not hasattr(g, 'count_called'):
        g = GenotypeArray(g, copy=False)

    # calculate observed and expected heterozygosity
    ho = heterozygosity_observed(g)
    af = g.count_alleles().to_frequencies()
    he = heterozygosity_expected(af, ploidy=g.shape[-1], fill=0)

    # calculate inbreeding coefficient, accounting for variants with no
    # expected heterozygosity
    with ignore_invalid():
        f = np.where(he > 0, 1 - (ho / he), fill)

    return f

コード例 #5

0

ファイルを表示

    def test_from_hdf5_condition(self):

        # setup HDF5 file
        node_path = 'test'
        tf = tempfile.NamedTemporaryFile(delete=False)
        file_path = tf.name
        tf.close()
        with h5py.File(file_path, mode='w') as h5f:
            h5f.create_dataset(node_path,
                               data=diploid_genotype_data,
                               chunks=(2, 3, 2))

        # selection
        condition = [False, True, False, True, False]

        # file and node path
        g = GenotypeCArray.from_hdf5(file_path, node_path, condition=condition)
        expect = GenotypeArray(diploid_genotype_data).compress(condition,
                                                               axis=0)
        aeq(expect, g)

        # dataset
        with h5py.File(file_path, mode='r') as h5f:
            dataset = h5f[node_path]
            g = GenotypeCArray.from_hdf5(dataset, condition=condition)
            aeq(expect, g)

コード例 #6

0

ファイルを表示

    def test_slice_types(self):

        g = GenotypeArray(diploid_genotype_data, dtype='i1')

        # row slice
        s = g[1:]
        assert_is_instance(s, GenotypeArray)

        # col slice
        s = g[:, 1:]
        assert_is_instance(s, GenotypeArray)

        # row index
        s = g[0]
        assert_is_instance(s, np.ndarray)
        assert_not_is_instance(s, GenotypeArray)

        # col index
        s = g[:, 0]
        assert_is_instance(s, np.ndarray)
        assert_not_is_instance(s, GenotypeArray)

        # ploidy index
        s = g[:, :, 0]
        assert_is_instance(s, np.ndarray)
        assert_not_is_instance(s, GenotypeArray)

        # item
        s = g[0, 0, 0]
        assert_is_instance(s, np.int8)
        assert_not_is_instance(s, GenotypeArray)

コード例 #7

0

ファイルを表示

ファイル: hw.py プロジェクト: oxpeter/scikit-allel

def heterozygosity_observed(g, fill=np.nan):
    """Calculate the rate of observed heterozygosity for each variant.

    Parameters
    ----------

    g : array_like, int, shape (n_variants, n_samples, ploidy)
        Genotype array.
    fill : float, optional
        Use this value for variants where all calls are missing.

    Returns
    -------

    ho : ndarray, float, shape (n_variants,)
        Observed heterozygosity

    Examples
    --------

    >>> import allel
    >>> g = allel.GenotypeArray([[[0, 0], [0, 0], [0, 0]],
    ...                          [[0, 0], [0, 1], [1, 1]],
    ...                          [[0, 0], [1, 1], [2, 2]],
    ...                          [[1, 1], [1, 2], [-1, -1]]])
    >>> allel.stats.heterozygosity_observed(g)
    array([ 0.        ,  0.33333333,  0.        ,  0.5       ])

    """

    # check inputs
    if not hasattr(g, 'count_het') or not hasattr(g, 'count_called'):
        g = GenotypeArray(g, copy=False)

    # count hets
    n_het = np.asarray(g.count_het(axis=1))
    n_called = np.asarray(g.count_called(axis=1))

    # calculate rate of observed heterozygosity, accounting for variants
    # where all calls are missing
    with ignore_invalid():
        ho = np.where(n_called > 0, n_het / n_called, fill)

    return ho

コード例 #8

0

ファイルを表示

ファイル: hw.py プロジェクト: quank/scikit-allel

def heterozygosity_observed(g, fill=np.nan):
    """Calculate the rate of observed heterozygosity for each variant.

    Parameters
    ----------

    g : array_like, int, shape (n_variants, n_samples, ploidy)
        Genotype array.
    fill : float, optional
        Use this value for variants where all calls are missing.

    Returns
    -------

    ho : ndarray, float, shape (n_variants,)
        Observed heterozygosity

    Examples
    --------

    >>> import allel
    >>> g = allel.GenotypeArray([[[0, 0], [0, 0], [0, 0]],
    ...                          [[0, 0], [0, 1], [1, 1]],
    ...                          [[0, 0], [1, 1], [2, 2]],
    ...                          [[1, 1], [1, 2], [-1, -1]]])
    >>> allel.stats.heterozygosity_observed(g)
    array([ 0.        ,  0.33333333,  0.        ,  0.5       ])

    """

    # check inputs
    if not hasattr(g, 'count_het') or not hasattr(g, 'count_called'):
        g = GenotypeArray(g, copy=False)

    # count hets
    n_het = np.asarray(g.count_het(axis=1))
    n_called = np.asarray(g.count_called(axis=1))

    # calculate rate of observed heterozygosity, accounting for variants
    # where all calls are missing
    with ignore_invalid():
        ho = np.where(n_called > 0, n_het / n_called, fill)

    return ho

コード例 #9

0

ファイルを表示

ファイル: mendel.py プロジェクト: yangmqglobe/scikit-allel

def phase_by_transmission(g, window_size, copy=True):
    """Phase genotypes in a trio or cross where possible using Mendelian
    transmission.

    Parameters
    ----------
    g : array_like, int, shape (n_variants, n_samples, 2)
        Genotype array, with parents as first two columns and progeny as
        remaining columns.
    window_size : int
        Number of previous heterozygous sites to include when phasing each
        parent. A number somewhere between 10 and 100 may be appropriate,
        depending on levels of heterozygosity and quality of data.
    copy : bool, optional
        If False, attempt to phase genotypes in-place. Note that this is
        only possible if the input array has int8 dtype, otherwise a copy is
        always made regardless of this parameter.

    Returns
    -------
    g : GenotypeArray
        Genotype array with progeny phased where possible.

    """

    # setup
    g = np.asarray(g, dtype='i1')
    g = GenotypeArray(g, copy=copy)
    g._values = memoryview_safe(g.values)
    check_ploidy(g.ploidy, 2)
    check_min_samples(g.n_samples, 3)

    # phase the progeny
    is_phased = _opt_phase_progeny_by_transmission(g.values)
    g.is_phased = np.asarray(is_phased).view(bool)

    # phase the parents
    _opt_phase_parents_by_transmission(g.values, is_phased, window_size)

    return g

コード例 #10

0

ファイルを表示

    def test_haploidify_samples(self):

        # diploid
        g = GenotypeArray([[[0, 1], [2, 3]],
                           [[4, 5], [6, 7]],
                           [[8, 9], [10, 11]]], dtype='i1')
        h = g.haploidify_samples()
        eq(2, h.ndim)
        eq(3, h.n_variants)
        eq(2, h.n_haplotypes)
        eq(np.int8, h.dtype)
        for i in range(g.n_variants):
            for j in range(g.n_samples):
                self.assertIn(h[i, j], set(g[i, j]))
        print(repr(h))

        # triploid
        g = GenotypeArray([[[0, 1, 2], [3, 4, 5]],
                           [[6, 7, 8], [9, 10, 11]],
                           [[12, 13, 14], [15, 16, 17]]], dtype='i1')
        h = g.haploidify_samples()
        eq(2, h.ndim)
        eq(3, h.n_variants)
        eq(2, h.n_haplotypes)
        eq(np.int8, h.dtype)
        for i in range(g.n_variants):
            for j in range(g.n_samples):
                self.assertIn(h[i, j], set(g[i, j]))

コード例 #11

0

ファイルを表示

ファイル: test_stats.py プロジェクト: oxpeter/scikit-allel

    def test_pairwise_distance_multidim(self):
        g = GenotypeArray([[[0, 0], [0, 0]],
                           [[1, 1], [1, 1]],
                           [[1, 1], [2, 2]],
                           [[0, 0], [0, 1]],
                           [[0, 0], [0, 2]],
                           [[1, 1], [1, 2]],
                           [[0, 1], [0, 1]],
                           [[0, 1], [1, 2]],
                           [[0, 0], [-1, -1]],
                           [[0, 1], [-1, -1]],
                           [[-1, -1], [-1, -1]]], dtype='i1')
        gac = g.to_allele_counts()

        def metric(ac1, ac2):
            mpd = allel.stats.mean_pairwise_difference_between(ac1, ac2,
                                                               fill=0)
            return mpd.sum()

        expect = [
            allel.stats.mean_pairwise_difference_between(gac[:, 0], gac[:, 1],
                                                         fill=0).sum()]
        actual = allel.stats.pairwise_distance(gac, metric)
        aeq(expect, actual)

コード例 #12

0

ファイルを表示

    def test_constructor(self):

        # missing data arg
        with assert_raises(TypeError):
            # noinspection PyArgumentList
            GenotypeArray()

        # data has wrong dtype
        data = 'foo bar'
        with assert_raises(TypeError):
            GenotypeArray(data)

        # data has wrong dtype
        data = [4., 5., 3.7]
        with assert_raises(TypeError):
            GenotypeArray(data)

        # data has wrong dimensions
        data = [1, 2, 3]
        with assert_raises(TypeError):
            GenotypeArray(data)

        # data has wrong dimensions
        data = [[1, 2], [3, 4]]  # use HaplotypeArray instead
        with assert_raises(TypeError):
            GenotypeArray(data)

        # diploid data (typed)
        g = GenotypeArray(diploid_genotype_data, dtype='i1')
        aeq(diploid_genotype_data, g)
        eq(np.int8, g.dtype)

        # polyploid data (typed)
        g = GenotypeArray(triploid_genotype_data, dtype='i1')
        aeq(triploid_genotype_data, g)
        eq(np.int8, g.dtype)

コード例 #13

0

ファイルを表示

ファイル: fst.py プロジェクト: podpearson/scikit-allel

def _weir_cockerham_fst(g, subpops, max_allele):

    # check inputs
    g = GenotypeArray(g, copy=False)
    n_variants, n_samples, ploidy = g.shape
    n_alleles = max_allele + 1

    # number of populations sampled
    r = len(subpops)
    n_populations = r
    debug('r: %r', r)

    # count alleles within each subpopulation
    ac = [g.count_alleles(subpop=s, max_allele=max_allele) for s in subpops]

    # stack allele counts from each sub-population into a single array
    ac = np.dstack(ac)
    assert ac.shape == (n_variants, n_alleles, n_populations)
    debug('ac: %s, %r', ac.shape, ac)

    # count number of alleles called within each population by summing
    # allele counts along the alleles dimension
    an = np.sum(ac, axis=1)
    assert an.shape == (n_variants, n_populations)
    debug('an: %s, %r', an.shape, an)

    # compute number of individuals sampled from each population
    n = an // 2
    assert n.shape == (n_variants, n_populations)
    debug('n: %s, %r', n.shape, n)

    # compute the total number of individuals sampled across all populations
    n_total = np.sum(n, axis=1)
    assert n_total.shape == (n_variants,)
    debug('n_total: %s, %r', n_total.shape, n_total)

    # compute the average sample size across populations
    n_bar = np.mean(n, axis=1)
    assert n_bar.shape == (n_variants,)
    debug('n_bar: %s, %r', n_bar.shape, n_bar)

    # compute the term n sub C incorporating the coefficient of variation in
    # sample sizes
    n_C = (n_total - (np.sum(n**2, axis=1) / n_total)) / (r - 1)
    assert n_C.shape == (n_variants,)
    debug('n_C: %s, %r', n_C.shape, n_C)

    # compute allele frequencies within each population
    p = ac / an[:, np.newaxis, :]
    assert p.shape == (n_variants, n_alleles, n_populations)
    debug('p: %s, %r', p.shape, p)

    # compute the average sample frequency of each allele
    ac_total = np.sum(ac, axis=2)
    an_total = np.sum(an, axis=1)
    p_bar = ac_total / an_total[:, np.newaxis]
    assert p_bar.shape == (n_variants, n_alleles)
    debug('p_bar: %s, %r', p_bar.shape, p_bar)

    # add in some extra dimensions to enable broadcasting
    n_bar = n_bar[:, np.newaxis]
    n_C = n_C[:, np.newaxis]
    n = n[:, np.newaxis, :]
    p_bar = p_bar[:, :, np.newaxis]

    # compute the sample variance of allele frequencies over populations
    s_squared = (
        np.sum(n * ((p - p_bar) ** 2),
               axis=2) /
        (n_bar * (r - 1))
    )
    assert s_squared.shape == (n_variants, n_alleles)
    debug('s_squared: %s, %r', s_squared.shape, s_squared)

    # remove extra dimensions for correct broadcasting
    p_bar = p_bar[:, :, 0]

    # compute the average heterozygosity over all populations
    # N.B., take only samples in subpops of interest
    gs = g.take(list(itertools.chain(*subpops)), axis=1)
    h_bar = [gs.count_het(allele=allele, axis=1) / n_total
             for allele in range(n_alleles)]
    h_bar = np.column_stack(h_bar)
    assert h_bar.shape == (n_variants, n_alleles)
    debug('h_bar: %s, %r', h_bar.shape, h_bar)

    # now comes the tricky bit...

    # component of variance between populations
    a = ((n_bar / n_C) *
         (s_squared -
          ((1 / (n_bar - 1)) *
           ((p_bar * (1 - p_bar)) -
            ((r - 1) * s_squared / r) -
            (h_bar / 4)))))
    assert a.shape == (n_variants, n_alleles)

    # component of variance between individuals within populations
    b = ((n_bar / (n_bar - 1)) *
         ((p_bar * (1 - p_bar)) -
          ((r - 1) * s_squared / r) -
          (((2 * n_bar) - 1) * h_bar / (4 * n_bar))))
    assert b.shape == (n_variants, n_alleles)

    # component of variance between gametes within individuals
    c = h_bar / 2
    assert c.shape == (n_variants, n_alleles)

    return a, b, c

コード例 #14

0

ファイルを表示

ファイル: mendel.py プロジェクト: yangmqglobe/scikit-allel

def phase_progeny_by_transmission(g):
    """Phase progeny genotypes from a trio or cross using Mendelian
    transmission.

    Parameters
    ----------
    g : array_like, int, shape (n_variants, n_samples, 2)
        Genotype array, with parents as first two columns and progeny as
        remaining columns.

    Returns
    -------
    g : ndarray, int8, shape (n_variants, n_samples, 2)
        Genotype array with progeny phased where possible.

    Examples
    --------
    >>> import allel
    >>> g = allel.GenotypeArray([
    ...     [[0, 0], [0, 0], [0, 0]],
    ...     [[1, 1], [1, 1], [1, 1]],
    ...     [[0, 0], [1, 1], [0, 1]],
    ...     [[1, 1], [0, 0], [0, 1]],
    ...     [[0, 0], [0, 1], [0, 0]],
    ...     [[0, 0], [0, 1], [0, 1]],
    ...     [[0, 1], [0, 0], [0, 1]],
    ...     [[0, 1], [0, 1], [0, 1]],
    ...     [[0, 1], [1, 2], [0, 1]],
    ...     [[1, 2], [0, 1], [1, 2]],
    ...     [[0, 1], [2, 3], [0, 2]],
    ...     [[2, 3], [0, 1], [1, 3]],
    ...     [[0, 0], [0, 0], [-1, -1]],
    ...     [[0, 0], [0, 0], [1, 1]],
    ... ], dtype='i1')
    >>> g = allel.phase_progeny_by_transmission(g)
    >>> print(g.to_str(row_threshold=None))
    0/0 0/0 0|0
    1/1 1/1 1|1
    0/0 1/1 0|1
    1/1 0/0 1|0
    0/0 0/1 0|0
    0/0 0/1 0|1
    0/1 0/0 1|0
    0/1 0/1 0/1
    0/1 1/2 0|1
    1/2 0/1 2|1
    0/1 2/3 0|2
    2/3 0/1 3|1
    0/0 0/0 ./.
    0/0 0/0 1/1
    >>> g.is_phased
    array([[False, False,  True],
           [False, False,  True],
           [False, False,  True],
           [False, False,  True],
           [False, False,  True],
           [False, False,  True],
           [False, False,  True],
           [False, False, False],
           [False, False,  True],
           [False, False,  True],
           [False, False,  True],
           [False, False,  True],
           [False, False, False],
           [False, False, False]])

    """

    # setup
    g = GenotypeArray(g, dtype='i1', copy=True)
    check_ploidy(g.ploidy, 2)
    check_min_samples(g.n_samples, 3)

    # run the phasing
    # N.B., a copy has already been made, so no need to make memoryview safe
    is_phased = _opt_phase_progeny_by_transmission(g.values)
    g.is_phased = np.asarray(is_phased).view(bool)

    # outputs
    return g

コード例 #15

0

ファイルを表示

ファイル: mendel.py プロジェクト: yangmqglobe/scikit-allel

def paint_transmission(parent_haplotypes, progeny_haplotypes):
    """Paint haplotypes inherited from a single diploid parent according to
    their allelic inheritance.

    Parameters
    ----------
    parent_haplotypes : array_like, int, shape (n_variants, 2)
        Both haplotypes from a single diploid parent.
    progeny_haplotypes : array_like, int, shape (n_variants, n_progeny)
        Haplotypes found in progeny of the given parent, inherited from the
        given parent. I.e., haplotypes from gametes of the given parent.

    Returns
    -------
    painting : ndarray, uint8, shape (n_variants, n_progeny)
        An array of integers coded as follows: 1 = allele inherited from
        first parental haplotype; 2 = allele inherited from second parental
        haplotype; 3 = reference allele, also carried by both parental
        haplotypes; 4 = non-reference allele, also carried by both parental
        haplotypes; 5 = non-parental allele; 6 = either or both parental
        alleles missing; 7 = missing allele; 0 = undetermined.

    Examples
    --------
    >>> import allel
    >>> haplotypes = allel.HaplotypeArray([
    ...     [0, 0, 0, 1, 2, -1],
    ...     [0, 1, 0, 1, 2, -1],
    ...     [1, 0, 0, 1, 2, -1],
    ...     [1, 1, 0, 1, 2, -1],
    ...     [0, 2, 0, 1, 2, -1],
    ...     [0, -1, 0, 1, 2, -1],
    ...     [-1, 1, 0, 1, 2, -1],
    ...     [-1, -1, 0, 1, 2, -1],
    ... ], dtype='i1')
    >>> painting = allel.paint_transmission(haplotypes[:, :2],
    ...                                           haplotypes[:, 2:])
    >>> painting
    array([[3, 5, 5, 7],
           [1, 2, 5, 7],
           [2, 1, 5, 7],
           [5, 4, 5, 7],
           [1, 5, 2, 7],
           [6, 6, 6, 7],
           [6, 6, 6, 7],
           [6, 6, 6, 7]], dtype=uint8)

    """

    # check inputs
    parent_haplotypes = HaplotypeArray(parent_haplotypes)
    progeny_haplotypes = HaplotypeArray(progeny_haplotypes)
    if parent_haplotypes.n_haplotypes != 2:
        raise ValueError('exactly two parental haplotypes should be provided')

    # convenience variables
    parent1 = parent_haplotypes[:, 0, np.newaxis]
    parent2 = parent_haplotypes[:, 1, np.newaxis]
    progeny_is_missing = progeny_haplotypes < 0
    parent_is_missing = np.any(parent_haplotypes < 0, axis=1)
    # need this for broadcasting, but also need to retain original for later
    parent_is_missing_bc = parent_is_missing[:, np.newaxis]
    parent_diplotype = GenotypeArray(parent_haplotypes[:, np.newaxis, :])
    parent_is_hom_ref = parent_diplotype.is_hom_ref()
    parent_is_het = parent_diplotype.is_het()
    parent_is_hom_alt = parent_diplotype.is_hom_alt()

    # identify allele calls where inheritance can be determined
    is_callable = ~progeny_is_missing & ~parent_is_missing_bc
    is_callable_seg = is_callable & parent_is_het

    # main inheritance states
    inherit_parent1 = is_callable_seg & (progeny_haplotypes == parent1)
    inherit_parent2 = is_callable_seg & (progeny_haplotypes == parent2)
    nonseg_ref = (is_callable & parent_is_hom_ref & (progeny_haplotypes == parent1))
    nonseg_alt = (is_callable & parent_is_hom_alt & (progeny_haplotypes == parent1))
    nonparental = (
        is_callable & (progeny_haplotypes != parent1) & (progeny_haplotypes != parent2)
    )

    # record inheritance states
    # N.B., order in which these are set matters
    painting = np.zeros(progeny_haplotypes.shape, dtype='u1')
    painting[inherit_parent1] = INHERIT_PARENT1
    painting[inherit_parent2] = INHERIT_PARENT2
    painting[nonseg_ref] = INHERIT_NONSEG_REF
    painting[nonseg_alt] = INHERIT_NONSEG_ALT
    painting[nonparental] = INHERIT_NONPARENTAL
    painting[parent_is_missing] = INHERIT_PARENT_MISSING
    painting[progeny_is_missing] = INHERIT_MISSING

    return painting

コード例 #16

0

ファイルを表示

ファイル: mendel.py プロジェクト: yangmqglobe/scikit-allel

def mendel_errors(parent_genotypes, progeny_genotypes):
    """Locate genotype calls not consistent with Mendelian transmission of
    alleles.

    Parameters
    ----------
    parent_genotypes : array_like, int, shape (n_variants, 2, 2)
        Genotype calls for the two parents.
    progeny_genotypes : array_like, int, shape (n_variants, n_progeny, 2)
        Genotype calls for the progeny.

    Returns
    -------
    me : ndarray, int, shape (n_variants, n_progeny)
        Count of Mendel errors for each progeny genotype call.

    Examples
    --------
    The following are all consistent with Mendelian transmission. Note that a
    value of 0 is returned for missing calls::

        >>> import allel
        >>> import numpy as np
        >>> genotypes = np.array([
        ...     # aa x aa -> aa
        ...     [[0, 0], [0, 0], [0, 0], [-1, -1], [-1, -1], [-1, -1]],
        ...     [[1, 1], [1, 1], [1, 1], [-1, -1], [-1, -1], [-1, -1]],
        ...     [[2, 2], [2, 2], [2, 2], [-1, -1], [-1, -1], [-1, -1]],
        ...     # aa x ab -> aa or ab
        ...     [[0, 0], [0, 1], [0, 0], [0, 1], [-1, -1], [-1, -1]],
        ...     [[0, 0], [0, 2], [0, 0], [0, 2], [-1, -1], [-1, -1]],
        ...     [[1, 1], [0, 1], [1, 1], [0, 1], [-1, -1], [-1, -1]],
        ...     # aa x bb -> ab
        ...     [[0, 0], [1, 1], [0, 1], [-1, -1], [-1, -1], [-1, -1]],
        ...     [[0, 0], [2, 2], [0, 2], [-1, -1], [-1, -1], [-1, -1]],
        ...     [[1, 1], [2, 2], [1, 2], [-1, -1], [-1, -1], [-1, -1]],
        ...     # aa x bc -> ab or ac
        ...     [[0, 0], [1, 2], [0, 1], [0, 2], [-1, -1], [-1, -1]],
        ...     [[1, 1], [0, 2], [0, 1], [1, 2], [-1, -1], [-1, -1]],
        ...     # ab x ab -> aa or ab or bb
        ...     [[0, 1], [0, 1], [0, 0], [0, 1], [1, 1], [-1, -1]],
        ...     [[1, 2], [1, 2], [1, 1], [1, 2], [2, 2], [-1, -1]],
        ...     [[0, 2], [0, 2], [0, 0], [0, 2], [2, 2], [-1, -1]],
        ...     # ab x bc -> ab or ac or bb or bc
        ...     [[0, 1], [1, 2], [0, 1], [0, 2], [1, 1], [1, 2]],
        ...     [[0, 1], [0, 2], [0, 0], [0, 1], [0, 1], [1, 2]],
        ...     # ab x cd -> ac or ad or bc or bd
        ...     [[0, 1], [2, 3], [0, 2], [0, 3], [1, 2], [1, 3]],
        ... ])
        >>> me = allel.mendel_errors(genotypes[:, :2], genotypes[:, 2:])
        >>> me
        array([[0, 0, 0, 0],
               [0, 0, 0, 0],
               [0, 0, 0, 0],
               [0, 0, 0, 0],
               [0, 0, 0, 0],
               [0, 0, 0, 0],
               [0, 0, 0, 0],
               [0, 0, 0, 0],
               [0, 0, 0, 0],
               [0, 0, 0, 0],
               [0, 0, 0, 0],
               [0, 0, 0, 0],
               [0, 0, 0, 0],
               [0, 0, 0, 0],
               [0, 0, 0, 0],
               [0, 0, 0, 0],
               [0, 0, 0, 0]])

    The following are cases of 'non-parental' inheritance where one or two
    alleles are found in the progeny that are not present in either parent.
    Note that the number of errors may be 1 or 2 depending on the number of
    non-parental alleles::

        >>> genotypes = np.array([
        ...     # aa x aa -> ab or ac or bb or cc
        ...     [[0, 0], [0, 0], [0, 1], [0, 2], [1, 1], [2, 2]],
        ...     [[1, 1], [1, 1], [0, 1], [1, 2], [0, 0], [2, 2]],
        ...     [[2, 2], [2, 2], [0, 2], [1, 2], [0, 0], [1, 1]],
        ...     # aa x ab -> ac or bc or cc
        ...     [[0, 0], [0, 1], [0, 2], [1, 2], [2, 2], [2, 2]],
        ...     [[0, 0], [0, 2], [0, 1], [1, 2], [1, 1], [1, 1]],
        ...     [[1, 1], [0, 1], [1, 2], [0, 2], [2, 2], [2, 2]],
        ...     # aa x bb -> ac or bc or cc
        ...     [[0, 0], [1, 1], [0, 2], [1, 2], [2, 2], [2, 2]],
        ...     [[0, 0], [2, 2], [0, 1], [1, 2], [1, 1], [1, 1]],
        ...     [[1, 1], [2, 2], [0, 1], [0, 2], [0, 0], [0, 0]],
        ...     # ab x ab -> ac or bc or cc
        ...     [[0, 1], [0, 1], [0, 2], [1, 2], [2, 2], [2, 2]],
        ...     [[0, 2], [0, 2], [0, 1], [1, 2], [1, 1], [1, 1]],
        ...     [[1, 2], [1, 2], [0, 1], [0, 2], [0, 0], [0, 0]],
        ...     # ab x bc -> ad or bd or cd or dd
        ...     [[0, 1], [1, 2], [0, 3], [1, 3], [2, 3], [3, 3]],
        ...     [[0, 1], [0, 2], [0, 3], [1, 3], [2, 3], [3, 3]],
        ...     [[0, 2], [1, 2], [0, 3], [1, 3], [2, 3], [3, 3]],
        ...     # ab x cd -> ae or be or ce or de
        ...     [[0, 1], [2, 3], [0, 4], [1, 4], [2, 4], [3, 4]],
        ... ])
        >>> me = allel.mendel_errors(genotypes[:, :2], genotypes[:, 2:])
        >>> me
        array([[1, 1, 2, 2],
               [1, 1, 2, 2],
               [1, 1, 2, 2],
               [1, 1, 2, 2],
               [1, 1, 2, 2],
               [1, 1, 2, 2],
               [1, 1, 2, 2],
               [1, 1, 2, 2],
               [1, 1, 2, 2],
               [1, 1, 2, 2],
               [1, 1, 2, 2],
               [1, 1, 2, 2],
               [1, 1, 1, 2],
               [1, 1, 1, 2],
               [1, 1, 1, 2],
               [1, 1, 1, 1]])

    The following are cases of 'hemi-parental' inheritance, where progeny
    appear to have inherited two copies of an allele found only once in one of
    the parents::

        >>> genotypes = np.array([
        ...     # aa x ab -> bb
        ...     [[0, 0], [0, 1], [1, 1], [-1, -1]],
        ...     [[0, 0], [0, 2], [2, 2], [-1, -1]],
        ...     [[1, 1], [0, 1], [0, 0], [-1, -1]],
        ...     # ab x bc -> aa or cc
        ...     [[0, 1], [1, 2], [0, 0], [2, 2]],
        ...     [[0, 1], [0, 2], [1, 1], [2, 2]],
        ...     [[0, 2], [1, 2], [0, 0], [1, 1]],
        ...     # ab x cd -> aa or bb or cc or dd
        ...     [[0, 1], [2, 3], [0, 0], [1, 1]],
        ...     [[0, 1], [2, 3], [2, 2], [3, 3]],
        ... ])
        >>> me = allel.mendel_errors(genotypes[:, :2], genotypes[:, 2:])
        >>> me
        array([[1, 0],
               [1, 0],
               [1, 0],
               [1, 1],
               [1, 1],
               [1, 1],
               [1, 1],
               [1, 1]])

    The following are cases of 'uni-parental' inheritance, where progeny
    appear to have inherited both alleles from a single parent::

        >>> genotypes = np.array([
        ...     # aa x bb -> aa or bb
        ...     [[0, 0], [1, 1], [0, 0], [1, 1]],
        ...     [[0, 0], [2, 2], [0, 0], [2, 2]],
        ...     [[1, 1], [2, 2], [1, 1], [2, 2]],
        ...     # aa x bc -> aa or bc
        ...     [[0, 0], [1, 2], [0, 0], [1, 2]],
        ...     [[1, 1], [0, 2], [1, 1], [0, 2]],
        ...     # ab x cd -> ab or cd
        ...     [[0, 1], [2, 3], [0, 1], [2, 3]],
        ... ])
        >>> me = allel.mendel_errors(genotypes[:, :2], genotypes[:, 2:])
        >>> me
        array([[1, 1],
               [1, 1],
               [1, 1],
               [1, 1],
               [1, 1],
               [1, 1]])

    """

    # setup
    parent_genotypes = GenotypeArray(parent_genotypes)
    progeny_genotypes = GenotypeArray(progeny_genotypes)
    check_ploidy(parent_genotypes.ploidy, 2)
    check_ploidy(progeny_genotypes.ploidy, 2)

    # transform into per-call allele counts
    max_allele = max(parent_genotypes.max(), progeny_genotypes.max())
    parent_gc = parent_genotypes.to_allele_counts(max_allele=max_allele, dtype='i1')
    progeny_gc = progeny_genotypes.to_allele_counts(max_allele=max_allele, dtype='i1')

    # detect nonparental and hemiparental inheritance by comparing allele
    # counts between parents and progeny
    max_progeny_gc = parent_gc.clip(max=1).sum(axis=1)
    max_progeny_gc = max_progeny_gc[:, np.newaxis, :]
    me = (progeny_gc - max_progeny_gc).clip(min=0).sum(axis=2)

    # detect uniparental inheritance by finding cases where no alleles are
    # shared between parents, then comparing progeny allele counts to each
    # parent
    p1_gc = parent_gc[:, 0, np.newaxis, :]
    p2_gc = parent_gc[:, 1, np.newaxis, :]
    # find variants where parents don't share any alleles
    is_shared_allele = (p1_gc > 0) & (p2_gc > 0)
    no_shared_alleles = ~np.any(is_shared_allele, axis=2)
    # find calls where progeny genotype is identical to one or the other parent
    me[no_shared_alleles &
       (np.all(progeny_gc == p1_gc, axis=2) |
        np.all(progeny_gc == p2_gc, axis=2))] = 1

    # retrofit where either or both parent has a missing call
    me[np.any(parent_genotypes.is_missing(), axis=1)] = 0

    return me

コード例 #17

0

ファイルを表示

ファイル: fst.py プロジェクト: yangmqglobe/scikit-allel

def weir_cockerham_fst(g, subpops, max_allele=None, blen=None):
    """Compute the variance components from the analyses of variance of
    allele frequencies according to Weir and Cockerham (1984).

    Parameters
    ----------
    g : array_like, int, shape (n_variants, n_samples, ploidy)
        Genotype array.
    subpops : sequence of sequences of ints
        Sample indices for each subpopulation.
    max_allele : int, optional
        The highest allele index to consider.
    blen : int, optional
        Block length to use for chunked computation.

    Returns
    -------
    a : ndarray, float, shape (n_variants, n_alleles)
        Component of variance between populations.
    b : ndarray, float, shape (n_variants, n_alleles)
        Component of variance between individuals within populations.
    c : ndarray, float, shape (n_variants, n_alleles)
        Component of variance between gametes within individuals.

    Examples
    --------
    Calculate variance components from some genotype data::

        >>> import allel
        >>> g = [[[0, 0], [0, 0], [1, 1], [1, 1]],
        ...      [[0, 1], [0, 1], [0, 1], [0, 1]],
        ...      [[0, 0], [0, 0], [0, 0], [0, 0]],
        ...      [[0, 1], [1, 2], [1, 1], [2, 2]],
        ...      [[0, 0], [1, 1], [0, 1], [-1, -1]]]
        >>> subpops = [[0, 1], [2, 3]]
        >>> a, b, c = allel.weir_cockerham_fst(g, subpops)
        >>> a
        array([[ 0.5  ,  0.5  ,  0.   ],
               [ 0.   ,  0.   ,  0.   ],
               [ 0.   ,  0.   ,  0.   ],
               [ 0.   , -0.125, -0.125],
               [-0.375, -0.375,  0.   ]])
        >>> b
        array([[ 0.        ,  0.        ,  0.        ],
               [-0.25      , -0.25      ,  0.        ],
               [ 0.        ,  0.        ,  0.        ],
               [ 0.        ,  0.125     ,  0.25      ],
               [ 0.41666667,  0.41666667,  0.        ]])
        >>> c
        array([[0.        , 0.        , 0.        ],
               [0.5       , 0.5       , 0.        ],
               [0.        , 0.        , 0.        ],
               [0.125     , 0.25      , 0.125     ],
               [0.16666667, 0.16666667, 0.        ]])

    Estimate the parameter theta (a.k.a., Fst) for each variant
    and each allele individually::

        >>> fst = a / (a + b + c)
        >>> fst
        array([[ 1. ,  1. ,  nan],
               [ 0. ,  0. ,  nan],
               [ nan,  nan,  nan],
               [ 0. , -0.5, -0.5],
               [-1.8, -1.8,  nan]])

    Estimate Fst for each variant individually (averaging over alleles)::

        >>> fst = (np.sum(a, axis=1) /
        ...        (np.sum(a, axis=1) + np.sum(b, axis=1) + np.sum(c, axis=1)))
        >>> fst
        array([ 1. ,  0. ,  nan, -0.4, -1.8])

    Estimate Fst averaging over all variants and alleles::

        >>> fst = np.sum(a) / (np.sum(a) + np.sum(b) + np.sum(c))
        >>> fst
        -4.36809058868914e-17

    Note that estimated Fst values may be negative.

    """

    # check inputs
    if not hasattr(g, 'shape') or not hasattr(g, 'ndim'):
        g = GenotypeArray(g, copy=False)
    if g.ndim != 3:
        raise ValueError('g must have three dimensions')
    if g.shape[2] != 2:
        raise NotImplementedError('only diploid genotypes are supported')

    # determine highest allele index
    if max_allele is None:
        max_allele = g.max()

    # compute in chunks to avoid loading big arrays into memory
    blen = get_blen_array(g, blen)
    n_variants = g.shape[0]
    shape = (n_variants, max_allele + 1)
    a = np.zeros(shape, dtype='f8')
    b = np.zeros(shape, dtype='f8')
    c = np.zeros(shape, dtype='f8')
    for i in range(0, n_variants, blen):
        j = min(n_variants, i + blen)
        gb = g[i:j]
        ab, bb, cb = _weir_cockerham_fst(gb, subpops, max_allele)
        a[i:j] = ab
        b[i:j] = bb
        c[i:j] = cb

    return a, b, c

コード例 #18

0

ファイルを表示

ファイル: fst.py プロジェクト: yangmqglobe/scikit-allel

def _weir_cockerham_fst(g, subpops, max_allele):

    # check inputs
    g = GenotypeArray(g, copy=False)
    n_variants, n_samples, ploidy = g.shape
    n_alleles = max_allele + 1

    # number of populations sampled
    r = len(subpops)
    n_populations = r
    debug('r: %r', r)

    # count alleles within each subpopulation
    ac = [g.count_alleles(subpop=s, max_allele=max_allele) for s in subpops]

    # stack allele counts from each sub-population into a single array
    ac = np.dstack(ac)
    assert ac.shape == (n_variants, n_alleles, n_populations)
    debug('ac: %s, %r', ac.shape, ac)

    # count number of alleles called within each population by summing
    # allele counts along the alleles dimension
    an = np.sum(ac, axis=1)
    assert an.shape == (n_variants, n_populations)
    debug('an: %s, %r', an.shape, an)

    # compute number of individuals sampled from each population
    n = an // 2
    assert n.shape == (n_variants, n_populations)
    debug('n: %s, %r', n.shape, n)

    # compute the total number of individuals sampled across all populations
    n_total = np.sum(n, axis=1)
    assert n_total.shape == (n_variants, )
    debug('n_total: %s, %r', n_total.shape, n_total)

    # compute the average sample size across populations
    n_bar = np.mean(n, axis=1)
    assert n_bar.shape == (n_variants, )
    debug('n_bar: %s, %r', n_bar.shape, n_bar)

    # compute the term n sub C incorporating the coefficient of variation in
    # sample sizes
    n_C = (n_total - (np.sum(n**2, axis=1) / n_total)) / (r - 1)
    assert n_C.shape == (n_variants, )
    debug('n_C: %s, %r', n_C.shape, n_C)

    # compute allele frequencies within each population
    p = ac / an[:, np.newaxis, :]
    assert p.shape == (n_variants, n_alleles, n_populations)
    debug('p: %s, %r', p.shape, p)

    # compute the average sample frequency of each allele
    ac_total = np.sum(ac, axis=2)
    an_total = np.sum(an, axis=1)
    p_bar = ac_total / an_total[:, np.newaxis]
    assert p_bar.shape == (n_variants, n_alleles)
    debug('p_bar: %s, %r', p_bar.shape, p_bar)

    # add in some extra dimensions to enable broadcasting
    n_bar = n_bar[:, np.newaxis]
    n_C = n_C[:, np.newaxis]
    n = n[:, np.newaxis, :]
    p_bar = p_bar[:, :, np.newaxis]

    # compute the sample variance of allele frequencies over populations
    s_squared = (np.sum(n * ((p - p_bar)**2), axis=2) / (n_bar * (r - 1)))
    assert s_squared.shape == (n_variants, n_alleles)
    debug('s_squared: %s, %r', s_squared.shape, s_squared)

    # remove extra dimensions for correct broadcasting
    p_bar = p_bar[:, :, 0]

    # compute the average heterozygosity over all populations
    # N.B., take only samples in subpops of interest
    gs = g.take(list(itertools.chain(*subpops)), axis=1)
    h_bar = [
        gs.count_het(allele=allele, axis=1) / n_total
        for allele in range(n_alleles)
    ]
    h_bar = np.column_stack(h_bar)
    assert h_bar.shape == (n_variants, n_alleles)
    debug('h_bar: %s, %r', h_bar.shape, h_bar)

    # now comes the tricky bit...

    # component of variance between populations
    a = ((n_bar / n_C) * (s_squared -
                          ((1 / (n_bar - 1)) *
                           ((p_bar * (1 - p_bar)) - ((r - 1) * s_squared / r) -
                            (h_bar / 4)))))
    assert a.shape == (n_variants, n_alleles)

    # component of variance between individuals within populations
    b = ((n_bar / (n_bar - 1)) * ((p_bar *
                                   (1 - p_bar)) - ((r - 1) * s_squared / r) -
                                  (((2 * n_bar) - 1) * h_bar / (4 * n_bar))))
    assert b.shape == (n_variants, n_alleles)

    # component of variance between gametes within individuals
    c = h_bar / 2
    assert c.shape == (n_variants, n_alleles)

    return a, b, c

コード例 #19

0

ファイルを表示

ファイル: fst.py プロジェクト: podpearson/scikit-allel

def weir_cockerham_fst(g, subpops, max_allele=None, chunked=False, blen=None):
    """Compute the variance components from the analyses of variance of
    allele frequencies according to Weir and Cockerham (1984).

    Parameters
    ----------
    g : array_like, int, shape (n_variants, n_samples, ploidy)
        Genotype array.
    subpops : sequence of sequences of ints
        Sample indices for each subpopulation.
    max_allele : int, optional
        The highest allele index to consider.
    chunked : bool, optional
        If True, use a block-wise implementation to avoid loading the entire
        input array into memory.
    blen : int, optional
        Block length to use for chunked implementation.

    Returns
    -------
    a : ndarray, float, shape (n_variants, n_alleles)
        Component of variance between populations.
    b : ndarray, float, shape (n_variants, n_alleles)
        Component of variance between individuals within populations.
    c : ndarray, float, shape (n_variants, n_alleles)
        Component of variance between gametes within individuals.

    Examples
    --------
    Calculate variance components from some genotype data::

        >>> import allel
        >>> g = [[[0, 0], [0, 0], [1, 1], [1, 1]],
        ...      [[0, 1], [0, 1], [0, 1], [0, 1]],
        ...      [[0, 0], [0, 0], [0, 0], [0, 0]],
        ...      [[0, 1], [1, 2], [1, 1], [2, 2]],
        ...      [[0, 0], [1, 1], [0, 1], [-1, -1]]]
        >>> subpops = [[0, 1], [2, 3]]
        >>> a, b, c = allel.stats.weir_cockerham_fst(g, subpops)
        >>> a
        array([[ 0.5  ,  0.5  ,  0.   ],
               [ 0.   ,  0.   ,  0.   ],
               [ 0.   ,  0.   ,  0.   ],
               [ 0.   , -0.125, -0.125],
               [-0.375, -0.375,  0.   ]])
        >>> b
        array([[ 0.        ,  0.        ,  0.        ],
               [-0.25      , -0.25      ,  0.        ],
               [ 0.        ,  0.        ,  0.        ],
               [ 0.        ,  0.125     ,  0.25      ],
               [ 0.41666667,  0.41666667,  0.        ]])
        >>> c
        array([[ 0.        ,  0.        ,  0.        ],
               [ 0.5       ,  0.5       ,  0.        ],
               [ 0.        ,  0.        ,  0.        ],
               [ 0.125     ,  0.25      ,  0.125     ],
               [ 0.16666667,  0.16666667,  0.        ]])

    Estimate the parameter theta (a.k.a., Fst) for each variant
    and each allele individually::

        >>> fst = a / (a + b + c)
        >>> fst
        array([[ 1. ,  1. ,  nan],
               [ 0. ,  0. ,  nan],
               [ nan,  nan,  nan],
               [ 0. , -0.5, -0.5],
               [-1.8, -1.8,  nan]])

    Estimate Fst for each variant individually (averaging over alleles)::

        >>> fst = (np.sum(a, axis=1) /
        ...        (np.sum(a, axis=1) + np.sum(b, axis=1) + np.sum(c, axis=1)))
        >>> fst
        array([ 1. ,  0. ,  nan, -0.4, -1.8])

    Estimate Fst averaging over all variants and alleles::

        >>> fst = np.sum(a) / (np.sum(a) + np.sum(b) + np.sum(c))
        >>> fst
        -4.3680905886891398e-17

    Note that estimated Fst values may be negative.

    """

    # check inputs
    if not hasattr(g, 'shape') or not hasattr(g, 'ndim'):
        g = GenotypeArray(g, copy=False)
    if g.ndim != 3:
        raise ValueError('g must have three dimensions')
    if g.shape[2] != 2:
        raise NotImplementedError('only diploid genotypes are supported')

    # determine highest allele index
    if max_allele is None:
        max_allele = g.max()

    if chunked:
        # use a block-wise implementation
        blen = get_blen_array(g, blen)
        n_variants = g.shape[0]
        shape = (n_variants, max_allele + 1)
        a = np.zeros(shape, dtype='f8')
        b = np.zeros(shape, dtype='f8')
        c = np.zeros(shape, dtype='f8')
        for i in range(0, n_variants, blen):
            j = min(n_variants, i+blen)
            gb = g[i:j]
            ab, bb, cb = _weir_cockerham_fst(gb, subpops, max_allele)
            a[i:j] = ab
            b[i:j] = bb
            c[i:j] = cb

    else:
        a, b, c = _weir_cockerham_fst(g, subpops, max_allele)

    return a, b, c

コード例 #20

0

ファイルを表示

 def setup_instance(self, data):
     return GenotypeArray(data)