Python HaplotypeArray Examples, allel.model.ndarray.HaplotypeArray Python Examples

Example #1

0

Show file

File: selection.py Project: obestwalter/scikit-allel

def voight_painting(h):
    """Paint haplotypes, assigning a unique integer to each shared haplotype
    prefix.

    Parameters
    ----------
    h : array_like, int, shape (n_variants, n_haplotypes)
        Haplotype array.

    Returns
    -------
    painting : ndarray, int, shape (n_variants, n_haplotypes)
        Painting array.
    indices : ndarray, int, shape (n_hapotypes,)
        Haplotype indices after sorting by prefix.

    """

    # check inputs
    # N.B., ensure int8 so we can use cython optimisation
    h = HaplotypeArray(np.asarray(h), copy=False)
    if h.max() > 1:
        raise NotImplementedError('only biallelic variants are supported')
    if h.min() < 0:
        raise NotImplementedError('missing calls are not supported')

    # sort by prefix
    indices = h.prefix_argsort()
    h = np.take(h, indices, axis=1)

    # paint
    painting = paint_shared_prefixes(np.asarray(h))

    return painting, indices

Example #2

0

Show file

File: selection.py Project: hardingnj/scikit-allel

def haplotype_diversity(h):
    """Estimate haplotype diversity.

    Parameters
    ----------
    h : array_like, int, shape (n_variants, n_haplotypes)
        Haplotype array.

    Returns
    -------
    hd : float
        Haplotype diversity.

    """

    # check inputs
    h = HaplotypeArray(h, copy=False)

    # number of haplotypes
    n = h.n_haplotypes

    # compute haplotype frequencies
    f = h.distinct_frequencies()

    # estimate haplotype diversity
    hd = (1 - np.sum(f ** 2)) * n / (n - 1)

    return hd

Example #3

0

Show file

File: test_stats.py Project: oxpeter/scikit-allel

    def test_mean_pairwise_diversity(self):

        # start with simplest case, two haplotypes, one pairwise comparison
        h = HaplotypeArray([[0, 0],
                            [1, 1],
                            [0, 1],
                            [1, 2],
                            [0, -1],
                            [-1, -1]])
        ac = h.count_alleles()
        expect = [0, 0, 1, 1, -1, -1]
        actual = allel.stats.mean_pairwise_difference(ac, fill=-1)
        aeq(expect, actual)

        # four haplotypes, 6 pairwise comparison
        h = HaplotypeArray([[0, 0, 0, 0],
                            [0, 0, 0, 1],
                            [0, 0, 1, 1],
                            [0, 1, 1, 1],
                            [1, 1, 1, 1],
                            [0, 0, 1, 2],
                            [0, 1, 1, 2],
                            [0, 1, -1, -1],
                            [-1, -1, -1, -1]])
        ac = h.count_alleles()
        expect = [0, 3/6, 4/6, 3/6, 0, 5/6, 5/6, 1, -1]
        actual = allel.stats.mean_pairwise_difference(ac, fill=-1)
        assert_array_close(expect, actual)

Example #4

0

Show file

File: selection.py Project: obestwalter/scikit-allel

def haplotype_diversity(h):
    """Estimate haplotype diversity.

    Parameters
    ----------
    h : array_like, int, shape (n_variants, n_haplotypes)
        Haplotype array.

    Returns
    -------
    hd : float
        Haplotype diversity.

    """

    # check inputs
    h = HaplotypeArray(h, copy=False)

    # number of haplotypes
    n = h.n_haplotypes

    # compute haplotype frequencies
    f = h.distinct_frequencies()

    # estimate haplotype diversity
    hd = (1 - np.sum(f**2)) * n / (n - 1)

    return hd

Example #5

0

Show file

File: selection.py Project: obestwalter/scikit-allel

def plot_haplotype_frequencies(h,
                               palette='Paired',
                               singleton_color='w',
                               ax=None):
    """Plot haplotype frequencies.

    Parameters
    ----------
    h : array_like, int, shape (n_variants, n_haplotypes)
        Haplotype array.
    palette : string, optional
        A Seaborn palette name.
    singleton_color : string, optional
        Color to paint singleton haplotypes.
    ax : axes, optional
        The axes on which to draw. If not provided, a new figure will be
        created.

    Returns
    -------
    ax : axes

    """

    import matplotlib.pyplot as plt
    import seaborn as sns

    # check inputs
    h = HaplotypeArray(h, copy=False)

    # setup figure
    if ax is None:
        width = plt.rcParams['figure.figsize'][0]
        height = width / 10
        fig, ax = plt.subplots(figsize=(width, height))
        sns.despine(ax=ax, left=True)

    # count distinct haplotypes
    hc = h.distinct_counts()

    # setup palette
    n_colors = np.count_nonzero(hc > 1)
    palette = sns.color_palette(palette, n_colors)

    # paint frequencies
    x1 = 0
    for i, c in enumerate(hc):
        x2 = x1 + c
        if c > 1:
            color = palette[i]
        else:
            color = singleton_color
        ax.axvspan(x1, x2, color=color)
        x1 = x2

    # tidy up
    ax.set_xlim(0, h.shape[1])
    ax.set_yticks([])

    return ax

Example #6

0

Show file

File: selection.py Project: oxpeter/scikit-allel

def plot_haplotype_frequencies(h, palette='Paired', singleton_color='w',
                               ax=None):
    """Plot haplotype frequencies.

    Parameters
    ----------
    h : array_like, int, shape (n_variants, n_haplotypes)
        Haplotype array.
    palette : string, optional
        A Seaborn palette name.
    singleton_color : string, optional
        Color to paint singleton haplotypes.
    ax : axes, optional
        The axes on which to draw. If not provided, a new figure will be
        created.

    Returns
    -------
    ax : axes

    """

    import matplotlib.pyplot as plt
    import seaborn as sns

    # check inputs
    h = HaplotypeArray(h, copy=False)

    # setup figure
    if ax is None:
        width = plt.rcParams['figure.figsize'][0]
        height = width / 10
        fig, ax = plt.subplots(figsize=(width, height))
        sns.despine(ax=ax, left=True)

    # count distinct haplotypes
    hc = h.distinct_counts()

    # setup palette
    n_colors = np.count_nonzero(hc > 1)
    palette = sns.color_palette(palette, n_colors)

    # paint frequencies
    x1 = 0
    for i, c in enumerate(hc):
        x2 = x1 + c
        if c > 1:
            color = palette[i]
        else:
            color = singleton_color
        ax.axvspan(x1, x2, color=color)
        x1 = x2

    # tidy up
    ax.set_xlim(0, h.shape[1])
    ax.set_yticks([])

    return ax

Example #7

0

Show file

    def test_slice_types(self):

        h = HaplotypeArray(haplotype_data, dtype='i1')

        # row slice
        s = h[1:]
        assert_is_instance(s, HaplotypeArray)

        # col slice
        s = h[:, 1:]
        assert_is_instance(s, HaplotypeArray)

        # row index
        s = h[0]
        assert_is_instance(s, np.ndarray)
        assert_not_is_instance(s, HaplotypeArray)

        # col index
        s = h[:, 0]
        assert_is_instance(s, np.ndarray)
        assert_not_is_instance(s, HaplotypeArray)

        # item
        s = h[0, 0]
        assert_is_instance(s, np.int8)
        assert_not_is_instance(s, HaplotypeArray)

Example #8

0

Show file

File: selection.py Project: hardingnj/scikit-allel

def garud_h(h):
    """Compute the H1, H12, H123 and H2/H1 statistics for detecting signatures
    of soft sweeps, as defined in Garud et al. (2015).

    Parameters
    ----------
    h : array_like, int, shape (n_variants, n_haplotypes)
        Haplotype array.

    Returns
    -------
    h1 : float
        H1 statistic (sum of squares of haplotype frequencies).
    h12 : float
        H12 statistic (sum of squares of haplotype frequencies, combining
        the two most common haplotypes into a single frequency).
    h123 : float
        H123 statistic (sum of squares of haplotype frequencies, combining
        the three most common haplotypes into a single frequency).
    h2_h1 : float
        H2/H1 statistic, indicating the "softness" of a sweep.

    """

    # check inputs
    h = HaplotypeArray(h, copy=False)

    # compute haplotype frequencies
    f = h.distinct_frequencies()

    # compute H1
    h1 = np.sum(f ** 2)

    # compute H12
    h12 = np.sum(f[:2]) ** 2 + np.sum(f[2:] ** 2)

    # compute H123
    h123 = np.sum(f[:3]) ** 2 + np.sum(f[3:] ** 2)

    # compute H2/H1
    h2 = h1 - f[0] ** 2
    h2_h1 = h2 / h1

    return h1, h12, h123, h2_h1

Example #9

0

Show file

File: selection.py Project: hardingnj/scikit-allel

def ehh_decay(h, truncate=False):
    """Compute the decay of extended haplotype homozygosity (EHH)
    moving away from the first variant.

    Parameters
    ----------
    h : array_like, int, shape (n_variants, n_haplotypes)
        Haplotype array.
    truncate : bool, optional
        If True, the return array will exclude trailing zeros.

    Returns
    -------
    ehh : ndarray, float, shape (n_variants, )
        EHH at successive variants from the first variant.

    """

    from allel.opt.stats import pairwise_shared_prefix_lengths_int8

    # check inputs
    # N.B., ensure int8 so we can use cython optimisation
    h = HaplotypeArray(np.asarray(h, dtype="i1"), copy=False)
    if h.max() > 1:
        raise NotImplementedError("only biallelic variants are supported")
    if h.min() < 0:
        raise NotImplementedError("missing calls are not supported")

    # initialise
    n_variants = h.n_variants  # number of rows, i.e., variants
    n_haplotypes = h.n_haplotypes  # number of columns, i.e., haplotypes
    n_pairs = (n_haplotypes * (n_haplotypes - 1)) // 2

    # compute the shared prefix length between all pairs of haplotypes
    spl = pairwise_shared_prefix_lengths_int8(h)

    # compute EHH by counting the number of shared prefixes extending beyond
    # each variant
    minlength = None if truncate else n_variants + 1
    b = np.bincount(spl, minlength=minlength)
    c = np.cumsum(b[::-1])[:-1]
    ehh = (c / n_pairs)[::-1]

    return ehh

Example #10

0

Show file

def ehh_decay(h, truncate=False):
    """Compute the decay of extended haplotype homozygosity (EHH)
    moving away from the first variant.

    Parameters
    ----------
    h : array_like, int, shape (n_variants, n_haplotypes)
        Haplotype array.
    truncate : bool, optional
        If True, the return array will exclude trailing zeros.

    Returns
    -------
    ehh : ndarray, float, shape (n_variants, )
        EHH at successive variants from the first variant.

    """

    from allel.opt.stats import pairwise_shared_prefix_lengths_int8

    # check inputs
    # N.B., ensure int8 so we can use cython optimisation
    h = HaplotypeArray(np.asarray(h, dtype='i1'), copy=False)
    if h.max() > 1:
        raise NotImplementedError('only biallelic variants are supported')
    if h.min() < 0:
        raise NotImplementedError('missing calls are not supported')

    # initialise
    n_variants = h.n_variants  # number of rows, i.e., variants
    n_haplotypes = h.n_haplotypes  # number of columns, i.e., haplotypes
    n_pairs = (n_haplotypes * (n_haplotypes - 1)) // 2

    # compute the shared prefix length between all pairs of haplotypes
    spl = pairwise_shared_prefix_lengths_int8(h)

    # compute EHH by counting the number of shared prefixes extending beyond
    # each variant
    minlength = None if truncate else n_variants + 1
    b = np.bincount(spl, minlength=minlength)
    c = np.cumsum(b[::-1])[:-1]
    ehh = (c / n_pairs)[::-1]

    return ehh

Example #11

0

Show file

File: selection.py Project: obestwalter/scikit-allel

def garud_h(h):
    """Compute the H1, H12, H123 and H2/H1 statistics for detecting signatures
    of soft sweeps, as defined in Garud et al. (2015).

    Parameters
    ----------
    h : array_like, int, shape (n_variants, n_haplotypes)
        Haplotype array.

    Returns
    -------
    h1 : float
        H1 statistic (sum of squares of haplotype frequencies).
    h12 : float
        H12 statistic (sum of squares of haplotype frequencies, combining
        the two most common haplotypes into a single frequency).
    h123 : float
        H123 statistic (sum of squares of haplotype frequencies, combining
        the three most common haplotypes into a single frequency).
    h2_h1 : float
        H2/H1 statistic, indicating the "softness" of a sweep.

    """

    # check inputs
    h = HaplotypeArray(h, copy=False)

    # compute haplotype frequencies
    f = h.distinct_frequencies()

    # compute H1
    h1 = np.sum(f**2)

    # compute H12
    h12 = np.sum(f[:2])**2 + np.sum(f[2:]**2)

    # compute H123
    h123 = np.sum(f[:3])**2 + np.sum(f[3:]**2)

    # compute H2/H1
    h2 = h1 - f[0]**2
    h2_h1 = h2 / h1

    return h1, h12, h123, h2_h1

Example #12

0

Show file

File: test_stats.py Project: oxpeter/scikit-allel

    def test_windowed_diversity(self):

        # four haplotypes, 6 pairwise comparison
        h = HaplotypeArray([[0, 0, 0, 0],
                            [0, 0, 0, 1],
                            [0, 0, 1, 1],
                            [0, 1, 1, 1],
                            [1, 1, 1, 1],
                            [0, 0, 1, 2],
                            [0, 1, 1, 2],
                            [0, 1, -1, -1],
                            [-1, -1, -1, -1]])
        ac = h.count_alleles()
        # mean pairwise diversity
        # expect = [0, 3/6, 4/6, 3/6, 0, 5/6, 5/6, 1, -1]
        pos = SortedIndex([2, 4, 7, 14, 15, 18, 19, 25, 27])
        expect = [(7/6)/10, (13/6)/10, 1/11]
        actual, _, _, _ = allel.stats.windowed_diversity(pos, ac, size=10,
                                                         start=1,
                                                         stop=31)
        assert_array_close(expect, actual)

Example #13

0

Show file

File: test_stats.py Project: oxpeter/scikit-allel

    def test_mean_pairwise_divergence(self):

        # simplest case, two haplotypes in each population
        h = HaplotypeArray([[0, 0, 0, 0],
                            [0, 0, 0, 1],
                            [0, 0, 1, 1],
                            [0, 1, 1, 1],
                            [1, 1, 1, 1],
                            [0, 0, 1, 2],
                            [0, 1, 1, 2],
                            [0, 1, -1, -1],
                            [-1, -1, -1, -1]])
        h1 = h.take([0, 1], axis=1)
        h2 = h.take([2, 3], axis=1)
        ac1 = h1.count_alleles()
        ac2 = h2.count_alleles()

        expect = [0/4, 2/4, 4/4, 2/4, 0/4, 4/4, 3/4, -1, -1]
        actual = allel.stats.mean_pairwise_difference_between(ac1, ac2,
                                                              fill=-1)
        aeq(expect, actual)

Example #14

0

Show file

    def test_constructor(self):

        # missing data arg
        with self.assertRaises(TypeError):
            # noinspection PyArgumentList
            HaplotypeArray()

        # data has wrong dtype
        data = 'foo bar'
        with self.assertRaises(TypeError):
            HaplotypeArray(data)

        # data has wrong dtype
        data = [4., 5., 3.7]
        with self.assertRaises(TypeError):
            HaplotypeArray(data)

        # data has wrong dimensions
        data = [1, 2, 3]
        with self.assertRaises(TypeError):
            HaplotypeArray(data)

        # data has wrong dimensions
        data = diploid_genotype_data  # use GenotypeArray instead
        with self.assertRaises(TypeError):
            HaplotypeArray(data)

        # haploid data (typed)
        h = HaplotypeArray(haplotype_data, dtype='i1')
        aeq(haplotype_data, h)
        eq(np.int8, h.dtype)

Example #15

0

Show file

File: test_stats.py Project: oxpeter/scikit-allel

    def test_windowed_divergence(self):

        # simplest case, two haplotypes in each population
        h = HaplotypeArray([[0, 0, 0, 0],
                            [0, 0, 0, 1],
                            [0, 0, 1, 1],
                            [0, 1, 1, 1],
                            [1, 1, 1, 1],
                            [0, 0, 1, 2],
                            [0, 1, 1, 2],
                            [0, 1, -1, -1],
                            [-1, -1, -1, -1]])
        h1 = h.take([0, 1], axis=1)
        h2 = h.take([2, 3], axis=1)
        ac1 = h1.count_alleles()
        ac2 = h2.count_alleles()
        # mean pairwise divergence
        # expect = [0/4, 2/4, 4/4, 2/4, 0/4, 4/4, 3/4, -1, -1]
        pos = SortedIndex([2, 4, 7, 14, 15, 18, 19, 25, 27])
        expect = [(6/4)/10, (9/4)/10, 0/11]
        actual, _, _, _ = allel.stats.windowed_divergence(
            pos, ac1, ac2, size=10, start=1, stop=31
        )
        assert_array_close(expect, actual)

Example #16

0

Show file

File: selection.py Project: hardingnj/scikit-allel

def voight_painting(h):
    """Paint haplotypes, assigning a unique integer to each shared haplotype
    prefix.

    Parameters
    ----------
    h : array_like, int, shape (n_variants, n_haplotypes)
        Haplotype array.

    Returns
    -------
    painting : ndarray, int, shape (n_variants, n_haplotypes)
        Painting array.
    indices : ndarray, int, shape (n_hapotypes,)
        Haplotype indices after sorting by prefix.

    """

    from allel.opt.stats import paint_shared_prefixes_int8

    # check inputs
    # N.B., ensure int8 so we can use cython optimisation
    h = HaplotypeArray(np.asarray(h, dtype="i1"), copy=False)
    if h.max() > 1:
        raise NotImplementedError("only biallelic variants are supported")
    if h.min() < 0:
        raise NotImplementedError("missing calls are not supported")

    # sort by prefix
    indices = h.prefix_argsort()
    h = np.take(h, indices, axis=1)

    # paint
    painting = paint_shared_prefixes_int8(h)

    return painting, indices

Example #17

0

Show file

File: mendel.py Project: yangmqglobe/scikit-allel

def paint_transmission(parent_haplotypes, progeny_haplotypes):
    """Paint haplotypes inherited from a single diploid parent according to
    their allelic inheritance.

    Parameters
    ----------
    parent_haplotypes : array_like, int, shape (n_variants, 2)
        Both haplotypes from a single diploid parent.
    progeny_haplotypes : array_like, int, shape (n_variants, n_progeny)
        Haplotypes found in progeny of the given parent, inherited from the
        given parent. I.e., haplotypes from gametes of the given parent.

    Returns
    -------
    painting : ndarray, uint8, shape (n_variants, n_progeny)
        An array of integers coded as follows: 1 = allele inherited from
        first parental haplotype; 2 = allele inherited from second parental
        haplotype; 3 = reference allele, also carried by both parental
        haplotypes; 4 = non-reference allele, also carried by both parental
        haplotypes; 5 = non-parental allele; 6 = either or both parental
        alleles missing; 7 = missing allele; 0 = undetermined.

    Examples
    --------
    >>> import allel
    >>> haplotypes = allel.HaplotypeArray([
    ...     [0, 0, 0, 1, 2, -1],
    ...     [0, 1, 0, 1, 2, -1],
    ...     [1, 0, 0, 1, 2, -1],
    ...     [1, 1, 0, 1, 2, -1],
    ...     [0, 2, 0, 1, 2, -1],
    ...     [0, -1, 0, 1, 2, -1],
    ...     [-1, 1, 0, 1, 2, -1],
    ...     [-1, -1, 0, 1, 2, -1],
    ... ], dtype='i1')
    >>> painting = allel.paint_transmission(haplotypes[:, :2],
    ...                                           haplotypes[:, 2:])
    >>> painting
    array([[3, 5, 5, 7],
           [1, 2, 5, 7],
           [2, 1, 5, 7],
           [5, 4, 5, 7],
           [1, 5, 2, 7],
           [6, 6, 6, 7],
           [6, 6, 6, 7],
           [6, 6, 6, 7]], dtype=uint8)

    """

    # check inputs
    parent_haplotypes = HaplotypeArray(parent_haplotypes)
    progeny_haplotypes = HaplotypeArray(progeny_haplotypes)
    if parent_haplotypes.n_haplotypes != 2:
        raise ValueError('exactly two parental haplotypes should be provided')

    # convenience variables
    parent1 = parent_haplotypes[:, 0, np.newaxis]
    parent2 = parent_haplotypes[:, 1, np.newaxis]
    progeny_is_missing = progeny_haplotypes < 0
    parent_is_missing = np.any(parent_haplotypes < 0, axis=1)
    # need this for broadcasting, but also need to retain original for later
    parent_is_missing_bc = parent_is_missing[:, np.newaxis]
    parent_diplotype = GenotypeArray(parent_haplotypes[:, np.newaxis, :])
    parent_is_hom_ref = parent_diplotype.is_hom_ref()
    parent_is_het = parent_diplotype.is_het()
    parent_is_hom_alt = parent_diplotype.is_hom_alt()

    # identify allele calls where inheritance can be determined
    is_callable = ~progeny_is_missing & ~parent_is_missing_bc
    is_callable_seg = is_callable & parent_is_het

    # main inheritance states
    inherit_parent1 = is_callable_seg & (progeny_haplotypes == parent1)
    inherit_parent2 = is_callable_seg & (progeny_haplotypes == parent2)
    nonseg_ref = (is_callable & parent_is_hom_ref & (progeny_haplotypes == parent1))
    nonseg_alt = (is_callable & parent_is_hom_alt & (progeny_haplotypes == parent1))
    nonparental = (
        is_callable & (progeny_haplotypes != parent1) & (progeny_haplotypes != parent2)
    )

    # record inheritance states
    # N.B., order in which these are set matters
    painting = np.zeros(progeny_haplotypes.shape, dtype='u1')
    painting[inherit_parent1] = INHERIT_PARENT1
    painting[inherit_parent2] = INHERIT_PARENT2
    painting[nonseg_ref] = INHERIT_NONSEG_REF
    painting[nonseg_alt] = INHERIT_NONSEG_ALT
    painting[nonparental] = INHERIT_NONPARENTAL
    painting[parent_is_missing] = INHERIT_PARENT_MISSING
    painting[progeny_is_missing] = INHERIT_MISSING

    return painting

Example #18

0

Show file

 def setup_instance(self, data):
     return HaplotypeArray(data)