예제 #1
0
def locate_fixed_differences(ac1, ac2):
    """Locate variants with no shared alleles between two populations.

    Parameters
    ----------
    ac1 : array_like, int, shape (n_variants, n_alleles)
        Allele counts array from the first population.
    ac2 : array_like, int, shape (n_variants, n_alleles)
        Allele counts array from the second population.

    Returns
    -------
    loc : ndarray, bool, shape (n_variants,)

    See Also
    --------
    allel.stats.diversity.windowed_df

    Examples
    --------

    >>> import allel
    >>> g = allel.GenotypeArray([[[0, 0], [0, 0], [1, 1], [1, 1]],
    ...                          [[0, 1], [0, 1], [0, 1], [0, 1]],
    ...                          [[0, 1], [0, 1], [1, 1], [1, 1]],
    ...                          [[0, 0], [0, 0], [1, 1], [2, 2]],
    ...                          [[0, 0], [-1, -1], [1, 1], [-1, -1]]])
    >>> ac1 = g.count_alleles(subpop=[0, 1])
    >>> ac2 = g.count_alleles(subpop=[2, 3])
    >>> loc_df = allel.locate_fixed_differences(ac1, ac2)
    >>> loc_df
    array([ True, False, False,  True,  True])

    """

    # check inputs
    ac1 = asarray_ndim(ac1, 2)
    ac2 = asarray_ndim(ac2, 2)
    check_dim0_aligned(ac1, ac2)
    ac1, ac2 = ensure_dim1_aligned(ac1, ac2)

    # stack allele counts for convenience
    pac = np.dstack([ac1, ac2])

    # count numbers of alleles called in each population
    pan = np.sum(pac, axis=1)

    # count the numbers of populations with each allele
    npa = np.sum(pac > 0, axis=2)

    # locate variants with allele calls in both populations
    non_missing = np.all(pan > 0, axis=1)

    # locate variants where all alleles are only found in a single population
    no_shared_alleles = np.all(npa <= 1, axis=1)

    return non_missing & no_shared_alleles
예제 #2
0
def patterson_f3(acc, aca, acb):
    """Unbiased estimator for F3(C; A, B), the three-population test for
    admixture in population C.

    Parameters
    ----------
    acc : array_like, int, shape (n_variants, 2)
        Allele counts for the test population (C).
    aca : array_like, int, shape (n_variants, 2)
        Allele counts for the first source population (A).
    acb : array_like, int, shape (n_variants, 2)
        Allele counts for the second source population (B).

    Returns
    -------
    T : ndarray, float, shape (n_variants,)
        Un-normalized f3 estimates per variant.
    B : ndarray, float, shape (n_variants,)
        Estimates for heterozygosity in population C.

    Notes
    -----
    See Patterson (2012), main text and Appendix A.

    For un-normalized f3 statistics, ignore the `B` return value.

    To compute the f3* statistic, which is normalized by heterozygosity in
    population C to remove numerical dependence on the allele frequency
    spectrum, compute ``np.sum(T) / np.sum(B)``.

    """

    # check inputs
    aca = AlleleCountsArray(aca, copy=False)
    assert aca.shape[1] == 2, 'only biallelic variants supported'
    acb = AlleleCountsArray(acb, copy=False)
    assert acb.shape[1] == 2, 'only biallelic variants supported'
    acc = AlleleCountsArray(acc, copy=False)
    assert acc.shape[1] == 2, 'only biallelic variants supported'
    check_dim0_aligned(aca, acb, acc)

    # compute allele number and heterozygosity in test population
    sc = acc.sum(axis=1)
    hc = h_hat(acc)

    # compute sample frequencies for the alternate allele
    a = aca.to_frequencies()[:, 1]
    b = acb.to_frequencies()[:, 1]
    c = acc.to_frequencies()[:, 1]

    # compute estimator
    T = ((c - a) * (c - b)) - (hc / sc)
    B = 2 * hc

    return T, B
예제 #3
0
def patterson_f3(acc, aca, acb):
    """Unbiased estimator for F3(C; A, B), the three-population test for
    admixture in population C.

    Parameters
    ----------
    acc : array_like, int, shape (n_variants, 2)
        Allele counts for the test population (C).
    aca : array_like, int, shape (n_variants, 2)
        Allele counts for the first source population (A).
    acb : array_like, int, shape (n_variants, 2)
        Allele counts for the second source population (B).

    Returns
    -------
    T : ndarray, float, shape (n_variants,)
        Un-normalized f3 estimates per variant.
    B : ndarray, float, shape (n_variants,)
        Estimates for heterozygosity in population C.

    Notes
    -----
    See Patterson (2012), main text and Appendix A.

    For un-normalized f3 statistics, ignore the `B` return value.

    To compute the f3* statistic, which is normalized by heterozygosity in
    population C to remove numerical dependence on the allele frequency
    spectrum, compute ``np.sum(T) / np.sum(B)``.

    """

    # check inputs
    aca = AlleleCountsArray(aca, copy=False)
    assert aca.shape[1] == 2, 'only biallelic variants supported'
    acb = AlleleCountsArray(acb, copy=False)
    assert acb.shape[1] == 2, 'only biallelic variants supported'
    acc = AlleleCountsArray(acc, copy=False)
    assert acc.shape[1] == 2, 'only biallelic variants supported'
    check_dim0_aligned(aca, acb, acc)

    # compute allele number and heterozygosity in test population
    sc = acc.sum(axis=1)
    hc = h_hat(acc)

    # compute sample frequencies for the alternate allele
    a = aca.to_frequencies()[:, 1]
    b = acb.to_frequencies()[:, 1]
    c = acc.to_frequencies()[:, 1]

    # compute estimator
    T = ((c - a) * (c - b)) - (hc / sc)
    B = 2 * hc

    return T, B
예제 #4
0
def locate_private_alleles(*acs):
    """Locate alleles that are found only in a single population.

    Parameters
    ----------
    *acs : array_like, int, shape (n_variants, n_alleles)
        Allele counts arrays from each population.

    Returns
    -------
    loc : ndarray, bool, shape (n_variants, n_alleles)
        Boolean array where elements are True if allele is private to a
        single population.

    Examples
    --------

    >>> import allel
    >>> g = allel.GenotypeArray([[[0, 0], [0, 0], [1, 1], [1, 1]],
    ...                          [[0, 1], [0, 1], [0, 1], [0, 1]],
    ...                          [[0, 1], [0, 1], [1, 1], [1, 1]],
    ...                          [[0, 0], [0, 0], [1, 1], [2, 2]],
    ...                          [[0, 0], [-1, -1], [1, 1], [-1, -1]]])
    >>> ac1 = g.count_alleles(subpop=[0, 1])
    >>> ac2 = g.count_alleles(subpop=[2])
    >>> ac3 = g.count_alleles(subpop=[3])
    >>> loc_private_alleles = allel.locate_private_alleles(ac1, ac2, ac3)
    >>> loc_private_alleles
    array([[ True, False, False],
           [False, False, False],
           [ True, False, False],
           [ True,  True,  True],
           [ True,  True, False]])
    >>> loc_private_variants = np.any(loc_private_alleles, axis=1)
    >>> loc_private_variants
    array([ True, False,  True,  True,  True])

    """

    # check inputs
    acs = [asarray_ndim(ac, 2) for ac in acs]
    check_dim0_aligned(*acs)
    acs = ensure_dim1_aligned(*acs)

    # stack allele counts for convenience
    pac = np.dstack(acs)

    # count the numbers of populations with each allele
    npa = np.sum(pac > 0, axis=2)

    # locate alleles found only in a single population
    loc_pa = npa == 1

    return loc_pa
예제 #5
0
def patterson_d(aca, acb, acc, acd):
    """Unbiased estimator for D(A, B; C, D), the normalised four-population
    test for admixture between (A or B) and (C or D), also known as the
    "ABBA BABA" test.

    Parameters
    ----------
    aca : array_like, int, shape (n_variants, 2),
        Allele counts for population A.
    acb : array_like, int, shape (n_variants, 2)
        Allele counts for population B.
    acc : array_like, int, shape (n_variants, 2)
        Allele counts for population C.
    acd : array_like, int, shape (n_variants, 2)
        Allele counts for population D.

    Returns
    -------
    num : ndarray, float, shape (n_variants,)
        Numerator (un-normalised f4 estimates).
    den : ndarray, float, shape (n_variants,)
        Denominator.

    Notes
    -----
    See Patterson (2012), main text and Appendix A.

    For un-normalized f4 statistics, ignore the `den` return value.

    """

    # check inputs
    aca = AlleleCountsArray(aca, copy=False)
    assert aca.shape[1] == 2, 'only biallelic variants supported'
    acb = AlleleCountsArray(acb, copy=False)
    assert acb.shape[1] == 2, 'only biallelic variants supported'
    acc = AlleleCountsArray(acc, copy=False)
    assert acc.shape[1] == 2, 'only biallelic variants supported'
    acd = AlleleCountsArray(acd, copy=False)
    assert acd.shape[1] == 2, 'only biallelic variants supported'
    check_dim0_aligned(aca, acb, acc, acd)

    # compute sample frequencies for the alternate allele
    a = aca.to_frequencies()[:, 1]
    b = acb.to_frequencies()[:, 1]
    c = acc.to_frequencies()[:, 1]
    d = acd.to_frequencies()[:, 1]

    # compute estimator
    num = (a - b) * (c - d)
    den = (a + b - (2 * a * b)) * (c + d - (2 * c * d))

    return num, den
예제 #6
0
def patterson_d(aca, acb, acc, acd):
    """Unbiased estimator for D(A, B; C, D), the normalised four-population
    test for admixture between (A or B) and (C or D), also known as the
    "ABBA BABA" test.

    Parameters
    ----------
    aca : array_like, int, shape (n_variants, 2),
        Allele counts for population A.
    acb : array_like, int, shape (n_variants, 2)
        Allele counts for population B.
    acc : array_like, int, shape (n_variants, 2)
        Allele counts for population C.
    acd : array_like, int, shape (n_variants, 2)
        Allele counts for population D.

    Returns
    -------
    num : ndarray, float, shape (n_variants,)
        Numerator (un-normalised f4 estimates).
    den : ndarray, float, shape (n_variants,)
        Denominator.

    Notes
    -----
    See Patterson (2012), main text and Appendix A.

    For un-normalized f4 statistics, ignore the `den` return value.

    """

    # check inputs
    aca = AlleleCountsArray(aca, copy=False)
    assert aca.shape[1] == 2, 'only biallelic variants supported'
    acb = AlleleCountsArray(acb, copy=False)
    assert acb.shape[1] == 2, 'only biallelic variants supported'
    acc = AlleleCountsArray(acc, copy=False)
    assert acc.shape[1] == 2, 'only biallelic variants supported'
    acd = AlleleCountsArray(acd, copy=False)
    assert acd.shape[1] == 2, 'only biallelic variants supported'
    check_dim0_aligned(aca, acb, acc, acd)

    # compute sample frequencies for the alternate allele
    a = aca.to_frequencies()[:, 1]
    b = acb.to_frequencies()[:, 1]
    c = acc.to_frequencies()[:, 1]
    d = acd.to_frequencies()[:, 1]

    # compute estimator
    num = (a - b) * (c - d)
    den = (a + b - (2 * a * b)) * (c + d - (2 * c * d))

    return num, den
예제 #7
0
def patterson_f2(aca, acb):
    """Unbiased estimator for F2(A, B), the branch length between populations
    A and B.

    Parameters
    ----------
    aca : array_like, int, shape (n_variants, 2)
        Allele counts for population A.
    acb : array_like, int, shape (n_variants, 2)
        Allele counts for population B.

    Returns
    -------
    f2 : ndarray, float, shape (n_variants,)

    Notes
    -----
    See Patterson (2012), Appendix A.

    """

    # check inputs
    aca = AlleleCountsArray(aca, copy=False)
    assert aca.shape[1] == 2, 'only biallelic variants supported'
    acb = AlleleCountsArray(acb, copy=False)
    assert acb.shape[1] == 2, 'only biallelic variants supported'
    check_dim0_aligned(aca, acb)

    # compute allele numbers
    sa = aca.sum(axis=1)
    sb = acb.sum(axis=1)

    # compute heterozygosities
    ha = h_hat(aca)
    hb = h_hat(acb)

    # compute sample frequencies for the alternate allele
    a = aca.to_frequencies()[:, 1]
    b = acb.to_frequencies()[:, 1]

    # compute estimator
    x = ((a - b) ** 2) - (ha / sa) - (hb / sb)

    return x
예제 #8
0
def patterson_f2(aca, acb):
    """Unbiased estimator for F2(A, B), the branch length between populations
    A and B.

    Parameters
    ----------
    aca : array_like, int, shape (n_variants, 2)
        Allele counts for population A.
    acb : array_like, int, shape (n_variants, 2)
        Allele counts for population B.

    Returns
    -------
    f2 : ndarray, float, shape (n_variants,)

    Notes
    -----
    See Patterson (2012), Appendix A.

    """

    # check inputs
    aca = AlleleCountsArray(aca, copy=False)
    assert aca.shape[1] == 2, 'only biallelic variants supported'
    acb = AlleleCountsArray(acb, copy=False)
    assert acb.shape[1] == 2, 'only biallelic variants supported'
    check_dim0_aligned(aca, acb)

    # compute allele numbers
    sa = aca.sum(axis=1)
    sb = acb.sum(axis=1)

    # compute heterozygosities
    ha = h_hat(aca)
    hb = h_hat(acb)

    # compute sample frequencies for the alternate allele
    a = aca.to_frequencies()[:, 1]
    b = acb.to_frequencies()[:, 1]

    # compute estimator
    x = ((a - b)**2) - (ha / sa) - (hb / sb)

    return x
예제 #9
0
def compute_ihh_gaps(pos, map_pos, gap_scale, max_gap, is_accessible):
    """Compute spacing between variants for integrating haplotype
    homozygosity.

    Parameters
    ----------
    pos : array_like, int, shape (n_variants,)
        Variant positions (physical distance).
    map_pos : array_like, float, shape (n_variants,)
        Variant positions (genetic map distance).
    gap_scale : int, optional
        Rescale distance between variants if gap is larger than this value.
    max_gap : int, optional
        Do not report scores if EHH spans a gap larger than this number of
        base pairs.
    is_accessible : array_like, bool, optional
        Genome accessibility array. If provided, distance between variants
        will be computed as the number of accessible bases between them.

    Returns
    -------
    gaps : ndarray, float, shape (n_variants - 1,)

    """

    # check inputs
    if map_pos is None:
        # integrate over physical distance
        map_pos = pos
    else:
        map_pos = asarray_ndim(map_pos, 1)
        check_dim0_aligned(pos, map_pos)

    # compute physical gaps
    physical_gaps = np.diff(pos)

    # compute genetic gaps
    gaps = np.diff(map_pos).astype('f8')

    if is_accessible is not None:

        # compute accessible gaps
        is_accessible = asarray_ndim(is_accessible, 1)
        assert is_accessible.shape[0] > pos[-1], \
            'accessibility array too short'
        accessible_gaps = np.zeros_like(physical_gaps)
        for i in range(1, len(pos)):
            # N.B., expect pos is 1-based
            n_access = np.count_nonzero(is_accessible[pos[i - 1] - 1:pos[i] -
                                                      1])
            accessible_gaps[i - 1] = n_access

        # adjust using accessibility
        scaling = accessible_gaps / physical_gaps
        gaps = gaps * scaling

    elif gap_scale is not None and gap_scale > 0:

        scaling = np.ones(gaps.shape, dtype='f8')
        loc_scale = physical_gaps > gap_scale
        scaling[loc_scale] = gap_scale / physical_gaps[loc_scale]
        gaps = gaps * scaling

    if max_gap is not None and max_gap > 0:

        # deal with very large gaps
        gaps[physical_gaps > max_gap] = -1

    return gaps
예제 #10
0
def standardize_by_allele_count(score,
                                aac,
                                bins=None,
                                n_bins=None,
                                diagnostics=True):
    """Standardize `score` within allele frequency bins.

    Parameters
    ----------
    score : array_like, float
        The score to be standardized, e.g., IHS or NSL.
    aac : array_like, int
        An array of alternate allele counts.
    bins : array_like, int, optional
        Allele count bins, overrides `n_bins`.
    n_bins : int, optional
        Number of allele count bins to use.
    diagnostics : bool, optional
        If True, plot some diagnostic information about the standardization.

    Returns
    -------
    score_standardized : ndarray, float
        Standardized scores.
    bins : ndarray, int
        Allele count bins used for standardization.

    """

    from scipy.stats import binned_statistic

    # check inputs
    score = asarray_ndim(score, 1)
    aac = asarray_ndim(aac, 1)
    check_dim0_aligned(score, aac)

    # remove nans
    nonan = ~np.isnan(score)
    score_nonan = score[nonan]
    aac_nonan = aac[nonan]

    if bins is None:
        # make our own similar sized bins

        # how many bins to make?
        if n_bins is None:
            # something vaguely reasonable
            n_bins = np.max(aac) // 2

        # make bins
        bins = make_similar_sized_bins(aac_nonan, n_bins)

    else:
        # user-provided bins
        bins = asarray_ndim(bins, 1)

    mean_score, _, _ = binned_statistic(aac_nonan,
                                        score_nonan,
                                        statistic=np.mean,
                                        bins=bins)
    std_score, _, _ = binned_statistic(aac_nonan,
                                       score_nonan,
                                       statistic=np.std,
                                       bins=bins)

    if diagnostics:
        import matplotlib.pyplot as plt
        x = (bins[:-1] + bins[1:]) / 2
        plt.figure()
        plt.fill_between(x,
                         mean_score - std_score,
                         mean_score + std_score,
                         alpha=.5,
                         label='std')
        plt.plot(x, mean_score, marker='o', label='mean')
        plt.grid(axis='y')
        plt.xlabel('Alternate allele count')
        plt.ylabel('Unstandardized score')
        plt.title('Standardization diagnostics')
        plt.legend()

    # apply standardization
    score_standardized = np.empty_like(score)
    for i in range(len(bins) - 1):
        x1 = bins[i]
        x2 = bins[i + 1]
        if i == 0:
            # first bin
            loc = (aac < x2)
        elif i == len(bins) - 2:
            # last bin
            loc = (aac >= x1)
        else:
            # middle bins
            loc = (aac >= x1) & (aac < x2)
        m = mean_score[i]
        s = std_score[i]
        score_standardized[loc] = (score[loc] - m) / s

    return score_standardized, bins
예제 #11
0
def hudson_fst(ac1, ac2, fill=np.nan):
    """Calculate the numerator and denominator for Fst estimation using the
    method of Hudson (1992) elaborated by Bhatia et al. (2013).

    Parameters
    ----------
    ac1 : array_like, int, shape (n_variants, n_alleles)
        Allele counts array from the first population.
    ac2 : array_like, int, shape (n_variants, n_alleles)
        Allele counts array from the second population.
    fill : float
        Use this value where there are no pairs to compare (e.g.,
        all allele calls are missing).

    Returns
    -------
    num : ndarray, float, shape (n_variants,)
        Divergence between the two populations minus average
        of diversity within each population.
    den : ndarray, float, shape (n_variants,)
        Divergence between the two populations.

    Examples
    --------
    Calculate numerator and denominator for Fst estimation::

        >>> import allel
        >>> g = allel.GenotypeArray([[[0, 0], [0, 0], [1, 1], [1, 1]],
        ...                          [[0, 1], [0, 1], [0, 1], [0, 1]],
        ...                          [[0, 0], [0, 0], [0, 0], [0, 0]],
        ...                          [[0, 1], [1, 2], [1, 1], [2, 2]],
        ...                          [[0, 0], [1, 1], [0, 1], [-1, -1]]])
        >>> subpops = [[0, 1], [2, 3]]
        >>> ac1 = g.count_alleles(subpop=subpops[0])
        >>> ac2 = g.count_alleles(subpop=subpops[1])
        >>> num, den = allel.hudson_fst(ac1, ac2)
        >>> num
        array([ 1.        , -0.16666667,  0.        , -0.125     , -0.33333333])
        >>> den
        array([1.   , 0.5  , 0.   , 0.625, 0.5  ])

    Estimate Fst for each variant individually::

        >>> fst = num / den
        >>> fst
        array([ 1.        , -0.33333333,         nan, -0.2       , -0.66666667])

    Estimate Fst averaging over variants::

        >>> fst = np.sum(num) / np.sum(den)
        >>> fst
        0.1428571428571429

    """  # flake8: noqa

    # check inputs
    ac1 = asarray_ndim(ac1, 2)
    ac2 = asarray_ndim(ac2, 2)
    check_dim0_aligned(ac1, ac2)
    ac1, ac2 = ensure_dim1_aligned(ac1, ac2)

    # calculate these once only
    an1 = np.sum(ac1, axis=1)
    an2 = np.sum(ac2, axis=1)

    # calculate average diversity (a.k.a. heterozygosity) within each
    # population
    within = (mean_pairwise_difference(ac1, an1, fill=fill) +
              mean_pairwise_difference(ac2, an2, fill=fill)) / 2

    # calculate divergence (a.k.a. heterozygosity) between each population
    between = mean_pairwise_difference_between(ac1, ac2, an1, an2, fill=fill)

    # define numerator and denominator for Fst calculations
    num = between - within
    den = between

    return num, den
예제 #12
0
def tabulate_state_blocks(x, states, pos=None):
    """Construct a dataframe where each row provides information about continuous state blocks.

    Parameters
    ----------
    x : array_like, int
        1-dimensional array of state values.
    states : set
        Set of states of interest. Any state value not in this set will be ignored.
    pos : array_like, int, optional
        Array of positions corresponding to values in `x`.

    Returns
    -------
    df : DataFrame

    Examples
    --------
    >>> import allel
    >>> x = [1, 1, 0, 1, 1, 2, 2, 0, 2, 1, 1]
    >>> df = allel.tabulate_state_blocks(x, states={1, 2})
    >>> df
       state  support  start_lidx     ...       size_min  size_max  is_marginal
    0      1        4          -1     ...              5        -1         True
    1      2        3           4     ...              4         4        False
    2      1        2           8     ...              2        -1         True
    [3 rows x 9 columns]
    >>> pos = [2, 4, 7, 8, 10, 14, 19, 23, 28, 30, 31]
    >>> df = allel.tabulate_state_blocks(x, states={1, 2}, pos=pos)
    >>> df
       state  support  start_lidx     ...      stop_rpos  length_min  length_max
    0      1        4          -1     ...             14           9          -1
    1      2        3           4     ...             30          15          19
    2      1        2           8     ...             -1           2          -1
    [3 rows x 15 columns]

    """

    # check inputs
    x = asarray_ndim(x, 1)
    check_integer_dtype(x)
    x = memoryview_safe(x)

    # find state transitions
    switch_points, transitions, observations = state_transitions(x, states)

    # setup some helpers
    t = transitions[1:, 0]
    o = observations[1:]
    s1 = switch_points[:-1]
    s2 = switch_points[1:]
    is_marginal = (s1[:, 0] < 0) | (s2[:, 1] < 0)
    size_min = s2[:, 0] - s1[:, 1] + 1
    size_max = s2[:, 1] - s1[:, 0] - 1
    size_max[is_marginal] = -1

    # start to build a dataframe
    items = [
        ('state', t),
        ('support', o),
        ('start_lidx', s1[:, 0]),
        ('start_ridx', s1[:, 1]),
        ('stop_lidx', s2[:, 0]),
        ('stop_ridx', s2[:, 1]),
        ('size_min', size_min),
        ('size_max', size_max),
        ('is_marginal', is_marginal)
    ]

    # deal with optional positions
    if pos is not None:
        pos = asarray_ndim(pos, 1)
        check_dim0_aligned(x, pos)
        check_integer_dtype(pos)

        # obtain switch positions
        switch_positions = np.take(pos, switch_points)
        # deal with boundary transitions
        switch_positions[0, 0] = -1
        switch_positions[-1, 1] = -1

        # setup helpers
        p1 = switch_positions[:-1]
        p2 = switch_positions[1:]
        length_min = p2[:, 0] - p1[:, 1] + 1
        length_max = p2[:, 1] - p1[:, 0] - 1
        length_max[is_marginal] = -1

        items += [
            ('start_lpos', p1[:, 0]),
            ('start_rpos', p1[:, 1]),
            ('stop_lpos', p2[:, 0]),
            ('stop_rpos', p2[:, 1]),
            ('length_min', length_min),
            ('length_max', length_max),
        ]

    import pandas
    return pandas.DataFrame.from_dict(OrderedDict(items))
예제 #13
0
def tabulate_state_transitions(x, states, pos=None):
    """Construct a dataframe where each row provides information about a state transition.

    Parameters
    ----------
    x : array_like, int
        1-dimensional array of state values.
    states : set
        Set of states of interest. Any state value not in this set will be ignored.
    pos : array_like, int, optional
        Array of positions corresponding to values in `x`.

    Returns
    -------
    df : DataFrame

    Notes
    -----
    The resulting dataframe includes one row at the start representing the first state
    observation and one row at the end representing the last state observation.

    Examples
    --------
    >>> import allel
    >>> x = [1, 1, 0, 1, 1, 2, 2, 0, 2, 1, 1]
    >>> df = allel.tabulate_state_transitions(x, states={1, 2})
    >>> df
       lstate  rstate  lidx  ridx
    0      -1       1    -1     0
    1       1       2     4     5
    2       2       1     8     9
    3       1      -1    10    -1
    >>> pos = [2, 4, 7, 8, 10, 14, 19, 23, 28, 30, 31]
    >>> df = allel.tabulate_state_transitions(x, states={1, 2}, pos=pos)
    >>> df
       lstate  rstate  lidx  ridx  lpos  rpos
    0      -1       1    -1     0    -1     2
    1       1       2     4     5    10    14
    2       2       1     8     9    28    30
    3       1      -1    10    -1    31    -1

    """

    # check inputs
    x = asarray_ndim(x, 1)
    check_integer_dtype(x)
    x = memoryview_safe(x)

    # find state transitions
    switch_points, transitions, _ = state_transitions(x, states)

    # start to build a dataframe
    items = [('lstate', transitions[:, 0]),
             ('rstate', transitions[:, 1]),
             ('lidx', switch_points[:, 0]),
             ('ridx', switch_points[:, 1])]

    # deal with optional positions
    if pos is not None:
        pos = asarray_ndim(pos, 1)
        check_dim0_aligned(x, pos)
        check_integer_dtype(pos)

        # find switch positions
        switch_positions = np.take(pos, switch_points)
        # deal with boundary transitions
        switch_positions[0, 0] = -1
        switch_positions[-1, 1] = -1

        # add columns into dataframe
        items += [('lpos', switch_positions[:, 0]),
                  ('rpos', switch_positions[:, 1])]

    import pandas
    return pandas.DataFrame.from_dict(OrderedDict(items))
예제 #14
0
def xpehh(
    h1,
    h2,
    pos,
    map_pos=None,
    min_ehh=0.05,
    include_edges=False,
    gap_scale=20000,
    max_gap=200000,
    is_accessible=None,
    use_threads=True,
):
    """Compute the unstandardized cross-population extended haplotype
    homozygosity score (XPEHH) for each variant.

    Parameters
    ----------
    h1 : array_like, int, shape (n_variants, n_haplotypes)
        Haplotype array for the first population.
    h2 : array_like, int, shape (n_variants, n_haplotypes)
        Haplotype array for the second population.
    pos : array_like, int, shape (n_variants,)
        Variant positions on physical or genetic map.
    map_pos : array_like, float, shape (n_variants,)
        Variant positions (genetic map distance).
    min_ehh: float, optional
        Minimum EHH beyond which to truncate integrated haplotype
        homozygosity calculation.
    include_edges : bool, optional
        If True, report scores even if EHH does not decay below `min_ehh`
        before reaching the edge of the data.
    gap_scale : int, optional
        Rescale distance between variants if gap is larger than this value.
    max_gap : int, optional
        Do not report scores if EHH spans a gap larger than this number of
        base pairs.
    is_accessible : array_like, bool, optional
        Genome accessibility array. If provided, distance between variants
        will be computed as the number of accessible bases between them.
    use_threads : bool, optional
        If True use multiple threads to compute.

    Returns
    -------
    score : ndarray, float, shape (n_variants,)
        Unstandardized XPEHH scores.

    Notes
    -----

    This function will calculate XPEHH for all variants. To exclude variants
    below a given minor allele frequency, filter the input haplotype arrays
    before passing to this function.

    This function returns NaN for any EHH calculations where haplotype
    homozygosity does not decay below `min_ehh` before reaching the first or
    last variant. To disable this behaviour, set `include_edges` to True.

    Note that the unstandardized score is returned. Usually these scores are
    then standardized genome-wide.

    Haplotype arrays from the two populations may have different numbers of
    haplotypes.

    See Also
    --------
    standardize

    """

    from allel.opt.stats import ihh_scan_int8

    # check inputs
    h1 = HaplotypeArray(np.asarray(h1, dtype="i1"))
    h2 = HaplotypeArray(np.asarray(h2, dtype="i1"))
    pos = asarray_ndim(pos, 1)
    check_dim0_aligned(h1, h2, pos)

    # compute gaps between variants for integration
    gaps = compute_ihh_gaps(pos, map_pos, gap_scale, max_gap, is_accessible)

    # setup kwargs
    kwargs = dict(min_ehh=min_ehh, include_edges=include_edges)

    if use_threads and multiprocessing.cpu_count() > 1:
        # use multiple threads

        # setup threadpool
        pool = ThreadPool(min(4, multiprocessing.cpu_count()))

        # scan forward
        res1_fwd = pool.apply_async(ihh_scan_int8, (h1, gaps), kwargs)
        res2_fwd = pool.apply_async(ihh_scan_int8, (h2, gaps), kwargs)

        # scan backward
        res1_rev = pool.apply_async(ihh_scan_int8, (h1[::-1], gaps[::-1]), kwargs)
        res2_rev = pool.apply_async(ihh_scan_int8, (h2[::-1], gaps[::-1]), kwargs)

        # wait for both to finish
        pool.close()
        pool.join()

        # obtain results
        ihh1_fwd = res1_fwd.get()
        ihh2_fwd = res2_fwd.get()
        ihh1_rev = res1_rev.get()
        ihh2_rev = res2_rev.get()

        # cleanup
        pool.terminate()

    else:
        # compute without threads

        # scan forward
        ihh1_fwd = ihh_scan_int8(h1, gaps, **kwargs)
        ihh2_fwd = ihh_scan_int8(h2, gaps, **kwargs)

        # scan backward
        ihh1_rev = ihh_scan_int8(h1[::-1], gaps[::-1], **kwargs)
        ihh2_rev = ihh_scan_int8(h2[::-1], gaps[::-1], **kwargs)

    # handle reverse scans
    ihh1_rev = ihh1_rev[::-1]
    ihh2_rev = ihh2_rev[::-1]

    # compute unstandardized score
    ihh1 = ihh1_fwd + ihh1_rev
    ihh2 = ihh2_fwd + ihh2_rev
    score = np.log(ihh1 / ihh2)

    return score
예제 #15
0
def create_allele_mapping(ref, alt, alleles, dtype='i1'):
    """Create an array mapping variant alleles into a different allele index
    system.

    Parameters
    ----------
    ref : array_like, S1, shape (n_variants,)
        Reference alleles.
    alt : array_like, S1, shape (n_variants, n_alt_alleles)
        Alternate alleles.
    alleles : array_like, S1, shape (n_variants, n_alleles)
        Alleles defining the new allele indexing.
    dtype : dtype, optional
        Output dtype.

    Returns
    -------
    mapping : ndarray, int8, shape (n_variants, n_alt_alleles + 1)

    Examples
    --------
    Example with biallelic variants::

        >>> import allel
        >>> ref = [b'A', b'C', b'T', b'G']
        >>> alt = [b'T', b'G', b'C', b'A']
        >>> alleles = [[b'A', b'T'],  # no transformation
        ...            [b'G', b'C'],  # swap
        ...            [b'T', b'A'],  # 1 missing
        ...            [b'A', b'C']]  # 1 missing
        >>> mapping = allel.create_allele_mapping(ref, alt, alleles)
        >>> mapping
        array([[ 0,  1],
               [ 1,  0],
               [ 0, -1],
               [-1,  0]], dtype=int8)

    Example with multiallelic variants::

        >>> ref = [b'A', b'C', b'T']
        >>> alt = [[b'T', b'G'],
        ...        [b'A', b'T'],
        ...        [b'G', b'.']]
        >>> alleles = [[b'A', b'T'],
        ...            [b'C', b'T'],
        ...            [b'G', b'A']]
        >>> mapping = create_allele_mapping(ref, alt, alleles)
        >>> mapping
        array([[ 0,  1, -1],
               [ 0, -1,  1],
               [-1,  0, -1]], dtype=int8)

    See Also
    --------
    GenotypeArray.map_alleles, HaplotypeArray.map_alleles, AlleleCountsArray.map_alleles

    """

    ref = asarray_ndim(ref, 1)
    alt = asarray_ndim(alt, 1, 2)
    alleles = asarray_ndim(alleles, 1, 2)
    check_dim0_aligned(ref, alt, alleles)

    # reshape for convenience
    ref = ref[:, None]
    if alt.ndim == 1:
        alt = alt[:, None]
    if alleles.ndim == 1:
        alleles = alleles[:, None]
    source_alleles = np.append(ref, alt, axis=1)

    # setup output array
    out = np.empty(source_alleles.shape, dtype=dtype)
    out.fill(-1)

    # find matches
    for ai in range(source_alleles.shape[1]):
        match = source_alleles[:, ai, None] == alleles
        match_i, match_j = match.nonzero()
        out[match_i, ai] = match_j

    return out
예제 #16
0
def xpnsl(h1, h2, use_threads=True):
    """Cross-population version of the NSL statistic.

    Parameters
    ----------
    h1 : array_like, int, shape (n_variants, n_haplotypes)
        Haplotype array for the first population.
    h2 : array_like, int, shape (n_variants, n_haplotypes)
        Haplotype array for the second population.
    use_threads : bool, optional
        If True use multiple threads to compute.

    Returns
    -------
    score : ndarray, float, shape (n_variants,)
        Unstandardized XPNSL scores.

    """

    # check inputs
    h1 = asarray_ndim(h1, 2)
    check_integer_dtype(h1)
    h2 = asarray_ndim(h2, 2)
    check_integer_dtype(h2)
    check_dim0_aligned(h1, h2)

    if use_threads and multiprocessing.cpu_count() > 1:
        # use multiple threads

        # setup threadpool
        pool = ThreadPool(min(4, multiprocessing.cpu_count()))

        # scan forward
        res1_fwd = pool.apply_async(nsl_scan, args=(h1, ))
        res2_fwd = pool.apply_async(nsl_scan, args=(h2, ))

        # scan backward
        res1_rev = pool.apply_async(nsl_scan, args=(h1[::-1], ))
        res2_rev = pool.apply_async(nsl_scan, args=(h2[::-1], ))

        # wait for both to finish
        pool.close()
        pool.join()

        # obtain results
        nsl1_fwd = res1_fwd.get()
        nsl2_fwd = res2_fwd.get()
        nsl1_rev = res1_rev.get()
        nsl2_rev = res2_rev.get()

        # cleanup
        pool.terminate()

    else:
        # compute without threads

        # scan forward
        nsl1_fwd = nsl_scan(h1)
        nsl2_fwd = nsl_scan(h2)

        # scan backward
        nsl1_rev = nsl_scan(h1[::-1])
        nsl2_rev = nsl_scan(h2[::-1])

    # handle reverse scans
    nsl1_rev = nsl1_rev[::-1]
    nsl2_rev = nsl2_rev[::-1]

    # compute unstandardized score
    nsl1 = nsl1_fwd + nsl1_rev
    nsl2 = nsl2_fwd + nsl2_rev
    score = np.log(nsl1 / nsl2)

    return score
예제 #17
0
def hudson_fst(ac1, ac2, fill=np.nan):
    """Calculate the numerator and denominator for Fst estimation using the
    method of Hudson (1992) elaborated by Bhatia et al. (2013).

    Parameters
    ----------
    ac1 : array_like, int, shape (n_variants, n_alleles)
        Allele counts array from the first population.
    ac2 : array_like, int, shape (n_variants, n_alleles)
        Allele counts array from the second population.
    fill : float
        Use this value where there are no pairs to compare (e.g.,
        all allele calls are missing).

    Returns
    -------
    num : ndarray, float, shape (n_variants,)
        Divergence between the two populations minus average
        of diversity within each population.
    den : ndarray, float, shape (n_variants,)
        Divergence between the two populations.

    Examples
    --------
    Calculate numerator and denominator for Fst estimation::

        >>> import allel
        >>> g = allel.GenotypeArray([[[0, 0], [0, 0], [1, 1], [1, 1]],
        ...                          [[0, 1], [0, 1], [0, 1], [0, 1]],
        ...                          [[0, 0], [0, 0], [0, 0], [0, 0]],
        ...                          [[0, 1], [1, 2], [1, 1], [2, 2]],
        ...                          [[0, 0], [1, 1], [0, 1], [-1, -1]]])
        >>> subpops = [[0, 1], [2, 3]]
        >>> ac1 = g.count_alleles(subpop=subpops[0])
        >>> ac2 = g.count_alleles(subpop=subpops[1])
        >>> num, den = allel.stats.hudson_fst(ac1, ac2)
        >>> num
        array([ 1.        , -0.16666667,  0.        , -0.125     , -0.33333333])
        >>> den
        array([ 1.   ,  0.5  ,  0.   ,  0.625,  0.5  ])

    Estimate Fst for each variant individually::

        >>> fst = num / den
        >>> fst
        array([ 1.        , -0.33333333,         nan, -0.2       , -0.66666667])

    Estimate Fst averaging over variants::

        >>> fst = np.sum(num) / np.sum(den)
        >>> fst
        0.1428571428571429

    """  # flake8: noqa

    # check inputs
    ac1 = asarray_ndim(ac1, 2)
    ac2 = asarray_ndim(ac2, 2)
    check_dim0_aligned(ac1, ac2)
    ac1, ac2 = ensure_dim1_aligned(ac1, ac2)

    # calculate these once only
    an1 = np.sum(ac1, axis=1)
    an2 = np.sum(ac2, axis=1)

    # calculate average diversity (a.k.a. heterozygosity) within each
    # population
    within = (mean_pairwise_difference(ac1, an1, fill=fill) +
              mean_pairwise_difference(ac2, an2, fill=fill)) / 2

    # calculate divergence (a.k.a. heterozygosity) between each population
    between = mean_pairwise_difference_between(ac1, ac2, an1, an2, fill=fill)

    # define numerator and denominator for Fst calculations
    num = between - within
    den = between

    return num, den
예제 #18
0
def mean_pairwise_difference(ac, an=None, fill=np.nan):
    """Calculate for each variant the mean number of pairwise differences
    between chromosomes sampled from within a single population.

    Parameters
    ----------

    ac : array_like, int, shape (n_variants, n_alleles)
        Allele counts array.
    an : array_like, int, shape (n_variants,), optional
        Allele numbers. If not provided, will be calculated from `ac`.
    fill : float
        Use this value where there are no pairs to compare (e.g.,
        all allele calls are missing).

    Returns
    -------

    mpd : ndarray, float, shape (n_variants,)

    Notes
    -----

    The values returned by this function can be summed over a genome
    region and divided by the number of accessible bases to estimate
    nucleotide diversity, a.k.a. *pi*.

    Examples
    --------

    >>> import allel
    >>> h = allel.HaplotypeArray([[0, 0, 0, 0],
    ...                           [0, 0, 0, 1],
    ...                           [0, 0, 1, 1],
    ...                           [0, 1, 1, 1],
    ...                           [1, 1, 1, 1],
    ...                           [0, 0, 1, 2],
    ...                           [0, 1, 1, 2],
    ...                           [0, 1, -1, -1]])
    >>> ac = h.count_alleles()
    >>> allel.mean_pairwise_difference(ac)
    array([0.        , 0.5       , 0.66666667, 0.5       , 0.        ,
           0.83333333, 0.83333333, 1.        ])

    See Also
    --------

    sequence_diversity, windowed_diversity

    """

    # This function calculates the mean number of pairwise differences
    # between haplotypes within a single population, generalising to any number
    # of alleles.

    # check inputs
    ac = asarray_ndim(ac, 2)

    # total number of haplotypes
    if an is None:
        an = np.sum(ac, axis=1)
    else:
        an = asarray_ndim(an, 1)
        check_dim0_aligned(ac, an)

    # total number of pairwise comparisons for each variant:
    # (an choose 2)
    n_pairs = an * (an - 1) / 2

    # number of pairwise comparisons where there is no difference:
    # sum of (ac choose 2) for each allele (i.e., number of ways to
    # choose the same allele twice)
    n_same = np.sum(ac * (ac - 1) / 2, axis=1)

    # number of pairwise differences
    n_diff = n_pairs - n_same

    # mean number of pairwise differences, accounting for cases where
    # there are no pairs
    with ignore_invalid():
        mpd = np.where(n_pairs > 0, n_diff / n_pairs, fill)

    return mpd
예제 #19
0
def mean_pairwise_difference_between(ac1,
                                     ac2,
                                     an1=None,
                                     an2=None,
                                     fill=np.nan):
    """Calculate for each variant the mean number of pairwise differences
    between chromosomes sampled from two different populations.

    Parameters
    ----------

    ac1 : array_like, int, shape (n_variants, n_alleles)
        Allele counts array from the first population.
    ac2 : array_like, int, shape (n_variants, n_alleles)
        Allele counts array from the second population.
    an1 : array_like, int, shape (n_variants,), optional
        Allele numbers for the first population. If not provided, will be
        calculated from `ac1`.
    an2 : array_like, int, shape (n_variants,), optional
        Allele numbers for the second population. If not provided, will be
        calculated from `ac2`.
    fill : float
        Use this value where there are no pairs to compare (e.g.,
        all allele calls are missing).

    Returns
    -------

    mpd : ndarray, float, shape (n_variants,)

    Notes
    -----

    The values returned by this function can be summed over a genome
    region and divided by the number of accessible bases to estimate
    nucleotide divergence between two populations, a.k.a. *Dxy*.

    Examples
    --------

    >>> import allel
    >>> h = allel.HaplotypeArray([[0, 0, 0, 0],
    ...                           [0, 0, 0, 1],
    ...                           [0, 0, 1, 1],
    ...                           [0, 1, 1, 1],
    ...                           [1, 1, 1, 1],
    ...                           [0, 0, 1, 2],
    ...                           [0, 1, 1, 2],
    ...                           [0, 1, -1, -1]])
    >>> ac1 = h.count_alleles(subpop=[0, 1])
    >>> ac2 = h.count_alleles(subpop=[2, 3])
    >>> allel.mean_pairwise_difference_between(ac1, ac2)
    array([0.  , 0.5 , 1.  , 0.5 , 0.  , 1.  , 0.75,  nan])

    See Also
    --------

    sequence_divergence, windowed_divergence

    """

    # This function calculates the mean number of pairwise differences
    # between haplotypes from two different populations, generalising to any
    # number of alleles.

    # check inputs
    ac1 = asarray_ndim(ac1, 2)
    ac2 = asarray_ndim(ac2, 2)
    check_dim0_aligned(ac1, ac2)
    ac1, ac2 = ensure_dim1_aligned(ac1, ac2)

    # total number of haplotypes sampled from each population
    if an1 is None:
        an1 = np.sum(ac1, axis=1)
    else:
        an1 = asarray_ndim(an1, 1)
        check_dim0_aligned(ac1, an1)
    if an2 is None:
        an2 = np.sum(ac2, axis=1)
    else:
        an2 = asarray_ndim(an2, 1)
        check_dim0_aligned(ac2, an2)

    # total number of pairwise comparisons for each variant
    n_pairs = an1 * an2

    # number of pairwise comparisons where there is no difference:
    # sum of (ac1 * ac2) for each allele (i.e., number of ways to
    # choose the same allele twice)
    n_same = np.sum(ac1 * ac2, axis=1)

    # number of pairwise differences
    n_diff = n_pairs - n_same

    # mean number of pairwise differences, accounting for cases where
    # there are no pairs
    with ignore_invalid():
        mpd = np.where(n_pairs > 0, n_diff / n_pairs, fill)

    return mpd
예제 #20
0
def standardize_by_allele_count(score, aac, bins=None, n_bins=None, diagnostics=True):
    """Standardize `score` within allele frequency bins.

    Parameters
    ----------
    score : array_like, float
        The score to be standardized, e.g., IHS or NSL.
    aac : array_like, int
        An array of alternate allele counts.
    bins : array_like, int, optional
        Allele count bins, overrides `n_bins`.
    n_bins : int, optional
        Number of allele count bins to use.
    diagnostics : bool, optional
        If True, plot some diagnostic information about the standardization.

    Returns
    -------
    score_standardized : ndarray, float
        Standardized scores.
    bins : ndarray, int
        Allele count bins used for standardization.

    """

    from scipy.stats import binned_statistic

    # check inputs
    score = asarray_ndim(score, 1)
    aac = asarray_ndim(aac, 1)
    check_dim0_aligned(score, aac)

    # remove nans
    nonan = ~np.isnan(score)
    score_nonan = score[nonan]
    aac_nonan = aac[nonan]

    if bins is None:
        # make our own similar sized bins

        # how many bins to make?
        if n_bins is None:
            # something vaguely reasonable
            n_bins = np.max(aac) // 2

        # make bins
        bins = make_similar_sized_bins(aac_nonan, n_bins)

    else:
        # user-provided bins
        bins = asarray_ndim(bins, 1)

    mean_score, _, _ = binned_statistic(aac_nonan, score_nonan, statistic=np.mean, bins=bins)
    std_score, _, _ = binned_statistic(aac_nonan, score_nonan, statistic=np.std, bins=bins)

    if diagnostics:
        import matplotlib.pyplot as plt

        x = (bins[:-1] + bins[1:]) / 2
        plt.figure()
        plt.fill_between(x, mean_score - std_score, mean_score + std_score, alpha=0.5, label="std")
        plt.plot(x, mean_score, marker="o", label="mean")
        plt.grid(axis="y")
        plt.xlabel("Alternate allele count")
        plt.ylabel("Unstandardized score")
        plt.title("Standardization diagnostics")
        plt.legend()

    # apply standardization
    score_standardized = np.empty_like(score)
    for i in range(len(bins) - 1):
        x1 = bins[i]
        x2 = bins[i + 1]
        if i == 0:
            # first bin
            loc = aac < x2
        elif i == len(bins) - 2:
            # last bin
            loc = aac >= x1
        else:
            # middle bins
            loc = (aac >= x1) & (aac < x2)
        m = mean_score[i]
        s = std_score[i]
        score_standardized[loc] = (score[loc] - m) / s

    return score_standardized, bins
예제 #21
0
def compute_ihh_gaps(pos, map_pos, gap_scale, max_gap, is_accessible):
    """Compute spacing between variants for integrating haplotype
    homozygosity.

    Parameters
    ----------
    pos : array_like, int, shape (n_variants,)
        Variant positions (physical distance).
    map_pos : array_like, float, shape (n_variants,)
        Variant positions (genetic map distance).
    gap_scale : int, optional
        Rescale distance between variants if gap is larger than this value.
    max_gap : int, optional
        Do not report scores if EHH spans a gap larger than this number of
        base pairs.
    is_accessible : array_like, bool, optional
        Genome accessibility array. If provided, distance between variants
        will be computed as the number of accessible bases between them.

    Returns
    -------
    gaps : ndarray, float, shape (n_variants - 1,)

    """

    # check inputs
    if map_pos is None:
        # integrate over physical distance
        map_pos = pos
    else:
        map_pos = asarray_ndim(map_pos, 1)
        check_dim0_aligned(pos, map_pos)

    # compute physical gaps
    physical_gaps = np.diff(pos)

    # compute genetic gaps
    gaps = np.diff(map_pos).astype("f8")

    if is_accessible is not None:

        # compute accessible gaps
        is_accessible = asarray_ndim(is_accessible, 1)
        assert is_accessible.shape[0] > pos[-1], "accessibility array too short"
        accessible_gaps = np.zeros_like(physical_gaps)
        for i in range(1, len(pos)):
            # N.B., expect pos is 1-based
            n_access = np.count_nonzero(is_accessible[pos[i - 1] - 1 : pos[i] - 1])
            accessible_gaps[i - 1] = n_access

        # adjust using accessibility
        scaling = accessible_gaps / physical_gaps
        gaps = gaps * scaling

    elif gap_scale is not None and gap_scale > 0:

        scaling = np.ones(gaps.shape, dtype="f8")
        loc_scale = physical_gaps > gap_scale
        scaling[loc_scale] = gap_scale / physical_gaps[loc_scale]
        gaps = gaps * scaling

    if max_gap is not None and max_gap > 0:

        # deal with very large gaps
        gaps[physical_gaps > max_gap] = -1

    return gaps
예제 #22
0
def ihs(h,
        pos,
        map_pos=None,
        min_ehh=0.05,
        min_maf=0.05,
        include_edges=False,
        gap_scale=20000,
        max_gap=200000,
        is_accessible=None,
        use_threads=True):
    """Compute the unstandardized integrated haplotype score (IHS) for each
    variant, comparing integrated haplotype homozygosity between the
    reference (0) and alternate (1) alleles.

    Parameters
    ----------
    h : array_like, int, shape (n_variants, n_haplotypes)
        Haplotype array.
    pos : array_like, int, shape (n_variants,)
        Variant positions (physical distance).
    map_pos : array_like, float, shape (n_variants,)
        Variant positions (genetic map distance).
    min_ehh: float, optional
        Minimum EHH beyond which to truncate integrated haplotype
        homozygosity calculation.
    min_maf : float, optional
        Do not compute integrated haplotype homozogysity for variants with
        minor allele frequency below this value.
    include_edges : bool, optional
        If True, report scores even if EHH does not decay below `min_ehh`
        before reaching the edge of the data.
    gap_scale : int, optional
        Rescale distance between variants if gap is larger than this value.
    max_gap : int, optional
        Do not report scores if EHH spans a gap larger than this number of
        base pairs.
    is_accessible : array_like, bool, optional
        Genome accessibility array. If provided, distance between variants
        will be computed as the number of accessible bases between them.
    use_threads : bool, optional
        If True use multiple threads to compute.

    Returns
    -------
    score : ndarray, float, shape (n_variants,)
        Unstandardized IHS scores.

    Notes
    -----

    This function will calculate IHS for all variants. To exclude variants
    below a given minor allele frequency, filter the input haplotype array
    before passing to this function.

    This function computes IHS comparing the reference and alternate alleles.
    These can be polarised by switching the sign for any variant where the
    reference allele is derived.

    This function returns NaN for any IHS calculations where haplotype
    homozygosity does not decay below `min_ehh` before reaching the first or
    last variant. To disable this behaviour, set `include_edges` to True.

    Note that the unstandardized score is returned. Usually these scores are
    then standardized in different allele frequency bins.

    See Also
    --------
    standardize_by_allele_count

    """

    # check inputs
    h = asarray_ndim(h, 2)
    check_integer_dtype(h)
    pos = asarray_ndim(pos, 1)
    check_dim0_aligned(h, pos)

    # compute gaps between variants for integration
    gaps = compute_ihh_gaps(pos, map_pos, gap_scale, max_gap, is_accessible)

    # setup kwargs
    kwargs = dict(min_ehh=min_ehh,
                  min_maf=min_maf,
                  include_edges=include_edges)

    if use_threads and multiprocessing.cpu_count() > 1:
        # run with threads

        # create pool
        pool = ThreadPool(2)

        # scan forward
        result_fwd = pool.apply_async(ihh01_scan, (h, gaps), kwargs)

        # scan backward
        result_rev = pool.apply_async(ihh01_scan, (h[::-1], gaps[::-1]),
                                      kwargs)

        # wait for both to finish
        pool.close()
        pool.join()

        # obtain results
        ihh0_fwd, ihh1_fwd = result_fwd.get()
        ihh0_rev, ihh1_rev = result_rev.get()

        # cleanup
        pool.terminate()

    else:
        # run without threads

        # scan forward
        ihh0_fwd, ihh1_fwd = ihh01_scan(h, gaps, **kwargs)

        # scan backward
        ihh0_rev, ihh1_rev = ihh01_scan(h[::-1], gaps[::-1], **kwargs)

    # handle reverse scan
    ihh0_rev = ihh0_rev[::-1]
    ihh1_rev = ihh1_rev[::-1]

    # compute unstandardized score
    ihh0 = ihh0_fwd + ihh0_rev
    ihh1 = ihh1_fwd + ihh1_rev
    score = np.log(ihh1 / ihh0)

    return score
예제 #23
0
def xpehh(h1,
          h2,
          pos,
          map_pos=None,
          min_ehh=0.05,
          include_edges=False,
          gap_scale=20000,
          max_gap=200000,
          is_accessible=None,
          use_threads=True):
    """Compute the unstandardized cross-population extended haplotype
    homozygosity score (XPEHH) for each variant.

    Parameters
    ----------
    h1 : array_like, int, shape (n_variants, n_haplotypes)
        Haplotype array for the first population.
    h2 : array_like, int, shape (n_variants, n_haplotypes)
        Haplotype array for the second population.
    pos : array_like, int, shape (n_variants,)
        Variant positions on physical or genetic map.
    map_pos : array_like, float, shape (n_variants,)
        Variant positions (genetic map distance).
    min_ehh: float, optional
        Minimum EHH beyond which to truncate integrated haplotype
        homozygosity calculation.
    include_edges : bool, optional
        If True, report scores even if EHH does not decay below `min_ehh`
        before reaching the edge of the data.
    gap_scale : int, optional
        Rescale distance between variants if gap is larger than this value.
    max_gap : int, optional
        Do not report scores if EHH spans a gap larger than this number of
        base pairs.
    is_accessible : array_like, bool, optional
        Genome accessibility array. If provided, distance between variants
        will be computed as the number of accessible bases between them.
    use_threads : bool, optional
        If True use multiple threads to compute.

    Returns
    -------
    score : ndarray, float, shape (n_variants,)
        Unstandardized XPEHH scores.

    Notes
    -----

    This function will calculate XPEHH for all variants. To exclude variants
    below a given minor allele frequency, filter the input haplotype arrays
    before passing to this function.

    This function returns NaN for any EHH calculations where haplotype
    homozygosity does not decay below `min_ehh` before reaching the first or
    last variant. To disable this behaviour, set `include_edges` to True.

    Note that the unstandardized score is returned. Usually these scores are
    then standardized genome-wide.

    Haplotype arrays from the two populations may have different numbers of
    haplotypes.

    See Also
    --------
    standardize

    """

    # check inputs
    h1 = asarray_ndim(h1, 2)
    check_integer_dtype(h1)
    h2 = asarray_ndim(h2, 2)
    check_integer_dtype(h2)
    pos = asarray_ndim(pos, 1)
    check_dim0_aligned(h1, h2, pos)

    # compute gaps between variants for integration
    gaps = compute_ihh_gaps(pos, map_pos, gap_scale, max_gap, is_accessible)

    # setup kwargs
    kwargs = dict(min_ehh=min_ehh, include_edges=include_edges)

    if use_threads and multiprocessing.cpu_count() > 1:
        # use multiple threads

        # setup threadpool
        pool = ThreadPool(min(4, multiprocessing.cpu_count()))

        # scan forward
        res1_fwd = pool.apply_async(ihh_scan, (h1, gaps), kwargs)
        res2_fwd = pool.apply_async(ihh_scan, (h2, gaps), kwargs)

        # scan backward
        res1_rev = pool.apply_async(ihh_scan, (h1[::-1], gaps[::-1]), kwargs)
        res2_rev = pool.apply_async(ihh_scan, (h2[::-1], gaps[::-1]), kwargs)

        # wait for both to finish
        pool.close()
        pool.join()

        # obtain results
        ihh1_fwd = res1_fwd.get()
        ihh2_fwd = res2_fwd.get()
        ihh1_rev = res1_rev.get()
        ihh2_rev = res2_rev.get()

        # cleanup
        pool.terminate()

    else:
        # compute without threads

        # scan forward
        ihh1_fwd = ihh_scan(h1, gaps, **kwargs)
        ihh2_fwd = ihh_scan(h2, gaps, **kwargs)

        # scan backward
        ihh1_rev = ihh_scan(h1[::-1], gaps[::-1], **kwargs)
        ihh2_rev = ihh_scan(h2[::-1], gaps[::-1], **kwargs)

    # handle reverse scans
    ihh1_rev = ihh1_rev[::-1]
    ihh2_rev = ihh2_rev[::-1]

    # compute unstandardized score
    ihh1 = ihh1_fwd + ihh1_rev
    ihh2 = ihh2_fwd + ihh2_rev
    score = np.log(ihh1 / ihh2)

    return score
예제 #24
0
def pbs(ac1,
        ac2,
        ac3,
        window_size,
        window_start=0,
        window_stop=None,
        window_step=None,
        normed=True):
    """Compute the population branching statistic (PBS) which performs a comparison
    of allele frequencies between three populations to detect genome regions that are
    unusually differentiated in one population relative to the other two populations.

    Parameters
    ----------
    ac1 : array_like, int
        Allele counts from the first population.
    ac2 : array_like, int
        Allele counts from the second population.
    ac3 : array_like, int
        Allele counts from the third population.
    window_size : int
        The window size (number of variants) within which to compute PBS values.
    window_start : int, optional
        The variant index at which to start windowed calculations.
    window_stop : int, optional
        The variant index at which to stop windowed calculations.
    window_step : int, optional
        The number of variants between start positions of windows. If not given, defaults
        to the window size, i.e., non-overlapping windows.
    normed : bool, optional
        If True (default), use the normalised version of PBS, also known as PBSn1 [2]_.
        Otherwise, use the PBS statistic as originally defined in [1]_.

    Returns
    -------
    pbs : ndarray, float
        Windowed PBS values.

    Notes
    -----
    The F\ :sub:`ST` calculations use Hudson's estimator.

    References
    ----------
    .. [1] Yi et al., "Sequencing of Fifty Human Exomes Reveals Adaptation to High
       Altitude", Science, 329(5987): 75–78, 2 July 2010.
    .. [2] Malaspinas et al., "A genomic history of Aboriginal Australia", Nature. volume
       538, pages 207–214, 13 October 2016.

    """

    # normalise and check inputs
    ac1 = AlleleCountsArray(ac1)
    ac2 = AlleleCountsArray(ac2)
    ac3 = AlleleCountsArray(ac3)
    check_dim0_aligned(ac1, ac2, ac3)

    # compute fst
    fst12 = moving_hudson_fst(ac1,
                              ac2,
                              size=window_size,
                              start=window_start,
                              stop=window_stop,
                              step=window_step)
    fst13 = moving_hudson_fst(ac1,
                              ac3,
                              size=window_size,
                              start=window_start,
                              stop=window_stop,
                              step=window_step)
    fst23 = moving_hudson_fst(ac2,
                              ac3,
                              size=window_size,
                              start=window_start,
                              stop=window_stop,
                              step=window_step)

    # clip fst values to avoid infinite if fst is 1
    for x in fst12, fst13, fst23:
        np.clip(x, a_min=0, a_max=0.99999, out=x)

    # compute fst transform
    t12 = -np.log(1 - fst12)
    t13 = -np.log(1 - fst13)
    t23 = -np.log(1 - fst23)

    # compute pbs
    ret = (t12 + t13 - t23) / 2

    if normed:
        # compute pbs normalising constant
        norm = 1 + (t12 + t13 + t23) / 2
        ret = ret / norm

    return ret
예제 #25
0
def mean_pairwise_difference(ac, an=None, fill=np.nan):
    """Calculate for each variant the mean number of pairwise differences
    between chromosomes sampled from within a single population.

    Parameters
    ----------

    ac : array_like, int, shape (n_variants, n_alleles)
        Allele counts array.
    an : array_like, int, shape (n_variants,), optional
        Allele numbers. If not provided, will be calculated from `ac`.
    fill : float
        Use this value where there are no pairs to compare (e.g.,
        all allele calls are missing).

    Returns
    -------

    mpd : ndarray, float, shape (n_variants,)

    Notes
    -----

    The values returned by this function can be summed over a genome
    region and divided by the number of accessible bases to estimate
    nucleotide diversity, a.k.a. *pi*.

    Examples
    --------

    >>> import allel
    >>> h = allel.HaplotypeArray([[0, 0, 0, 0],
    ...                           [0, 0, 0, 1],
    ...                           [0, 0, 1, 1],
    ...                           [0, 1, 1, 1],
    ...                           [1, 1, 1, 1],
    ...                           [0, 0, 1, 2],
    ...                           [0, 1, 1, 2],
    ...                           [0, 1, -1, -1]])
    >>> ac = h.count_alleles()
    >>> allel.stats.mean_pairwise_difference(ac)
    array([ 0.        ,  0.5       ,  0.66666667,  0.5       ,  0.        ,
            0.83333333,  0.83333333,  1.        ])

    See Also
    --------

    sequence_diversity, windowed_diversity

    """

    # This function calculates the mean number of pairwise differences
    # between haplotypes within a single population, generalising to any number
    # of alleles.

    # check inputs
    ac = asarray_ndim(ac, 2)

    # total number of haplotypes
    if an is None:
        an = np.sum(ac, axis=1)
    else:
        an = asarray_ndim(an, 1)
        check_dim0_aligned(ac, an)

    # total number of pairwise comparisons for each variant:
    # (an choose 2)
    n_pairs = an * (an - 1) / 2

    # number of pairwise comparisons where there is no difference:
    # sum of (ac choose 2) for each allele (i.e., number of ways to
    # choose the same allele twice)
    n_same = np.sum(ac * (ac - 1) / 2, axis=1)

    # number of pairwise differences
    n_diff = n_pairs - n_same

    # mean number of pairwise differences, accounting for cases where
    # there are no pairs
    with ignore_invalid():
        mpd = np.where(n_pairs > 0, n_diff / n_pairs, fill)

    return mpd
예제 #26
0
def roh_mhmm(gv,
             pos,
             phet_roh=0.001,
             phet_nonroh=(0.0025, 0.01),
             transition=1e-6,
             min_roh=0,
             is_accessible=None,
             contig_size=None):
    """Call ROH (runs of homozygosity) in a single individual given a genotype vector.

    This function computes the likely ROH using a Multinomial HMM model. There are 3
    observable states at each position in a chromosome/contig: 0 = Hom, 1 = Het,
    2 = inaccessible (i.e., unobserved).

    The model is provided with a probability of observing a het in a ROH (`phet_roh`) and one
    or more probabilities of observing a het in a non-ROH, as this probability may not be
    constant across the genome (`phet_nonroh`).

    Parameters
    ----------
    gv : array_like, int, shape (n_variants, ploidy)
        Genotype vector.
    pos: array_like, int, shape (n_variants,)
        Positions of variants, same 0th dimension as `gv`.
    phet_roh: float, optional
        Probability of observing a heterozygote in a ROH. Appropriate values
        will depend on de novo mutation rate and genotype error rate.
    phet_nonroh: tuple of floats, optional
        One or more probabilites of observing a heterozygote outside of ROH.
        Appropriate values will depend primarily on nucleotide diversity within
        the population, but also on mutation rate and genotype error rate.
    transition: float, optional
        Probability of moving between states.
    min_roh: integer, optional
        Minimum size (bp) to condsider as a ROH. Will depend on contig size
        and recombination rate.
    is_accessible: array_like, bool, shape (`contig_size`,), optional
        Boolean array for each position in contig describing whether accessible
        or not.
    contig_size: int, optional
        If is_accessible not known/not provided, allows specification of
        total length of contig.

    Returns
    -------
    df_roh: DataFrame
        Data frame where each row describes a run of homozygosity. Columns are 'start',
        'stop', 'length' and 'is_marginal'. Start and stop are 1-based, stop-inclusive.
    froh: float
        Proportion of genome in a ROH.

    Notes
    -----
    This function requires `hmmlearn <http://hmmlearn.readthedocs.io/en/latest/>`_ to be
    installed.

    This function currently requires around 4GB memory for a contig size of ~50Mbp.

    """

    from hmmlearn import hmm

    # setup inputs
    if isinstance(phet_nonroh, float):
        phet_nonroh = phet_nonroh,
    gv = GenotypeVector(gv)
    pos = asarray_ndim(pos, 1)
    check_dim0_aligned(gv, pos)
    is_accessible = asarray_ndim(is_accessible, 1, dtype=bool)

    # heterozygote probabilities
    het_px = np.concatenate([(phet_roh, ), phet_nonroh])

    # start probabilities (all equal)
    start_prob = np.repeat(1 / het_px.size, het_px.size)

    # transition between underlying states
    transition_mx = _hmm_derive_transition_matrix(transition, het_px.size)

    # probability of inaccessible
    if is_accessible is None:
        if contig_size is None:
            raise ValueError(
                "If is_accessibile argument is not provided, you must provide contig_size"
            )
        p_accessible = 1.0
    else:
        p_accessible = is_accessible.mean()
        contig_size = is_accessible.size

    emission_mx = _mhmm_derive_emission_matrix(het_px, p_accessible)

    # initialize HMM
    roh_hmm = hmm.MultinomialHMM(n_components=het_px.size)
    roh_hmm.n_symbols_ = 3
    roh_hmm.startprob_ = start_prob
    roh_hmm.transmat_ = transition_mx
    roh_hmm.emissionprob_ = emission_mx

    # locate heterozygous calls
    is_het = gv.is_het()

    # predict ROH state
    pred, obs = _mhmm_predict_roh_state(roh_hmm, is_het, pos, is_accessible,
                                        contig_size)

    # find ROH windows
    df_blocks = tabulate_state_blocks(pred, states=list(range(len(het_px))))
    df_roh = df_blocks[(df_blocks.state == 0)].reset_index(drop=True)
    # adapt the dataframe for ROH
    for col in 'state', 'support', 'start_lidx', 'stop_ridx', 'size_max':
        del df_roh[col]
    df_roh.rename(columns={
        'start_ridx': 'start',
        'stop_lidx': 'stop',
        'size_min': 'length'
    },
                  inplace=True)
    # make coordinates 1-based
    df_roh['start'] = df_roh['start'] + 1
    df_roh['stop'] = df_roh['stop'] + 1

    # filter by ROH size
    if min_roh > 0:
        df_roh = df_roh[df_roh.length >= min_roh]

    # compute FROH
    froh = df_roh.length.sum() / contig_size

    return df_roh, froh
예제 #27
0
def mean_pairwise_difference_between(ac1, ac2, an1=None, an2=None, fill=np.nan):
    """Calculate for each variant the mean number of pairwise differences
    between chromosomes sampled from two different populations.

    Parameters
    ----------

    ac1 : array_like, int, shape (n_variants, n_alleles)
        Allele counts array from the first population.
    ac2 : array_like, int, shape (n_variants, n_alleles)
        Allele counts array from the second population.
    an1 : array_like, int, shape (n_variants,), optional
        Allele numbers for the first population. If not provided, will be
        calculated from `ac1`.
    an2 : array_like, int, shape (n_variants,), optional
        Allele numbers for the second population. If not provided, will be
        calculated from `ac2`.
    fill : float
        Use this value where there are no pairs to compare (e.g.,
        all allele calls are missing).

    Returns
    -------

    mpd : ndarray, float, shape (n_variants,)

    Notes
    -----

    The values returned by this function can be summed over a genome
    region and divided by the number of accessible bases to estimate
    nucleotide divergence between two populations, a.k.a. *Dxy*.

    Examples
    --------

    >>> import allel
    >>> h = allel.HaplotypeArray([[0, 0, 0, 0],
    ...                           [0, 0, 0, 1],
    ...                           [0, 0, 1, 1],
    ...                           [0, 1, 1, 1],
    ...                           [1, 1, 1, 1],
    ...                           [0, 0, 1, 2],
    ...                           [0, 1, 1, 2],
    ...                           [0, 1, -1, -1]])
    >>> ac1 = h.count_alleles(subpop=[0, 1])
    >>> ac2 = h.count_alleles(subpop=[2, 3])
    >>> allel.stats.mean_pairwise_difference_between(ac1, ac2)
    array([ 0.  ,  0.5 ,  1.  ,  0.5 ,  0.  ,  1.  ,  0.75,   nan])

    See Also
    --------

    sequence_divergence, windowed_divergence

    """

    # This function calculates the mean number of pairwise differences
    # between haplotypes from two different populations, generalising to any
    # number of alleles.

    # check inputs
    ac1 = asarray_ndim(ac1, 2)
    ac2 = asarray_ndim(ac2, 2)
    check_dim0_aligned(ac1, ac2)
    ac1, ac2 = ensure_dim1_aligned(ac1, ac2)

    # total number of haplotypes sampled from each population
    if an1 is None:
        an1 = np.sum(ac1, axis=1)
    else:
        an1 = asarray_ndim(an1, 1)
        check_dim0_aligned(ac1, an1)
    if an2 is None:
        an2 = np.sum(ac2, axis=1)
    else:
        an2 = asarray_ndim(an2, 1)
        check_dim0_aligned(ac2, an2)

    # total number of pairwise comparisons for each variant
    n_pairs = an1 * an2

    # number of pairwise comparisons where there is no difference:
    # sum of (ac1 * ac2) for each allele (i.e., number of ways to
    # choose the same allele twice)
    n_same = np.sum(ac1 * ac2, axis=1)

    # number of pairwise differences
    n_diff = n_pairs - n_same

    # mean number of pairwise differences, accounting for cases where
    # there are no pairs
    with ignore_invalid():
        mpd = np.where(n_pairs > 0, n_diff / n_pairs, fill)

    return mpd
예제 #28
0
def ihs(
    h,
    pos,
    map_pos=None,
    min_ehh=0.05,
    min_maf=0.05,
    include_edges=False,
    gap_scale=20000,
    max_gap=200000,
    is_accessible=None,
    use_threads=True,
):
    """Compute the unstandardized integrated haplotype score (IHS) for each
    variant, comparing integrated haplotype homozygosity between the
    reference (0) and alternate (1) alleles.

    Parameters
    ----------
    h : array_like, int, shape (n_variants, n_haplotypes)
        Haplotype array.
    pos : array_like, int, shape (n_variants,)
        Variant positions (physical distance).
    map_pos : array_like, float, shape (n_variants,)
        Variant positions (genetic map distance).
    min_ehh: float, optional
        Minimum EHH beyond which to truncate integrated haplotype
        homozygosity calculation.
    min_maf : float, optional
        Do not compute integrated haplotype homozogysity for variants with
        minor allele frequency below this value.
    include_edges : bool, optional
        If True, report scores even if EHH does not decay below `min_ehh`
        before reaching the edge of the data.
    gap_scale : int, optional
        Rescale distance between variants if gap is larger than this value.
    max_gap : int, optional
        Do not report scores if EHH spans a gap larger than this number of
        base pairs.
    is_accessible : array_like, bool, optional
        Genome accessibility array. If provided, distance between variants
        will be computed as the number of accessible bases between them.
    use_threads : bool, optional
        If True use multiple threads to compute.

    Returns
    -------
    score : ndarray, float, shape (n_variants,)
        Unstandardized IHS scores.

    Notes
    -----

    This function will calculate IHS for all variants. To exclude variants
    below a given minor allele frequency, filter the input haplotype array
    before passing to this function.

    This function computes IHS comparing the reference and alternate alleles.
    These can be polarised by switching the sign for any variant where the
    reference allele is derived.

    This function returns NaN for any IHS calculations where haplotype
    homozygosity does not decay below `min_ehh` before reaching the first or
    last variant. To disable this behaviour, set `include_edges` to True.

    Note that the unstandardized score is returned. Usually these scores are
    then standardized in different allele frequency bins.

    See Also
    --------
    standardize_by_allele_count

    """

    from allel.opt.stats import ihh01_scan_int8

    # check inputs
    h = HaplotypeArray(np.asarray(h, dtype="i1"))
    pos = asarray_ndim(pos, 1)
    check_dim0_aligned(h, pos)

    # compute gaps between variants for integration
    gaps = compute_ihh_gaps(pos, map_pos, gap_scale, max_gap, is_accessible)

    # setup kwargs
    kwargs = dict(min_ehh=min_ehh, min_maf=min_maf, include_edges=include_edges)

    if use_threads and multiprocessing.cpu_count() > 1:
        # run with threads

        # create pool
        pool = ThreadPool(2)

        # scan forward
        result_fwd = pool.apply_async(ihh01_scan_int8, (h, gaps), kwargs)

        # scan backward
        result_rev = pool.apply_async(ihh01_scan_int8, (h[::-1], gaps[::-1]), kwargs)

        # wait for both to finish
        pool.close()
        pool.join()

        # obtain results
        ihh0_fwd, ihh1_fwd = result_fwd.get()
        ihh0_rev, ihh1_rev = result_rev.get()

        # cleanup
        pool.terminate()

    else:
        # run without threads

        # scan forward
        ihh0_fwd, ihh1_fwd = ihh01_scan_int8(h, gaps, **kwargs)

        # scan backward
        ihh0_rev, ihh1_rev = ihh01_scan_int8(h[::-1], gaps[::-1], **kwargs)

    # handle reverse scan
    ihh0_rev = ihh0_rev[::-1]
    ihh1_rev = ihh1_rev[::-1]

    # compute unstandardized score
    ihh0 = ihh0_fwd + ihh0_rev
    ihh1 = ihh1_fwd + ihh1_rev
    score = np.log(ihh1 / ihh0)

    return score