예제 #1
0
 def __init__(self, data, chunks=None, name=None, lock=False):
     super(AlleleCountsDaskArray, self).__init__(data,
                                                 chunks=chunks,
                                                 name=name,
                                                 lock=lock)
     check_ndim(self.values, 2)
     check_integer_dtype(self.values)
예제 #2
0
def sfs(dac):
    """Compute the site frequency spectrum given derived allele counts at
    a set of biallelic variants.

    Parameters
    ----------
    dac : array_like, int, shape (n_variants,)
        Array of derived allele counts.

    Returns
    -------
    sfs : ndarray, int, shape (n_chromosomes,)
        Array where the kth element is the number of variant sites with k
        derived alleles.

    """

    # check input
    dac = asarray_ndim(dac, 1)
    check_integer_dtype(dac)

    # need platform integer for bincount
    dac = dac.astype(int, copy=False)

    # compute site frequency spectrum
    s = np.bincount(dac)

    return s
예제 #3
0
def sfs_folded(ac):
    """Compute the folded site frequency spectrum given reference and
    alternate allele counts at a set of biallelic variants.

    Parameters
    ----------
    ac : array_like, int, shape (n_variants, 2)
        Allele counts array.

    Returns
    -------
    sfs_folded : ndarray, int, shape (n_chromosomes//2,)
        Array where the kth element is the number of variant sites with a
        minor allele count of k.

    """

    # check input
    ac = asarray_ndim(ac, 2)
    assert ac.shape[1] == 2, 'only biallelic variants are supported'
    check_integer_dtype(ac)

    # compute minor allele counts
    mac = np.amin(ac, axis=1)

    # need platform integer for bincount
    mac = mac.astype(int, copy=False)

    # compute folded site frequency spectrum
    s = np.bincount(mac)

    return s
예제 #4
0
def joint_sfs(dac1, dac2):
    """Compute the joint site frequency spectrum between two populations.

    Parameters
    ----------
    dac1 : array_like, int, shape (n_variants,)
        Derived allele counts for the first population.
    dac2 : array_like, int, shape (n_variants,)
        Derived allele counts for the second population.

    Returns
    -------
    joint_sfs : ndarray, int, shape (m_chromosomes, n_chromosomes)
        Array where the (i, j)th element is the number of variant sites with i
        derived alleles in the first population and j derived alleles in the
        second population.

    """

    # check inputs
    dac1 = asarray_ndim(dac1, 1)
    dac2 = asarray_ndim(dac2, 1)
    check_integer_dtype(dac1)
    check_integer_dtype(dac2)

    # compute site frequency spectrum
    n = int(np.max(dac1) + 1)
    m = int(np.max(dac2) + 1)

    # need platform integer for bincount
    tmp = (dac1 * m + dac2).astype(int, copy=False)

    s = np.bincount(tmp)
    s.resize(n, m)
    return s
예제 #5
0
 def __init__(self, data, chunks=None, name=None, lock=False):
     super(GenotypesDask, self).__init__(data,
                                         chunks=chunks,
                                         name=name,
                                         lock=lock)
     check_integer_dtype(self.values)
     self._mask = None
     self._is_phased = None
예제 #6
0
def _check_dac_n(dac, n):
    dac = asarray_ndim(dac, 1)
    check_integer_dtype(dac)
    mx = np.max(dac)
    if n is None:
        n = mx
    elif n < mx:
        raise ValueError(
            'number of chromosomes too small; expected {}, found {}'.format(
                n, mx))
    return dac, int(n)
예제 #7
0
def _check_ac_n(ac, n):
    ac = asarray_ndim(ac, 2)
    if ac.shape[1] != 2:
        raise ValueError('only biallelic variants are supported')
    check_integer_dtype(ac)
    mx = np.max(np.sum(ac, axis=1))
    if n is None:
        n = mx
    elif n < mx:
        raise ValueError(
            'number of chromosomes too small; expected {}, found {}'.format(
                n, mx))
    return ac, int(n)
예제 #8
0
def joint_sfs_folded(ac1, ac2):
    """Compute the joint folded site frequency spectrum between two
    populations.

    Parameters
    ----------
    ac1 : array_like, int, shape (n_variants, 2)
        Allele counts for the first population.
    ac2 : array_like, int, shape (n_variants, 2)
        Allele counts for the second population.

    Returns
    -------
    joint_sfs_folded : ndarray, int, shape (m_chromosomes//2, n_chromosomes//2)
        Array where the (i, j)th element is the number of variant sites with a
        minor allele count of i in the first population and j in the second
        population.

    """

    # check inputs
    ac1 = asarray_ndim(ac1, 2)
    ac2 = asarray_ndim(ac2, 2)
    assert ac1.shape[1] == ac2.shape[1] == 2, \
        'only biallelic variants are supported'
    check_integer_dtype(ac1)
    check_integer_dtype(ac2)

    # compute minor allele counts
    mac1 = np.amin(ac1, axis=1)
    mac2 = np.amin(ac2, axis=1)

    # compute site frequency spectrum
    m = int(np.max(mac1) + 1)
    n = int(np.max(mac2) + 1)
    tmp = (mac1 * n + mac2).astype(int, copy=False)
    s = np.bincount(tmp)
    s.resize(m, n)
    return s
예제 #9
0
def xpehh(h1,
          h2,
          pos,
          map_pos=None,
          min_ehh=0.05,
          include_edges=False,
          gap_scale=20000,
          max_gap=200000,
          is_accessible=None,
          use_threads=True):
    """Compute the unstandardized cross-population extended haplotype
    homozygosity score (XPEHH) for each variant.

    Parameters
    ----------
    h1 : array_like, int, shape (n_variants, n_haplotypes)
        Haplotype array for the first population.
    h2 : array_like, int, shape (n_variants, n_haplotypes)
        Haplotype array for the second population.
    pos : array_like, int, shape (n_variants,)
        Variant positions on physical or genetic map.
    map_pos : array_like, float, shape (n_variants,)
        Variant positions (genetic map distance).
    min_ehh: float, optional
        Minimum EHH beyond which to truncate integrated haplotype
        homozygosity calculation.
    include_edges : bool, optional
        If True, report scores even if EHH does not decay below `min_ehh`
        before reaching the edge of the data.
    gap_scale : int, optional
        Rescale distance between variants if gap is larger than this value.
    max_gap : int, optional
        Do not report scores if EHH spans a gap larger than this number of
        base pairs.
    is_accessible : array_like, bool, optional
        Genome accessibility array. If provided, distance between variants
        will be computed as the number of accessible bases between them.
    use_threads : bool, optional
        If True use multiple threads to compute.

    Returns
    -------
    score : ndarray, float, shape (n_variants,)
        Unstandardized XPEHH scores.

    Notes
    -----

    This function will calculate XPEHH for all variants. To exclude variants
    below a given minor allele frequency, filter the input haplotype arrays
    before passing to this function.

    This function returns NaN for any EHH calculations where haplotype
    homozygosity does not decay below `min_ehh` before reaching the first or
    last variant. To disable this behaviour, set `include_edges` to True.

    Note that the unstandardized score is returned. Usually these scores are
    then standardized genome-wide.

    Haplotype arrays from the two populations may have different numbers of
    haplotypes.

    See Also
    --------
    standardize

    """

    # check inputs
    h1 = asarray_ndim(h1, 2)
    check_integer_dtype(h1)
    h2 = asarray_ndim(h2, 2)
    check_integer_dtype(h2)
    pos = asarray_ndim(pos, 1)
    check_dim0_aligned(h1, h2, pos)

    # compute gaps between variants for integration
    gaps = compute_ihh_gaps(pos, map_pos, gap_scale, max_gap, is_accessible)

    # setup kwargs
    kwargs = dict(min_ehh=min_ehh, include_edges=include_edges)

    if use_threads and multiprocessing.cpu_count() > 1:
        # use multiple threads

        # setup threadpool
        pool = ThreadPool(min(4, multiprocessing.cpu_count()))

        # scan forward
        res1_fwd = pool.apply_async(ihh_scan, (h1, gaps), kwargs)
        res2_fwd = pool.apply_async(ihh_scan, (h2, gaps), kwargs)

        # scan backward
        res1_rev = pool.apply_async(ihh_scan, (h1[::-1], gaps[::-1]), kwargs)
        res2_rev = pool.apply_async(ihh_scan, (h2[::-1], gaps[::-1]), kwargs)

        # wait for both to finish
        pool.close()
        pool.join()

        # obtain results
        ihh1_fwd = res1_fwd.get()
        ihh2_fwd = res2_fwd.get()
        ihh1_rev = res1_rev.get()
        ihh2_rev = res2_rev.get()

        # cleanup
        pool.terminate()

    else:
        # compute without threads

        # scan forward
        ihh1_fwd = ihh_scan(h1, gaps, **kwargs)
        ihh2_fwd = ihh_scan(h2, gaps, **kwargs)

        # scan backward
        ihh1_rev = ihh_scan(h1[::-1], gaps[::-1], **kwargs)
        ihh2_rev = ihh_scan(h2[::-1], gaps[::-1], **kwargs)

    # handle reverse scans
    ihh1_rev = ihh1_rev[::-1]
    ihh2_rev = ihh2_rev[::-1]

    # compute unstandardized score
    ihh1 = ihh1_fwd + ihh1_rev
    ihh2 = ihh2_fwd + ihh2_rev
    score = np.log(ihh1 / ihh2)

    return score
예제 #10
0
def ihs(h,
        pos,
        map_pos=None,
        min_ehh=0.05,
        min_maf=0.05,
        include_edges=False,
        gap_scale=20000,
        max_gap=200000,
        is_accessible=None,
        use_threads=True):
    """Compute the unstandardized integrated haplotype score (IHS) for each
    variant, comparing integrated haplotype homozygosity between the
    reference (0) and alternate (1) alleles.

    Parameters
    ----------
    h : array_like, int, shape (n_variants, n_haplotypes)
        Haplotype array.
    pos : array_like, int, shape (n_variants,)
        Variant positions (physical distance).
    map_pos : array_like, float, shape (n_variants,)
        Variant positions (genetic map distance).
    min_ehh: float, optional
        Minimum EHH beyond which to truncate integrated haplotype
        homozygosity calculation.
    min_maf : float, optional
        Do not compute integrated haplotype homozogysity for variants with
        minor allele frequency below this value.
    include_edges : bool, optional
        If True, report scores even if EHH does not decay below `min_ehh`
        before reaching the edge of the data.
    gap_scale : int, optional
        Rescale distance between variants if gap is larger than this value.
    max_gap : int, optional
        Do not report scores if EHH spans a gap larger than this number of
        base pairs.
    is_accessible : array_like, bool, optional
        Genome accessibility array. If provided, distance between variants
        will be computed as the number of accessible bases between them.
    use_threads : bool, optional
        If True use multiple threads to compute.

    Returns
    -------
    score : ndarray, float, shape (n_variants,)
        Unstandardized IHS scores.

    Notes
    -----

    This function will calculate IHS for all variants. To exclude variants
    below a given minor allele frequency, filter the input haplotype array
    before passing to this function.

    This function computes IHS comparing the reference and alternate alleles.
    These can be polarised by switching the sign for any variant where the
    reference allele is derived.

    This function returns NaN for any IHS calculations where haplotype
    homozygosity does not decay below `min_ehh` before reaching the first or
    last variant. To disable this behaviour, set `include_edges` to True.

    Note that the unstandardized score is returned. Usually these scores are
    then standardized in different allele frequency bins.

    See Also
    --------
    standardize_by_allele_count

    """

    # check inputs
    h = asarray_ndim(h, 2)
    check_integer_dtype(h)
    pos = asarray_ndim(pos, 1)
    check_dim0_aligned(h, pos)

    # compute gaps between variants for integration
    gaps = compute_ihh_gaps(pos, map_pos, gap_scale, max_gap, is_accessible)

    # setup kwargs
    kwargs = dict(min_ehh=min_ehh,
                  min_maf=min_maf,
                  include_edges=include_edges)

    if use_threads and multiprocessing.cpu_count() > 1:
        # run with threads

        # create pool
        pool = ThreadPool(2)

        # scan forward
        result_fwd = pool.apply_async(ihh01_scan, (h, gaps), kwargs)

        # scan backward
        result_rev = pool.apply_async(ihh01_scan, (h[::-1], gaps[::-1]),
                                      kwargs)

        # wait for both to finish
        pool.close()
        pool.join()

        # obtain results
        ihh0_fwd, ihh1_fwd = result_fwd.get()
        ihh0_rev, ihh1_rev = result_rev.get()

        # cleanup
        pool.terminate()

    else:
        # run without threads

        # scan forward
        ihh0_fwd, ihh1_fwd = ihh01_scan(h, gaps, **kwargs)

        # scan backward
        ihh0_rev, ihh1_rev = ihh01_scan(h[::-1], gaps[::-1], **kwargs)

    # handle reverse scan
    ihh0_rev = ihh0_rev[::-1]
    ihh1_rev = ihh1_rev[::-1]

    # compute unstandardized score
    ihh0 = ihh0_fwd + ihh0_rev
    ihh1 = ihh1_fwd + ihh1_rev
    score = np.log(ihh1 / ihh0)

    return score
예제 #11
0
 def __init__(self, data):
     super(GenotypeAlleleCountsChunkedArray, self).__init__(data)
     check_ndim(self.values, 3)
     check_integer_dtype(self.values)
예제 #12
0
 def __init__(self, data):
     super(HaplotypeChunkedArray, self).__init__(data)
     check_ndim(self.values, 2)
     check_integer_dtype(self.values)
예제 #13
0
 def __init__(self, data):
     super(GenotypeChunkedArray, self).__init__(data)
     check_ndim(self.values, 3)
     check_integer_dtype(self.values)
     self._mask = None
     self._is_phased = None
예제 #14
0
def tabulate_state_blocks(x, states, pos=None):
    """Construct a dataframe where each row provides information about continuous state blocks.

    Parameters
    ----------
    x : array_like, int
        1-dimensional array of state values.
    states : set
        Set of states of interest. Any state value not in this set will be ignored.
    pos : array_like, int, optional
        Array of positions corresponding to values in `x`.

    Returns
    -------
    df : DataFrame

    Examples
    --------
    >>> import allel
    >>> x = [1, 1, 0, 1, 1, 2, 2, 0, 2, 1, 1]
    >>> df = allel.tabulate_state_blocks(x, states={1, 2})
    >>> df
       state  support  start_lidx     ...       size_min  size_max  is_marginal
    0      1        4          -1     ...              5        -1         True
    1      2        3           4     ...              4         4        False
    2      1        2           8     ...              2        -1         True
    [3 rows x 9 columns]
    >>> pos = [2, 4, 7, 8, 10, 14, 19, 23, 28, 30, 31]
    >>> df = allel.tabulate_state_blocks(x, states={1, 2}, pos=pos)
    >>> df
       state  support  start_lidx     ...      stop_rpos  length_min  length_max
    0      1        4          -1     ...             14           9          -1
    1      2        3           4     ...             30          15          19
    2      1        2           8     ...             -1           2          -1
    [3 rows x 15 columns]

    """

    # check inputs
    x = asarray_ndim(x, 1)
    check_integer_dtype(x)
    x = memoryview_safe(x)

    # find state transitions
    switch_points, transitions, observations = state_transitions(x, states)

    # setup some helpers
    t = transitions[1:, 0]
    o = observations[1:]
    s1 = switch_points[:-1]
    s2 = switch_points[1:]
    is_marginal = (s1[:, 0] < 0) | (s2[:, 1] < 0)
    size_min = s2[:, 0] - s1[:, 1] + 1
    size_max = s2[:, 1] - s1[:, 0] - 1
    size_max[is_marginal] = -1

    # start to build a dataframe
    items = [
        ('state', t),
        ('support', o),
        ('start_lidx', s1[:, 0]),
        ('start_ridx', s1[:, 1]),
        ('stop_lidx', s2[:, 0]),
        ('stop_ridx', s2[:, 1]),
        ('size_min', size_min),
        ('size_max', size_max),
        ('is_marginal', is_marginal)
    ]

    # deal with optional positions
    if pos is not None:
        pos = asarray_ndim(pos, 1)
        check_dim0_aligned(x, pos)
        check_integer_dtype(pos)

        # obtain switch positions
        switch_positions = np.take(pos, switch_points)
        # deal with boundary transitions
        switch_positions[0, 0] = -1
        switch_positions[-1, 1] = -1

        # setup helpers
        p1 = switch_positions[:-1]
        p2 = switch_positions[1:]
        length_min = p2[:, 0] - p1[:, 1] + 1
        length_max = p2[:, 1] - p1[:, 0] - 1
        length_max[is_marginal] = -1

        items += [
            ('start_lpos', p1[:, 0]),
            ('start_rpos', p1[:, 1]),
            ('stop_lpos', p2[:, 0]),
            ('stop_rpos', p2[:, 1]),
            ('length_min', length_min),
            ('length_max', length_max),
        ]

    import pandas
    return pandas.DataFrame.from_dict(OrderedDict(items))
예제 #15
0
 def __init__(self, data, chunks=None, name=None, lock=False):
     super(GenotypeAlleleCountsDask, self).__init__(data,
                                                    chunks=chunks,
                                                    name=name,
                                                    lock=lock)
     check_integer_dtype(self.values)
예제 #16
0
def nsl(h, use_threads=True):
    """Compute the unstandardized number of segregating sites by length (nSl)
    for each variant, comparing the reference and alternate alleles,
    after Ferrer-Admetlla et al. (2014).

    Parameters
    ----------
    h : array_like, int, shape (n_variants, n_haplotypes)
        Haplotype array.
    use_threads : bool, optional
        If True use multiple threads to compute.

    Returns
    -------
    score : ndarray, float, shape (n_variants,)

    Notes
    -----
    This function will calculate nSl for all variants. To exclude variants
    below a given minor allele frequency, filter the input haplotype array
    before passing to this function.

    This function computes nSl by comparing the reference and alternate
    alleles. These can be polarised by switching the sign for any variant where
    the reference allele is derived.

    This function does nothing about nSl calculations where haplotype
    homozygosity extends up to the first or last variant. There may be edge
    effects.

    Note that the unstandardized score is returned. Usually these scores are
    then standardized in different allele frequency bins.

    See Also
    --------
    standardize_by_allele_count

    """

    # check inputs
    h = asarray_ndim(h, 2)
    check_integer_dtype(h)

    # # check there are no invariant sites
    # ac = h.count_alleles()
    # assert np.all(ac.is_segregating()), 'please remove non-segregating sites'

    if use_threads and multiprocessing.cpu_count() > 1:

        # create pool
        pool = ThreadPool(2)

        # scan forward
        result_fwd = pool.apply_async(nsl01_scan, args=(h, ))

        # scan backward
        result_rev = pool.apply_async(nsl01_scan, args=(h[::-1], ))

        # wait for both to finish
        pool.close()
        pool.join()

        # obtain results
        nsl0_fwd, nsl1_fwd = result_fwd.get()
        nsl0_rev, nsl1_rev = result_rev.get()

    else:

        # scan forward
        nsl0_fwd, nsl1_fwd = nsl01_scan(h)

        # scan backward
        nsl0_rev, nsl1_rev = nsl01_scan(h[::-1])

    # handle backwards
    nsl0_rev = nsl0_rev[::-1]
    nsl1_rev = nsl1_rev[::-1]

    # compute unstandardized score
    nsl0 = nsl0_fwd + nsl0_rev
    nsl1 = nsl1_fwd + nsl1_rev
    score = np.log(nsl1 / nsl0)

    return score
예제 #17
0
def xpnsl(h1, h2, use_threads=True):
    """Cross-population version of the NSL statistic.

    Parameters
    ----------
    h1 : array_like, int, shape (n_variants, n_haplotypes)
        Haplotype array for the first population.
    h2 : array_like, int, shape (n_variants, n_haplotypes)
        Haplotype array for the second population.
    use_threads : bool, optional
        If True use multiple threads to compute.

    Returns
    -------
    score : ndarray, float, shape (n_variants,)
        Unstandardized XPNSL scores.

    """

    # check inputs
    h1 = asarray_ndim(h1, 2)
    check_integer_dtype(h1)
    h2 = asarray_ndim(h2, 2)
    check_integer_dtype(h2)
    check_dim0_aligned(h1, h2)

    if use_threads and multiprocessing.cpu_count() > 1:
        # use multiple threads

        # setup threadpool
        pool = ThreadPool(min(4, multiprocessing.cpu_count()))

        # scan forward
        res1_fwd = pool.apply_async(nsl_scan, args=(h1, ))
        res2_fwd = pool.apply_async(nsl_scan, args=(h2, ))

        # scan backward
        res1_rev = pool.apply_async(nsl_scan, args=(h1[::-1], ))
        res2_rev = pool.apply_async(nsl_scan, args=(h2[::-1], ))

        # wait for both to finish
        pool.close()
        pool.join()

        # obtain results
        nsl1_fwd = res1_fwd.get()
        nsl2_fwd = res2_fwd.get()
        nsl1_rev = res1_rev.get()
        nsl2_rev = res2_rev.get()

        # cleanup
        pool.terminate()

    else:
        # compute without threads

        # scan forward
        nsl1_fwd = nsl_scan(h1)
        nsl2_fwd = nsl_scan(h2)

        # scan backward
        nsl1_rev = nsl_scan(h1[::-1])
        nsl2_rev = nsl_scan(h2[::-1])

    # handle reverse scans
    nsl1_rev = nsl1_rev[::-1]
    nsl2_rev = nsl2_rev[::-1]

    # compute unstandardized score
    nsl1 = nsl1_fwd + nsl1_rev
    nsl2 = nsl2_fwd + nsl2_rev
    score = np.log(nsl1 / nsl2)

    return score
예제 #18
0
def tabulate_state_transitions(x, states, pos=None):
    """Construct a dataframe where each row provides information about a state transition.

    Parameters
    ----------
    x : array_like, int
        1-dimensional array of state values.
    states : set
        Set of states of interest. Any state value not in this set will be ignored.
    pos : array_like, int, optional
        Array of positions corresponding to values in `x`.

    Returns
    -------
    df : DataFrame

    Notes
    -----
    The resulting dataframe includes one row at the start representing the first state
    observation and one row at the end representing the last state observation.

    Examples
    --------
    >>> import allel
    >>> x = [1, 1, 0, 1, 1, 2, 2, 0, 2, 1, 1]
    >>> df = allel.tabulate_state_transitions(x, states={1, 2})
    >>> df
       lstate  rstate  lidx  ridx
    0      -1       1    -1     0
    1       1       2     4     5
    2       2       1     8     9
    3       1      -1    10    -1
    >>> pos = [2, 4, 7, 8, 10, 14, 19, 23, 28, 30, 31]
    >>> df = allel.tabulate_state_transitions(x, states={1, 2}, pos=pos)
    >>> df
       lstate  rstate  lidx  ridx  lpos  rpos
    0      -1       1    -1     0    -1     2
    1       1       2     4     5    10    14
    2       2       1     8     9    28    30
    3       1      -1    10    -1    31    -1

    """

    # check inputs
    x = asarray_ndim(x, 1)
    check_integer_dtype(x)
    x = memoryview_safe(x)

    # find state transitions
    switch_points, transitions, _ = state_transitions(x, states)

    # start to build a dataframe
    items = [('lstate', transitions[:, 0]),
             ('rstate', transitions[:, 1]),
             ('lidx', switch_points[:, 0]),
             ('ridx', switch_points[:, 1])]

    # deal with optional positions
    if pos is not None:
        pos = asarray_ndim(pos, 1)
        check_dim0_aligned(x, pos)
        check_integer_dtype(pos)

        # find switch positions
        switch_positions = np.take(pos, switch_points)
        # deal with boundary transitions
        switch_positions[0, 0] = -1
        switch_positions[-1, 1] = -1

        # add columns into dataframe
        items += [('lpos', switch_positions[:, 0]),
                  ('rpos', switch_positions[:, 1])]

    import pandas
    return pandas.DataFrame.from_dict(OrderedDict(items))