Example #1
0
def xpehh(h1, h2, pos, min_ehh=0.05):
    """Compute the unstandardized cross-population extended haplotype
    homozygosity score (XPEHH) for each variant.

    Parameters
    ----------
    h1 : array_like, int, shape (n_variants, n_haplotypes)
        Haplotype array for the first population.
    h2 : array_like, int, shape (n_variants, n_haplotypes)
        Haplotype array for the second population.
    pos : array_like, int, shape (n_variants,)
        Variant positions on physical or genetic map.
    min_ehh: float, optional
        Minimum EHH beyond which to truncate integrated haplotype
        homozygosity calculation.

    Returns
    -------
    score : ndarray, float, shape (n_variants,)
        Unstandardized XPEHH scores.

    Notes
    -----

    This function will calculate XPEHH for all variants. To exclude variants
    below a given minor allele frequency, filter the input haplotype arrays
    before passing to this function.

    This function returns NaN for any EHH calculations where haplotype
    homozygosity does not decay below `min_ehh` before reaching the first or
    last variant. To disable this behaviour, set `min_ehh` to None.

    This function currently does nothing to account for large gaps between
    variants. There will be edge effects near any large gaps.

    Note that the unstandardized score is returned. Usually these scores are
    then normalised in different allele frequency bins.

    Haplotype arrays from the two populations may have different numbers of
    haplotypes.

    """

    from allel.opt.stats import ihh_scan_int8

    # scan forward
    ihh1_fwd = ihh_scan_int8(h1, pos, min_ehh=min_ehh)
    ihh2_fwd = ihh_scan_int8(h2, pos, min_ehh=min_ehh)

    # scan backward
    ihh1_rev = ihh_scan_int8(h1[::-1], pos[::-1], min_ehh=min_ehh)[::-1]
    ihh2_rev = ihh_scan_int8(h2[::-1], pos[::-1], min_ehh=min_ehh)[::-1]

    # compute unstandardized score
    ihh1 = ihh1_fwd + ihh1_rev
    ihh2 = ihh2_fwd + ihh2_rev
    score = np.log(ihh1 / ihh2)

    return score
Example #2
0
def test_ihh_scan_int8_d():
    # edge case: start from 0 haplotype homozygosity
    gaps = np.array([10], dtype='f8')
    h = np.array([[0, 1], [1, 0]], dtype='i1')

    expect = [0, 0]
    actual = ihh_scan_int8(h, gaps, min_ehh=0, include_edges=False)
    assert_array_nanclose(expect, actual)

    expect = [0, 0]
    actual = ihh_scan_int8(h, gaps, min_ehh=0, include_edges=True)
    assert_array_nanclose(expect, actual)
Example #3
0
def test_ihh_scan_int8_d():
    # edge case: start from 0 haplotype homozygosity
    gaps = np.array([10], dtype='f8')
    h = np.array([[0, 1],
                  [1, 0]], dtype='i1')

    expect = [0, 0]
    actual = ihh_scan_int8(h, gaps, min_ehh=0, include_edges=False)
    assert_array_nanclose(expect, actual)

    expect = [0, 0]
    actual = ihh_scan_int8(h, gaps, min_ehh=0, include_edges=True)
    assert_array_nanclose(expect, actual)
Example #4
0
def test_ihh_scan_int8_c():
    # simple case: 1 haplotype pair, haplotype homozygosity decays
    gaps = np.array([10, 10], dtype='f8')
    h = np.array([[0, 1], [0, 0], [0, 0]], dtype='i1')

    # do not include edges
    expect = [0, 5, 15]
    actual = ihh_scan_int8(h, gaps, min_ehh=0, include_edges=False)
    assert_array_nanclose(expect, actual)

    # include edges
    expect = [0, 5, 15]
    actual = ihh_scan_int8(h, gaps, min_ehh=0, include_edges=True)
    assert_array_nanclose(expect, actual)
Example #5
0
def test_ihh_scan_int8_b():
    # 1 haplotype pair, haplotype homozygosity over all variants
    # handling of large gap (encoded as -1)
    gaps = np.array([10, -1], dtype='f8')
    h = np.array([[0, 0], [0, 0], [0, 0]], dtype='i1')

    # do not include edges
    expect = [np.nan, np.nan, np.nan]
    actual = ihh_scan_int8(h, gaps, min_ehh=0, include_edges=False)
    assert_array_nanclose(expect, actual)

    # include edges
    expect = [0, 10, np.nan]
    actual = ihh_scan_int8(h, gaps, min_ehh=0, include_edges=True)
    assert_array_nanclose(expect, actual)
Example #6
0
def test_ihh_scan_int8_c():
    # simple case: 1 haplotype pair, haplotype homozygosity decays
    gaps = np.array([10, 10], dtype='f8')
    h = np.array([[0, 1],
                  [0, 0],
                  [0, 0]], dtype='i1')

    # do not include edges
    expect = [0, 5, 15]
    actual = ihh_scan_int8(h, gaps, min_ehh=0, include_edges=False)
    assert_array_nanclose(expect, actual)

    # include edges
    expect = [0, 5, 15]
    actual = ihh_scan_int8(h, gaps, min_ehh=0, include_edges=True)
    assert_array_nanclose(expect, actual)
Example #7
0
def test_ihh_scan_int8_b():
    # 1 haplotype pair, haplotype homozygosity over all variants
    # handling of large gap (encoded as -1)
    gaps = np.array([10, -1], dtype='f8')
    h = np.array([[0, 0],
                  [0, 0],
                  [0, 0]], dtype='i1')

    # do not include edges
    expect = [np.nan, np.nan, np.nan]
    actual = ihh_scan_int8(h, gaps, min_ehh=0, include_edges=False)
    assert_array_nanclose(expect, actual)

    # include edges
    expect = [0, 10, np.nan]
    actual = ihh_scan_int8(h, gaps, min_ehh=0, include_edges=True)
    assert_array_nanclose(expect, actual)
Example #8
0
def xpehh(
    h1,
    h2,
    pos,
    map_pos=None,
    min_ehh=0.05,
    include_edges=False,
    gap_scale=20000,
    max_gap=200000,
    is_accessible=None,
    use_threads=True,
):
    """Compute the unstandardized cross-population extended haplotype
    homozygosity score (XPEHH) for each variant.

    Parameters
    ----------
    h1 : array_like, int, shape (n_variants, n_haplotypes)
        Haplotype array for the first population.
    h2 : array_like, int, shape (n_variants, n_haplotypes)
        Haplotype array for the second population.
    pos : array_like, int, shape (n_variants,)
        Variant positions on physical or genetic map.
    map_pos : array_like, float, shape (n_variants,)
        Variant positions (genetic map distance).
    min_ehh: float, optional
        Minimum EHH beyond which to truncate integrated haplotype
        homozygosity calculation.
    include_edges : bool, optional
        If True, report scores even if EHH does not decay below `min_ehh`
        before reaching the edge of the data.
    gap_scale : int, optional
        Rescale distance between variants if gap is larger than this value.
    max_gap : int, optional
        Do not report scores if EHH spans a gap larger than this number of
        base pairs.
    is_accessible : array_like, bool, optional
        Genome accessibility array. If provided, distance between variants
        will be computed as the number of accessible bases between them.
    use_threads : bool, optional
        If True use multiple threads to compute.

    Returns
    -------
    score : ndarray, float, shape (n_variants,)
        Unstandardized XPEHH scores.

    Notes
    -----

    This function will calculate XPEHH for all variants. To exclude variants
    below a given minor allele frequency, filter the input haplotype arrays
    before passing to this function.

    This function returns NaN for any EHH calculations where haplotype
    homozygosity does not decay below `min_ehh` before reaching the first or
    last variant. To disable this behaviour, set `include_edges` to True.

    Note that the unstandardized score is returned. Usually these scores are
    then standardized genome-wide.

    Haplotype arrays from the two populations may have different numbers of
    haplotypes.

    See Also
    --------
    standardize

    """

    from allel.opt.stats import ihh_scan_int8

    # check inputs
    h1 = HaplotypeArray(np.asarray(h1, dtype="i1"))
    h2 = HaplotypeArray(np.asarray(h2, dtype="i1"))
    pos = asarray_ndim(pos, 1)
    check_dim0_aligned(h1, h2, pos)

    # compute gaps between variants for integration
    gaps = compute_ihh_gaps(pos, map_pos, gap_scale, max_gap, is_accessible)

    # setup kwargs
    kwargs = dict(min_ehh=min_ehh, include_edges=include_edges)

    if use_threads and multiprocessing.cpu_count() > 1:
        # use multiple threads

        # setup threadpool
        pool = ThreadPool(min(4, multiprocessing.cpu_count()))

        # scan forward
        res1_fwd = pool.apply_async(ihh_scan_int8, (h1, gaps), kwargs)
        res2_fwd = pool.apply_async(ihh_scan_int8, (h2, gaps), kwargs)

        # scan backward
        res1_rev = pool.apply_async(ihh_scan_int8, (h1[::-1], gaps[::-1]), kwargs)
        res2_rev = pool.apply_async(ihh_scan_int8, (h2[::-1], gaps[::-1]), kwargs)

        # wait for both to finish
        pool.close()
        pool.join()

        # obtain results
        ihh1_fwd = res1_fwd.get()
        ihh2_fwd = res2_fwd.get()
        ihh1_rev = res1_rev.get()
        ihh2_rev = res2_rev.get()

        # cleanup
        pool.terminate()

    else:
        # compute without threads

        # scan forward
        ihh1_fwd = ihh_scan_int8(h1, gaps, **kwargs)
        ihh2_fwd = ihh_scan_int8(h2, gaps, **kwargs)

        # scan backward
        ihh1_rev = ihh_scan_int8(h1[::-1], gaps[::-1], **kwargs)
        ihh2_rev = ihh_scan_int8(h2[::-1], gaps[::-1], **kwargs)

    # handle reverse scans
    ihh1_rev = ihh1_rev[::-1]
    ihh2_rev = ihh2_rev[::-1]

    # compute unstandardized score
    ihh1 = ihh1_fwd + ihh1_rev
    ihh2 = ihh2_fwd + ihh2_rev
    score = np.log(ihh1 / ihh2)

    return score