def test_ihh_scan_d(): # edge case: start from 0 haplotype homozygosity gaps = np.array([10], dtype='f8') h = np.array([[0, 1], [1, 0]]) expect = [0, 0] actual = ihh_scan(h, gaps, min_ehh=0, include_edges=False) assert_array_nanclose(expect, actual) expect = [0, 0] actual = ihh_scan(h, gaps, min_ehh=0, include_edges=True) assert_array_nanclose(expect, actual)
def test_ihh_scan_c(): # simple case: 1 haplotype pair, haplotype homozygosity decays gaps = np.array([10, 10], dtype='f8') h = np.array([[0, 1], [0, 0], [0, 0]]) # do not include edges expect = [0, 5, 15] actual = ihh_scan(h, gaps, min_ehh=0, include_edges=False) assert_array_nanclose(expect, actual) # include edges expect = [0, 5, 15] actual = ihh_scan(h, gaps, min_ehh=0, include_edges=True) assert_array_nanclose(expect, actual)
def test_ihh_scan_a(): # simple case: 1 haplotype pair, haplotype homozygosity over all variants gaps = np.array([10, 10], dtype='f8') h = np.array([[0, 0], [0, 0], [0, 0]]) # do not include edges expect = [np.nan, np.nan, np.nan] actual = ihh_scan(h, gaps, min_ehh=0, include_edges=False) assert_array_nanclose(expect, actual) # include edges expect = [0, 10, 20] actual = ihh_scan(h, gaps, min_ehh=0, include_edges=True) assert_array_nanclose(expect, actual)
def test_ihh_scan_b(): # 1 haplotype pair, haplotype homozygosity over all variants # handling of large gap (encoded as -1) gaps = np.array([10, -1], dtype='f8') h = np.array([[0, 0], [0, 0], [0, 0]]) # do not include edges expect = [np.nan, np.nan, np.nan] actual = ihh_scan(h, gaps, min_ehh=0, include_edges=False) assert_array_nanclose(expect, actual) # include edges expect = [0, 10, np.nan] actual = ihh_scan(h, gaps, min_ehh=0, include_edges=True) assert_array_nanclose(expect, actual)
def test_ihh_scan_e(): # edge case: start from haplotype homozygosity below min_ehh gaps = np.array([10], dtype='f8') h = np.array([[0, 0, 1], [0, 1, 0]]) expect = [np.nan, 10 / 6] actual = ihh_scan(h, gaps, min_ehh=0, include_edges=False) assert_array_almost_equal(expect, actual) expect = [0, 10 / 6] actual = ihh_scan(h, gaps, min_ehh=0, include_edges=True) assert_array_almost_equal(expect, actual) expect = [0, 0] actual = ihh_scan(h, gaps, min_ehh=0.5, include_edges=False) assert_array_almost_equal(expect, actual) expect = [0, 0] actual = ihh_scan(h, gaps, min_ehh=0.5, include_edges=True) assert_array_almost_equal(expect, actual)
def xpehh(h1, h2, pos, map_pos=None, min_ehh=0.05, include_edges=False, gap_scale=20000, max_gap=200000, is_accessible=None, use_threads=True): """Compute the unstandardized cross-population extended haplotype homozygosity score (XPEHH) for each variant. Parameters ---------- h1 : array_like, int, shape (n_variants, n_haplotypes) Haplotype array for the first population. h2 : array_like, int, shape (n_variants, n_haplotypes) Haplotype array for the second population. pos : array_like, int, shape (n_variants,) Variant positions on physical or genetic map. map_pos : array_like, float, shape (n_variants,) Variant positions (genetic map distance). min_ehh: float, optional Minimum EHH beyond which to truncate integrated haplotype homozygosity calculation. include_edges : bool, optional If True, report scores even if EHH does not decay below `min_ehh` before reaching the edge of the data. gap_scale : int, optional Rescale distance between variants if gap is larger than this value. max_gap : int, optional Do not report scores if EHH spans a gap larger than this number of base pairs. is_accessible : array_like, bool, optional Genome accessibility array. If provided, distance between variants will be computed as the number of accessible bases between them. use_threads : bool, optional If True use multiple threads to compute. Returns ------- score : ndarray, float, shape (n_variants,) Unstandardized XPEHH scores. Notes ----- This function will calculate XPEHH for all variants. To exclude variants below a given minor allele frequency, filter the input haplotype arrays before passing to this function. This function returns NaN for any EHH calculations where haplotype homozygosity does not decay below `min_ehh` before reaching the first or last variant. To disable this behaviour, set `include_edges` to True. Note that the unstandardized score is returned. Usually these scores are then standardized genome-wide. Haplotype arrays from the two populations may have different numbers of haplotypes. See Also -------- standardize """ # check inputs h1 = asarray_ndim(h1, 2) check_integer_dtype(h1) h2 = asarray_ndim(h2, 2) check_integer_dtype(h2) pos = asarray_ndim(pos, 1) check_dim0_aligned(h1, h2, pos) # compute gaps between variants for integration gaps = compute_ihh_gaps(pos, map_pos, gap_scale, max_gap, is_accessible) # setup kwargs kwargs = dict(min_ehh=min_ehh, include_edges=include_edges) if use_threads and multiprocessing.cpu_count() > 1: # use multiple threads # setup threadpool pool = ThreadPool(min(4, multiprocessing.cpu_count())) # scan forward res1_fwd = pool.apply_async(ihh_scan, (h1, gaps), kwargs) res2_fwd = pool.apply_async(ihh_scan, (h2, gaps), kwargs) # scan backward res1_rev = pool.apply_async(ihh_scan, (h1[::-1], gaps[::-1]), kwargs) res2_rev = pool.apply_async(ihh_scan, (h2[::-1], gaps[::-1]), kwargs) # wait for both to finish pool.close() pool.join() # obtain results ihh1_fwd = res1_fwd.get() ihh2_fwd = res2_fwd.get() ihh1_rev = res1_rev.get() ihh2_rev = res2_rev.get() # cleanup pool.terminate() else: # compute without threads # scan forward ihh1_fwd = ihh_scan(h1, gaps, **kwargs) ihh2_fwd = ihh_scan(h2, gaps, **kwargs) # scan backward ihh1_rev = ihh_scan(h1[::-1], gaps[::-1], **kwargs) ihh2_rev = ihh_scan(h2[::-1], gaps[::-1], **kwargs) # handle reverse scans ihh1_rev = ihh1_rev[::-1] ihh2_rev = ihh2_rev[::-1] # compute unstandardized score ihh1 = ihh1_fwd + ihh1_rev ihh2 = ihh2_fwd + ihh2_rev score = np.log(ihh1 / ihh2) return score