def test_ihh01_scan_int8_d(): gaps = np.array([10, 10, 10], dtype='f8') h = np.array([[0, 0, 1, 1, 1, 0], [0, 1, 0, 1, 0, 1], [1, 0, 0, 0, 1, 1], [0, 0, 0, 1, 1, 1]], dtype='i1') ihh0, ihh1 = ihh01_scan_int8(h, gaps, min_ehh=0.05) x = (10 * (1 + 1 / 3) / 2) + (10 * (1 / 3 + 0) / 2) expect_ihh0 = [np.nan, np.nan, x, x] assert_array_nanclose(expect_ihh0, ihh0) expect_ihh1 = [np.nan, np.nan, x, x] assert_array_nanclose(expect_ihh1, ihh1) ihh0, ihh1 = ihh01_scan_int8(h, gaps, min_ehh=0) expect_ihh0 = [np.nan, np.nan, x, x] assert_array_nanclose(expect_ihh0, ihh0) expect_ihh1 = [np.nan, np.nan, x, x] assert_array_nanclose(expect_ihh1, ihh1) ihh0, ihh1 = ihh01_scan_int8(h, gaps, min_ehh=0, include_edges=True) expect_ihh0 = [0, 10 * 2 / 3, x, x] assert_array_nanclose(expect_ihh0, ihh0) expect_ihh1 = [0, 10 * 2 / 3, x, x] assert_array_nanclose(expect_ihh1, ihh1)
def ihs(h, pos, min_ehh=0.05): """Compute the unstandardized integrated haplotype score (IHS) for each variant, comparing integrated haplotype homozygosity between the reference and alternate alleles. Parameters ---------- h : array_like, int, shape (n_variants, n_haplotypes) Haplotype array. pos : array_like, int, shape (n_variants,) Variant positions on physical or genetic map. min_ehh: float, optional Minimum EHH beyond which to truncate integrated haplotype homozygosity calculation. Returns ------- score : ndarray, float, shape (n_variants,) Unstandardized IHS scores. Notes ----- This function will calculate IHS for all variants. To exclude variants below a given minor allele frequency, filter the input haplotype array before passing to this function. This function computes IHS comparing the reference and alternate alleles. These can be polarised by switching the sign for any variant where the reference allele is derived. This function returns NaN for any IHS calculations where haplotype homozygosity does not decay below `min_ehh` before reaching the first or last variant. To disable this behaviour, set `min_ehh` to None. This function currently does nothing to account for large gaps between variants. There will be edge effects near any large gaps. Note that the unstandardized score is returned. Usually these scores are then normalised in different allele frequency bins. """ from allel.opt.stats import ihh01_scan_int8 # scan forward ihh0_fwd, ihh1_fwd = ihh01_scan_int8(h, pos, min_ehh=min_ehh) # scan backward ihh0_rev, ihh1_rev = ihh01_scan_int8(h[::-1], pos[::-1], min_ehh=min_ehh) ihh0_rev = ihh0_rev[::-1] ihh1_rev = ihh1_rev[::-1] # compute unstandardized score ihh0 = ihh0_fwd + ihh0_rev ihh1 = ihh1_fwd + ihh1_rev score = np.log(ihh1 / ihh0) return score
def test_ihh01_scan_int8_a(): gaps = np.array([10, 10, 10], dtype='f8') h = np.array([[0, 0, 1], [0, 1, 1], [1, 1, 0], [1, 0, 0]], dtype='i1') ihh0, ihh1 = ihh01_scan_int8(h, gaps, min_ehh=0.05, include_edges=False) expect_ihh0 = [np.nan, np.nan, np.nan, 5] assert_array_nanclose(expect_ihh0, ihh0) expect_ihh1 = [np.nan, 5, 5, np.nan] assert_array_nanclose(expect_ihh1, ihh1) ihh0, ihh1 = ihh01_scan_int8(h, gaps, min_ehh=0, include_edges=True) expect_ihh0 = [0, np.nan, np.nan, 5] assert_array_nanclose(expect_ihh0, ihh0) expect_ihh1 = [np.nan, 5, 5, np.nan] assert_array_nanclose(expect_ihh1, ihh1)
def test_ihh01_scan_int8_c(): gaps = np.array([10, 10, 10], dtype='f8') h = np.array([[0, 0, 0, 1, 1, 1], [0, 0, 0, 1, 1, 1], [0, 0, 0, 1, 1, 1], [0, 0, 0, 1, 1, 1]], dtype='i1') ihh0, ihh1 = ihh01_scan_int8(h, gaps, min_ehh=0.05) expect_ihh0 = [np.nan, np.nan, np.nan, np.nan] assert_array_nanclose(expect_ihh0, ihh0) expect_ihh1 = [np.nan, np.nan, np.nan, np.nan] assert_array_nanclose(expect_ihh1, ihh1) ihh0, ihh1 = ihh01_scan_int8(h, gaps, min_ehh=0, include_edges=True) expect_ihh0 = [0, 10, 20, 30] assert_array_nanclose(expect_ihh0, ihh0) expect_ihh1 = [0, 10, 20, 30] assert_array_nanclose(expect_ihh1, ihh1)
def test_ihh01_scan_int8_e(): # min_maf gaps = np.array([10, 10], dtype='f8') h = np.array([[0, 0, 1], [0, 0, 1], [0, 0, 1]], dtype='i1') expect_ihh0 = [0, 10, 20] expect_ihh1 = [np.nan, np.nan, np.nan] ihh0, ihh1 = ihh01_scan_int8(h, gaps, min_ehh=0, min_maf=0, include_edges=True) assert_array_nanclose(expect_ihh0, ihh0) assert_array_nanclose(expect_ihh1, ihh1) expect_ihh0 = [np.nan, np.nan, np.nan] expect_ihh1 = [np.nan, np.nan, np.nan] ihh0, ihh1 = ihh01_scan_int8(h, gaps, min_ehh=0, min_maf=0.4, include_edges=True) assert_array_nanclose(expect_ihh0, ihh0) assert_array_nanclose(expect_ihh1, ihh1)
def test_ihh01_scan_int8_b(): gaps = np.array([10, 10, 10], dtype='f8') h = np.array([[0, 0, 0, 1], [0, 0, 1, 0], [0, 1, 0, 0], [1, 0, 0, 0]], dtype='i1') ihh0, ihh1 = ihh01_scan_int8(h, gaps, min_ehh=0.05, include_edges=False) x = (10 * (1 + 1 / 3) / 2) + (10 * (1 / 3 + 0) / 2) expect_ihh0 = [np.nan, np.nan, x, x] assert_array_nanclose(expect_ihh0, ihh0) expect_ihh1 = [np.nan, np.nan, np.nan, np.nan] assert_array_nanclose(expect_ihh1, ihh1) ihh0, ihh1 = ihh01_scan_int8(h, gaps, min_ehh=0, include_edges=False) expect_ihh0 = [np.nan, np.nan, x, x] assert_array_nanclose(expect_ihh0, ihh0) expect_ihh1 = [np.nan, np.nan, np.nan, np.nan] assert_array_nanclose(expect_ihh1, ihh1) ihh0, ihh1 = ihh01_scan_int8(h, gaps, min_ehh=0, include_edges=True) expect_ihh0 = [0, 10 * (1 + 1 / 3) / 2, x, x] assert_array_nanclose(expect_ihh0, ihh0) expect_ihh1 = [np.nan, np.nan, np.nan, np.nan] assert_array_nanclose(expect_ihh1, ihh1)
def test_ihh01_scan_int8_e(): # min_maf gaps = np.array([10, 10], dtype='f8') h = np.array([[0, 0, 1], [0, 0, 1], [0, 0, 1]], dtype='i1') expect_ihh0 = [0, 10, 20] expect_ihh1 = [np.nan, np.nan, np.nan] ihh0, ihh1 = ihh01_scan_int8(h, gaps, min_ehh=0, min_maf=0, include_edges=True) assert_array_nanclose(expect_ihh0, ihh0) assert_array_nanclose(expect_ihh1, ihh1) expect_ihh0 = [np.nan, np.nan, np.nan] expect_ihh1 = [np.nan, np.nan, np.nan] ihh0, ihh1 = ihh01_scan_int8(h, gaps, min_ehh=0, min_maf=0.4, include_edges=True) assert_array_nanclose(expect_ihh0, ihh0) assert_array_nanclose(expect_ihh1, ihh1)
def test_ihh01_scan_int8(self): from allel.opt.stats import ihh01_scan_int8 pos = [10, 20, 30, 40] # case 1 h = np.array([[0, 0, 1], [0, 1, 1], [1, 1, 0], [1, 0, 0]], dtype='i1') ihh0, ihh1 = ihh01_scan_int8(h, pos, min_ehh=0.05) expect_ihh0 = [np.nan, np.nan, np.nan, 0] assert_array_nanclose(expect_ihh0, ihh0) expect_ihh1 = [np.nan, np.nan, 0, np.nan] assert_array_nanclose(expect_ihh1, ihh1) ihh0, ihh1 = ihh01_scan_int8(h, pos, min_ehh=None) expect_ihh0 = [0, np.nan, np.nan, 0] assert_array_nanclose(expect_ihh0, ihh0) expect_ihh1 = [np.nan, 0, 0, np.nan] assert_array_nanclose(expect_ihh1, ihh1) # case 2 h = np.array([[0, 0, 0, 1], [0, 0, 1, 0], [0, 1, 0, 0], [1, 0, 0, 0]], dtype='i1') ihh0, ihh1 = ihh01_scan_int8(h, pos, min_ehh=0.05) expect_ihh0 = [np.nan, np.nan, np.nan, 10*2/3] assert_array_nanclose(expect_ihh0, ihh0) expect_ihh1 = [np.nan, np.nan, np.nan, np.nan] assert_array_nanclose(expect_ihh1, ihh1) ihh0, ihh1 = ihh01_scan_int8(h, pos, min_ehh=None) expect_ihh0 = [0, 10*2/3, 10*2/3, 10*2/3] assert_array_nanclose(expect_ihh0, ihh0) expect_ihh1 = [np.nan, np.nan, np.nan, np.nan] assert_array_nanclose(expect_ihh1, ihh1) # case 3 h = np.array([[0, 0, 0, 1, 1, 1], [0, 0, 0, 1, 1, 1], [0, 0, 0, 1, 1, 1], [0, 0, 0, 1, 1, 1]], dtype='i1') ihh0, ihh1 = ihh01_scan_int8(h, pos, min_ehh=0.05) expect_ihh0 = [np.nan, np.nan, np.nan, np.nan] assert_array_nanclose(expect_ihh0, ihh0) expect_ihh1 = [np.nan, np.nan, np.nan, np.nan] assert_array_nanclose(expect_ihh1, ihh1) ihh0, ihh1 = ihh01_scan_int8(h, pos, min_ehh=None) expect_ihh0 = [0, 10, 20, 30] assert_array_nanclose(expect_ihh0, ihh0) expect_ihh1 = [0, 10, 20, 30] assert_array_nanclose(expect_ihh1, ihh1) # case 4 h = np.array([[0, 0, 1, 1, 1, 0], [0, 1, 0, 1, 0, 1], [1, 0, 0, 0, 1, 1], [0, 0, 0, 1, 1, 1]], dtype='i1') ihh0, ihh1 = ihh01_scan_int8(h, pos, min_ehh=0.05) expect_ihh0 = [np.nan, np.nan, np.nan, 10*2/3] assert_array_nanclose(expect_ihh0, ihh0) expect_ihh1 = [np.nan, np.nan, np.nan, 10*2/3] assert_array_nanclose(expect_ihh1, ihh1) ihh0, ihh1 = ihh01_scan_int8(h, pos, min_ehh=None) expect_ihh0 = [0, 10*2/3, 10*2/3, 10*2/3] assert_array_nanclose(expect_ihh0, ihh0) expect_ihh1 = [0, 10*2/3, 10*2/3, 10*2/3] assert_array_nanclose(expect_ihh1, ihh1)
def ihs( h, pos, map_pos=None, min_ehh=0.05, min_maf=0.05, include_edges=False, gap_scale=20000, max_gap=200000, is_accessible=None, use_threads=True, ): """Compute the unstandardized integrated haplotype score (IHS) for each variant, comparing integrated haplotype homozygosity between the reference (0) and alternate (1) alleles. Parameters ---------- h : array_like, int, shape (n_variants, n_haplotypes) Haplotype array. pos : array_like, int, shape (n_variants,) Variant positions (physical distance). map_pos : array_like, float, shape (n_variants,) Variant positions (genetic map distance). min_ehh: float, optional Minimum EHH beyond which to truncate integrated haplotype homozygosity calculation. min_maf : float, optional Do not compute integrated haplotype homozogysity for variants with minor allele frequency below this value. include_edges : bool, optional If True, report scores even if EHH does not decay below `min_ehh` before reaching the edge of the data. gap_scale : int, optional Rescale distance between variants if gap is larger than this value. max_gap : int, optional Do not report scores if EHH spans a gap larger than this number of base pairs. is_accessible : array_like, bool, optional Genome accessibility array. If provided, distance between variants will be computed as the number of accessible bases between them. use_threads : bool, optional If True use multiple threads to compute. Returns ------- score : ndarray, float, shape (n_variants,) Unstandardized IHS scores. Notes ----- This function will calculate IHS for all variants. To exclude variants below a given minor allele frequency, filter the input haplotype array before passing to this function. This function computes IHS comparing the reference and alternate alleles. These can be polarised by switching the sign for any variant where the reference allele is derived. This function returns NaN for any IHS calculations where haplotype homozygosity does not decay below `min_ehh` before reaching the first or last variant. To disable this behaviour, set `include_edges` to True. Note that the unstandardized score is returned. Usually these scores are then standardized in different allele frequency bins. See Also -------- standardize_by_allele_count """ from allel.opt.stats import ihh01_scan_int8 # check inputs h = HaplotypeArray(np.asarray(h, dtype="i1")) pos = asarray_ndim(pos, 1) check_dim0_aligned(h, pos) # compute gaps between variants for integration gaps = compute_ihh_gaps(pos, map_pos, gap_scale, max_gap, is_accessible) # setup kwargs kwargs = dict(min_ehh=min_ehh, min_maf=min_maf, include_edges=include_edges) if use_threads and multiprocessing.cpu_count() > 1: # run with threads # create pool pool = ThreadPool(2) # scan forward result_fwd = pool.apply_async(ihh01_scan_int8, (h, gaps), kwargs) # scan backward result_rev = pool.apply_async(ihh01_scan_int8, (h[::-1], gaps[::-1]), kwargs) # wait for both to finish pool.close() pool.join() # obtain results ihh0_fwd, ihh1_fwd = result_fwd.get() ihh0_rev, ihh1_rev = result_rev.get() # cleanup pool.terminate() else: # run without threads # scan forward ihh0_fwd, ihh1_fwd = ihh01_scan_int8(h, gaps, **kwargs) # scan backward ihh0_rev, ihh1_rev = ihh01_scan_int8(h[::-1], gaps[::-1], **kwargs) # handle reverse scan ihh0_rev = ihh0_rev[::-1] ihh1_rev = ihh1_rev[::-1] # compute unstandardized score ihh0 = ihh0_fwd + ihh0_rev ihh1 = ihh1_fwd + ihh1_rev score = np.log(ihh1 / ihh0) return score