def locate_fixed_differences(ac1, ac2): """Locate variants with no shared alleles between two populations. Parameters ---------- ac1 : array_like, int, shape (n_variants, n_alleles) Allele counts array from the first population. ac2 : array_like, int, shape (n_variants, n_alleles) Allele counts array from the second population. Returns ------- loc : ndarray, bool, shape (n_variants,) See Also -------- allel.stats.diversity.windowed_df Examples -------- >>> import allel >>> g = allel.GenotypeArray([[[0, 0], [0, 0], [1, 1], [1, 1]], ... [[0, 1], [0, 1], [0, 1], [0, 1]], ... [[0, 1], [0, 1], [1, 1], [1, 1]], ... [[0, 0], [0, 0], [1, 1], [2, 2]], ... [[0, 0], [-1, -1], [1, 1], [-1, -1]]]) >>> ac1 = g.count_alleles(subpop=[0, 1]) >>> ac2 = g.count_alleles(subpop=[2, 3]) >>> loc_df = allel.locate_fixed_differences(ac1, ac2) >>> loc_df array([ True, False, False, True, True]) """ # check inputs ac1 = asarray_ndim(ac1, 2) ac2 = asarray_ndim(ac2, 2) check_dim0_aligned(ac1, ac2) ac1, ac2 = ensure_dim1_aligned(ac1, ac2) # stack allele counts for convenience pac = np.dstack([ac1, ac2]) # count numbers of alleles called in each population pan = np.sum(pac, axis=1) # count the numbers of populations with each allele npa = np.sum(pac > 0, axis=2) # locate variants with allele calls in both populations non_missing = np.all(pan > 0, axis=1) # locate variants where all alleles are only found in a single population no_shared_alleles = np.all(npa <= 1, axis=1) return non_missing & no_shared_alleles
def patterson_f3(acc, aca, acb): """Unbiased estimator for F3(C; A, B), the three-population test for admixture in population C. Parameters ---------- acc : array_like, int, shape (n_variants, 2) Allele counts for the test population (C). aca : array_like, int, shape (n_variants, 2) Allele counts for the first source population (A). acb : array_like, int, shape (n_variants, 2) Allele counts for the second source population (B). Returns ------- T : ndarray, float, shape (n_variants,) Un-normalized f3 estimates per variant. B : ndarray, float, shape (n_variants,) Estimates for heterozygosity in population C. Notes ----- See Patterson (2012), main text and Appendix A. For un-normalized f3 statistics, ignore the `B` return value. To compute the f3* statistic, which is normalized by heterozygosity in population C to remove numerical dependence on the allele frequency spectrum, compute ``np.sum(T) / np.sum(B)``. """ # check inputs aca = AlleleCountsArray(aca, copy=False) assert aca.shape[1] == 2, 'only biallelic variants supported' acb = AlleleCountsArray(acb, copy=False) assert acb.shape[1] == 2, 'only biallelic variants supported' acc = AlleleCountsArray(acc, copy=False) assert acc.shape[1] == 2, 'only biallelic variants supported' check_dim0_aligned(aca, acb, acc) # compute allele number and heterozygosity in test population sc = acc.sum(axis=1) hc = h_hat(acc) # compute sample frequencies for the alternate allele a = aca.to_frequencies()[:, 1] b = acb.to_frequencies()[:, 1] c = acc.to_frequencies()[:, 1] # compute estimator T = ((c - a) * (c - b)) - (hc / sc) B = 2 * hc return T, B
def locate_private_alleles(*acs): """Locate alleles that are found only in a single population. Parameters ---------- *acs : array_like, int, shape (n_variants, n_alleles) Allele counts arrays from each population. Returns ------- loc : ndarray, bool, shape (n_variants, n_alleles) Boolean array where elements are True if allele is private to a single population. Examples -------- >>> import allel >>> g = allel.GenotypeArray([[[0, 0], [0, 0], [1, 1], [1, 1]], ... [[0, 1], [0, 1], [0, 1], [0, 1]], ... [[0, 1], [0, 1], [1, 1], [1, 1]], ... [[0, 0], [0, 0], [1, 1], [2, 2]], ... [[0, 0], [-1, -1], [1, 1], [-1, -1]]]) >>> ac1 = g.count_alleles(subpop=[0, 1]) >>> ac2 = g.count_alleles(subpop=[2]) >>> ac3 = g.count_alleles(subpop=[3]) >>> loc_private_alleles = allel.locate_private_alleles(ac1, ac2, ac3) >>> loc_private_alleles array([[ True, False, False], [False, False, False], [ True, False, False], [ True, True, True], [ True, True, False]]) >>> loc_private_variants = np.any(loc_private_alleles, axis=1) >>> loc_private_variants array([ True, False, True, True, True]) """ # check inputs acs = [asarray_ndim(ac, 2) for ac in acs] check_dim0_aligned(*acs) acs = ensure_dim1_aligned(*acs) # stack allele counts for convenience pac = np.dstack(acs) # count the numbers of populations with each allele npa = np.sum(pac > 0, axis=2) # locate alleles found only in a single population loc_pa = npa == 1 return loc_pa
def patterson_d(aca, acb, acc, acd): """Unbiased estimator for D(A, B; C, D), the normalised four-population test for admixture between (A or B) and (C or D), also known as the "ABBA BABA" test. Parameters ---------- aca : array_like, int, shape (n_variants, 2), Allele counts for population A. acb : array_like, int, shape (n_variants, 2) Allele counts for population B. acc : array_like, int, shape (n_variants, 2) Allele counts for population C. acd : array_like, int, shape (n_variants, 2) Allele counts for population D. Returns ------- num : ndarray, float, shape (n_variants,) Numerator (un-normalised f4 estimates). den : ndarray, float, shape (n_variants,) Denominator. Notes ----- See Patterson (2012), main text and Appendix A. For un-normalized f4 statistics, ignore the `den` return value. """ # check inputs aca = AlleleCountsArray(aca, copy=False) assert aca.shape[1] == 2, 'only biallelic variants supported' acb = AlleleCountsArray(acb, copy=False) assert acb.shape[1] == 2, 'only biallelic variants supported' acc = AlleleCountsArray(acc, copy=False) assert acc.shape[1] == 2, 'only biallelic variants supported' acd = AlleleCountsArray(acd, copy=False) assert acd.shape[1] == 2, 'only biallelic variants supported' check_dim0_aligned(aca, acb, acc, acd) # compute sample frequencies for the alternate allele a = aca.to_frequencies()[:, 1] b = acb.to_frequencies()[:, 1] c = acc.to_frequencies()[:, 1] d = acd.to_frequencies()[:, 1] # compute estimator num = (a - b) * (c - d) den = (a + b - (2 * a * b)) * (c + d - (2 * c * d)) return num, den
def patterson_f2(aca, acb): """Unbiased estimator for F2(A, B), the branch length between populations A and B. Parameters ---------- aca : array_like, int, shape (n_variants, 2) Allele counts for population A. acb : array_like, int, shape (n_variants, 2) Allele counts for population B. Returns ------- f2 : ndarray, float, shape (n_variants,) Notes ----- See Patterson (2012), Appendix A. """ # check inputs aca = AlleleCountsArray(aca, copy=False) assert aca.shape[1] == 2, 'only biallelic variants supported' acb = AlleleCountsArray(acb, copy=False) assert acb.shape[1] == 2, 'only biallelic variants supported' check_dim0_aligned(aca, acb) # compute allele numbers sa = aca.sum(axis=1) sb = acb.sum(axis=1) # compute heterozygosities ha = h_hat(aca) hb = h_hat(acb) # compute sample frequencies for the alternate allele a = aca.to_frequencies()[:, 1] b = acb.to_frequencies()[:, 1] # compute estimator x = ((a - b) ** 2) - (ha / sa) - (hb / sb) return x
def patterson_f2(aca, acb): """Unbiased estimator for F2(A, B), the branch length between populations A and B. Parameters ---------- aca : array_like, int, shape (n_variants, 2) Allele counts for population A. acb : array_like, int, shape (n_variants, 2) Allele counts for population B. Returns ------- f2 : ndarray, float, shape (n_variants,) Notes ----- See Patterson (2012), Appendix A. """ # check inputs aca = AlleleCountsArray(aca, copy=False) assert aca.shape[1] == 2, 'only biallelic variants supported' acb = AlleleCountsArray(acb, copy=False) assert acb.shape[1] == 2, 'only biallelic variants supported' check_dim0_aligned(aca, acb) # compute allele numbers sa = aca.sum(axis=1) sb = acb.sum(axis=1) # compute heterozygosities ha = h_hat(aca) hb = h_hat(acb) # compute sample frequencies for the alternate allele a = aca.to_frequencies()[:, 1] b = acb.to_frequencies()[:, 1] # compute estimator x = ((a - b)**2) - (ha / sa) - (hb / sb) return x
def compute_ihh_gaps(pos, map_pos, gap_scale, max_gap, is_accessible): """Compute spacing between variants for integrating haplotype homozygosity. Parameters ---------- pos : array_like, int, shape (n_variants,) Variant positions (physical distance). map_pos : array_like, float, shape (n_variants,) Variant positions (genetic map distance). gap_scale : int, optional Rescale distance between variants if gap is larger than this value. max_gap : int, optional Do not report scores if EHH spans a gap larger than this number of base pairs. is_accessible : array_like, bool, optional Genome accessibility array. If provided, distance between variants will be computed as the number of accessible bases between them. Returns ------- gaps : ndarray, float, shape (n_variants - 1,) """ # check inputs if map_pos is None: # integrate over physical distance map_pos = pos else: map_pos = asarray_ndim(map_pos, 1) check_dim0_aligned(pos, map_pos) # compute physical gaps physical_gaps = np.diff(pos) # compute genetic gaps gaps = np.diff(map_pos).astype('f8') if is_accessible is not None: # compute accessible gaps is_accessible = asarray_ndim(is_accessible, 1) assert is_accessible.shape[0] > pos[-1], \ 'accessibility array too short' accessible_gaps = np.zeros_like(physical_gaps) for i in range(1, len(pos)): # N.B., expect pos is 1-based n_access = np.count_nonzero(is_accessible[pos[i - 1] - 1:pos[i] - 1]) accessible_gaps[i - 1] = n_access # adjust using accessibility scaling = accessible_gaps / physical_gaps gaps = gaps * scaling elif gap_scale is not None and gap_scale > 0: scaling = np.ones(gaps.shape, dtype='f8') loc_scale = physical_gaps > gap_scale scaling[loc_scale] = gap_scale / physical_gaps[loc_scale] gaps = gaps * scaling if max_gap is not None and max_gap > 0: # deal with very large gaps gaps[physical_gaps > max_gap] = -1 return gaps
def standardize_by_allele_count(score, aac, bins=None, n_bins=None, diagnostics=True): """Standardize `score` within allele frequency bins. Parameters ---------- score : array_like, float The score to be standardized, e.g., IHS or NSL. aac : array_like, int An array of alternate allele counts. bins : array_like, int, optional Allele count bins, overrides `n_bins`. n_bins : int, optional Number of allele count bins to use. diagnostics : bool, optional If True, plot some diagnostic information about the standardization. Returns ------- score_standardized : ndarray, float Standardized scores. bins : ndarray, int Allele count bins used for standardization. """ from scipy.stats import binned_statistic # check inputs score = asarray_ndim(score, 1) aac = asarray_ndim(aac, 1) check_dim0_aligned(score, aac) # remove nans nonan = ~np.isnan(score) score_nonan = score[nonan] aac_nonan = aac[nonan] if bins is None: # make our own similar sized bins # how many bins to make? if n_bins is None: # something vaguely reasonable n_bins = np.max(aac) // 2 # make bins bins = make_similar_sized_bins(aac_nonan, n_bins) else: # user-provided bins bins = asarray_ndim(bins, 1) mean_score, _, _ = binned_statistic(aac_nonan, score_nonan, statistic=np.mean, bins=bins) std_score, _, _ = binned_statistic(aac_nonan, score_nonan, statistic=np.std, bins=bins) if diagnostics: import matplotlib.pyplot as plt x = (bins[:-1] + bins[1:]) / 2 plt.figure() plt.fill_between(x, mean_score - std_score, mean_score + std_score, alpha=.5, label='std') plt.plot(x, mean_score, marker='o', label='mean') plt.grid(axis='y') plt.xlabel('Alternate allele count') plt.ylabel('Unstandardized score') plt.title('Standardization diagnostics') plt.legend() # apply standardization score_standardized = np.empty_like(score) for i in range(len(bins) - 1): x1 = bins[i] x2 = bins[i + 1] if i == 0: # first bin loc = (aac < x2) elif i == len(bins) - 2: # last bin loc = (aac >= x1) else: # middle bins loc = (aac >= x1) & (aac < x2) m = mean_score[i] s = std_score[i] score_standardized[loc] = (score[loc] - m) / s return score_standardized, bins
def hudson_fst(ac1, ac2, fill=np.nan): """Calculate the numerator and denominator for Fst estimation using the method of Hudson (1992) elaborated by Bhatia et al. (2013). Parameters ---------- ac1 : array_like, int, shape (n_variants, n_alleles) Allele counts array from the first population. ac2 : array_like, int, shape (n_variants, n_alleles) Allele counts array from the second population. fill : float Use this value where there are no pairs to compare (e.g., all allele calls are missing). Returns ------- num : ndarray, float, shape (n_variants,) Divergence between the two populations minus average of diversity within each population. den : ndarray, float, shape (n_variants,) Divergence between the two populations. Examples -------- Calculate numerator and denominator for Fst estimation:: >>> import allel >>> g = allel.GenotypeArray([[[0, 0], [0, 0], [1, 1], [1, 1]], ... [[0, 1], [0, 1], [0, 1], [0, 1]], ... [[0, 0], [0, 0], [0, 0], [0, 0]], ... [[0, 1], [1, 2], [1, 1], [2, 2]], ... [[0, 0], [1, 1], [0, 1], [-1, -1]]]) >>> subpops = [[0, 1], [2, 3]] >>> ac1 = g.count_alleles(subpop=subpops[0]) >>> ac2 = g.count_alleles(subpop=subpops[1]) >>> num, den = allel.hudson_fst(ac1, ac2) >>> num array([ 1. , -0.16666667, 0. , -0.125 , -0.33333333]) >>> den array([1. , 0.5 , 0. , 0.625, 0.5 ]) Estimate Fst for each variant individually:: >>> fst = num / den >>> fst array([ 1. , -0.33333333, nan, -0.2 , -0.66666667]) Estimate Fst averaging over variants:: >>> fst = np.sum(num) / np.sum(den) >>> fst 0.1428571428571429 """ # flake8: noqa # check inputs ac1 = asarray_ndim(ac1, 2) ac2 = asarray_ndim(ac2, 2) check_dim0_aligned(ac1, ac2) ac1, ac2 = ensure_dim1_aligned(ac1, ac2) # calculate these once only an1 = np.sum(ac1, axis=1) an2 = np.sum(ac2, axis=1) # calculate average diversity (a.k.a. heterozygosity) within each # population within = (mean_pairwise_difference(ac1, an1, fill=fill) + mean_pairwise_difference(ac2, an2, fill=fill)) / 2 # calculate divergence (a.k.a. heterozygosity) between each population between = mean_pairwise_difference_between(ac1, ac2, an1, an2, fill=fill) # define numerator and denominator for Fst calculations num = between - within den = between return num, den
def tabulate_state_blocks(x, states, pos=None): """Construct a dataframe where each row provides information about continuous state blocks. Parameters ---------- x : array_like, int 1-dimensional array of state values. states : set Set of states of interest. Any state value not in this set will be ignored. pos : array_like, int, optional Array of positions corresponding to values in `x`. Returns ------- df : DataFrame Examples -------- >>> import allel >>> x = [1, 1, 0, 1, 1, 2, 2, 0, 2, 1, 1] >>> df = allel.tabulate_state_blocks(x, states={1, 2}) >>> df state support start_lidx ... size_min size_max is_marginal 0 1 4 -1 ... 5 -1 True 1 2 3 4 ... 4 4 False 2 1 2 8 ... 2 -1 True [3 rows x 9 columns] >>> pos = [2, 4, 7, 8, 10, 14, 19, 23, 28, 30, 31] >>> df = allel.tabulate_state_blocks(x, states={1, 2}, pos=pos) >>> df state support start_lidx ... stop_rpos length_min length_max 0 1 4 -1 ... 14 9 -1 1 2 3 4 ... 30 15 19 2 1 2 8 ... -1 2 -1 [3 rows x 15 columns] """ # check inputs x = asarray_ndim(x, 1) check_integer_dtype(x) x = memoryview_safe(x) # find state transitions switch_points, transitions, observations = state_transitions(x, states) # setup some helpers t = transitions[1:, 0] o = observations[1:] s1 = switch_points[:-1] s2 = switch_points[1:] is_marginal = (s1[:, 0] < 0) | (s2[:, 1] < 0) size_min = s2[:, 0] - s1[:, 1] + 1 size_max = s2[:, 1] - s1[:, 0] - 1 size_max[is_marginal] = -1 # start to build a dataframe items = [ ('state', t), ('support', o), ('start_lidx', s1[:, 0]), ('start_ridx', s1[:, 1]), ('stop_lidx', s2[:, 0]), ('stop_ridx', s2[:, 1]), ('size_min', size_min), ('size_max', size_max), ('is_marginal', is_marginal) ] # deal with optional positions if pos is not None: pos = asarray_ndim(pos, 1) check_dim0_aligned(x, pos) check_integer_dtype(pos) # obtain switch positions switch_positions = np.take(pos, switch_points) # deal with boundary transitions switch_positions[0, 0] = -1 switch_positions[-1, 1] = -1 # setup helpers p1 = switch_positions[:-1] p2 = switch_positions[1:] length_min = p2[:, 0] - p1[:, 1] + 1 length_max = p2[:, 1] - p1[:, 0] - 1 length_max[is_marginal] = -1 items += [ ('start_lpos', p1[:, 0]), ('start_rpos', p1[:, 1]), ('stop_lpos', p2[:, 0]), ('stop_rpos', p2[:, 1]), ('length_min', length_min), ('length_max', length_max), ] import pandas return pandas.DataFrame.from_dict(OrderedDict(items))
def tabulate_state_transitions(x, states, pos=None): """Construct a dataframe where each row provides information about a state transition. Parameters ---------- x : array_like, int 1-dimensional array of state values. states : set Set of states of interest. Any state value not in this set will be ignored. pos : array_like, int, optional Array of positions corresponding to values in `x`. Returns ------- df : DataFrame Notes ----- The resulting dataframe includes one row at the start representing the first state observation and one row at the end representing the last state observation. Examples -------- >>> import allel >>> x = [1, 1, 0, 1, 1, 2, 2, 0, 2, 1, 1] >>> df = allel.tabulate_state_transitions(x, states={1, 2}) >>> df lstate rstate lidx ridx 0 -1 1 -1 0 1 1 2 4 5 2 2 1 8 9 3 1 -1 10 -1 >>> pos = [2, 4, 7, 8, 10, 14, 19, 23, 28, 30, 31] >>> df = allel.tabulate_state_transitions(x, states={1, 2}, pos=pos) >>> df lstate rstate lidx ridx lpos rpos 0 -1 1 -1 0 -1 2 1 1 2 4 5 10 14 2 2 1 8 9 28 30 3 1 -1 10 -1 31 -1 """ # check inputs x = asarray_ndim(x, 1) check_integer_dtype(x) x = memoryview_safe(x) # find state transitions switch_points, transitions, _ = state_transitions(x, states) # start to build a dataframe items = [('lstate', transitions[:, 0]), ('rstate', transitions[:, 1]), ('lidx', switch_points[:, 0]), ('ridx', switch_points[:, 1])] # deal with optional positions if pos is not None: pos = asarray_ndim(pos, 1) check_dim0_aligned(x, pos) check_integer_dtype(pos) # find switch positions switch_positions = np.take(pos, switch_points) # deal with boundary transitions switch_positions[0, 0] = -1 switch_positions[-1, 1] = -1 # add columns into dataframe items += [('lpos', switch_positions[:, 0]), ('rpos', switch_positions[:, 1])] import pandas return pandas.DataFrame.from_dict(OrderedDict(items))
def xpehh( h1, h2, pos, map_pos=None, min_ehh=0.05, include_edges=False, gap_scale=20000, max_gap=200000, is_accessible=None, use_threads=True, ): """Compute the unstandardized cross-population extended haplotype homozygosity score (XPEHH) for each variant. Parameters ---------- h1 : array_like, int, shape (n_variants, n_haplotypes) Haplotype array for the first population. h2 : array_like, int, shape (n_variants, n_haplotypes) Haplotype array for the second population. pos : array_like, int, shape (n_variants,) Variant positions on physical or genetic map. map_pos : array_like, float, shape (n_variants,) Variant positions (genetic map distance). min_ehh: float, optional Minimum EHH beyond which to truncate integrated haplotype homozygosity calculation. include_edges : bool, optional If True, report scores even if EHH does not decay below `min_ehh` before reaching the edge of the data. gap_scale : int, optional Rescale distance between variants if gap is larger than this value. max_gap : int, optional Do not report scores if EHH spans a gap larger than this number of base pairs. is_accessible : array_like, bool, optional Genome accessibility array. If provided, distance between variants will be computed as the number of accessible bases between them. use_threads : bool, optional If True use multiple threads to compute. Returns ------- score : ndarray, float, shape (n_variants,) Unstandardized XPEHH scores. Notes ----- This function will calculate XPEHH for all variants. To exclude variants below a given minor allele frequency, filter the input haplotype arrays before passing to this function. This function returns NaN for any EHH calculations where haplotype homozygosity does not decay below `min_ehh` before reaching the first or last variant. To disable this behaviour, set `include_edges` to True. Note that the unstandardized score is returned. Usually these scores are then standardized genome-wide. Haplotype arrays from the two populations may have different numbers of haplotypes. See Also -------- standardize """ from allel.opt.stats import ihh_scan_int8 # check inputs h1 = HaplotypeArray(np.asarray(h1, dtype="i1")) h2 = HaplotypeArray(np.asarray(h2, dtype="i1")) pos = asarray_ndim(pos, 1) check_dim0_aligned(h1, h2, pos) # compute gaps between variants for integration gaps = compute_ihh_gaps(pos, map_pos, gap_scale, max_gap, is_accessible) # setup kwargs kwargs = dict(min_ehh=min_ehh, include_edges=include_edges) if use_threads and multiprocessing.cpu_count() > 1: # use multiple threads # setup threadpool pool = ThreadPool(min(4, multiprocessing.cpu_count())) # scan forward res1_fwd = pool.apply_async(ihh_scan_int8, (h1, gaps), kwargs) res2_fwd = pool.apply_async(ihh_scan_int8, (h2, gaps), kwargs) # scan backward res1_rev = pool.apply_async(ihh_scan_int8, (h1[::-1], gaps[::-1]), kwargs) res2_rev = pool.apply_async(ihh_scan_int8, (h2[::-1], gaps[::-1]), kwargs) # wait for both to finish pool.close() pool.join() # obtain results ihh1_fwd = res1_fwd.get() ihh2_fwd = res2_fwd.get() ihh1_rev = res1_rev.get() ihh2_rev = res2_rev.get() # cleanup pool.terminate() else: # compute without threads # scan forward ihh1_fwd = ihh_scan_int8(h1, gaps, **kwargs) ihh2_fwd = ihh_scan_int8(h2, gaps, **kwargs) # scan backward ihh1_rev = ihh_scan_int8(h1[::-1], gaps[::-1], **kwargs) ihh2_rev = ihh_scan_int8(h2[::-1], gaps[::-1], **kwargs) # handle reverse scans ihh1_rev = ihh1_rev[::-1] ihh2_rev = ihh2_rev[::-1] # compute unstandardized score ihh1 = ihh1_fwd + ihh1_rev ihh2 = ihh2_fwd + ihh2_rev score = np.log(ihh1 / ihh2) return score
def create_allele_mapping(ref, alt, alleles, dtype='i1'): """Create an array mapping variant alleles into a different allele index system. Parameters ---------- ref : array_like, S1, shape (n_variants,) Reference alleles. alt : array_like, S1, shape (n_variants, n_alt_alleles) Alternate alleles. alleles : array_like, S1, shape (n_variants, n_alleles) Alleles defining the new allele indexing. dtype : dtype, optional Output dtype. Returns ------- mapping : ndarray, int8, shape (n_variants, n_alt_alleles + 1) Examples -------- Example with biallelic variants:: >>> import allel >>> ref = [b'A', b'C', b'T', b'G'] >>> alt = [b'T', b'G', b'C', b'A'] >>> alleles = [[b'A', b'T'], # no transformation ... [b'G', b'C'], # swap ... [b'T', b'A'], # 1 missing ... [b'A', b'C']] # 1 missing >>> mapping = allel.create_allele_mapping(ref, alt, alleles) >>> mapping array([[ 0, 1], [ 1, 0], [ 0, -1], [-1, 0]], dtype=int8) Example with multiallelic variants:: >>> ref = [b'A', b'C', b'T'] >>> alt = [[b'T', b'G'], ... [b'A', b'T'], ... [b'G', b'.']] >>> alleles = [[b'A', b'T'], ... [b'C', b'T'], ... [b'G', b'A']] >>> mapping = create_allele_mapping(ref, alt, alleles) >>> mapping array([[ 0, 1, -1], [ 0, -1, 1], [-1, 0, -1]], dtype=int8) See Also -------- GenotypeArray.map_alleles, HaplotypeArray.map_alleles, AlleleCountsArray.map_alleles """ ref = asarray_ndim(ref, 1) alt = asarray_ndim(alt, 1, 2) alleles = asarray_ndim(alleles, 1, 2) check_dim0_aligned(ref, alt, alleles) # reshape for convenience ref = ref[:, None] if alt.ndim == 1: alt = alt[:, None] if alleles.ndim == 1: alleles = alleles[:, None] source_alleles = np.append(ref, alt, axis=1) # setup output array out = np.empty(source_alleles.shape, dtype=dtype) out.fill(-1) # find matches for ai in range(source_alleles.shape[1]): match = source_alleles[:, ai, None] == alleles match_i, match_j = match.nonzero() out[match_i, ai] = match_j return out
def xpnsl(h1, h2, use_threads=True): """Cross-population version of the NSL statistic. Parameters ---------- h1 : array_like, int, shape (n_variants, n_haplotypes) Haplotype array for the first population. h2 : array_like, int, shape (n_variants, n_haplotypes) Haplotype array for the second population. use_threads : bool, optional If True use multiple threads to compute. Returns ------- score : ndarray, float, shape (n_variants,) Unstandardized XPNSL scores. """ # check inputs h1 = asarray_ndim(h1, 2) check_integer_dtype(h1) h2 = asarray_ndim(h2, 2) check_integer_dtype(h2) check_dim0_aligned(h1, h2) if use_threads and multiprocessing.cpu_count() > 1: # use multiple threads # setup threadpool pool = ThreadPool(min(4, multiprocessing.cpu_count())) # scan forward res1_fwd = pool.apply_async(nsl_scan, args=(h1, )) res2_fwd = pool.apply_async(nsl_scan, args=(h2, )) # scan backward res1_rev = pool.apply_async(nsl_scan, args=(h1[::-1], )) res2_rev = pool.apply_async(nsl_scan, args=(h2[::-1], )) # wait for both to finish pool.close() pool.join() # obtain results nsl1_fwd = res1_fwd.get() nsl2_fwd = res2_fwd.get() nsl1_rev = res1_rev.get() nsl2_rev = res2_rev.get() # cleanup pool.terminate() else: # compute without threads # scan forward nsl1_fwd = nsl_scan(h1) nsl2_fwd = nsl_scan(h2) # scan backward nsl1_rev = nsl_scan(h1[::-1]) nsl2_rev = nsl_scan(h2[::-1]) # handle reverse scans nsl1_rev = nsl1_rev[::-1] nsl2_rev = nsl2_rev[::-1] # compute unstandardized score nsl1 = nsl1_fwd + nsl1_rev nsl2 = nsl2_fwd + nsl2_rev score = np.log(nsl1 / nsl2) return score
def hudson_fst(ac1, ac2, fill=np.nan): """Calculate the numerator and denominator for Fst estimation using the method of Hudson (1992) elaborated by Bhatia et al. (2013). Parameters ---------- ac1 : array_like, int, shape (n_variants, n_alleles) Allele counts array from the first population. ac2 : array_like, int, shape (n_variants, n_alleles) Allele counts array from the second population. fill : float Use this value where there are no pairs to compare (e.g., all allele calls are missing). Returns ------- num : ndarray, float, shape (n_variants,) Divergence between the two populations minus average of diversity within each population. den : ndarray, float, shape (n_variants,) Divergence between the two populations. Examples -------- Calculate numerator and denominator for Fst estimation:: >>> import allel >>> g = allel.GenotypeArray([[[0, 0], [0, 0], [1, 1], [1, 1]], ... [[0, 1], [0, 1], [0, 1], [0, 1]], ... [[0, 0], [0, 0], [0, 0], [0, 0]], ... [[0, 1], [1, 2], [1, 1], [2, 2]], ... [[0, 0], [1, 1], [0, 1], [-1, -1]]]) >>> subpops = [[0, 1], [2, 3]] >>> ac1 = g.count_alleles(subpop=subpops[0]) >>> ac2 = g.count_alleles(subpop=subpops[1]) >>> num, den = allel.stats.hudson_fst(ac1, ac2) >>> num array([ 1. , -0.16666667, 0. , -0.125 , -0.33333333]) >>> den array([ 1. , 0.5 , 0. , 0.625, 0.5 ]) Estimate Fst for each variant individually:: >>> fst = num / den >>> fst array([ 1. , -0.33333333, nan, -0.2 , -0.66666667]) Estimate Fst averaging over variants:: >>> fst = np.sum(num) / np.sum(den) >>> fst 0.1428571428571429 """ # flake8: noqa # check inputs ac1 = asarray_ndim(ac1, 2) ac2 = asarray_ndim(ac2, 2) check_dim0_aligned(ac1, ac2) ac1, ac2 = ensure_dim1_aligned(ac1, ac2) # calculate these once only an1 = np.sum(ac1, axis=1) an2 = np.sum(ac2, axis=1) # calculate average diversity (a.k.a. heterozygosity) within each # population within = (mean_pairwise_difference(ac1, an1, fill=fill) + mean_pairwise_difference(ac2, an2, fill=fill)) / 2 # calculate divergence (a.k.a. heterozygosity) between each population between = mean_pairwise_difference_between(ac1, ac2, an1, an2, fill=fill) # define numerator and denominator for Fst calculations num = between - within den = between return num, den
def mean_pairwise_difference(ac, an=None, fill=np.nan): """Calculate for each variant the mean number of pairwise differences between chromosomes sampled from within a single population. Parameters ---------- ac : array_like, int, shape (n_variants, n_alleles) Allele counts array. an : array_like, int, shape (n_variants,), optional Allele numbers. If not provided, will be calculated from `ac`. fill : float Use this value where there are no pairs to compare (e.g., all allele calls are missing). Returns ------- mpd : ndarray, float, shape (n_variants,) Notes ----- The values returned by this function can be summed over a genome region and divided by the number of accessible bases to estimate nucleotide diversity, a.k.a. *pi*. Examples -------- >>> import allel >>> h = allel.HaplotypeArray([[0, 0, 0, 0], ... [0, 0, 0, 1], ... [0, 0, 1, 1], ... [0, 1, 1, 1], ... [1, 1, 1, 1], ... [0, 0, 1, 2], ... [0, 1, 1, 2], ... [0, 1, -1, -1]]) >>> ac = h.count_alleles() >>> allel.mean_pairwise_difference(ac) array([0. , 0.5 , 0.66666667, 0.5 , 0. , 0.83333333, 0.83333333, 1. ]) See Also -------- sequence_diversity, windowed_diversity """ # This function calculates the mean number of pairwise differences # between haplotypes within a single population, generalising to any number # of alleles. # check inputs ac = asarray_ndim(ac, 2) # total number of haplotypes if an is None: an = np.sum(ac, axis=1) else: an = asarray_ndim(an, 1) check_dim0_aligned(ac, an) # total number of pairwise comparisons for each variant: # (an choose 2) n_pairs = an * (an - 1) / 2 # number of pairwise comparisons where there is no difference: # sum of (ac choose 2) for each allele (i.e., number of ways to # choose the same allele twice) n_same = np.sum(ac * (ac - 1) / 2, axis=1) # number of pairwise differences n_diff = n_pairs - n_same # mean number of pairwise differences, accounting for cases where # there are no pairs with ignore_invalid(): mpd = np.where(n_pairs > 0, n_diff / n_pairs, fill) return mpd
def mean_pairwise_difference_between(ac1, ac2, an1=None, an2=None, fill=np.nan): """Calculate for each variant the mean number of pairwise differences between chromosomes sampled from two different populations. Parameters ---------- ac1 : array_like, int, shape (n_variants, n_alleles) Allele counts array from the first population. ac2 : array_like, int, shape (n_variants, n_alleles) Allele counts array from the second population. an1 : array_like, int, shape (n_variants,), optional Allele numbers for the first population. If not provided, will be calculated from `ac1`. an2 : array_like, int, shape (n_variants,), optional Allele numbers for the second population. If not provided, will be calculated from `ac2`. fill : float Use this value where there are no pairs to compare (e.g., all allele calls are missing). Returns ------- mpd : ndarray, float, shape (n_variants,) Notes ----- The values returned by this function can be summed over a genome region and divided by the number of accessible bases to estimate nucleotide divergence between two populations, a.k.a. *Dxy*. Examples -------- >>> import allel >>> h = allel.HaplotypeArray([[0, 0, 0, 0], ... [0, 0, 0, 1], ... [0, 0, 1, 1], ... [0, 1, 1, 1], ... [1, 1, 1, 1], ... [0, 0, 1, 2], ... [0, 1, 1, 2], ... [0, 1, -1, -1]]) >>> ac1 = h.count_alleles(subpop=[0, 1]) >>> ac2 = h.count_alleles(subpop=[2, 3]) >>> allel.mean_pairwise_difference_between(ac1, ac2) array([0. , 0.5 , 1. , 0.5 , 0. , 1. , 0.75, nan]) See Also -------- sequence_divergence, windowed_divergence """ # This function calculates the mean number of pairwise differences # between haplotypes from two different populations, generalising to any # number of alleles. # check inputs ac1 = asarray_ndim(ac1, 2) ac2 = asarray_ndim(ac2, 2) check_dim0_aligned(ac1, ac2) ac1, ac2 = ensure_dim1_aligned(ac1, ac2) # total number of haplotypes sampled from each population if an1 is None: an1 = np.sum(ac1, axis=1) else: an1 = asarray_ndim(an1, 1) check_dim0_aligned(ac1, an1) if an2 is None: an2 = np.sum(ac2, axis=1) else: an2 = asarray_ndim(an2, 1) check_dim0_aligned(ac2, an2) # total number of pairwise comparisons for each variant n_pairs = an1 * an2 # number of pairwise comparisons where there is no difference: # sum of (ac1 * ac2) for each allele (i.e., number of ways to # choose the same allele twice) n_same = np.sum(ac1 * ac2, axis=1) # number of pairwise differences n_diff = n_pairs - n_same # mean number of pairwise differences, accounting for cases where # there are no pairs with ignore_invalid(): mpd = np.where(n_pairs > 0, n_diff / n_pairs, fill) return mpd
def standardize_by_allele_count(score, aac, bins=None, n_bins=None, diagnostics=True): """Standardize `score` within allele frequency bins. Parameters ---------- score : array_like, float The score to be standardized, e.g., IHS or NSL. aac : array_like, int An array of alternate allele counts. bins : array_like, int, optional Allele count bins, overrides `n_bins`. n_bins : int, optional Number of allele count bins to use. diagnostics : bool, optional If True, plot some diagnostic information about the standardization. Returns ------- score_standardized : ndarray, float Standardized scores. bins : ndarray, int Allele count bins used for standardization. """ from scipy.stats import binned_statistic # check inputs score = asarray_ndim(score, 1) aac = asarray_ndim(aac, 1) check_dim0_aligned(score, aac) # remove nans nonan = ~np.isnan(score) score_nonan = score[nonan] aac_nonan = aac[nonan] if bins is None: # make our own similar sized bins # how many bins to make? if n_bins is None: # something vaguely reasonable n_bins = np.max(aac) // 2 # make bins bins = make_similar_sized_bins(aac_nonan, n_bins) else: # user-provided bins bins = asarray_ndim(bins, 1) mean_score, _, _ = binned_statistic(aac_nonan, score_nonan, statistic=np.mean, bins=bins) std_score, _, _ = binned_statistic(aac_nonan, score_nonan, statistic=np.std, bins=bins) if diagnostics: import matplotlib.pyplot as plt x = (bins[:-1] + bins[1:]) / 2 plt.figure() plt.fill_between(x, mean_score - std_score, mean_score + std_score, alpha=0.5, label="std") plt.plot(x, mean_score, marker="o", label="mean") plt.grid(axis="y") plt.xlabel("Alternate allele count") plt.ylabel("Unstandardized score") plt.title("Standardization diagnostics") plt.legend() # apply standardization score_standardized = np.empty_like(score) for i in range(len(bins) - 1): x1 = bins[i] x2 = bins[i + 1] if i == 0: # first bin loc = aac < x2 elif i == len(bins) - 2: # last bin loc = aac >= x1 else: # middle bins loc = (aac >= x1) & (aac < x2) m = mean_score[i] s = std_score[i] score_standardized[loc] = (score[loc] - m) / s return score_standardized, bins
def compute_ihh_gaps(pos, map_pos, gap_scale, max_gap, is_accessible): """Compute spacing between variants for integrating haplotype homozygosity. Parameters ---------- pos : array_like, int, shape (n_variants,) Variant positions (physical distance). map_pos : array_like, float, shape (n_variants,) Variant positions (genetic map distance). gap_scale : int, optional Rescale distance between variants if gap is larger than this value. max_gap : int, optional Do not report scores if EHH spans a gap larger than this number of base pairs. is_accessible : array_like, bool, optional Genome accessibility array. If provided, distance between variants will be computed as the number of accessible bases between them. Returns ------- gaps : ndarray, float, shape (n_variants - 1,) """ # check inputs if map_pos is None: # integrate over physical distance map_pos = pos else: map_pos = asarray_ndim(map_pos, 1) check_dim0_aligned(pos, map_pos) # compute physical gaps physical_gaps = np.diff(pos) # compute genetic gaps gaps = np.diff(map_pos).astype("f8") if is_accessible is not None: # compute accessible gaps is_accessible = asarray_ndim(is_accessible, 1) assert is_accessible.shape[0] > pos[-1], "accessibility array too short" accessible_gaps = np.zeros_like(physical_gaps) for i in range(1, len(pos)): # N.B., expect pos is 1-based n_access = np.count_nonzero(is_accessible[pos[i - 1] - 1 : pos[i] - 1]) accessible_gaps[i - 1] = n_access # adjust using accessibility scaling = accessible_gaps / physical_gaps gaps = gaps * scaling elif gap_scale is not None and gap_scale > 0: scaling = np.ones(gaps.shape, dtype="f8") loc_scale = physical_gaps > gap_scale scaling[loc_scale] = gap_scale / physical_gaps[loc_scale] gaps = gaps * scaling if max_gap is not None and max_gap > 0: # deal with very large gaps gaps[physical_gaps > max_gap] = -1 return gaps
def ihs(h, pos, map_pos=None, min_ehh=0.05, min_maf=0.05, include_edges=False, gap_scale=20000, max_gap=200000, is_accessible=None, use_threads=True): """Compute the unstandardized integrated haplotype score (IHS) for each variant, comparing integrated haplotype homozygosity between the reference (0) and alternate (1) alleles. Parameters ---------- h : array_like, int, shape (n_variants, n_haplotypes) Haplotype array. pos : array_like, int, shape (n_variants,) Variant positions (physical distance). map_pos : array_like, float, shape (n_variants,) Variant positions (genetic map distance). min_ehh: float, optional Minimum EHH beyond which to truncate integrated haplotype homozygosity calculation. min_maf : float, optional Do not compute integrated haplotype homozogysity for variants with minor allele frequency below this value. include_edges : bool, optional If True, report scores even if EHH does not decay below `min_ehh` before reaching the edge of the data. gap_scale : int, optional Rescale distance between variants if gap is larger than this value. max_gap : int, optional Do not report scores if EHH spans a gap larger than this number of base pairs. is_accessible : array_like, bool, optional Genome accessibility array. If provided, distance between variants will be computed as the number of accessible bases between them. use_threads : bool, optional If True use multiple threads to compute. Returns ------- score : ndarray, float, shape (n_variants,) Unstandardized IHS scores. Notes ----- This function will calculate IHS for all variants. To exclude variants below a given minor allele frequency, filter the input haplotype array before passing to this function. This function computes IHS comparing the reference and alternate alleles. These can be polarised by switching the sign for any variant where the reference allele is derived. This function returns NaN for any IHS calculations where haplotype homozygosity does not decay below `min_ehh` before reaching the first or last variant. To disable this behaviour, set `include_edges` to True. Note that the unstandardized score is returned. Usually these scores are then standardized in different allele frequency bins. See Also -------- standardize_by_allele_count """ # check inputs h = asarray_ndim(h, 2) check_integer_dtype(h) pos = asarray_ndim(pos, 1) check_dim0_aligned(h, pos) # compute gaps between variants for integration gaps = compute_ihh_gaps(pos, map_pos, gap_scale, max_gap, is_accessible) # setup kwargs kwargs = dict(min_ehh=min_ehh, min_maf=min_maf, include_edges=include_edges) if use_threads and multiprocessing.cpu_count() > 1: # run with threads # create pool pool = ThreadPool(2) # scan forward result_fwd = pool.apply_async(ihh01_scan, (h, gaps), kwargs) # scan backward result_rev = pool.apply_async(ihh01_scan, (h[::-1], gaps[::-1]), kwargs) # wait for both to finish pool.close() pool.join() # obtain results ihh0_fwd, ihh1_fwd = result_fwd.get() ihh0_rev, ihh1_rev = result_rev.get() # cleanup pool.terminate() else: # run without threads # scan forward ihh0_fwd, ihh1_fwd = ihh01_scan(h, gaps, **kwargs) # scan backward ihh0_rev, ihh1_rev = ihh01_scan(h[::-1], gaps[::-1], **kwargs) # handle reverse scan ihh0_rev = ihh0_rev[::-1] ihh1_rev = ihh1_rev[::-1] # compute unstandardized score ihh0 = ihh0_fwd + ihh0_rev ihh1 = ihh1_fwd + ihh1_rev score = np.log(ihh1 / ihh0) return score
def xpehh(h1, h2, pos, map_pos=None, min_ehh=0.05, include_edges=False, gap_scale=20000, max_gap=200000, is_accessible=None, use_threads=True): """Compute the unstandardized cross-population extended haplotype homozygosity score (XPEHH) for each variant. Parameters ---------- h1 : array_like, int, shape (n_variants, n_haplotypes) Haplotype array for the first population. h2 : array_like, int, shape (n_variants, n_haplotypes) Haplotype array for the second population. pos : array_like, int, shape (n_variants,) Variant positions on physical or genetic map. map_pos : array_like, float, shape (n_variants,) Variant positions (genetic map distance). min_ehh: float, optional Minimum EHH beyond which to truncate integrated haplotype homozygosity calculation. include_edges : bool, optional If True, report scores even if EHH does not decay below `min_ehh` before reaching the edge of the data. gap_scale : int, optional Rescale distance between variants if gap is larger than this value. max_gap : int, optional Do not report scores if EHH spans a gap larger than this number of base pairs. is_accessible : array_like, bool, optional Genome accessibility array. If provided, distance between variants will be computed as the number of accessible bases between them. use_threads : bool, optional If True use multiple threads to compute. Returns ------- score : ndarray, float, shape (n_variants,) Unstandardized XPEHH scores. Notes ----- This function will calculate XPEHH for all variants. To exclude variants below a given minor allele frequency, filter the input haplotype arrays before passing to this function. This function returns NaN for any EHH calculations where haplotype homozygosity does not decay below `min_ehh` before reaching the first or last variant. To disable this behaviour, set `include_edges` to True. Note that the unstandardized score is returned. Usually these scores are then standardized genome-wide. Haplotype arrays from the two populations may have different numbers of haplotypes. See Also -------- standardize """ # check inputs h1 = asarray_ndim(h1, 2) check_integer_dtype(h1) h2 = asarray_ndim(h2, 2) check_integer_dtype(h2) pos = asarray_ndim(pos, 1) check_dim0_aligned(h1, h2, pos) # compute gaps between variants for integration gaps = compute_ihh_gaps(pos, map_pos, gap_scale, max_gap, is_accessible) # setup kwargs kwargs = dict(min_ehh=min_ehh, include_edges=include_edges) if use_threads and multiprocessing.cpu_count() > 1: # use multiple threads # setup threadpool pool = ThreadPool(min(4, multiprocessing.cpu_count())) # scan forward res1_fwd = pool.apply_async(ihh_scan, (h1, gaps), kwargs) res2_fwd = pool.apply_async(ihh_scan, (h2, gaps), kwargs) # scan backward res1_rev = pool.apply_async(ihh_scan, (h1[::-1], gaps[::-1]), kwargs) res2_rev = pool.apply_async(ihh_scan, (h2[::-1], gaps[::-1]), kwargs) # wait for both to finish pool.close() pool.join() # obtain results ihh1_fwd = res1_fwd.get() ihh2_fwd = res2_fwd.get() ihh1_rev = res1_rev.get() ihh2_rev = res2_rev.get() # cleanup pool.terminate() else: # compute without threads # scan forward ihh1_fwd = ihh_scan(h1, gaps, **kwargs) ihh2_fwd = ihh_scan(h2, gaps, **kwargs) # scan backward ihh1_rev = ihh_scan(h1[::-1], gaps[::-1], **kwargs) ihh2_rev = ihh_scan(h2[::-1], gaps[::-1], **kwargs) # handle reverse scans ihh1_rev = ihh1_rev[::-1] ihh2_rev = ihh2_rev[::-1] # compute unstandardized score ihh1 = ihh1_fwd + ihh1_rev ihh2 = ihh2_fwd + ihh2_rev score = np.log(ihh1 / ihh2) return score
def pbs(ac1, ac2, ac3, window_size, window_start=0, window_stop=None, window_step=None, normed=True): """Compute the population branching statistic (PBS) which performs a comparison of allele frequencies between three populations to detect genome regions that are unusually differentiated in one population relative to the other two populations. Parameters ---------- ac1 : array_like, int Allele counts from the first population. ac2 : array_like, int Allele counts from the second population. ac3 : array_like, int Allele counts from the third population. window_size : int The window size (number of variants) within which to compute PBS values. window_start : int, optional The variant index at which to start windowed calculations. window_stop : int, optional The variant index at which to stop windowed calculations. window_step : int, optional The number of variants between start positions of windows. If not given, defaults to the window size, i.e., non-overlapping windows. normed : bool, optional If True (default), use the normalised version of PBS, also known as PBSn1 [2]_. Otherwise, use the PBS statistic as originally defined in [1]_. Returns ------- pbs : ndarray, float Windowed PBS values. Notes ----- The F\ :sub:`ST` calculations use Hudson's estimator. References ---------- .. [1] Yi et al., "Sequencing of Fifty Human Exomes Reveals Adaptation to High Altitude", Science, 329(5987): 75–78, 2 July 2010. .. [2] Malaspinas et al., "A genomic history of Aboriginal Australia", Nature. volume 538, pages 207–214, 13 October 2016. """ # normalise and check inputs ac1 = AlleleCountsArray(ac1) ac2 = AlleleCountsArray(ac2) ac3 = AlleleCountsArray(ac3) check_dim0_aligned(ac1, ac2, ac3) # compute fst fst12 = moving_hudson_fst(ac1, ac2, size=window_size, start=window_start, stop=window_stop, step=window_step) fst13 = moving_hudson_fst(ac1, ac3, size=window_size, start=window_start, stop=window_stop, step=window_step) fst23 = moving_hudson_fst(ac2, ac3, size=window_size, start=window_start, stop=window_stop, step=window_step) # clip fst values to avoid infinite if fst is 1 for x in fst12, fst13, fst23: np.clip(x, a_min=0, a_max=0.99999, out=x) # compute fst transform t12 = -np.log(1 - fst12) t13 = -np.log(1 - fst13) t23 = -np.log(1 - fst23) # compute pbs ret = (t12 + t13 - t23) / 2 if normed: # compute pbs normalising constant norm = 1 + (t12 + t13 + t23) / 2 ret = ret / norm return ret
def mean_pairwise_difference(ac, an=None, fill=np.nan): """Calculate for each variant the mean number of pairwise differences between chromosomes sampled from within a single population. Parameters ---------- ac : array_like, int, shape (n_variants, n_alleles) Allele counts array. an : array_like, int, shape (n_variants,), optional Allele numbers. If not provided, will be calculated from `ac`. fill : float Use this value where there are no pairs to compare (e.g., all allele calls are missing). Returns ------- mpd : ndarray, float, shape (n_variants,) Notes ----- The values returned by this function can be summed over a genome region and divided by the number of accessible bases to estimate nucleotide diversity, a.k.a. *pi*. Examples -------- >>> import allel >>> h = allel.HaplotypeArray([[0, 0, 0, 0], ... [0, 0, 0, 1], ... [0, 0, 1, 1], ... [0, 1, 1, 1], ... [1, 1, 1, 1], ... [0, 0, 1, 2], ... [0, 1, 1, 2], ... [0, 1, -1, -1]]) >>> ac = h.count_alleles() >>> allel.stats.mean_pairwise_difference(ac) array([ 0. , 0.5 , 0.66666667, 0.5 , 0. , 0.83333333, 0.83333333, 1. ]) See Also -------- sequence_diversity, windowed_diversity """ # This function calculates the mean number of pairwise differences # between haplotypes within a single population, generalising to any number # of alleles. # check inputs ac = asarray_ndim(ac, 2) # total number of haplotypes if an is None: an = np.sum(ac, axis=1) else: an = asarray_ndim(an, 1) check_dim0_aligned(ac, an) # total number of pairwise comparisons for each variant: # (an choose 2) n_pairs = an * (an - 1) / 2 # number of pairwise comparisons where there is no difference: # sum of (ac choose 2) for each allele (i.e., number of ways to # choose the same allele twice) n_same = np.sum(ac * (ac - 1) / 2, axis=1) # number of pairwise differences n_diff = n_pairs - n_same # mean number of pairwise differences, accounting for cases where # there are no pairs with ignore_invalid(): mpd = np.where(n_pairs > 0, n_diff / n_pairs, fill) return mpd
def roh_mhmm(gv, pos, phet_roh=0.001, phet_nonroh=(0.0025, 0.01), transition=1e-6, min_roh=0, is_accessible=None, contig_size=None): """Call ROH (runs of homozygosity) in a single individual given a genotype vector. This function computes the likely ROH using a Multinomial HMM model. There are 3 observable states at each position in a chromosome/contig: 0 = Hom, 1 = Het, 2 = inaccessible (i.e., unobserved). The model is provided with a probability of observing a het in a ROH (`phet_roh`) and one or more probabilities of observing a het in a non-ROH, as this probability may not be constant across the genome (`phet_nonroh`). Parameters ---------- gv : array_like, int, shape (n_variants, ploidy) Genotype vector. pos: array_like, int, shape (n_variants,) Positions of variants, same 0th dimension as `gv`. phet_roh: float, optional Probability of observing a heterozygote in a ROH. Appropriate values will depend on de novo mutation rate and genotype error rate. phet_nonroh: tuple of floats, optional One or more probabilites of observing a heterozygote outside of ROH. Appropriate values will depend primarily on nucleotide diversity within the population, but also on mutation rate and genotype error rate. transition: float, optional Probability of moving between states. min_roh: integer, optional Minimum size (bp) to condsider as a ROH. Will depend on contig size and recombination rate. is_accessible: array_like, bool, shape (`contig_size`,), optional Boolean array for each position in contig describing whether accessible or not. contig_size: int, optional If is_accessible not known/not provided, allows specification of total length of contig. Returns ------- df_roh: DataFrame Data frame where each row describes a run of homozygosity. Columns are 'start', 'stop', 'length' and 'is_marginal'. Start and stop are 1-based, stop-inclusive. froh: float Proportion of genome in a ROH. Notes ----- This function requires `hmmlearn <http://hmmlearn.readthedocs.io/en/latest/>`_ to be installed. This function currently requires around 4GB memory for a contig size of ~50Mbp. """ from hmmlearn import hmm # setup inputs if isinstance(phet_nonroh, float): phet_nonroh = phet_nonroh, gv = GenotypeVector(gv) pos = asarray_ndim(pos, 1) check_dim0_aligned(gv, pos) is_accessible = asarray_ndim(is_accessible, 1, dtype=bool) # heterozygote probabilities het_px = np.concatenate([(phet_roh, ), phet_nonroh]) # start probabilities (all equal) start_prob = np.repeat(1 / het_px.size, het_px.size) # transition between underlying states transition_mx = _hmm_derive_transition_matrix(transition, het_px.size) # probability of inaccessible if is_accessible is None: if contig_size is None: raise ValueError( "If is_accessibile argument is not provided, you must provide contig_size" ) p_accessible = 1.0 else: p_accessible = is_accessible.mean() contig_size = is_accessible.size emission_mx = _mhmm_derive_emission_matrix(het_px, p_accessible) # initialize HMM roh_hmm = hmm.MultinomialHMM(n_components=het_px.size) roh_hmm.n_symbols_ = 3 roh_hmm.startprob_ = start_prob roh_hmm.transmat_ = transition_mx roh_hmm.emissionprob_ = emission_mx # locate heterozygous calls is_het = gv.is_het() # predict ROH state pred, obs = _mhmm_predict_roh_state(roh_hmm, is_het, pos, is_accessible, contig_size) # find ROH windows df_blocks = tabulate_state_blocks(pred, states=list(range(len(het_px)))) df_roh = df_blocks[(df_blocks.state == 0)].reset_index(drop=True) # adapt the dataframe for ROH for col in 'state', 'support', 'start_lidx', 'stop_ridx', 'size_max': del df_roh[col] df_roh.rename(columns={ 'start_ridx': 'start', 'stop_lidx': 'stop', 'size_min': 'length' }, inplace=True) # make coordinates 1-based df_roh['start'] = df_roh['start'] + 1 df_roh['stop'] = df_roh['stop'] + 1 # filter by ROH size if min_roh > 0: df_roh = df_roh[df_roh.length >= min_roh] # compute FROH froh = df_roh.length.sum() / contig_size return df_roh, froh
def mean_pairwise_difference_between(ac1, ac2, an1=None, an2=None, fill=np.nan): """Calculate for each variant the mean number of pairwise differences between chromosomes sampled from two different populations. Parameters ---------- ac1 : array_like, int, shape (n_variants, n_alleles) Allele counts array from the first population. ac2 : array_like, int, shape (n_variants, n_alleles) Allele counts array from the second population. an1 : array_like, int, shape (n_variants,), optional Allele numbers for the first population. If not provided, will be calculated from `ac1`. an2 : array_like, int, shape (n_variants,), optional Allele numbers for the second population. If not provided, will be calculated from `ac2`. fill : float Use this value where there are no pairs to compare (e.g., all allele calls are missing). Returns ------- mpd : ndarray, float, shape (n_variants,) Notes ----- The values returned by this function can be summed over a genome region and divided by the number of accessible bases to estimate nucleotide divergence between two populations, a.k.a. *Dxy*. Examples -------- >>> import allel >>> h = allel.HaplotypeArray([[0, 0, 0, 0], ... [0, 0, 0, 1], ... [0, 0, 1, 1], ... [0, 1, 1, 1], ... [1, 1, 1, 1], ... [0, 0, 1, 2], ... [0, 1, 1, 2], ... [0, 1, -1, -1]]) >>> ac1 = h.count_alleles(subpop=[0, 1]) >>> ac2 = h.count_alleles(subpop=[2, 3]) >>> allel.stats.mean_pairwise_difference_between(ac1, ac2) array([ 0. , 0.5 , 1. , 0.5 , 0. , 1. , 0.75, nan]) See Also -------- sequence_divergence, windowed_divergence """ # This function calculates the mean number of pairwise differences # between haplotypes from two different populations, generalising to any # number of alleles. # check inputs ac1 = asarray_ndim(ac1, 2) ac2 = asarray_ndim(ac2, 2) check_dim0_aligned(ac1, ac2) ac1, ac2 = ensure_dim1_aligned(ac1, ac2) # total number of haplotypes sampled from each population if an1 is None: an1 = np.sum(ac1, axis=1) else: an1 = asarray_ndim(an1, 1) check_dim0_aligned(ac1, an1) if an2 is None: an2 = np.sum(ac2, axis=1) else: an2 = asarray_ndim(an2, 1) check_dim0_aligned(ac2, an2) # total number of pairwise comparisons for each variant n_pairs = an1 * an2 # number of pairwise comparisons where there is no difference: # sum of (ac1 * ac2) for each allele (i.e., number of ways to # choose the same allele twice) n_same = np.sum(ac1 * ac2, axis=1) # number of pairwise differences n_diff = n_pairs - n_same # mean number of pairwise differences, accounting for cases where # there are no pairs with ignore_invalid(): mpd = np.where(n_pairs > 0, n_diff / n_pairs, fill) return mpd
def ihs( h, pos, map_pos=None, min_ehh=0.05, min_maf=0.05, include_edges=False, gap_scale=20000, max_gap=200000, is_accessible=None, use_threads=True, ): """Compute the unstandardized integrated haplotype score (IHS) for each variant, comparing integrated haplotype homozygosity between the reference (0) and alternate (1) alleles. Parameters ---------- h : array_like, int, shape (n_variants, n_haplotypes) Haplotype array. pos : array_like, int, shape (n_variants,) Variant positions (physical distance). map_pos : array_like, float, shape (n_variants,) Variant positions (genetic map distance). min_ehh: float, optional Minimum EHH beyond which to truncate integrated haplotype homozygosity calculation. min_maf : float, optional Do not compute integrated haplotype homozogysity for variants with minor allele frequency below this value. include_edges : bool, optional If True, report scores even if EHH does not decay below `min_ehh` before reaching the edge of the data. gap_scale : int, optional Rescale distance between variants if gap is larger than this value. max_gap : int, optional Do not report scores if EHH spans a gap larger than this number of base pairs. is_accessible : array_like, bool, optional Genome accessibility array. If provided, distance between variants will be computed as the number of accessible bases between them. use_threads : bool, optional If True use multiple threads to compute. Returns ------- score : ndarray, float, shape (n_variants,) Unstandardized IHS scores. Notes ----- This function will calculate IHS for all variants. To exclude variants below a given minor allele frequency, filter the input haplotype array before passing to this function. This function computes IHS comparing the reference and alternate alleles. These can be polarised by switching the sign for any variant where the reference allele is derived. This function returns NaN for any IHS calculations where haplotype homozygosity does not decay below `min_ehh` before reaching the first or last variant. To disable this behaviour, set `include_edges` to True. Note that the unstandardized score is returned. Usually these scores are then standardized in different allele frequency bins. See Also -------- standardize_by_allele_count """ from allel.opt.stats import ihh01_scan_int8 # check inputs h = HaplotypeArray(np.asarray(h, dtype="i1")) pos = asarray_ndim(pos, 1) check_dim0_aligned(h, pos) # compute gaps between variants for integration gaps = compute_ihh_gaps(pos, map_pos, gap_scale, max_gap, is_accessible) # setup kwargs kwargs = dict(min_ehh=min_ehh, min_maf=min_maf, include_edges=include_edges) if use_threads and multiprocessing.cpu_count() > 1: # run with threads # create pool pool = ThreadPool(2) # scan forward result_fwd = pool.apply_async(ihh01_scan_int8, (h, gaps), kwargs) # scan backward result_rev = pool.apply_async(ihh01_scan_int8, (h[::-1], gaps[::-1]), kwargs) # wait for both to finish pool.close() pool.join() # obtain results ihh0_fwd, ihh1_fwd = result_fwd.get() ihh0_rev, ihh1_rev = result_rev.get() # cleanup pool.terminate() else: # run without threads # scan forward ihh0_fwd, ihh1_fwd = ihh01_scan_int8(h, gaps, **kwargs) # scan backward ihh0_rev, ihh1_rev = ihh01_scan_int8(h[::-1], gaps[::-1], **kwargs) # handle reverse scan ihh0_rev = ihh0_rev[::-1] ihh1_rev = ihh1_rev[::-1] # compute unstandardized score ihh0 = ihh0_fwd + ihh0_rev ihh1 = ihh1_fwd + ihh1_rev score = np.log(ihh1 / ihh0) return score