def Garud_H( ds: Dataset, *, call_genotype: Hashable = variables.call_genotype, cohorts: Optional[Sequence[Union[int, str]]] = None, merge: bool = True, ) -> Dataset: """Compute the H1, H12, H123 and H2/H1 statistics for detecting signatures of soft sweeps, as defined in Garud et al. (2015). By default, values of this statistic are calculated across all variants. To compute values in windows, call :func:`window` before calling this function. Parameters ---------- ds Genotype call dataset. call_genotype Input variable name holding call_genotype as defined by :data:`sgkit.variables.call_genotype_spec`. Must be present in ``ds``. cohorts The cohorts to compute statistics for, specified as a sequence of cohort indexes or IDs. None (the default) means compute statistics for all cohorts. merge If True (the default), merge the input dataset and the computed output variables into a single dataset, otherwise return only the computed output variables. See :ref:`dataset_merge` for more details. Returns ------- A dataset containing the following variables: - `stat_Garud_h1` (windows, cohorts): Garud H1 statistic. Defined by :data:`sgkit.variables.stat_Garud_h1_spec`. - `stat_Garud_h12` (windows, cohorts): Garud H12 statistic. Defined by :data:`sgkit.variables.stat_Garud_h12_spec`. - `stat_Garud_h123` (windows, cohorts): Garud H123 statistic. Defined by :data:`sgkit.variables.stat_Garud_h123_spec`. - `stat_Garud_h2_h1` (windows, cohorts): Garud H2/H1 statistic. Defined by :data:`sgkit.variables.stat_Garud_h2_h1_spec`. Raises ------ NotImplementedError If the dataset is not diploid. Warnings -------- This function is currently only implemented for diploid datasets. Examples -------- >>> import numpy as np >>> import sgkit as sg >>> import xarray as xr >>> ds = sg.simulate_genotype_call_dataset(n_variant=5, n_sample=4) >>> # Divide samples into two cohorts >>> sample_cohort = np.repeat([0, 1], ds.dims["samples"] // 2) >>> ds["sample_cohort"] = xr.DataArray(sample_cohort, dims="samples") >>> # Divide into windows of size three (variants) >>> ds = sg.window(ds, size=3, step=3) >>> gh = sg.Garud_H(ds) >>> gh["stat_Garud_h1"].values # doctest: +NORMALIZE_WHITESPACE array([[0.25 , 0.375], [0.375, 0.375]]) >>> gh["stat_Garud_h12"].values # doctest: +NORMALIZE_WHITESPACE array([[0.375, 0.625], [0.625, 0.625]]) >>> gh["stat_Garud_h123"].values # doctest: +NORMALIZE_WHITESPACE array([[0.625, 1. ], [1. , 1. ]]) >>> gh["stat_Garud_h2_h1"].values # doctest: +NORMALIZE_WHITESPACE array([[0.75 , 0.33333333], [0.33333333, 0.33333333]]) """ if ds.dims["ploidy"] != 2: raise NotImplementedError( "Garud H only implemented for diploid genotypes") if not has_windows(ds): raise ValueError("Dataset must be windowed for Garud_H") variables.validate(ds, {call_genotype: variables.call_genotype_spec}) gt = ds[call_genotype] # convert sample cohorts to haplotype layout sc = ds.sample_cohort.values hsc = np.stack((sc, sc), axis=1).ravel() # TODO: assumes diploid n_cohorts = sc.max() + 1 # 0-based indexing cohorts = cohorts or range(n_cohorts) ct = _cohorts_to_array(cohorts, ds.indexes.get("cohorts", None)) gh = window_statistic( gt, lambda gt: _Garud_h_cohorts(gt, hsc, n_cohorts, ct), ds.window_start.values, ds.window_stop.values, dtype=np.float64, # first chunks dimension is windows, computed in window_statistic chunks=(-1, n_cohorts, N_GARUD_H_STATS), ) n_windows = ds.window_start.shape[0] assert_array_shape(gh, n_windows, n_cohorts, N_GARUD_H_STATS) new_ds = create_dataset({ variables.stat_Garud_h1: ( ("windows", "cohorts"), gh[:, :, 0], ), variables.stat_Garud_h12: ( ("windows", "cohorts"), gh[:, :, 1], ), variables.stat_Garud_h123: ( ("windows", "cohorts"), gh[:, :, 2], ), variables.stat_Garud_h2_h1: ( ("windows", "cohorts"), gh[:, :, 3], ), }) return conditional_merge_datasets(ds, new_ds, merge)
def diversity( ds: Dataset, *, cohort_allele_count: Hashable = variables.cohort_allele_count, merge: bool = True, ) -> Dataset: """Compute diversity from cohort allele counts. By default, values of this statistic are calculated per variant. To compute values in windows, call :func:`window` before calling this function. Parameters ---------- ds Genotype call dataset. cohort_allele_count Cohort allele count variable to use or calculate. Defined by :data:`sgkit.variables.cohort_allele_count_spec`. If the variable is not present in ``ds``, it will be computed using :func:`count_cohort_alleles`. merge If True (the default), merge the input dataset and the computed output variables into a single dataset, otherwise return only the computed output variables. See :ref:`dataset_merge` for more details. Returns ------- A dataset containing the diversity values, as defined by :data:`sgkit.variables.stat_diversity_spec`. Shape (variants, cohorts), or (windows, cohorts) if windowing information is available. Warnings -------- This method does not currently support datasets that are chunked along the samples dimension. Examples -------- >>> import numpy as np >>> import sgkit as sg >>> import xarray as xr >>> ds = sg.simulate_genotype_call_dataset(n_variant=5, n_sample=4) >>> # Divide samples into two cohorts >>> sample_cohort = np.repeat([0, 1], ds.dims["samples"] // 2) >>> ds["sample_cohort"] = xr.DataArray(sample_cohort, dims="samples") >>> sg.diversity(ds)["stat_diversity"].values # doctest: +NORMALIZE_WHITESPACE array([[0.5 , 0.66666667], [0.66666667, 0.5 ], [0.66666667, 0.66666667], [0.5 , 0.5 ], [0.5 , 0.5 ]]) >>> # Divide into windows of size three (variants) >>> ds = sg.window(ds, size=3) >>> sg.diversity(ds)["stat_diversity"].values # doctest: +NORMALIZE_WHITESPACE array([[1.83333333, 1.83333333], [1. , 1. ]]) """ ds = define_variable_if_absent(ds, variables.cohort_allele_count, cohort_allele_count, count_cohort_alleles) variables.validate( ds, {cohort_allele_count: variables.cohort_allele_count_spec}) ac = ds[cohort_allele_count] an = ac.sum(axis=2) n_pairs = an * (an - 1) / 2 n_same = (ac * (ac - 1) / 2).sum(axis=2) n_diff = n_pairs - n_same # replace zeros to avoid divide by zero error n_pairs_na = n_pairs.where(n_pairs != 0) pi = n_diff / n_pairs_na if has_windows(ds): div = window_statistic( pi, np.sum, ds.window_start.values, ds.window_stop.values, dtype=pi.dtype, axis=0, ) new_ds = create_dataset( {variables.stat_diversity: ( ("windows", "cohorts"), div, )}) else: new_ds = create_dataset( {variables.stat_diversity: ( ("variants", "cohorts"), pi, )}) return conditional_merge_datasets(ds, new_ds, merge)
def observed_heterozygosity( ds: Dataset, *, call_heterozygosity: Hashable = variables.call_heterozygosity, sample_cohort: Hashable = variables.sample_cohort, merge: bool = True, ) -> Dataset: """Compute per cohort observed heterozygosity. The observed heterozygosity of a cohort is the mean of individual heterozygosity values among all samples of that cohort as described in :func:`individual_heterozygosity`. Calls with a nan value for individual heterozygosity are ignored when calculating the cohort mean. By default, values of this statistic are calculated per variant. To compute values in windows, call :func:`window_by_position` or :func:`window_by_variant` before calling this function. Parameters ---------- ds Dataset containing genotype calls. call_heterozygosity Input variable name holding call_heterozygosity as defined by :data:`sgkit.variables.call_heterozygosity_spec`. If the variable is not present in ``ds``, it will be computed using :func:`individual_heterozygosity`. sample_cohort Input variable name holding sample_cohort as defined by :data:`sgkit.variables.sample_cohort_spec`. merge If True (the default), merge the input dataset and the computed output variables into a single dataset, otherwise return only the computed output variables. See :ref:`dataset_merge` for more details. Returns ------- A dataset containing :data:`sgkit.variables.stat_observed_heterozygosity_spec` of per cohort observed heterozygosity with shape (variants, cohorts) containing values within the inteval [0, 1] or nan. Examples -------- >>> import numpy as np >>> import sgkit as sg >>> import xarray as xr >>> ds = sg.simulate_genotype_call_dataset(n_variant=5, n_sample=4) >>> # Divide samples into two cohorts >>> sample_cohort = np.repeat([0, 1], ds.dims["samples"] // 2) >>> ds["sample_cohort"] = xr.DataArray(sample_cohort, dims="samples") >>> sg.observed_heterozygosity(ds)["stat_observed_heterozygosity"].values # doctest: +NORMALIZE_WHITESPACE array([[0.5, 1. ], [1. , 0.5], [0. , 1. ], [0.5, 0.5], [0.5, 0.5]]) >>> # Divide into windows of size three (variants) >>> ds = sg.window_by_variant(ds, size=3) >>> sg.observed_heterozygosity(ds)["stat_observed_heterozygosity"].values # doctest: +NORMALIZE_WHITESPACE array([[1.5, 2.5], [1. , 1. ]]) """ ds = define_variable_if_absent( ds, variables.call_heterozygosity, call_heterozygosity, individual_heterozygosity, ) variables.validate( ds, {call_heterozygosity: variables.call_heterozygosity_spec}) hi = da.asarray(ds[call_heterozygosity]) sc = da.asarray(ds[sample_cohort]) n_cohorts = sc.max().compute() + 1 shape = (hi.chunks[0], n_cohorts) n = da.zeros(n_cohorts, dtype=np.uint8) ho = da.map_blocks( _cohort_observed_heterozygosity, hi, sc, n, chunks=shape, drop_axis=1, new_axis=1, dtype=np.float64, ) if has_windows(ds): ho_sum = window_statistic( ho, np.sum, ds.window_start.values, ds.window_stop.values, dtype=ho.dtype, axis=0, ) new_ds = create_dataset({ variables.stat_observed_heterozygosity: ( ("windows", "cohorts"), ho_sum, ) }) else: new_ds = create_dataset({ variables.stat_observed_heterozygosity: ( ("variants", "cohorts"), ho, ) }) return conditional_merge_datasets(ds, new_ds, merge)
def divergence( ds: Dataset, *, cohort_allele_count: Hashable = variables.cohort_allele_count, merge: bool = True, ) -> Dataset: """Compute divergence between pairs of cohorts. The entry at (i, j) is the divergence between for cohort i and cohort j, except for the case where i and j are the same, in which case the entry is the diversity for cohort i. By default, values of this statistic are calculated per variant. To compute values in windows, call :func:`window` before calling this function. Parameters ---------- ds Genotype call dataset. cohort_allele_count Cohort allele count variable to use or calculate. Defined by :data:`sgkit.variables.cohort_allele_count_spec`. If the variable is not present in ``ds``, it will be computed using :func:`count_cohort_alleles`. merge If True (the default), merge the input dataset and the computed output variables into a single dataset, otherwise return only the computed output variables. See :ref:`dataset_merge` for more details. Returns ------- A dataset containing the divergence value between pairs of cohorts, as defined by :data:`sgkit.variables.stat_divergence_spec`. Shape (variants, cohorts, cohorts), or (windows, cohorts, cohorts) if windowing information is available. Warnings -------- This method does not currently support datasets that are chunked along the samples dimension. Examples -------- >>> import numpy as np >>> import sgkit as sg >>> import xarray as xr >>> ds = sg.simulate_genotype_call_dataset(n_variant=5, n_sample=4) >>> # Divide samples into two cohorts >>> sample_cohort = np.repeat([0, 1], ds.dims["samples"] // 2) >>> ds["sample_cohort"] = xr.DataArray(sample_cohort, dims="samples") >>> sg.divergence(ds)["stat_divergence"].values # doctest: +NORMALIZE_WHITESPACE array([[[0.5 , 0.5 ], [0.5 , 0.66666667]], <BLANKLINE> [[0.66666667, 0.5 ], [0.5 , 0.5 ]], <BLANKLINE> [[0.66666667, 0.5 ], [0.5 , 0.66666667]], <BLANKLINE> [[0.5 , 0.375 ], [0.375 , 0.5 ]], <BLANKLINE> [[0.5 , 0.625 ], [0.625 , 0.5 ]]]) >>> # Divide into windows of size three (variants) >>> ds = sg.window(ds, size=3) >>> sg.divergence(ds)["stat_divergence"].values # doctest: +NORMALIZE_WHITESPACE array([[[1.83333333, 1.5 ], [1.5 , 1.83333333]], <BLANKLINE> [[1. , 1. ], [1. , 1. ]]]) """ ds = define_variable_if_absent(ds, variables.cohort_allele_count, cohort_allele_count, count_cohort_alleles) variables.validate( ds, {cohort_allele_count: variables.cohort_allele_count_spec}) ac = ds[cohort_allele_count] n_variants = ds.dims["variants"] n_cohorts = ds.dims["cohorts"] ac = da.asarray(ac) shape = (ac.chunks[0], n_cohorts, n_cohorts) d = da.map_blocks(_divergence, ac, chunks=shape, dtype=np.float64) assert_array_shape(d, n_variants, n_cohorts, n_cohorts) if has_windows(ds): div = window_statistic( d, np.sum, ds.window_start.values, ds.window_stop.values, dtype=d.dtype, axis=0, ) new_ds = create_dataset({ variables.stat_divergence: ( ("windows", "cohorts_0", "cohorts_1"), div, ) }) else: new_ds = create_dataset({ variables.stat_divergence: ( ("variants", "cohorts_0", "cohorts_1"), d, ) }) return conditional_merge_datasets(ds, new_ds, merge)
def Tajimas_D( ds: Dataset, *, variant_allele_count: Hashable = variables.variant_allele_count, stat_diversity: Hashable = variables.stat_diversity, merge: bool = True, ) -> Dataset: """Compute Tajimas' D for a genotype call dataset. By default, values of this statistic are calculated per variant. To compute values in windows, call :func:`window_by_position` or :func:`window_by_variant` before calling this function. Parameters ---------- ds Genotype call dataset. variant_allele_count Variant allele count variable to use or calculate. Defined by :data:`sgkit.variables.variant_allele_count_spec`. If the variable is not present in ``ds``, it will be computed using :func:`count_variant_alleles`. stat_diversity Diversity variable to use or calculate. Defined by :data:`sgkit.variables.stat_diversity_spec`. If the variable is not present in ``ds``, it will be computed using :func:`diversity`. merge If True (the default), merge the input dataset and the computed output variables into a single dataset, otherwise return only the computed output variables. See :ref:`dataset_merge` for more details. Returns ------- A dataset containing the Tajimas' D value, as defined by :data:`sgkit.variables.stat_Tajimas_D_spec`. Shape (variants, cohorts), or (windows, cohorts) if windowing information is available. Warnings -------- This method does not currently support datasets that are chunked along the samples dimension. Examples -------- >>> import numpy as np >>> import sgkit as sg >>> import xarray as xr >>> ds = sg.simulate_genotype_call_dataset(n_variant=5, n_sample=4) >>> # Divide samples into two cohorts >>> sample_cohort = np.repeat([0, 1], ds.dims["samples"] // 2) >>> ds["sample_cohort"] = xr.DataArray(sample_cohort, dims="samples") >>> sg.Tajimas_D(ds)["stat_Tajimas_D"].values # doctest: +NORMALIZE_WHITESPACE array([[0.88883234, 2.18459998], [2.18459998, 0.88883234], [2.18459998, 2.18459998], [0.88883234, 0.88883234], [0.88883234, 0.88883234]]) >>> # Divide into windows of size three (variants) >>> ds = sg.window_by_variant(ds, size=3) >>> sg.Tajimas_D(ds)["stat_Tajimas_D"].values # doctest: +NORMALIZE_WHITESPACE array([[2.40517586, 2.40517586], [1.10393559, 1.10393559]]) """ ds = define_variable_if_absent(ds, variables.variant_allele_count, variant_allele_count, count_variant_alleles) ds = define_variable_if_absent(ds, variables.stat_diversity, stat_diversity, diversity) variables.validate( ds, { variant_allele_count: variables.variant_allele_count_spec, stat_diversity: variables.stat_diversity_spec, }, ) ac = ds[variant_allele_count] ac = da.asarray(ac) # count segregating. Note that this uses the definition in tskit, # which is the number of alleles - 1. In the biallelic case this # gives us the number of non-monomorphic sites. S = (ac > 0).sum(axis=1) - 1 if has_windows(ds): S = window_statistic( S, np.sum, ds.window_start.values, ds.window_stop.values, dtype=S.dtype, axis=0, ) # assume number of chromosomes sampled is constant for all variants # NOTE: even tho ac has dtype uint, we promote the sum to float # because the computation below requires floats n = ac.sum(axis=1, dtype="float").max() # (n-1)th harmonic number a1 = (1 / da.arange(1, n)).sum() # calculate Watterson's theta (absolute value) theta = S / a1 # get diversity div = ds[stat_diversity] # N.B., both theta estimates are usually divided by the number of # (accessible) bases but here we want the absolute difference d = div - theta[:, np.newaxis] # calculate the denominator (standard deviation) a2 = (1 / (da.arange(1, n)**2)).sum() b1 = (n + 1) / (3 * (n - 1)) b2 = 2 * (n**2 + n + 3) / (9 * n * (n - 1)) c1 = b1 - (1 / a1) c2 = b2 - ((n + 2) / (a1 * n)) + (a2 / (a1**2)) e1 = c1 / a1 e2 = c2 / (a1**2 + a2) d_stdev = da.sqrt((e1 * S) + (e2 * S * (S - 1))) # Let IEEE decide the semantics of division by zero here. The return value # will be -inf, nan or +inf, depending on the value of the numerator. # Currently this will raise a RuntimeWarning, if we divide by zero. D = d / d_stdev[:, np.newaxis] if has_windows(ds): new_ds = create_dataset( {variables.stat_Tajimas_D: (["windows", "cohorts"], D.data)}) else: new_ds = create_dataset( {variables.stat_Tajimas_D: (["variants", "cohorts"], D.data)}) return conditional_merge_datasets(ds, new_ds, merge)