コード例 #1
0
def test_window__default_step():
    ds = simulate_genotype_call_dataset(n_variant=10, n_sample=3, seed=0)
    assert not has_windows(ds)
    ds = window(ds, 2)
    assert has_windows(ds)
    np.testing.assert_equal(ds[window_contig].values, [0, 0, 0, 0, 0])
    np.testing.assert_equal(ds[window_start].values, [0, 2, 4, 6, 8])
    np.testing.assert_equal(ds[window_stop].values, [2, 4, 6, 8, 10])
コード例 #2
0
def test_window():
    ds = simulate_genotype_call_dataset(n_variant=10, n_sample=3, seed=0)
    assert not has_windows(ds)
    ds = window(ds, 2, 2)
    assert has_windows(ds)
    np.testing.assert_equal(ds[window_contig].values, [0, 0, 0, 0, 0])
    np.testing.assert_equal(ds[window_start].values, [0, 2, 4, 6, 8])
    np.testing.assert_equal(ds[window_stop].values, [2, 4, 6, 8, 10])

    with pytest.raises(MergeWarning):
        window(ds, 2, 2)
コード例 #3
0
ファイル: test_window.py プロジェクト: hammer/sgkit
def test_window_by_position__equal_spaced_windows():
    ds = simulate_genotype_call_dataset(n_variant=5, n_sample=3, seed=0)
    assert not has_windows(ds)
    ds["variant_position"] = (
        ["variants"],
        np.array([1, 4, 6, 8, 12]),
    )
    ds = window_by_position(ds, size=5, offset=1)
    assert has_windows(ds)
    np.testing.assert_equal(ds[window_contig].values, [0, 0, 0])
    np.testing.assert_equal(ds[window_start].values, [0, 2, 4])
    np.testing.assert_equal(ds[window_stop].values, [2, 4, 5])
コード例 #4
0
ファイル: test_window.py プロジェクト: hammer/sgkit
def test_window_by_position__multiple_contigs():
    ds = simulate_genotype_call_dataset(n_variant=10, n_sample=3, n_contig=2)
    ds["variant_position"] = (
        ["variants"],
        np.array([1, 4, 6, 8, 12, 1, 21, 25, 40, 55]),
    )
    ds = window_by_position(ds,
                            size=10,
                            window_start_position="variant_position")
    assert has_windows(ds)
    np.testing.assert_equal(ds[window_contig].values,
                            [0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
    np.testing.assert_equal(ds[window_start].values,
                            [0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
    np.testing.assert_equal(ds[window_stop].values,
                            [4, 5, 5, 5, 5, 6, 8, 8, 9, 10])
コード例 #5
0
def Garud_H(
    ds: Dataset,
    *,
    call_genotype: Hashable = variables.call_genotype,
    cohorts: Optional[Sequence[Union[int, str]]] = None,
    merge: bool = True,
) -> Dataset:
    """Compute the H1, H12, H123 and H2/H1 statistics for detecting signatures
    of soft sweeps, as defined in Garud et al. (2015).

    By default, values of this statistic are calculated across all variants.
    To compute values in windows, call :func:`window` before calling
    this function.

    Parameters
    ----------
    ds
        Genotype call dataset.
    call_genotype
        Input variable name holding call_genotype as defined by
        :data:`sgkit.variables.call_genotype_spec`.
        Must be present in ``ds``.
    cohorts
        The cohorts to compute statistics for, specified as a sequence of
        cohort indexes or IDs. None (the default) means compute statistics
        for all cohorts.
    merge
        If True (the default), merge the input dataset and the computed
        output variables into a single dataset, otherwise return only
        the computed output variables.
        See :ref:`dataset_merge` for more details.

    Returns
    -------
    A dataset containing the following variables:

    - `stat_Garud_h1` (windows, cohorts): Garud H1 statistic.
        Defined by :data:`sgkit.variables.stat_Garud_h1_spec`.

    - `stat_Garud_h12` (windows, cohorts): Garud H12 statistic.
        Defined by :data:`sgkit.variables.stat_Garud_h12_spec`.

    - `stat_Garud_h123` (windows, cohorts): Garud H123 statistic.
        Defined by :data:`sgkit.variables.stat_Garud_h123_spec`.

    - `stat_Garud_h2_h1` (windows, cohorts): Garud H2/H1 statistic.
        Defined by :data:`sgkit.variables.stat_Garud_h2_h1_spec`.

    Raises
    ------
    NotImplementedError
        If the dataset is not diploid.

    Warnings
    --------
    This function is currently only implemented for diploid datasets.

    Examples
    --------

    >>> import numpy as np
    >>> import sgkit as sg
    >>> import xarray as xr
    >>> ds = sg.simulate_genotype_call_dataset(n_variant=5, n_sample=4)

    >>> # Divide samples into two cohorts
    >>> sample_cohort = np.repeat([0, 1], ds.dims["samples"] // 2)
    >>> ds["sample_cohort"] = xr.DataArray(sample_cohort, dims="samples")

    >>> # Divide into windows of size three (variants)
    >>> ds = sg.window(ds, size=3, step=3)

    >>> gh = sg.Garud_H(ds)
    >>> gh["stat_Garud_h1"].values # doctest: +NORMALIZE_WHITESPACE
    array([[0.25 , 0.375],
        [0.375, 0.375]])
    >>> gh["stat_Garud_h12"].values # doctest: +NORMALIZE_WHITESPACE
    array([[0.375, 0.625],
        [0.625, 0.625]])
    >>> gh["stat_Garud_h123"].values # doctest: +NORMALIZE_WHITESPACE
    array([[0.625, 1.   ],
        [1.   , 1.   ]])
    >>> gh["stat_Garud_h2_h1"].values # doctest: +NORMALIZE_WHITESPACE
    array([[0.75      , 0.33333333],
        [0.33333333, 0.33333333]])
    """

    if ds.dims["ploidy"] != 2:
        raise NotImplementedError(
            "Garud H only implemented for diploid genotypes")

    if not has_windows(ds):
        raise ValueError("Dataset must be windowed for Garud_H")

    variables.validate(ds, {call_genotype: variables.call_genotype_spec})

    gt = ds[call_genotype]

    # convert sample cohorts to haplotype layout
    sc = ds.sample_cohort.values
    hsc = np.stack((sc, sc), axis=1).ravel()  # TODO: assumes diploid
    n_cohorts = sc.max() + 1  # 0-based indexing
    cohorts = cohorts or range(n_cohorts)
    ct = _cohorts_to_array(cohorts, ds.indexes.get("cohorts", None))

    gh = window_statistic(
        gt,
        lambda gt: _Garud_h_cohorts(gt, hsc, n_cohorts, ct),
        ds.window_start.values,
        ds.window_stop.values,
        dtype=np.float64,
        # first chunks dimension is windows, computed in window_statistic
        chunks=(-1, n_cohorts, N_GARUD_H_STATS),
    )
    n_windows = ds.window_start.shape[0]
    assert_array_shape(gh, n_windows, n_cohorts, N_GARUD_H_STATS)
    new_ds = create_dataset({
        variables.stat_Garud_h1: (
            ("windows", "cohorts"),
            gh[:, :, 0],
        ),
        variables.stat_Garud_h12: (
            ("windows", "cohorts"),
            gh[:, :, 1],
        ),
        variables.stat_Garud_h123: (
            ("windows", "cohorts"),
            gh[:, :, 2],
        ),
        variables.stat_Garud_h2_h1: (
            ("windows", "cohorts"),
            gh[:, :, 3],
        ),
    })

    return conditional_merge_datasets(ds, new_ds, merge)
コード例 #6
0
def diversity(
    ds: Dataset,
    *,
    cohort_allele_count: Hashable = variables.cohort_allele_count,
    merge: bool = True,
) -> Dataset:
    """Compute diversity from cohort allele counts.

    By default, values of this statistic are calculated per variant.
    To compute values in windows, call :func:`window` before calling
    this function.

    Parameters
    ----------
    ds
        Genotype call dataset.
    cohort_allele_count
        Cohort allele count variable to use or calculate. Defined by
        :data:`sgkit.variables.cohort_allele_count_spec`.
        If the variable is not present in ``ds``, it will be computed
        using :func:`count_cohort_alleles`.
    merge
        If True (the default), merge the input dataset and the computed
        output variables into a single dataset, otherwise return only
        the computed output variables.
        See :ref:`dataset_merge` for more details.

    Returns
    -------
    A dataset containing the diversity values, as defined by :data:`sgkit.variables.stat_diversity_spec`.
    Shape (variants, cohorts), or (windows, cohorts) if windowing information is available.

    Warnings
    --------
    This method does not currently support datasets that are chunked along the
    samples dimension.

    Examples
    --------

    >>> import numpy as np
    >>> import sgkit as sg
    >>> import xarray as xr
    >>> ds = sg.simulate_genotype_call_dataset(n_variant=5, n_sample=4)

    >>> # Divide samples into two cohorts
    >>> sample_cohort = np.repeat([0, 1], ds.dims["samples"] // 2)
    >>> ds["sample_cohort"] = xr.DataArray(sample_cohort, dims="samples")

    >>> sg.diversity(ds)["stat_diversity"].values # doctest: +NORMALIZE_WHITESPACE
    array([[0.5       , 0.66666667],
        [0.66666667, 0.5       ],
        [0.66666667, 0.66666667],
        [0.5       , 0.5       ],
        [0.5       , 0.5       ]])

    >>> # Divide into windows of size three (variants)
    >>> ds = sg.window(ds, size=3)
    >>> sg.diversity(ds)["stat_diversity"].values # doctest: +NORMALIZE_WHITESPACE
    array([[1.83333333, 1.83333333],
        [1.        , 1.        ]])
    """
    ds = define_variable_if_absent(ds, variables.cohort_allele_count,
                                   cohort_allele_count, count_cohort_alleles)
    variables.validate(
        ds, {cohort_allele_count: variables.cohort_allele_count_spec})

    ac = ds[cohort_allele_count]
    an = ac.sum(axis=2)
    n_pairs = an * (an - 1) / 2
    n_same = (ac * (ac - 1) / 2).sum(axis=2)
    n_diff = n_pairs - n_same
    # replace zeros to avoid divide by zero error
    n_pairs_na = n_pairs.where(n_pairs != 0)
    pi = n_diff / n_pairs_na

    if has_windows(ds):
        div = window_statistic(
            pi,
            np.sum,
            ds.window_start.values,
            ds.window_stop.values,
            dtype=pi.dtype,
            axis=0,
        )
        new_ds = create_dataset(
            {variables.stat_diversity: (
                ("windows", "cohorts"),
                div,
            )})
    else:
        new_ds = create_dataset(
            {variables.stat_diversity: (
                ("variants", "cohorts"),
                pi,
            )})
    return conditional_merge_datasets(ds, new_ds, merge)
コード例 #7
0
def divergence(
    ds: Dataset,
    *,
    cohort_allele_count: Hashable = variables.cohort_allele_count,
    merge: bool = True,
) -> Dataset:
    """Compute divergence between pairs of cohorts.

    The entry at (i, j) is the divergence between for cohort i and cohort j,
    except for the case where i and j are the same, in which case the entry
    is the diversity for cohort i.

    By default, values of this statistic are calculated per variant.
    To compute values in windows, call :func:`window` before calling
    this function.

    Parameters
    ----------
    ds
        Genotype call dataset.
    cohort_allele_count
        Cohort allele count variable to use or calculate. Defined by
        :data:`sgkit.variables.cohort_allele_count_spec`.
        If the variable is not present in ``ds``, it will be computed
        using :func:`count_cohort_alleles`.
    merge
        If True (the default), merge the input dataset and the computed
        output variables into a single dataset, otherwise return only
        the computed output variables.
        See :ref:`dataset_merge` for more details.

    Returns
    -------
    A dataset containing the divergence value between pairs of cohorts, as defined by
    :data:`sgkit.variables.stat_divergence_spec`.
    Shape (variants, cohorts, cohorts), or (windows, cohorts, cohorts) if windowing
    information is available.

    Warnings
    --------
    This method does not currently support datasets that are chunked along the
    samples dimension.

    Examples
    --------

    >>> import numpy as np
    >>> import sgkit as sg
    >>> import xarray as xr
    >>> ds = sg.simulate_genotype_call_dataset(n_variant=5, n_sample=4)

    >>> # Divide samples into two cohorts
    >>> sample_cohort = np.repeat([0, 1], ds.dims["samples"] // 2)
    >>> ds["sample_cohort"] = xr.DataArray(sample_cohort, dims="samples")

    >>> sg.divergence(ds)["stat_divergence"].values # doctest: +NORMALIZE_WHITESPACE
    array([[[0.5       , 0.5       ],
            [0.5       , 0.66666667]],
    <BLANKLINE>
        [[0.66666667, 0.5       ],
            [0.5       , 0.5       ]],
    <BLANKLINE>
        [[0.66666667, 0.5       ],
            [0.5       , 0.66666667]],
    <BLANKLINE>
        [[0.5       , 0.375     ],
            [0.375     , 0.5       ]],
    <BLANKLINE>
        [[0.5       , 0.625     ],
            [0.625     , 0.5       ]]])

    >>> # Divide into windows of size three (variants)
    >>> ds = sg.window(ds, size=3)
    >>> sg.divergence(ds)["stat_divergence"].values # doctest: +NORMALIZE_WHITESPACE
    array([[[1.83333333, 1.5       ],
            [1.5       , 1.83333333]],
    <BLANKLINE>
        [[1.        , 1.        ],
            [1.        , 1.        ]]])
    """

    ds = define_variable_if_absent(ds, variables.cohort_allele_count,
                                   cohort_allele_count, count_cohort_alleles)
    variables.validate(
        ds, {cohort_allele_count: variables.cohort_allele_count_spec})
    ac = ds[cohort_allele_count]

    n_variants = ds.dims["variants"]
    n_cohorts = ds.dims["cohorts"]
    ac = da.asarray(ac)
    shape = (ac.chunks[0], n_cohorts, n_cohorts)
    d = da.map_blocks(_divergence, ac, chunks=shape, dtype=np.float64)
    assert_array_shape(d, n_variants, n_cohorts, n_cohorts)

    if has_windows(ds):
        div = window_statistic(
            d,
            np.sum,
            ds.window_start.values,
            ds.window_stop.values,
            dtype=d.dtype,
            axis=0,
        )
        new_ds = create_dataset({
            variables.stat_divergence: (
                ("windows", "cohorts_0", "cohorts_1"),
                div,
            )
        })
    else:
        new_ds = create_dataset({
            variables.stat_divergence: (
                ("variants", "cohorts_0", "cohorts_1"),
                d,
            )
        })
    return conditional_merge_datasets(ds, new_ds, merge)
コード例 #8
0
ファイル: ld.py プロジェクト: aktech/sgkit
def ld_matrix(
    ds: Dataset,
    *,
    dosage: Hashable = variables.dosage,
    threshold: Optional[float] = None,
    variant_score: Optional[Hashable] = None,
) -> DataFrame:
    """Compute a sparse linkage disequilibrium (LD) matrix.

    This method computes the Rogers Huff R2 value for each pair of variants in
    a window, and returns those that exceed the provided threshold, as a sparse
    matrix dataframe.

    Parameters
    ----------
    ds
        Dataset containing genotype dosages. Must already be windowed with :func:`window`.
    dosage
        Name of genetic dosage variable.
        Defined by :data:`sgkit.variables.dosage_spec`.
    threshold
        R2 threshold below which no variant pairs will be returned. This should almost
        always be something at least slightly above 0 to avoid the large density very
        near zero LD present in most datasets.
    variant_score
        Optional name of variable to use to prioritize variant selection
        (e.g. minor allele frequency). Defaults to None.
        Defined by :data:`sgkit.variables.variant_score_spec`.

    Returns
    -------
    Upper triangle (including diagonal) of LD matrix as COO in dataframe.  Fields:

    - ``i``: Row (variant) index 1
    - ``j``: Row (variant) index 2
    - ``value``: R2 value
    - ``cmp``: If ``variant_score`` is provided, this is 1, 0, or -1 indicating whether or not ``i > j`` (1), ``i < j`` (-1), or ``i == j`` (0)

    Raises
    ------
    ValueError
        If the dataset is not windowed.
    """

    if not has_windows(ds):
        raise ValueError(
            "Dataset must be windowed for ld_matrix. See the sgkit.window function."
        )

    variables.validate(ds, {dosage: variables.dosage_spec})

    x = da.asarray(ds[dosage])

    threshold = threshold or np.nan

    if variant_score is not None:
        variables.validate(ds, {variant_score: variables.variant_score_spec})
        scores = da.asarray(ds[variant_score])
    else:
        scores = None

    # Find windows in each chunk
    window_starts = ds.window_start.values
    window_stops = ds.window_stop.values
    window_lengths = window_stops - window_starts

    chunks = x.chunks[0]
    chunk_starts = _sizes_to_start_offsets(chunks)
    rel_window_starts, windows_per_chunk = _get_chunked_windows(
        chunks, window_starts, window_stops)
    rel_window_stops = rel_window_starts + window_lengths
    chunk_offset_indexes = _sizes_to_start_offsets(windows_per_chunk)

    def to_ld_df(x: ArrayLike, chunk_index: int) -> DataFrame:
        chunk_offset_index_start = chunk_offset_indexes[chunk_index]
        chunk_offset_index_stop = chunk_offset_indexes[chunk_index + 1]
        chunk_window_starts = rel_window_starts[
            chunk_offset_index_start:chunk_offset_index_stop]
        chunk_window_stops = rel_window_stops[
            chunk_offset_index_start:chunk_offset_index_stop]
        max_stop = np.max(
            chunk_window_stops) if len(chunk_window_stops) > 0 else 0
        abs_chunk_start = chunk_starts[chunk_index]
        abs_chunk_end = abs_chunk_start + max_stop  # this may extend into later chunks
        block_x = x[abs_chunk_start:abs_chunk_end]
        block_scores = (scores[abs_chunk_start:abs_chunk_end]
                        if scores is not None else None)

        # Look at the next window (not in this chunk) to find out where to stop processing
        # windows in this chunk (see _ld_matrix_jit)
        if len(window_starts) == chunk_offset_index_stop:
            # if there are no more windows, then need to process the all windows in this chunk entirely
            chunk_max_window_start = max_stop
        else:
            # otherwise only process up the start of the next window
            chunk_max_window_start = (window_starts[chunk_offset_index_stop] -
                                      chunk_starts[chunk_index])

        index_dtype = np.int32
        value_dtype = np.float32

        f = dask.delayed(_ld_matrix)(
            block_x,
            chunk_window_starts,
            chunk_window_stops,
            abs_chunk_start,
            chunk_max_window_start,
            index_dtype,
            value_dtype,
            threshold=threshold,
            scores=block_scores,
        )
        meta = [("i", index_dtype), ("j", index_dtype), ("value", value_dtype)]
        if scores is not None:
            meta.append(("cmp", np.int8))
        return dd.from_delayed([f], meta=meta)

    return dd.concat([
        to_ld_df(x, chunk_index)
        for chunk_index in range(len(windows_per_chunk))
    ])
コード例 #9
0
def observed_heterozygosity(
    ds: Dataset,
    *,
    call_heterozygosity: Hashable = variables.call_heterozygosity,
    sample_cohort: Hashable = variables.sample_cohort,
    merge: bool = True,
) -> Dataset:
    """Compute per cohort observed heterozygosity.

    The observed heterozygosity of a cohort is the mean of individual
    heterozygosity values among all samples of that cohort as described
    in :func:`individual_heterozygosity`. Calls with a nan value for
    individual heterozygosity are ignored when calculating the cohort
    mean.

    By default, values of this statistic are calculated per variant.
    To compute values in windows, call :func:`window_by_position` or :func:`window_by_variant` before calling
    this function.

    Parameters
    ----------
    ds
        Dataset containing genotype calls.
    call_heterozygosity
        Input variable name holding call_heterozygosity as defined by
        :data:`sgkit.variables.call_heterozygosity_spec`.
        If the variable is not present in ``ds``, it will be computed
        using :func:`individual_heterozygosity`.
    sample_cohort
        Input variable name holding sample_cohort as defined by
        :data:`sgkit.variables.sample_cohort_spec`.
    merge
        If True (the default), merge the input dataset and the computed
        output variables into a single dataset, otherwise return only
        the computed output variables.
        See :ref:`dataset_merge` for more details.
    Returns
    -------
    A dataset containing :data:`sgkit.variables.stat_observed_heterozygosity_spec`
    of per cohort observed heterozygosity with shape (variants, cohorts)
    containing values within the inteval [0, 1] or nan.

    Examples
    --------

    >>> import numpy as np
    >>> import sgkit as sg
    >>> import xarray as xr
    >>> ds = sg.simulate_genotype_call_dataset(n_variant=5, n_sample=4)

    >>> # Divide samples into two cohorts
    >>> sample_cohort = np.repeat([0, 1], ds.dims["samples"] // 2)
    >>> ds["sample_cohort"] = xr.DataArray(sample_cohort, dims="samples")

    >>> sg.observed_heterozygosity(ds)["stat_observed_heterozygosity"].values # doctest: +NORMALIZE_WHITESPACE
    array([[0.5, 1. ],
        [1. , 0.5],
        [0. , 1. ],
        [0.5, 0.5],
        [0.5, 0.5]])

    >>> # Divide into windows of size three (variants)
    >>> ds = sg.window_by_variant(ds, size=3)
    >>> sg.observed_heterozygosity(ds)["stat_observed_heterozygosity"].values # doctest: +NORMALIZE_WHITESPACE
    array([[1.5, 2.5],
        [1. , 1. ]])
    """
    ds = define_variable_if_absent(
        ds,
        variables.call_heterozygosity,
        call_heterozygosity,
        individual_heterozygosity,
    )
    variables.validate(
        ds, {call_heterozygosity: variables.call_heterozygosity_spec})
    hi = da.asarray(ds[call_heterozygosity])
    sc = da.asarray(ds[sample_cohort])
    n_cohorts = sc.max().compute() + 1
    shape = (hi.chunks[0], n_cohorts)
    n = da.zeros(n_cohorts, dtype=np.uint8)
    ho = da.map_blocks(
        _cohort_observed_heterozygosity,
        hi,
        sc,
        n,
        chunks=shape,
        drop_axis=1,
        new_axis=1,
        dtype=np.float64,
    )
    if has_windows(ds):
        ho_sum = window_statistic(
            ho,
            np.sum,
            ds.window_start.values,
            ds.window_stop.values,
            dtype=ho.dtype,
            axis=0,
        )
        new_ds = create_dataset({
            variables.stat_observed_heterozygosity: (
                ("windows", "cohorts"),
                ho_sum,
            )
        })
    else:
        new_ds = create_dataset({
            variables.stat_observed_heterozygosity: (
                ("variants", "cohorts"),
                ho,
            )
        })
    return conditional_merge_datasets(ds, new_ds, merge)
コード例 #10
0
def Tajimas_D(
    ds: Dataset,
    *,
    variant_allele_count: Hashable = variables.variant_allele_count,
    stat_diversity: Hashable = variables.stat_diversity,
    merge: bool = True,
) -> Dataset:
    """Compute Tajimas' D for a genotype call dataset.

    By default, values of this statistic are calculated per variant.
    To compute values in windows, call :func:`window_by_position` or :func:`window_by_variant` before calling
    this function.

    Parameters
    ----------
    ds
        Genotype call dataset.
    variant_allele_count
        Variant allele count variable to use or calculate. Defined by
        :data:`sgkit.variables.variant_allele_count_spec`.
        If the variable is not present in ``ds``, it will be computed
        using :func:`count_variant_alleles`.
    stat_diversity
        Diversity variable to use or calculate. Defined by
        :data:`sgkit.variables.stat_diversity_spec`.
        If the variable is not present in ``ds``, it will be computed
        using :func:`diversity`.
    merge
        If True (the default), merge the input dataset and the computed
        output variables into a single dataset, otherwise return only
        the computed output variables.
        See :ref:`dataset_merge` for more details.

    Returns
    -------
    A dataset containing the Tajimas' D value, as defined by :data:`sgkit.variables.stat_Tajimas_D_spec`.
    Shape (variants, cohorts), or (windows, cohorts) if windowing information is available.

    Warnings
    --------
    This method does not currently support datasets that are chunked along the
    samples dimension.

    Examples
    --------

    >>> import numpy as np
    >>> import sgkit as sg
    >>> import xarray as xr
    >>> ds = sg.simulate_genotype_call_dataset(n_variant=5, n_sample=4)

    >>> # Divide samples into two cohorts
    >>> sample_cohort = np.repeat([0, 1], ds.dims["samples"] // 2)
    >>> ds["sample_cohort"] = xr.DataArray(sample_cohort, dims="samples")

    >>> sg.Tajimas_D(ds)["stat_Tajimas_D"].values # doctest: +NORMALIZE_WHITESPACE
    array([[0.88883234, 2.18459998],
           [2.18459998, 0.88883234],
           [2.18459998, 2.18459998],
           [0.88883234, 0.88883234],
           [0.88883234, 0.88883234]])

    >>> # Divide into windows of size three (variants)
    >>> ds = sg.window_by_variant(ds, size=3)
    >>> sg.Tajimas_D(ds)["stat_Tajimas_D"].values # doctest: +NORMALIZE_WHITESPACE
    array([[2.40517586, 2.40517586],
           [1.10393559, 1.10393559]])
    """
    ds = define_variable_if_absent(ds, variables.variant_allele_count,
                                   variant_allele_count, count_variant_alleles)
    ds = define_variable_if_absent(ds, variables.stat_diversity,
                                   stat_diversity, diversity)
    variables.validate(
        ds,
        {
            variant_allele_count: variables.variant_allele_count_spec,
            stat_diversity: variables.stat_diversity_spec,
        },
    )

    ac = ds[variant_allele_count]
    ac = da.asarray(ac)

    # count segregating. Note that this uses the definition in tskit,
    # which is the number of alleles - 1. In the biallelic case this
    # gives us the number of non-monomorphic sites.
    S = (ac > 0).sum(axis=1) - 1

    if has_windows(ds):
        S = window_statistic(
            S,
            np.sum,
            ds.window_start.values,
            ds.window_stop.values,
            dtype=S.dtype,
            axis=0,
        )

    # assume number of chromosomes sampled is constant for all variants
    # NOTE: even tho ac has dtype uint, we promote the sum to float
    #       because the computation below requires floats
    n = ac.sum(axis=1, dtype="float").max()

    # (n-1)th harmonic number
    a1 = (1 / da.arange(1, n)).sum()

    # calculate Watterson's theta (absolute value)
    theta = S / a1

    # get diversity
    div = ds[stat_diversity]

    # N.B., both theta estimates are usually divided by the number of
    # (accessible) bases but here we want the absolute difference
    d = div - theta[:, np.newaxis]

    # calculate the denominator (standard deviation)
    a2 = (1 / (da.arange(1, n)**2)).sum()
    b1 = (n + 1) / (3 * (n - 1))
    b2 = 2 * (n**2 + n + 3) / (9 * n * (n - 1))
    c1 = b1 - (1 / a1)
    c2 = b2 - ((n + 2) / (a1 * n)) + (a2 / (a1**2))
    e1 = c1 / a1
    e2 = c2 / (a1**2 + a2)
    d_stdev = da.sqrt((e1 * S) + (e2 * S * (S - 1)))

    # Let IEEE decide the semantics of division by zero here. The return value
    # will be -inf, nan or +inf, depending on the value of the numerator.
    # Currently this will raise a RuntimeWarning, if we divide by zero.
    D = d / d_stdev[:, np.newaxis]

    if has_windows(ds):
        new_ds = create_dataset(
            {variables.stat_Tajimas_D: (["windows", "cohorts"], D.data)})
    else:
        new_ds = create_dataset(
            {variables.stat_Tajimas_D: (["variants", "cohorts"], D.data)})
    return conditional_merge_datasets(ds, new_ds, merge)