コード例 #1
0
 def setup(self) -> None:
     self.count_call_alleles_ds = simulate_genotype_call_dataset(
         n_variant=100_000, n_sample=1000)
     self.count_cohort_alleles_ds = simulate_genotype_call_dataset(
         n_variant=100_000, n_sample=1000)
     sample_cohort = np.repeat(
         [0, 1], self.count_cohort_alleles_ds.dims["samples"] // 2)
     self.count_cohort_alleles_ds["sample_cohort"] = xr.DataArray(
         sample_cohort, dims="samples")
コード例 #2
0
ファイル: test_popgen.py プロジェクト: aktech/sgkit
def test_observed_heterozygosity__scikit_allel_comparison(
        n_variant, n_sample, missing_pct, window_size, seed):
    ds = simulate_genotype_call_dataset(
        n_variant=n_variant,
        n_sample=n_sample,
        n_ploidy=2,
        missing_pct=missing_pct,
        seed=seed,
    )
    ds["sample_cohort"] = (
        ["samples"],
        np.zeros(n_sample, int),
    )
    ds = window(ds, size=window_size)
    ho_sg = observed_heterozygosity(ds)["stat_observed_heterozygosity"].values
    if n_sample % window_size:
        # scikit-allel will drop the ragged end
        ho_sg = ho_sg[0:-1]
    # calculate with scikit-allel
    ho_sa = allel.moving_statistic(
        allel.heterozygosity_observed(ds["call_genotype"]),
        np.sum,
        size=window_size,
    )
    # add cohort dimension to scikit-allel result
    np.testing.assert_almost_equal(ho_sg, ho_sa[..., None])
コード例 #3
0
def test_window__default_step():
    ds = simulate_genotype_call_dataset(n_variant=10, n_sample=3, seed=0)
    assert not has_windows(ds)
    ds = window(ds, 2)
    assert has_windows(ds)
    np.testing.assert_equal(ds[window_contig].values, [0, 0, 0, 0, 0])
    np.testing.assert_equal(ds[window_start].values, [0, 2, 4, 6, 8])
    np.testing.assert_equal(ds[window_stop].values, [2, 4, 6, 8, 10])
コード例 #4
0
ファイル: test_popgen.py プロジェクト: aktech/sgkit
def test_observed_heterozygosity(chunks):
    ds = simulate_genotype_call_dataset(
        n_variant=4,
        n_sample=6,
        n_ploidy=4,
    )
    ds["call_genotype"] = (
        ["variants", "samples", "ploidy"],
        da.asarray([
            [
                [0, 0, 0, 0],
                [0, 0, 0, 0],
                [1, 1, 1, 1],
                [1, 1, 1, 1],
                [0, 0, 0, 0],
                [1, 1, 1, 1],
            ],
            [
                [0, 0, 0, 0],
                [0, 0, 0, 1],
                [0, 0, 1, 1],
                [0, 0, 1, 1],
                [1, 0, 1, 0],
                [0, 1, 0, 1],
            ],
            [
                [0, 0, 0, 0],
                [0, 0, 0, 0],
                [0, 1, 2, 3],
                [0, 1, 2, 3],
                [0, 1, 2, 3],
                [4, 5, 6, 7],
            ],
            [
                [0, 0, -1, -1],
                [0, 1, -1, -1],
                [0, 0, 1, 1],
                [-1, -1, -1, -1],
                [0, -1, -1, -1],
                [-1, -1, -1, -1],
            ],
        ]).rechunk(chunks),
    )
    ds.call_genotype_mask.values = ds.call_genotype < 0
    ds["sample_cohort"] = (
        ["samples"],
        da.asarray([0, 0, 1, 1, 2, 2]).rechunk(chunks[1]),
    )
    ho = observed_heterozygosity(ds)["stat_observed_heterozygosity"]
    np.testing.assert_almost_equal(
        ho,
        np.array([
            [0, 0, 0],
            [1 / 4, 2 / 3, 2 / 3],
            [0, 1, 1],
            [1 / 2, 4 / 6, np.nan],
        ]),
    )
コード例 #5
0
def test_window__multiple_contigs(n_variant, n_contig, window_contigs_exp,
                                  window_starts_exp, window_stops_exp):
    ds = simulate_genotype_call_dataset(n_variant=n_variant,
                                        n_sample=1,
                                        n_contig=n_contig)
    ds = window(ds, 2, 2)
    np.testing.assert_equal(ds[window_contig].values, window_contigs_exp)
    np.testing.assert_equal(ds[window_start].values, window_starts_exp)
    np.testing.assert_equal(ds[window_stop].values, window_stops_exp)
コード例 #6
0
ファイル: test_popgen.py プロジェクト: aktech/sgkit
def test_observed_heterozygosity__windowed(chunks, cohorts, expectation):
    ds = simulate_genotype_call_dataset(
        n_variant=4,
        n_sample=6,
        n_ploidy=4,
    )
    ds["call_genotype"] = (
        ["variants", "samples", "ploidy"],
        da.asarray([
            [
                [0, 0, 0, 0],
                [0, 0, 0, 0],
                [1, 1, 1, 1],
                [1, 1, 1, 1],
                [0, 0, 0, 0],
                [1, 1, 1, 1],
            ],
            [
                [0, 0, 0, 0],
                [0, 0, 0, 1],
                [0, 0, 1, 1],
                [0, 0, 1, 1],
                [1, 0, 1, 0],
                [0, 1, 0, 1],
            ],
            [
                [0, 0, 0, 0],
                [0, 0, 0, 0],
                [0, 1, 2, 3],
                [0, 1, 2, 3],
                [0, 1, 2, 3],
                [4, 5, 6, 7],
            ],
            [
                [0, 0, -1, -1],
                [0, 1, -1, -1],
                [0, 0, 1, 1],
                [-1, -1, -1, -1],
                [0, -1, -1, -1],
                [-1, -1, -1, -1],
            ],
        ]).rechunk(chunks),
    )
    ds.call_genotype_mask.values = ds.call_genotype < 0
    ds["sample_cohort"] = (
        ["samples"],
        da.asarray(cohorts).rechunk(chunks[1]),
    )
    ds = window(ds, size=2)
    ho = observed_heterozygosity(ds)["stat_observed_heterozygosity"]
    np.testing.assert_almost_equal(
        ho,
        np.array(expectation),
    )
コード例 #7
0
def test_window():
    ds = simulate_genotype_call_dataset(n_variant=10, n_sample=3, seed=0)
    assert not has_windows(ds)
    ds = window(ds, 2, 2)
    assert has_windows(ds)
    np.testing.assert_equal(ds[window_contig].values, [0, 0, 0, 0, 0])
    np.testing.assert_equal(ds[window_start].values, [0, 2, 4, 6, 8])
    np.testing.assert_equal(ds[window_stop].values, [2, 4, 6, 8, 10])

    with pytest.raises(MergeWarning):
        window(ds, 2, 2)
コード例 #8
0
ファイル: test_window.py プロジェクト: hammer/sgkit
def test_window_by_position__equal_spaced_windows():
    ds = simulate_genotype_call_dataset(n_variant=5, n_sample=3, seed=0)
    assert not has_windows(ds)
    ds["variant_position"] = (
        ["variants"],
        np.array([1, 4, 6, 8, 12]),
    )
    ds = window_by_position(ds, size=5, offset=1)
    assert has_windows(ds)
    np.testing.assert_equal(ds[window_contig].values, [0, 0, 0])
    np.testing.assert_equal(ds[window_start].values, [0, 2, 4])
    np.testing.assert_equal(ds[window_stop].values, [2, 4, 5])
コード例 #9
0
ファイル: test_window.py プロジェクト: hammer/sgkit
def test_window_by_position__multiple_contigs():
    ds = simulate_genotype_call_dataset(n_variant=10, n_sample=3, n_contig=2)
    ds["variant_position"] = (
        ["variants"],
        np.array([1, 4, 6, 8, 12, 1, 21, 25, 40, 55]),
    )
    ds = window_by_position(ds,
                            size=10,
                            window_start_position="variant_position")
    assert has_windows(ds)
    np.testing.assert_equal(ds[window_contig].values,
                            [0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
    np.testing.assert_equal(ds[window_start].values,
                            [0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
    np.testing.assert_equal(ds[window_stop].values,
                            [4, 5, 5, 5, 5, 6, 8, 8, 9, 10])
コード例 #10
0
ファイル: test_popgen.py プロジェクト: aktech/sgkit
def test_Garud_h(n_variants, n_samples, n_contigs, n_cohorts, cohorts,
                 cohort_indexes, chunks):
    ds = simulate_genotype_call_dataset(n_variant=n_variants,
                                        n_sample=n_samples,
                                        n_contig=n_contigs)
    ds = ds.chunk(dict(zip(["variants", "samples"], chunks)))
    subsets = np.array_split(ds.samples.values, n_cohorts)
    sample_cohorts = np.concatenate(
        [np.full_like(subset, i) for i, subset in enumerate(subsets)])
    ds["sample_cohort"] = xr.DataArray(sample_cohorts, dims="samples")
    cohort_names = [f"co_{i}" for i in range(n_cohorts)]
    coords = {k: cohort_names for k in ["cohorts"]}
    ds = ds.assign_coords(coords)  # type: ignore[no-untyped-call]
    ds = window(ds, size=3)

    gh = Garud_H(ds, cohorts=cohorts)
    h1 = gh.stat_Garud_h1.values
    h12 = gh.stat_Garud_h12.values
    h123 = gh.stat_Garud_h123.values
    h2_h1 = gh.stat_Garud_h2_h1.values

    # scikit-allel
    for c in range(n_cohorts):
        if cohort_indexes is not None and c not in cohort_indexes:
            # cohorts that were not computed should be nan
            np.testing.assert_array_equal(h1[:, c],
                                          np.full_like(h1[:, c], np.nan))
            np.testing.assert_array_equal(h12[:, c],
                                          np.full_like(h12[:, c], np.nan))
            np.testing.assert_array_equal(h123[:, c],
                                          np.full_like(h123[:, c], np.nan))
            np.testing.assert_array_equal(h2_h1[:, c],
                                          np.full_like(h2_h1[:, c], np.nan))
        else:
            gt = ds.call_genotype.values[:, sample_cohorts == c, :]
            ska_gt = allel.GenotypeArray(gt)
            ska_ha = ska_gt.to_haplotypes()
            ska_h = allel.moving_garud_h(ska_ha, size=3)

            np.testing.assert_allclose(h1[:, c], ska_h[0])
            np.testing.assert_allclose(h12[:, c], ska_h[1])
            np.testing.assert_allclose(h123[:, c], ska_h[2])
            np.testing.assert_allclose(h2_h1[:, c], ska_h[3])
コード例 #11
0
def test_filter_partial_calls():
    calls = np.array([[[0, 0], [0, 1], [1, 0]], [[-1, 0], [0, -1], [-1, -1]]])
    ds = simulate_genotype_call_dataset(*calls.shape)
    dims = ds["call_genotype"].dims
    ds["call_genotype"] = xr.DataArray(calls, dims=dims)
    ds["call_genotype_mask"] = xr.DataArray(calls < 0, dims=dims)

    ds2 = sgkit.stats.preprocessing.filter_partial_calls(ds)

    calls_filtered = ds2["call_genotype_complete"]
    mask_filtered = ds2["call_genotype_complete_mask"]

    np.testing.assert_array_equal(
        calls_filtered,
        np.array([[[0, 0], [0, 1], [1, 0]], [[-1, -1], [-1, -1], [-1, -1]]]),
    )

    np.testing.assert_array_equal(
        mask_filtered,
        np.array([[[0, 0], [0, 0], [0, 0]], [[1, 1], [1, 1], [1, 1]]],
                 dtype=bool),
    )
コード例 #12
0
def test_filter_partial_calls__mixed_ploidy():
    calls = np.array([
        [[0, 0, 0, 0], [0, 1, -2, -2], [1, 0, 0, 0]],
        [[-1, 0, -2, -2], [0, -1, -1, -1], [-1, -1, -1, -1]],
    ])
    ds = simulate_genotype_call_dataset(*calls.shape)
    dims = ds["call_genotype"].dims
    ds["call_genotype"] = xr.DataArray(calls,
                                       dims=dims,
                                       attrs={"mixed_ploidy": True})
    ds["call_genotype_mask"] = xr.DataArray(calls < 0, dims=dims)

    ds2 = sgkit.stats.preprocessing.filter_partial_calls(ds)

    calls_filtered = ds2["call_genotype_complete"]
    mask_filtered = ds2["call_genotype_complete_mask"]

    np.testing.assert_array_equal(
        calls_filtered,
        np.array([
            [[0, 0, 0, 0], [0, 1, -2, -2], [1, 0, 0, 0]],
            [[-1, -1, -2, -2], [-1, -1, -1, -1], [-1, -1, -1, -1]],
        ]),
    )

    np.testing.assert_array_equal(
        mask_filtered,
        np.array(
            [
                [[0, 0, 0, 0], [0, 0, 1, 1], [0, 0, 0, 0]],
                [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]],
            ],
            dtype=bool,
        ),
    )

    assert ds["call_genotype"].attrs["mixed_ploidy"] is True
コード例 #13
0
ファイル: test_sgkit.py プロジェクト: timothymillar/sgkit
import sgkit as sg

if __name__ == "__main__":
    ds = sg.simulate_genotype_call_dataset(n_variant=100,
                                           n_sample=50,
                                           n_contig=23)
    print(ds)
コード例 #14
0
ファイル: test_popgen.py プロジェクト: aktech/sgkit
def test_Garud_h__raise_on_no_windows():
    ds = simulate_genotype_call_dataset(n_variant=10, n_sample=10)

    with pytest.raises(ValueError,
                       match="Dataset must be windowed for Garud_H"):
        Garud_H(ds)
コード例 #15
0
ファイル: test_popgen.py プロジェクト: aktech/sgkit
def test_Garud_h__raise_on_non_diploid():
    ds = simulate_genotype_call_dataset(n_variant=10, n_sample=10, n_ploidy=3)
    with pytest.raises(NotImplementedError,
                       match="Garud H only implemented for diploid genotypes"):
        Garud_H(ds)