def setup(self) -> None: self.count_call_alleles_ds = simulate_genotype_call_dataset( n_variant=100_000, n_sample=1000) self.count_cohort_alleles_ds = simulate_genotype_call_dataset( n_variant=100_000, n_sample=1000) sample_cohort = np.repeat( [0, 1], self.count_cohort_alleles_ds.dims["samples"] // 2) self.count_cohort_alleles_ds["sample_cohort"] = xr.DataArray( sample_cohort, dims="samples")
def test_observed_heterozygosity__scikit_allel_comparison( n_variant, n_sample, missing_pct, window_size, seed): ds = simulate_genotype_call_dataset( n_variant=n_variant, n_sample=n_sample, n_ploidy=2, missing_pct=missing_pct, seed=seed, ) ds["sample_cohort"] = ( ["samples"], np.zeros(n_sample, int), ) ds = window(ds, size=window_size) ho_sg = observed_heterozygosity(ds)["stat_observed_heterozygosity"].values if n_sample % window_size: # scikit-allel will drop the ragged end ho_sg = ho_sg[0:-1] # calculate with scikit-allel ho_sa = allel.moving_statistic( allel.heterozygosity_observed(ds["call_genotype"]), np.sum, size=window_size, ) # add cohort dimension to scikit-allel result np.testing.assert_almost_equal(ho_sg, ho_sa[..., None])
def test_window__default_step(): ds = simulate_genotype_call_dataset(n_variant=10, n_sample=3, seed=0) assert not has_windows(ds) ds = window(ds, 2) assert has_windows(ds) np.testing.assert_equal(ds[window_contig].values, [0, 0, 0, 0, 0]) np.testing.assert_equal(ds[window_start].values, [0, 2, 4, 6, 8]) np.testing.assert_equal(ds[window_stop].values, [2, 4, 6, 8, 10])
def test_observed_heterozygosity(chunks): ds = simulate_genotype_call_dataset( n_variant=4, n_sample=6, n_ploidy=4, ) ds["call_genotype"] = ( ["variants", "samples", "ploidy"], da.asarray([ [ [0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [1, 1, 1, 1], [0, 0, 0, 0], [1, 1, 1, 1], ], [ [0, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 1], [0, 0, 1, 1], [1, 0, 1, 0], [0, 1, 0, 1], ], [ [0, 0, 0, 0], [0, 0, 0, 0], [0, 1, 2, 3], [0, 1, 2, 3], [0, 1, 2, 3], [4, 5, 6, 7], ], [ [0, 0, -1, -1], [0, 1, -1, -1], [0, 0, 1, 1], [-1, -1, -1, -1], [0, -1, -1, -1], [-1, -1, -1, -1], ], ]).rechunk(chunks), ) ds.call_genotype_mask.values = ds.call_genotype < 0 ds["sample_cohort"] = ( ["samples"], da.asarray([0, 0, 1, 1, 2, 2]).rechunk(chunks[1]), ) ho = observed_heterozygosity(ds)["stat_observed_heterozygosity"] np.testing.assert_almost_equal( ho, np.array([ [0, 0, 0], [1 / 4, 2 / 3, 2 / 3], [0, 1, 1], [1 / 2, 4 / 6, np.nan], ]), )
def test_window__multiple_contigs(n_variant, n_contig, window_contigs_exp, window_starts_exp, window_stops_exp): ds = simulate_genotype_call_dataset(n_variant=n_variant, n_sample=1, n_contig=n_contig) ds = window(ds, 2, 2) np.testing.assert_equal(ds[window_contig].values, window_contigs_exp) np.testing.assert_equal(ds[window_start].values, window_starts_exp) np.testing.assert_equal(ds[window_stop].values, window_stops_exp)
def test_observed_heterozygosity__windowed(chunks, cohorts, expectation): ds = simulate_genotype_call_dataset( n_variant=4, n_sample=6, n_ploidy=4, ) ds["call_genotype"] = ( ["variants", "samples", "ploidy"], da.asarray([ [ [0, 0, 0, 0], [0, 0, 0, 0], [1, 1, 1, 1], [1, 1, 1, 1], [0, 0, 0, 0], [1, 1, 1, 1], ], [ [0, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 1], [0, 0, 1, 1], [1, 0, 1, 0], [0, 1, 0, 1], ], [ [0, 0, 0, 0], [0, 0, 0, 0], [0, 1, 2, 3], [0, 1, 2, 3], [0, 1, 2, 3], [4, 5, 6, 7], ], [ [0, 0, -1, -1], [0, 1, -1, -1], [0, 0, 1, 1], [-1, -1, -1, -1], [0, -1, -1, -1], [-1, -1, -1, -1], ], ]).rechunk(chunks), ) ds.call_genotype_mask.values = ds.call_genotype < 0 ds["sample_cohort"] = ( ["samples"], da.asarray(cohorts).rechunk(chunks[1]), ) ds = window(ds, size=2) ho = observed_heterozygosity(ds)["stat_observed_heterozygosity"] np.testing.assert_almost_equal( ho, np.array(expectation), )
def test_window(): ds = simulate_genotype_call_dataset(n_variant=10, n_sample=3, seed=0) assert not has_windows(ds) ds = window(ds, 2, 2) assert has_windows(ds) np.testing.assert_equal(ds[window_contig].values, [0, 0, 0, 0, 0]) np.testing.assert_equal(ds[window_start].values, [0, 2, 4, 6, 8]) np.testing.assert_equal(ds[window_stop].values, [2, 4, 6, 8, 10]) with pytest.raises(MergeWarning): window(ds, 2, 2)
def test_window_by_position__equal_spaced_windows(): ds = simulate_genotype_call_dataset(n_variant=5, n_sample=3, seed=0) assert not has_windows(ds) ds["variant_position"] = ( ["variants"], np.array([1, 4, 6, 8, 12]), ) ds = window_by_position(ds, size=5, offset=1) assert has_windows(ds) np.testing.assert_equal(ds[window_contig].values, [0, 0, 0]) np.testing.assert_equal(ds[window_start].values, [0, 2, 4]) np.testing.assert_equal(ds[window_stop].values, [2, 4, 5])
def test_window_by_position__multiple_contigs(): ds = simulate_genotype_call_dataset(n_variant=10, n_sample=3, n_contig=2) ds["variant_position"] = ( ["variants"], np.array([1, 4, 6, 8, 12, 1, 21, 25, 40, 55]), ) ds = window_by_position(ds, size=10, window_start_position="variant_position") assert has_windows(ds) np.testing.assert_equal(ds[window_contig].values, [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]) np.testing.assert_equal(ds[window_start].values, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) np.testing.assert_equal(ds[window_stop].values, [4, 5, 5, 5, 5, 6, 8, 8, 9, 10])
def test_Garud_h(n_variants, n_samples, n_contigs, n_cohorts, cohorts, cohort_indexes, chunks): ds = simulate_genotype_call_dataset(n_variant=n_variants, n_sample=n_samples, n_contig=n_contigs) ds = ds.chunk(dict(zip(["variants", "samples"], chunks))) subsets = np.array_split(ds.samples.values, n_cohorts) sample_cohorts = np.concatenate( [np.full_like(subset, i) for i, subset in enumerate(subsets)]) ds["sample_cohort"] = xr.DataArray(sample_cohorts, dims="samples") cohort_names = [f"co_{i}" for i in range(n_cohorts)] coords = {k: cohort_names for k in ["cohorts"]} ds = ds.assign_coords(coords) # type: ignore[no-untyped-call] ds = window(ds, size=3) gh = Garud_H(ds, cohorts=cohorts) h1 = gh.stat_Garud_h1.values h12 = gh.stat_Garud_h12.values h123 = gh.stat_Garud_h123.values h2_h1 = gh.stat_Garud_h2_h1.values # scikit-allel for c in range(n_cohorts): if cohort_indexes is not None and c not in cohort_indexes: # cohorts that were not computed should be nan np.testing.assert_array_equal(h1[:, c], np.full_like(h1[:, c], np.nan)) np.testing.assert_array_equal(h12[:, c], np.full_like(h12[:, c], np.nan)) np.testing.assert_array_equal(h123[:, c], np.full_like(h123[:, c], np.nan)) np.testing.assert_array_equal(h2_h1[:, c], np.full_like(h2_h1[:, c], np.nan)) else: gt = ds.call_genotype.values[:, sample_cohorts == c, :] ska_gt = allel.GenotypeArray(gt) ska_ha = ska_gt.to_haplotypes() ska_h = allel.moving_garud_h(ska_ha, size=3) np.testing.assert_allclose(h1[:, c], ska_h[0]) np.testing.assert_allclose(h12[:, c], ska_h[1]) np.testing.assert_allclose(h123[:, c], ska_h[2]) np.testing.assert_allclose(h2_h1[:, c], ska_h[3])
def test_filter_partial_calls(): calls = np.array([[[0, 0], [0, 1], [1, 0]], [[-1, 0], [0, -1], [-1, -1]]]) ds = simulate_genotype_call_dataset(*calls.shape) dims = ds["call_genotype"].dims ds["call_genotype"] = xr.DataArray(calls, dims=dims) ds["call_genotype_mask"] = xr.DataArray(calls < 0, dims=dims) ds2 = sgkit.stats.preprocessing.filter_partial_calls(ds) calls_filtered = ds2["call_genotype_complete"] mask_filtered = ds2["call_genotype_complete_mask"] np.testing.assert_array_equal( calls_filtered, np.array([[[0, 0], [0, 1], [1, 0]], [[-1, -1], [-1, -1], [-1, -1]]]), ) np.testing.assert_array_equal( mask_filtered, np.array([[[0, 0], [0, 0], [0, 0]], [[1, 1], [1, 1], [1, 1]]], dtype=bool), )
def test_filter_partial_calls__mixed_ploidy(): calls = np.array([ [[0, 0, 0, 0], [0, 1, -2, -2], [1, 0, 0, 0]], [[-1, 0, -2, -2], [0, -1, -1, -1], [-1, -1, -1, -1]], ]) ds = simulate_genotype_call_dataset(*calls.shape) dims = ds["call_genotype"].dims ds["call_genotype"] = xr.DataArray(calls, dims=dims, attrs={"mixed_ploidy": True}) ds["call_genotype_mask"] = xr.DataArray(calls < 0, dims=dims) ds2 = sgkit.stats.preprocessing.filter_partial_calls(ds) calls_filtered = ds2["call_genotype_complete"] mask_filtered = ds2["call_genotype_complete_mask"] np.testing.assert_array_equal( calls_filtered, np.array([ [[0, 0, 0, 0], [0, 1, -2, -2], [1, 0, 0, 0]], [[-1, -1, -2, -2], [-1, -1, -1, -1], [-1, -1, -1, -1]], ]), ) np.testing.assert_array_equal( mask_filtered, np.array( [ [[0, 0, 0, 0], [0, 0, 1, 1], [0, 0, 0, 0]], [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]], ], dtype=bool, ), ) assert ds["call_genotype"].attrs["mixed_ploidy"] is True
import sgkit as sg if __name__ == "__main__": ds = sg.simulate_genotype_call_dataset(n_variant=100, n_sample=50, n_contig=23) print(ds)
def test_Garud_h__raise_on_no_windows(): ds = simulate_genotype_call_dataset(n_variant=10, n_sample=10) with pytest.raises(ValueError, match="Dataset must be windowed for Garud_H"): Garud_H(ds)
def test_Garud_h__raise_on_non_diploid(): ds = simulate_genotype_call_dataset(n_variant=10, n_sample=10, n_ploidy=3) with pytest.raises(NotImplementedError, match="Garud H only implemented for diploid genotypes"): Garud_H(ds)