def test_identity_by_state__tetraploid_multiallelic(chunks): ds = simulate_genotype_call_dataset( n_variant=2, n_sample=3, n_ploidy=4, n_allele=3, seed=0, ) ds = count_call_alleles(ds) ds.call_genotype.data[0, 2] = -1 # null call if chunks is not None: ds["call_allele_count"] = ( ds.call_allele_count.dims, ds.call_allele_count.data.rechunk(chunks), ) ds = identity_by_state(ds) actual = ds.stat_identity_by_state.values expect = np.nanmean( np.array([ [ [0.5, 0.375, np.nan], [0.375, 0.375, np.nan], [np.nan, np.nan, np.nan], ], [[1.0, 0.25, 0.0], [0.25, 0.625, 0.1875], [0.0, 0.1875, 0.625]], ]), axis=0, ) np.testing.assert_array_equal(expect, actual)
def test_Weir_Goudet_beta__multiallelic_trio(n_allele, decimal): # This tests for the correct relatedness of a trio # using the corrected beta from Weir Goudet 2017. # Note that the accuracy of the estimate increases # with the number of unique alleles because IBS # increasingly reflects IBD. ds = simulate_genotype_call_dataset(n_variant=10_000, n_sample=3, n_ploidy=2, n_allele=n_allele, seed=0) # sample 3 inherits 1 allele from each of samples 1 and 2 gt = ds.call_genotype.values gt[:, 2, 0] = gt[:, 0, 0] gt[:, 2, 1] = gt[:, 1, 0] ds.call_genotype.values[:] = gt beta = Weir_Goudet_beta(ds).stat_Weir_Goudet_beta.compute() beta0 = beta.min() actual = (beta - beta0) / (1 - beta0) expect = np.array([ [0.5, 0.0, 0.25], [0.0, 0.5, 0.25], [0.25, 0.25, 0.5], ]) np.testing.assert_array_almost_equal(actual, expect, decimal=decimal)
def test_collapse_ploidy() -> None: g = simulate_genotype_call_dataset(1000, 10, missing_pct=0.1) assert g.call_genotype.shape == (1000, 10, 2) assert g.call_genotype_mask.shape == (1000, 10, 2) # Test individual cases: g.call_genotype.loc[dict(variants=1, samples=1, ploidy=0)] = 1 g.call_genotype.loc[dict(variants=1, samples=1, ploidy=1)] = 1 g.call_genotype_mask.loc[dict(variants=1, samples=1, ploidy=0)] = 0 g.call_genotype_mask.loc[dict(variants=1, samples=1, ploidy=1)] = 0 g.call_genotype.loc[dict(variants=2, samples=2, ploidy=0)] = 0 g.call_genotype.loc[dict(variants=2, samples=2, ploidy=1)] = 1 g.call_genotype_mask.loc[dict(variants=2, samples=2, ploidy=0)] = 0 g.call_genotype_mask.loc[dict(variants=2, samples=2, ploidy=1)] = 0 g.call_genotype.loc[dict(variants=3, samples=3, ploidy=0)] = -1 g.call_genotype.loc[dict(variants=3, samples=3, ploidy=1)] = 1 g.call_genotype_mask.loc[dict(variants=3, samples=3, ploidy=0)] = 1 g.call_genotype_mask.loc[dict(variants=3, samples=3, ploidy=1)] = 0 call_g, call_g_mask = _collapse_ploidy(g) assert call_g.shape == (1000, 10) assert call_g_mask.shape == (1000, 10) assert call_g.isel(variants=1, samples=1) == 2 assert call_g.isel(variants=2, samples=2) == 1 assert call_g.isel(variants=3, samples=3) == -1 assert call_g_mask.isel(variants=1, samples=1) == 0 assert call_g_mask.isel(variants=3, samples=3) == 1
def test_pc_relate__maf_inputs_checks() -> None: g = simulate_genotype_call_dataset(100, 10) with pytest.raises(ValueError, match=r"MAF must be between \(0.0, 1.0\)"): pc_relate(g, maf=-1) with pytest.raises(ValueError, match=r"MAF must be between \(0.0, 1.0\)"): pc_relate(g, maf=1.0) with pytest.raises(ValueError, match=r"MAF must be between \(0.0, 1.0\)"): pc_relate(g, maf=0.0)
def ds_neq(): """Dataset with all variants well out of HWE""" ds = simulate_genotype_call_dataset(n_variant=50, n_sample=1000) gt_dist = (0.9, 0.05, 0.05) ds["call_genotype"] = simulate_genotype_calls( ds.dims["variants"], ds.dims["samples"], p=gt_dist ) return ds
def test_pc_relate__values_within_range() -> None: n_samples = 100 ds = (simulate_genotype_call_dataset(1000, n_samples).pipe( pca, n_components=2).pipe(pc_relate)) assert ds.pc_relate_phi.shape == (n_samples, n_samples) data_np = ds.pc_relate_phi.data.compute( ) # to be able to use fancy indexing below upper_phi = data_np[np.triu_indices_from(data_np, 1)] assert (upper_phi > -0.5).all() and (upper_phi < 0.5).all()
def get_dataset(calls: ArrayLike, **kwargs: Any) -> Dataset: calls = np.asarray(calls) ds = simulate_genotype_call_dataset(n_variant=calls.shape[0], n_sample=calls.shape[1], **kwargs) dims = ds["call_genotype"].dims ds["call_genotype"] = xr.DataArray(calls, dims=dims) ds["call_genotype_mask"] = xr.DataArray(calls < 0, dims=dims) return ds
def test_ld_matrix__raise_on_no_windows(): x = np.zeros((5, 10)) ds = simulate_genotype_call_dataset(n_variant=x.shape[0], n_sample=x.shape[1]) ds["dosage"] = (["variants", "samples"], x) with pytest.raises(ValueError, match="Dataset must be windowed for ld_matrix"): ld_matrix(ds)
def test_pc_relate__parent_child_relationship() -> None: # Eric's source: https://github.com/pystatgen/sgkit/pull/228#discussion_r487436876 # Create a dataset that is 2/3 founders and 1/3 progeny seed = 1 rs = np.random.RandomState(seed) ds = simulate_genotype_call_dataset(1000, 300, seed=seed) ds["sample_type"] = xr.DataArray( np.repeat(["mother", "father", "child"], 100), dims="samples" ) sample_groups = ds.groupby("sample_type").groups def simulate_new_generation(ds: xr.Dataset) -> xr.Dataset: # Generate progeny genotypes as a combination of randomly # selected haplotypes from each parents idx = sample_groups["mother"] + sample_groups["father"] gt = ds.call_genotype.isel(samples=idx).values idx = rs.randint(0, 2, size=gt.shape[:2]) # Collapse to haplotype across ploidy dim using indexer # * shape = (samples, variants) ht = gt[np.ix_(*map(range, gt.shape[:2])) + (idx,)].T gt_child = np.stack([ht[sample_groups[t]] for t in ["mother", "father"]]).T ds["call_genotype"].values = np.concatenate((gt, gt_child), axis=1) return ds # Redefine the progeny genotypes ds = simulate_new_generation(ds) # Infer kinship call_g, _ = _collapse_ploidy(ds) pcs = PCA(n_components=2, svd_solver="full").fit_transform(call_g.T) ds["sample_pcs"] = (("components", "samples"), pcs.T) ds["pc_relate_phi"] = pc_relate(ds)["pc_relate_phi"].compute() # Check that all coefficients are in expected ranges cts = ( ds["pc_relate_phi"] .to_series() .reset_index() .pipe(lambda df: df.loc[df.sample_x >= df.sample_y]["pc_relate_phi"]) .pipe( pd.cut, bins=[p for phi in [0, 0.25, 0.5] for p in [phi - 0.1, phi + 0.1]], labels=[ "unrelated", "unclassified", "parent/child", "unclassified", "self", ], ordered=False, ) .value_counts() ) assert cts["parent/child"] == len(sample_groups["child"]) * 2 assert cts["self"] == ds.dims["samples"] assert cts["unclassified"] == 0
def test_pc_relate__identical_sample_should_be_05() -> None: n_samples = 100 g = simulate_genotype_call_dataset(1000, n_samples, missing_pct=0.1) call_g, _ = _collapse_ploidy(g) pcs = PCA(n_components=2, svd_solver="full").fit_transform(call_g.T) g["sample_pcs"] = (("components", "samples"), pcs.T) # Add identical sample g.call_genotype.loc[dict(samples=8)] = g.call_genotype.isel(samples=0) phi = pc_relate(g) assert phi.pc_relate_phi.shape == (n_samples, n_samples) assert np.allclose(phi.pc_relate_phi.isel(sample_x=8, sample_y=0), 0.5, atol=0.1)
def test_pc_relate__values_within_range() -> None: n_samples = 100 g = simulate_genotype_call_dataset(1000, n_samples) call_g, _ = _collapse_ploidy(g) pcs = PCA(n_components=2, svd_solver="full").fit_transform(call_g.T) g["sample_pcs"] = (("components", "samples"), pcs.T) phi = pc_relate(g) assert phi.pc_relate_phi.shape == (n_samples, n_samples) data_np = phi.pc_relate_phi.data.compute() # to be able to use fancy indexing below upper_phi = data_np[np.triu_indices_from(data_np, 1)] assert (upper_phi > -0.5).all() and (upper_phi < 0.5).all()
def test_pc_relate__genotype_inputs_checks() -> None: g_wrong_ploidy = simulate_genotype_call_dataset(100, 10, n_ploidy=3) with pytest.raises(ValueError, match="PC Relate only works for diploid genotypes"): pc_relate(g_wrong_ploidy) g_non_biallelic = simulate_genotype_call_dataset(100, 10, n_allele=3) with pytest.raises( ValueError, match="PC Relate only works for biallelic genotypes" ): pc_relate(g_non_biallelic) g_no_pcs = simulate_genotype_call_dataset(100, 10) with pytest.raises(ValueError, match="sample_pca_projection not present"): pc_relate(g_no_pcs) with pytest.raises(ValueError, match="call_genotype not present"): pc_relate(g_no_pcs.drop_vars("call_genotype")) with pytest.raises(ValueError, match="call_genotype_mask not present"): pc_relate(g_no_pcs.drop_vars("call_genotype_mask"))
def test_save_and_load_dataset__mutable_mapping(): store: MutableMapping[str, bytes] = {} ds = simulate_genotype_call_dataset(n_variant=10, n_sample=10) save_dataset(ds, store) ds2 = load_dataset(store) assert_identical(ds, ds2) # save and load again to test https://github.com/pydata/xarray/issues/4386 store2: MutableMapping[str, bytes] = {} save_dataset(ds2, store2) assert_identical(ds, load_dataset(store2))
def test_impute_genotype_call_with_variant_mean() -> None: g = simulate_genotype_call_dataset(1000, 10, missing_pct=0.1) call_g, call_g_mask = _collapse_ploidy(g) # Test individual cases: call_g.loc[dict(variants=2)] = 1 call_g.loc[dict(variants=2, samples=1)] = 2 call_g_mask.loc[dict(variants=2)] = False call_g_mask.loc[dict(variants=2, samples=[0, 9])] = True imputed_call_g = _impute_genotype_call_with_variant_mean(call_g, call_g_mask) assert imputed_call_g.isel(variants=2, samples=1) == 2 assert (imputed_call_g.isel(variants=2, samples=slice(2, 9)) == 1).all() assert (imputed_call_g.isel(variants=2, samples=[0, 9]) == (7 + 2) / 8).all()
def test_identity_by_state__chunked_sample_dimension(): ds = simulate_genotype_call_dataset(n_variant=20, n_sample=10, n_ploidy=2) ds["call_genotype"] = ds.call_genotype.dims, da.asarray( ds.call_genotype.data, chunks=((20, ), (5, 5), (2, )), ) with pytest.raises( NotImplementedError, match= "identity_by_state does not support chunking in the samples dimension", ): identity_by_state(ds)
def test_display_genotypes__duplicate_variant_ids(): ds = simulate_genotype_call_dataset(n_variant=3, n_sample=3, seed=0) # set some variant IDs ds["variant_id"] = (["variants"], np.array(["V0", "V1", "V1"])) ds["variant_id_mask"] = (["variants"], np.array([False, False, False])) disp = display_genotypes(ds) expected = """\ samples S0 S1 S2 variants 0 0/0 1/0 1/0 1 0/1 1/0 0/1 2 0/0 1/0 1/1""" # noqa: W291 assert str(disp) == dedent(expected)
def simulate_dataset(gp: Any, chunks: int = -1) -> Dataset: gp = da.asarray(gp) gp = gp.rechunk((None, None, chunks)) ds = simulate_genotype_call_dataset(n_variant=gp.shape[0], n_sample=gp.shape[1]) ds = ds.drop_vars([variables.call_genotype, variables.call_genotype_mask]) ds = ds.assign({ variables.call_genotype_probability: ( ("variants", "samples", "genotypes"), gp, ) }) return ds
def test_display_genotypes__truncated_rows(): ds = simulate_genotype_call_dataset(n_variant=10, n_sample=10, seed=0) disp = display_genotypes(ds, max_variants=4, max_samples=10) expected = """\ samples S0 S1 S2 S3 S4 S5 S6 S7 S8 S9 variants 0 0/0 1/0 1/0 0/1 1/0 0/1 0/0 1/0 1/1 0/0 1 1/0 0/1 1/0 1/1 1/1 1/0 1/0 0/0 1/0 1/1 ... ... ... ... ... ... ... ... ... ... ... 8 0/1 0/0 1/0 0/1 0/1 1/0 1/0 0/1 1/0 1/0 9 1/1 0/1 1/0 0/1 1/0 1/1 0/1 1/0 1/1 1/0 [10 rows x 10 columns]""" # noqa: W291 assert str(disp) == dedent(expected)
def test_save_and_load_dataset(tmp_path, is_path): path = tmp_path / "ds.zarr" if not is_path: path = str(path) ds = simulate_genotype_call_dataset(n_variant=10, n_sample=10) save_dataset(ds, path) ds2 = load_dataset(path) assert_identical(ds, ds2) # save and load again to test https://github.com/pydata/xarray/issues/4386 path2 = tmp_path / "ds2.zarr" if not is_path: path2 = str(path2) save_dataset(ds2, path2) assert_identical(ds, load_dataset(path2))
def test_display_genotypes(): ds = simulate_genotype_call_dataset(n_variant=3, n_sample=3, seed=0) disp = display_genotypes(ds) expected = """\ samples S0 S1 S2 variants 0 0/0 1/0 1/0 1 0/1 1/0 0/1 2 0/0 1/0 1/1""" # noqa: W291 assert str(disp) == dedent(expected) expected_html = """<table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th>samples</th> <th>S0</th> <th>S1</th> <th>S2</th> </tr> <tr> <th>variants</th> <th></th> <th></th> <th></th> </tr> </thead> <tbody> <tr> <th>0</th> <td>0/0</td> <td>1/0</td> <td>1/0</td> </tr> <tr> <th>1</th> <td>0/1</td> <td>1/0</td> <td>0/1</td> </tr> <tr> <th>2</th> <td>0/0</td> <td>1/0</td> <td>1/1</td> </tr> </tbody> </table>""".strip() assert expected_html in disp._repr_html_()
def test_display_genotypes__large(): ds = simulate_genotype_call_dataset(n_variant=100_000, n_sample=1000, seed=0) disp = display_genotypes(ds, max_variants=4, max_samples=4) expected = """\ samples S0 S1 ... S998 S999 variants ... 0 0/0 1/0 ... 0/1 1/1 1 1/1 1/1 ... 0/1 1/1 ... ... ... ... ... ... 99998 0/1 1/1 ... 1/0 0/1 99999 1/0 1/0 ... 1/0 1/0 [100000 rows x 1000 columns]""" # noqa: W291 assert str(disp) == dedent(expected)
def ldm_df( x: ArrayLike, size: int, step: Optional[int] = None, threshold: Optional[float] = None, diag: bool = False, ) -> DataFrame: ds = simulate_genotype_call_dataset(n_variant=x.shape[0], n_sample=x.shape[1]) ds["dosage"] = (["variants", "samples"], x) ds = window_by_variant(ds, size=size, step=step) df = ld_matrix(ds, threshold=threshold).compute() if not diag: df = df.pipe(lambda df: df[df["i"] != df["j"]]) df = df[~df["value"].isnull()] return df
def test_scores(): # Create zero row vectors except for 1st and 11th # (make them have non-zero variance) x = np.zeros((10, 10), dtype="uint8") # Make 3rd and 4th perfectly correlated x[2, :-1] = 1 x[3, :-1] = 1 # Make 8th and 9th partially correlated with 3/4 x[7, :-5] = 1 x[8, :-5] = 1 ds = simulate_genotype_call_dataset(n_variant=x.shape[0], n_sample=x.shape[1]) ds["dosage"] = (["variants", "samples"], x) ds = window_by_variant(ds, size=10) ldm = ld_matrix(ds, threshold=0.2) idx_drop_ds = maximal_independent_set(ldm) idx_drop = np.sort(idx_drop_ds.ld_prune_index_to_drop.data) npt.assert_equal(idx_drop, [3, 8]) # check ld_prune removes correct variants pruned_ds = ld_prune(ds, threshold=0.2) npt.assert_equal(pruned_ds.variant_position.values, [0, 1, 2, 4, 5, 6, 7, 9]) # break tie between 3rd and 4th so 4th wins scores = np.ones(10, dtype="float32") scores[2] = 0 scores[3] = 2 ds[variables.variant_score] = (["variants"], scores) ldm = ld_matrix(ds, threshold=0.2, variant_score=variables.variant_score) idx_drop_ds = maximal_independent_set(ldm) idx_drop = np.sort(idx_drop_ds.ld_prune_index_to_drop.data) npt.assert_equal(idx_drop, [2, 8]) # check ld_prune removes correct variants pruned_ds = ld_prune(ds, threshold=0.2, variant_score=variables.variant_score) npt.assert_equal(pruned_ds.variant_position.values, [0, 1, 3, 4, 5, 6, 7, 9])
def test_display_genotypes__truncated_columns(): ds = simulate_genotype_call_dataset(n_variant=10, n_sample=10, seed=0) disp = display_genotypes(ds, max_variants=10, max_samples=4) expected = """\ samples S0 S1 ... S8 S9 variants ... 0 0/0 1/0 ... 1/1 0/0 1 1/0 0/1 ... 1/0 1/1 2 1/1 1/1 ... 0/0 1/0 3 0/1 0/0 ... 1/0 0/0 4 0/1 0/0 ... 0/0 1/1 5 1/1 1/0 ... 0/0 1/0 6 1/1 0/0 ... 1/0 0/1 7 1/0 0/1 ... 0/1 0/0 8 0/1 0/0 ... 1/0 1/0 9 1/1 0/1 ... 1/1 1/0 [10 rows x 10 columns]""" # noqa: W291 assert str(disp) == dedent(expected)
def simulate_dataset( n_variant: int = 100, n_sample: int = 50, n_cohort: Optional[int] = None, chunks: Any = (None, None), ) -> Dataset: """Simulate dataset with optional population structure""" ds = simulate_genotype_call_dataset(n_variant, n_sample, seed=0) if n_cohort: ac = simulate_cohort_genotypes( ds.dims["variants"], ds.dims["samples"], n_cohort ) ds["call_alternate_allele_count"] = xr.DataArray( ac, dims=("variants", "samples") ) else: ds = count_call_alternate_alleles(ds) ds["call_alternate_allele_count"] = ds["call_alternate_allele_count"].chunk(chunks) return ds
def test_vs_skallel(args): x, size, step, threshold, chunks = args ds = simulate_genotype_call_dataset(n_variant=x.shape[0], n_sample=x.shape[1]) ds["dosage"] = (["variants", "samples"], da.asarray(x).rechunk({0: chunks})) ds = window_by_variant(ds, size=size, step=step) ldm = ld_matrix(ds, threshold=threshold) has_duplicates = ldm.compute().duplicated(subset=["i", "j"]).any() assert not has_duplicates idx_drop_ds = maximal_independent_set(ldm) idx_drop = np.sort(idx_drop_ds.ld_prune_index_to_drop.data) m = allel.locate_unlinked(x, size=size, step=step, threshold=threshold) idx_drop_ska = np.sort(np.argwhere(~m).squeeze(axis=1)) npt.assert_equal(idx_drop_ska, idx_drop)
def test_identity_by_state__reference_implementation(ploidy, chunks, seed): ds = simulate_genotype_call_dataset( n_variant=sum(chunks[0]), n_sample=sum(chunks[1]), n_ploidy=ploidy, n_allele=sum(chunks[2]), missing_pct=0.2, seed=seed, ) ds = count_call_alleles(ds) ds["call_allele_count"] = ( ds.call_allele_count.dims, ds.call_allele_count.data.rechunk(chunks), ) ds = identity_by_state(ds) actual = ds.stat_identity_by_state.values # reference implementation AF = ds.call_allele_frequency.data expect = np.nanmean( (AF[..., None, :, :] * AF[..., :, None, :]).sum(axis=-1), axis=0).compute() np.testing.assert_array_almost_equal(expect, actual)
def simulate_regression_dataset( n_variant: int, n_sample: int, n_contig: int, n_covariate: int, n_trait: int, noise_scale: float = 0.01, seed: int = 0, ) -> Dataset: rs = np.random.RandomState(seed) ds = simulate_genotype_call_dataset(n_variant=n_variant, n_sample=n_sample, n_contig=n_contig) G = ds["call_genotype"].sum(dim="ploidy") X = rs.normal(size=(n_sample, n_covariate)) Y = (G.T.data @ rs.normal(size=(G.shape[0], n_trait)) + X @ rs.normal(size=(n_covariate, n_trait)) + rs.normal(size=(n_sample, 1), scale=noise_scale)) ds["call_dosage"] = G ds["sample_covariate"] = (("samples", "covariates"), X) ds["sample_trait"] = (("samples", "traits"), Y) return ds
def test_identity_by_state__diploid_biallelic(chunks): ds = simulate_genotype_call_dataset( n_variant=2, n_sample=3, n_ploidy=2, n_allele=2, seed=2, ) ds = count_call_alleles(ds) if chunks is not None: ds["call_allele_count"] = ( ds.call_allele_count.dims, ds.call_allele_count.data.rechunk(chunks), ) ds = identity_by_state(ds) actual = ds.stat_identity_by_state.values expect = np.nanmean( np.array([ [[1.0, 0.0, 0.5], [0.0, 1.0, 0.5], [0.5, 0.5, 0.5]], [[1.0, 1.0, 0.5], [1.0, 1.0, 0.5], [0.5, 0.5, 0.5]], ]), axis=0, ) np.testing.assert_array_equal(expect, actual)
def test_simulate_genotype_call_dataset__zarr(tmp_path): path = str(tmp_path / "ds.zarr") ds = simulate_genotype_call_dataset(n_variant=10, n_sample=10) ds.to_zarr(path) xr.testing.assert_equal(ds, xr.open_zarr(path, concat_characters=False)) # type: ignore[no-untyped-call]