def test_pca__raise_on_missing_data(sample_dataset, sentinel): ac = sample_dataset["call_alternate_allele_count"] ac = ac.where(sample_dataset["call_alternate_allele_count"] == 1, sentinel) ds = sample_dataset.assign(call_alternate_allele_count=ac) with pytest.raises(ValueError, match="Input data cannot contain missing values"): pca.pca(ds, n_components=2)
def test_pca__default_allele_counts_with_index(sample_dataset): pca.pca( sample_dataset.drop_vars("call_alternate_allele_count").set_index( {"variants": ("variant_contig", "variant_position")}), n_components=2, merge=False, ).compute()
def test_pca__lazy_evaluation(shape, chunks, algorithm): # Ensure that all new variables are backed by lazy arrays if algorithm == "tsqr" and all(c > 0 for c in chunks): return ds = simulate_dataset(*shape, chunks=chunks) # type: ignore[misc] ds = pca.pca(ds, n_components=2, algorithm=algorithm, merge=False) for v in ds: assert isinstance(ds[v].data, da.Array)
def test_pca__array_backend(backend, algorithm): # Ensure that calculation succeeds regardless of array input backend ds = simulate_dataset(25, 5) ds["call_alternate_allele_count"] = ds["call_alternate_allele_count"].copy( data=backend.asarray(ds["call_alternate_allele_count"])) ds = pca.pca(ds, n_components=2, algorithm=algorithm, merge=False) for v in ds: ds[v].compute()
def test_pca__tsqr_allel_comparison(shape, chunks, n_components): # Validate chunked, non-random implementation vs scikit-allel single chunk results ds = simulate_dataset(*shape, chunks=chunks) # type: ignore[misc] ds_sg = pca.pca(ds, n_components=n_components, algorithm="tsqr") ds_sk = allel_pca( ds["call_alternate_allele_count"].values.astype("float32"), n_components=n_components, scaler="patterson", randomized=False, ) assert ds_sg["sample_pca_projection"].values.dtype == np.float32 assert ds_sk["sample_pca_projection"].values.dtype == np.float32 validate_allel_comparison(ds_sg, ds_sk)
def test_pca__stability(stability_test_result, chunks, algorithm): # Ensure that results are stable across algorithms and that sign flips # do not occur when chunking changes if algorithm == "tsqr" and all(c > 0 for c in chunks): return shape, expected = stability_test_result ds = simulate_dataset(*shape, chunks=chunks, n_cohort=3) # type: ignore[misc] actual = pca.pca( ds, n_components=2, algorithm=algorithm, n_iter=6, random_state=0, merge=False ) # Results are expected to change slightly with chunking, but they # will change drastically (far more than 1e-5) if a sign flip occurs xr.testing.assert_allclose(expected, actual, atol=1e-5)
def test_pca__randomized_allel_comparison(shape, chunks, n_components): # Validate chunked, randomized implementation vs scikit-allel single chunk results -- # randomized validation requires more data, more structure, and fewer components in # order for results to be equal within the same tolerance as deterministic svd. ds = simulate_dataset(*shape, chunks=chunks, n_cohort=3) # type: ignore[misc] ds_sg = pca.pca( ds, n_components=n_components, algorithm="randomized", n_iter=5, random_state=0 ) ds_sk = allel_pca( ds["call_alternate_allele_count"].values.astype("float32"), n_components=n_components, scaler="patterson", randomized=True, iterated_power=5, random_state=0, ) assert ds_sg["sample_pca_projection"].values.dtype == np.float32 assert ds_sk["sample_pca_projection"].values.dtype == np.float32 validate_allel_comparison(ds_sg, ds_sk)
def stability_test_result(request): shape = request.param ds = simulate_dataset(*shape, chunks=(-1, -1), n_cohort=3) # type: ignore[misc] res = pca.pca(ds, n_components=2, algorithm="tsqr", merge=False) return shape, res
def test_pca__default_allele_counts(sample_dataset): pca.pca( sample_dataset.drop_vars("call_alternate_allele_count"), n_components=2, merge=False, ).compute()