def test_ld_matrix__raise_on_no_windows(): x = np.zeros((5, 10)) ds = simulate_genotype_call_dataset(n_variant=x.shape[0], n_sample=x.shape[1]) ds["dosage"] = (["variants", "samples"], x) with pytest.raises(ValueError, match="Dataset must be windowed for ld_matrix"): ld_matrix(ds)
def test_scores(): # Create zero row vectors except for 1st and 11th # (make them have non-zero variance) x = np.zeros((10, 10), dtype="uint8") # Make 3rd and 4th perfectly correlated x[2, :-1] = 1 x[3, :-1] = 1 # Make 8th and 9th partially correlated with 3/4 x[7, :-5] = 1 x[8, :-5] = 1 ds = simulate_genotype_call_dataset(n_variant=x.shape[0], n_sample=x.shape[1]) ds["dosage"] = (["variants", "samples"], x) ds = window_by_variant(ds, size=10) ldm = ld_matrix(ds, threshold=0.2) idx_drop_ds = maximal_independent_set(ldm) idx_drop = np.sort(idx_drop_ds.ld_prune_index_to_drop.data) npt.assert_equal(idx_drop, [3, 8]) # check ld_prune removes correct variants pruned_ds = ld_prune(ds, threshold=0.2) npt.assert_equal(pruned_ds.variant_position.values, [0, 1, 2, 4, 5, 6, 7, 9]) # break tie between 3rd and 4th so 4th wins scores = np.ones(10, dtype="float32") scores[2] = 0 scores[3] = 2 ds[variables.variant_score] = (["variants"], scores) ldm = ld_matrix(ds, threshold=0.2, variant_score=variables.variant_score) idx_drop_ds = maximal_independent_set(ldm) idx_drop = np.sort(idx_drop_ds.ld_prune_index_to_drop.data) npt.assert_equal(idx_drop, [2, 8]) # check ld_prune removes correct variants pruned_ds = ld_prune(ds, threshold=0.2, variant_score=variables.variant_score) npt.assert_equal(pruned_ds.variant_position.values, [0, 1, 3, 4, 5, 6, 7, 9])
def ldm_df( x: ArrayLike, size: int, step: Optional[int] = None, threshold: Optional[float] = None, diag: bool = False, ) -> DataFrame: ds = simulate_genotype_call_dataset(n_variant=x.shape[0], n_sample=x.shape[1]) ds["dosage"] = (["variants", "samples"], x) ds = window_by_variant(ds, size=size, step=step) df = ld_matrix(ds, threshold=threshold).compute() if not diag: df = df.pipe(lambda df: df[df["i"] != df["j"]]) df = df[~df["value"].isnull()] return df
def test_vs_skallel(args): x, size, step, threshold, chunks = args ds = simulate_genotype_call_dataset(n_variant=x.shape[0], n_sample=x.shape[1]) ds["dosage"] = (["variants", "samples"], da.asarray(x).rechunk({0: chunks})) ds = window_by_variant(ds, size=size, step=step) ldm = ld_matrix(ds, threshold=threshold) has_duplicates = ldm.compute().duplicated(subset=["i", "j"]).any() assert not has_duplicates idx_drop_ds = maximal_independent_set(ldm) idx_drop = np.sort(idx_drop_ds.ld_prune_index_to_drop.data) m = allel.locate_unlinked(x, size=size, step=step, threshold=threshold) idx_drop_ska = np.sort(np.argwhere(~m).squeeze(axis=1)) npt.assert_equal(idx_drop_ska, idx_drop)