Esempio n. 1
0
def test_ld_matrix__raise_on_no_windows():
    x = np.zeros((5, 10))
    ds = simulate_genotype_call_dataset(n_variant=x.shape[0],
                                        n_sample=x.shape[1])
    ds["dosage"] = (["variants", "samples"], x)

    with pytest.raises(ValueError,
                       match="Dataset must be windowed for ld_matrix"):
        ld_matrix(ds)
Esempio n. 2
0
def test_scores():
    # Create zero row vectors except for 1st and 11th
    # (make them have non-zero variance)
    x = np.zeros((10, 10), dtype="uint8")
    # Make 3rd and 4th perfectly correlated
    x[2, :-1] = 1
    x[3, :-1] = 1
    # Make 8th and 9th partially correlated with 3/4
    x[7, :-5] = 1
    x[8, :-5] = 1

    ds = simulate_genotype_call_dataset(n_variant=x.shape[0],
                                        n_sample=x.shape[1])
    ds["dosage"] = (["variants", "samples"], x)
    ds = window_by_variant(ds, size=10)

    ldm = ld_matrix(ds, threshold=0.2)
    idx_drop_ds = maximal_independent_set(ldm)
    idx_drop = np.sort(idx_drop_ds.ld_prune_index_to_drop.data)

    npt.assert_equal(idx_drop, [3, 8])

    # check ld_prune removes correct variants
    pruned_ds = ld_prune(ds, threshold=0.2)
    npt.assert_equal(pruned_ds.variant_position.values,
                     [0, 1, 2, 4, 5, 6, 7, 9])

    # break tie between 3rd and 4th so 4th wins
    scores = np.ones(10, dtype="float32")
    scores[2] = 0
    scores[3] = 2
    ds[variables.variant_score] = (["variants"], scores)

    ldm = ld_matrix(ds, threshold=0.2, variant_score=variables.variant_score)
    idx_drop_ds = maximal_independent_set(ldm)
    idx_drop = np.sort(idx_drop_ds.ld_prune_index_to_drop.data)

    npt.assert_equal(idx_drop, [2, 8])

    # check ld_prune removes correct variants
    pruned_ds = ld_prune(ds,
                         threshold=0.2,
                         variant_score=variables.variant_score)
    npt.assert_equal(pruned_ds.variant_position.values,
                     [0, 1, 3, 4, 5, 6, 7, 9])
Esempio n. 3
0
def ldm_df(
    x: ArrayLike,
    size: int,
    step: Optional[int] = None,
    threshold: Optional[float] = None,
    diag: bool = False,
) -> DataFrame:
    ds = simulate_genotype_call_dataset(n_variant=x.shape[0],
                                        n_sample=x.shape[1])
    ds["dosage"] = (["variants", "samples"], x)
    ds = window_by_variant(ds, size=size, step=step)
    df = ld_matrix(ds, threshold=threshold).compute()
    if not diag:
        df = df.pipe(lambda df: df[df["i"] != df["j"]])
    df = df[~df["value"].isnull()]
    return df
Esempio n. 4
0
def test_vs_skallel(args):
    x, size, step, threshold, chunks = args

    ds = simulate_genotype_call_dataset(n_variant=x.shape[0],
                                        n_sample=x.shape[1])
    ds["dosage"] = (["variants", "samples"], da.asarray(x).rechunk({0:
                                                                    chunks}))
    ds = window_by_variant(ds, size=size, step=step)

    ldm = ld_matrix(ds, threshold=threshold)
    has_duplicates = ldm.compute().duplicated(subset=["i", "j"]).any()
    assert not has_duplicates
    idx_drop_ds = maximal_independent_set(ldm)

    idx_drop = np.sort(idx_drop_ds.ld_prune_index_to_drop.data)
    m = allel.locate_unlinked(x, size=size, step=step, threshold=threshold)
    idx_drop_ska = np.sort(np.argwhere(~m).squeeze(axis=1))

    npt.assert_equal(idx_drop_ska, idx_drop)