Ejemplo n.º 1
0
def test_window_by_variant():
    ds = simulate_genotype_call_dataset(n_variant=10, n_sample=3, seed=0)
    assert not has_windows(ds)
    ds = window_by_variant(ds, size=2, step=2)
    assert has_windows(ds)
    np.testing.assert_equal(ds[window_contig].values, [0, 0, 0, 0, 0])
    np.testing.assert_equal(ds[window_start].values, [0, 2, 4, 6, 8])
    np.testing.assert_equal(ds[window_stop].values, [2, 4, 6, 8, 10])

    with pytest.raises(MergeWarning):
        window_by_variant(ds, size=2, step=2)
Ejemplo n.º 2
0
def test_window_by_variant__default_step():
    ds = simulate_genotype_call_dataset(n_variant=10, n_sample=3, seed=0)
    assert not has_windows(ds)
    ds = window_by_variant(ds, size=2)
    assert has_windows(ds)
    np.testing.assert_equal(ds[window_contig].values, [0, 0, 0, 0, 0])
    np.testing.assert_equal(ds[window_start].values, [0, 2, 4, 6, 8])
    np.testing.assert_equal(ds[window_stop].values, [2, 4, 6, 8, 10])
Ejemplo n.º 3
0
def test_window_by_variant__multiple_contigs(n_variant, n_contig,
                                             window_contigs_exp,
                                             window_starts_exp,
                                             window_stops_exp):
    ds = simulate_genotype_call_dataset(n_variant=n_variant,
                                        n_sample=1,
                                        n_contig=n_contig)
    ds = window_by_variant(ds, size=2, step=2)
    np.testing.assert_equal(ds[window_contig].values, window_contigs_exp)
    np.testing.assert_equal(ds[window_start].values, window_starts_exp)
    np.testing.assert_equal(ds[window_stop].values, window_stops_exp)
Ejemplo n.º 4
0
def ldm_df(
    x: ArrayLike,
    size: int,
    step: Optional[int] = None,
    threshold: Optional[float] = None,
    diag: bool = False,
) -> DataFrame:
    ds = simulate_genotype_call_dataset(n_variant=x.shape[0],
                                        n_sample=x.shape[1])
    ds["dosage"] = (["variants", "samples"], x)
    ds = window_by_variant(ds, size=size, step=step)
    df = ld_matrix(ds, threshold=threshold).compute()
    if not diag:
        df = df.pipe(lambda df: df[df["i"] != df["j"]])
    df = df[~df["value"].isnull()]
    return df
Ejemplo n.º 5
0
def test_scores():
    # Create zero row vectors except for 1st and 11th
    # (make them have non-zero variance)
    x = np.zeros((10, 10), dtype="uint8")
    # Make 3rd and 4th perfectly correlated
    x[2, :-1] = 1
    x[3, :-1] = 1
    # Make 8th and 9th partially correlated with 3/4
    x[7, :-5] = 1
    x[8, :-5] = 1

    ds = simulate_genotype_call_dataset(n_variant=x.shape[0],
                                        n_sample=x.shape[1])
    ds["dosage"] = (["variants", "samples"], x)
    ds = window_by_variant(ds, size=10)

    ldm = ld_matrix(ds, threshold=0.2)
    idx_drop_ds = maximal_independent_set(ldm)
    idx_drop = np.sort(idx_drop_ds.ld_prune_index_to_drop.data)

    npt.assert_equal(idx_drop, [3, 8])

    # check ld_prune removes correct variants
    pruned_ds = ld_prune(ds, threshold=0.2)
    npt.assert_equal(pruned_ds.variant_position.values,
                     [0, 1, 2, 4, 5, 6, 7, 9])

    # break tie between 3rd and 4th so 4th wins
    scores = np.ones(10, dtype="float32")
    scores[2] = 0
    scores[3] = 2
    ds[variables.variant_score] = (["variants"], scores)

    ldm = ld_matrix(ds, threshold=0.2, variant_score=variables.variant_score)
    idx_drop_ds = maximal_independent_set(ldm)
    idx_drop = np.sort(idx_drop_ds.ld_prune_index_to_drop.data)

    npt.assert_equal(idx_drop, [2, 8])

    # check ld_prune removes correct variants
    pruned_ds = ld_prune(ds,
                         threshold=0.2,
                         variant_score=variables.variant_score)
    npt.assert_equal(pruned_ds.variant_position.values,
                     [0, 1, 3, 4, 5, 6, 7, 9])
Ejemplo n.º 6
0
def test_vs_skallel(args):
    x, size, step, threshold, chunks = args

    ds = simulate_genotype_call_dataset(n_variant=x.shape[0],
                                        n_sample=x.shape[1])
    ds["dosage"] = (["variants", "samples"], da.asarray(x).rechunk({0:
                                                                    chunks}))
    ds = window_by_variant(ds, size=size, step=step)

    ldm = ld_matrix(ds, threshold=threshold)
    has_duplicates = ldm.compute().duplicated(subset=["i", "j"]).any()
    assert not has_duplicates
    idx_drop_ds = maximal_independent_set(ldm)

    idx_drop = np.sort(idx_drop_ds.ld_prune_index_to_drop.data)
    m = allel.locate_unlinked(x, size=size, step=step, threshold=threshold)
    idx_drop_ska = np.sort(np.argwhere(~m).squeeze(axis=1))

    npt.assert_equal(idx_drop_ska, idx_drop)