Python simulate_genotype_call_dataset Beispiele, sgkit.testing.simulate_genotype_call_dataset Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: test_ibs.py Projekt: timothymillar/sgkit

def test_identity_by_state__tetraploid_multiallelic(chunks):
    ds = simulate_genotype_call_dataset(
        n_variant=2,
        n_sample=3,
        n_ploidy=4,
        n_allele=3,
        seed=0,
    )
    ds = count_call_alleles(ds)
    ds.call_genotype.data[0, 2] = -1  # null call
    if chunks is not None:
        ds["call_allele_count"] = (
            ds.call_allele_count.dims,
            ds.call_allele_count.data.rechunk(chunks),
        )
    ds = identity_by_state(ds)
    actual = ds.stat_identity_by_state.values
    expect = np.nanmean(
        np.array([
            [
                [0.5, 0.375, np.nan],
                [0.375, 0.375, np.nan],
                [np.nan, np.nan, np.nan],
            ],
            [[1.0, 0.25, 0.0], [0.25, 0.625, 0.1875], [0.0, 0.1875, 0.625]],
        ]),
        axis=0,
    )
    np.testing.assert_array_equal(expect, actual)

Beispiel #2

0

Datei anzeigen

Datei: test_ibs.py Projekt: timothymillar/sgkit

def test_Weir_Goudet_beta__multiallelic_trio(n_allele, decimal):
    # This tests for the correct relatedness of a trio
    # using the corrected beta from Weir Goudet 2017.
    # Note that the accuracy of the estimate increases
    # with the number of unique alleles because IBS
    # increasingly reflects IBD.
    ds = simulate_genotype_call_dataset(n_variant=10_000,
                                        n_sample=3,
                                        n_ploidy=2,
                                        n_allele=n_allele,
                                        seed=0)
    # sample 3 inherits 1 allele from each of samples 1 and 2
    gt = ds.call_genotype.values
    gt[:, 2, 0] = gt[:, 0, 0]
    gt[:, 2, 1] = gt[:, 1, 0]
    ds.call_genotype.values[:] = gt
    beta = Weir_Goudet_beta(ds).stat_Weir_Goudet_beta.compute()
    beta0 = beta.min()
    actual = (beta - beta0) / (1 - beta0)
    expect = np.array([
        [0.5, 0.0, 0.25],
        [0.0, 0.5, 0.25],
        [0.25, 0.25, 0.5],
    ])
    np.testing.assert_array_almost_equal(actual, expect, decimal=decimal)

Beispiel #3

0

Datei anzeigen

Datei: test_pc_relate.py Projekt: aktech/sgkit

def test_collapse_ploidy() -> None:
    g = simulate_genotype_call_dataset(1000, 10, missing_pct=0.1)
    assert g.call_genotype.shape == (1000, 10, 2)
    assert g.call_genotype_mask.shape == (1000, 10, 2)

    # Test individual cases:
    g.call_genotype.loc[dict(variants=1, samples=1, ploidy=0)] = 1
    g.call_genotype.loc[dict(variants=1, samples=1, ploidy=1)] = 1
    g.call_genotype_mask.loc[dict(variants=1, samples=1, ploidy=0)] = 0
    g.call_genotype_mask.loc[dict(variants=1, samples=1, ploidy=1)] = 0

    g.call_genotype.loc[dict(variants=2, samples=2, ploidy=0)] = 0
    g.call_genotype.loc[dict(variants=2, samples=2, ploidy=1)] = 1
    g.call_genotype_mask.loc[dict(variants=2, samples=2, ploidy=0)] = 0
    g.call_genotype_mask.loc[dict(variants=2, samples=2, ploidy=1)] = 0

    g.call_genotype.loc[dict(variants=3, samples=3, ploidy=0)] = -1
    g.call_genotype.loc[dict(variants=3, samples=3, ploidy=1)] = 1
    g.call_genotype_mask.loc[dict(variants=3, samples=3, ploidy=0)] = 1
    g.call_genotype_mask.loc[dict(variants=3, samples=3, ploidy=1)] = 0

    call_g, call_g_mask = _collapse_ploidy(g)
    assert call_g.shape == (1000, 10)
    assert call_g_mask.shape == (1000, 10)
    assert call_g.isel(variants=1, samples=1) == 2
    assert call_g.isel(variants=2, samples=2) == 1
    assert call_g.isel(variants=3, samples=3) == -1
    assert call_g_mask.isel(variants=1, samples=1) == 0
    assert call_g_mask.isel(variants=3, samples=3) == 1

Beispiel #4

0

Datei anzeigen

Datei: test_pc_relate.py Projekt: aktech/sgkit

def test_pc_relate__maf_inputs_checks() -> None:
    g = simulate_genotype_call_dataset(100, 10)
    with pytest.raises(ValueError, match=r"MAF must be between \(0.0, 1.0\)"):
        pc_relate(g, maf=-1)
    with pytest.raises(ValueError, match=r"MAF must be between \(0.0, 1.0\)"):
        pc_relate(g, maf=1.0)
    with pytest.raises(ValueError, match=r"MAF must be between \(0.0, 1.0\)"):
        pc_relate(g, maf=0.0)

Beispiel #5

0

Datei anzeigen

def ds_neq():
    """Dataset with all variants well out of HWE"""
    ds = simulate_genotype_call_dataset(n_variant=50, n_sample=1000)
    gt_dist = (0.9, 0.05, 0.05)
    ds["call_genotype"] = simulate_genotype_calls(
        ds.dims["variants"], ds.dims["samples"], p=gt_dist
    )
    return ds

Beispiel #6

0

Datei anzeigen

def test_pc_relate__values_within_range() -> None:
    n_samples = 100
    ds = (simulate_genotype_call_dataset(1000, n_samples).pipe(
        pca, n_components=2).pipe(pc_relate))
    assert ds.pc_relate_phi.shape == (n_samples, n_samples)
    data_np = ds.pc_relate_phi.data.compute(
    )  # to be able to use fancy indexing below
    upper_phi = data_np[np.triu_indices_from(data_np, 1)]
    assert (upper_phi > -0.5).all() and (upper_phi < 0.5).all()

Beispiel #7

0

Datei anzeigen

def get_dataset(calls: ArrayLike, **kwargs: Any) -> Dataset:
    calls = np.asarray(calls)
    ds = simulate_genotype_call_dataset(n_variant=calls.shape[0],
                                        n_sample=calls.shape[1],
                                        **kwargs)
    dims = ds["call_genotype"].dims
    ds["call_genotype"] = xr.DataArray(calls, dims=dims)
    ds["call_genotype_mask"] = xr.DataArray(calls < 0, dims=dims)
    return ds

Beispiel #8

0

Datei anzeigen

Datei: test_ld.py Projekt: timothymillar/sgkit

def test_ld_matrix__raise_on_no_windows():
    x = np.zeros((5, 10))
    ds = simulate_genotype_call_dataset(n_variant=x.shape[0],
                                        n_sample=x.shape[1])
    ds["dosage"] = (["variants", "samples"], x)

    with pytest.raises(ValueError,
                       match="Dataset must be windowed for ld_matrix"):
        ld_matrix(ds)

Beispiel #9

0

Datei anzeigen

def test_pc_relate__parent_child_relationship() -> None:
    # Eric's source: https://github.com/pystatgen/sgkit/pull/228#discussion_r487436876

    # Create a dataset that is 2/3 founders and 1/3 progeny
    seed = 1
    rs = np.random.RandomState(seed)
    ds = simulate_genotype_call_dataset(1000, 300, seed=seed)
    ds["sample_type"] = xr.DataArray(
        np.repeat(["mother", "father", "child"], 100), dims="samples"
    )
    sample_groups = ds.groupby("sample_type").groups

    def simulate_new_generation(ds: xr.Dataset) -> xr.Dataset:
        # Generate progeny genotypes as a combination of randomly
        # selected haplotypes from each parents
        idx = sample_groups["mother"] + sample_groups["father"]
        gt = ds.call_genotype.isel(samples=idx).values
        idx = rs.randint(0, 2, size=gt.shape[:2])
        # Collapse to haplotype across ploidy dim using indexer
        # * shape = (samples, variants)
        ht = gt[np.ix_(*map(range, gt.shape[:2])) + (idx,)].T
        gt_child = np.stack([ht[sample_groups[t]] for t in ["mother", "father"]]).T
        ds["call_genotype"].values = np.concatenate((gt, gt_child), axis=1)
        return ds

    # Redefine the progeny genotypes
    ds = simulate_new_generation(ds)

    # Infer kinship
    call_g, _ = _collapse_ploidy(ds)
    pcs = PCA(n_components=2, svd_solver="full").fit_transform(call_g.T)
    ds["sample_pcs"] = (("components", "samples"), pcs.T)
    ds["pc_relate_phi"] = pc_relate(ds)["pc_relate_phi"].compute()

    # Check that all coefficients are in expected ranges
    cts = (
        ds["pc_relate_phi"]
        .to_series()
        .reset_index()
        .pipe(lambda df: df.loc[df.sample_x >= df.sample_y]["pc_relate_phi"])
        .pipe(
            pd.cut,
            bins=[p for phi in [0, 0.25, 0.5] for p in [phi - 0.1, phi + 0.1]],
            labels=[
                "unrelated",
                "unclassified",
                "parent/child",
                "unclassified",
                "self",
            ],
            ordered=False,
        )
        .value_counts()
    )
    assert cts["parent/child"] == len(sample_groups["child"]) * 2
    assert cts["self"] == ds.dims["samples"]
    assert cts["unclassified"] == 0

Beispiel #10

0

Datei anzeigen

def test_pc_relate__identical_sample_should_be_05() -> None:
    n_samples = 100
    g = simulate_genotype_call_dataset(1000, n_samples, missing_pct=0.1)
    call_g, _ = _collapse_ploidy(g)
    pcs = PCA(n_components=2, svd_solver="full").fit_transform(call_g.T)
    g["sample_pcs"] = (("components", "samples"), pcs.T)
    # Add identical sample
    g.call_genotype.loc[dict(samples=8)] = g.call_genotype.isel(samples=0)
    phi = pc_relate(g)
    assert phi.pc_relate_phi.shape == (n_samples, n_samples)
    assert np.allclose(phi.pc_relate_phi.isel(sample_x=8, sample_y=0), 0.5, atol=0.1)

Beispiel #11

0

Datei anzeigen

def test_pc_relate__values_within_range() -> None:
    n_samples = 100
    g = simulate_genotype_call_dataset(1000, n_samples)
    call_g, _ = _collapse_ploidy(g)
    pcs = PCA(n_components=2, svd_solver="full").fit_transform(call_g.T)
    g["sample_pcs"] = (("components", "samples"), pcs.T)
    phi = pc_relate(g)
    assert phi.pc_relate_phi.shape == (n_samples, n_samples)
    data_np = phi.pc_relate_phi.data.compute()  # to be able to use fancy indexing below
    upper_phi = data_np[np.triu_indices_from(data_np, 1)]
    assert (upper_phi > -0.5).all() and (upper_phi < 0.5).all()

Beispiel #12

0

Datei anzeigen

Datei: test_pc_relate.py Projekt: aktech/sgkit

def test_pc_relate__genotype_inputs_checks() -> None:
    g_wrong_ploidy = simulate_genotype_call_dataset(100, 10, n_ploidy=3)
    with pytest.raises(ValueError, match="PC Relate only works for diploid genotypes"):
        pc_relate(g_wrong_ploidy)

    g_non_biallelic = simulate_genotype_call_dataset(100, 10, n_allele=3)
    with pytest.raises(
        ValueError, match="PC Relate only works for biallelic genotypes"
    ):
        pc_relate(g_non_biallelic)

    g_no_pcs = simulate_genotype_call_dataset(100, 10)
    with pytest.raises(ValueError, match="sample_pca_projection not present"):
        pc_relate(g_no_pcs)

    with pytest.raises(ValueError, match="call_genotype not present"):
        pc_relate(g_no_pcs.drop_vars("call_genotype"))

    with pytest.raises(ValueError, match="call_genotype_mask not present"):
        pc_relate(g_no_pcs.drop_vars("call_genotype_mask"))

Beispiel #13

0

Datei anzeigen

def test_save_and_load_dataset__mutable_mapping():
    store: MutableMapping[str, bytes] = {}
    ds = simulate_genotype_call_dataset(n_variant=10, n_sample=10)
    save_dataset(ds, store)
    ds2 = load_dataset(store)
    assert_identical(ds, ds2)

    # save and load again to test https://github.com/pydata/xarray/issues/4386
    store2: MutableMapping[str, bytes] = {}
    save_dataset(ds2, store2)
    assert_identical(ds, load_dataset(store2))

Beispiel #14

0

Datei anzeigen

Datei: test_pc_relate.py Projekt: aktech/sgkit

def test_impute_genotype_call_with_variant_mean() -> None:
    g = simulate_genotype_call_dataset(1000, 10, missing_pct=0.1)
    call_g, call_g_mask = _collapse_ploidy(g)
    # Test individual cases:
    call_g.loc[dict(variants=2)] = 1
    call_g.loc[dict(variants=2, samples=1)] = 2
    call_g_mask.loc[dict(variants=2)] = False
    call_g_mask.loc[dict(variants=2, samples=[0, 9])] = True
    imputed_call_g = _impute_genotype_call_with_variant_mean(call_g, call_g_mask)
    assert imputed_call_g.isel(variants=2, samples=1) == 2
    assert (imputed_call_g.isel(variants=2, samples=slice(2, 9)) == 1).all()
    assert (imputed_call_g.isel(variants=2, samples=[0, 9]) == (7 + 2) / 8).all()

Beispiel #15

0

Datei anzeigen

Datei: test_ibs.py Projekt: timothymillar/sgkit

def test_identity_by_state__chunked_sample_dimension():
    ds = simulate_genotype_call_dataset(n_variant=20, n_sample=10, n_ploidy=2)
    ds["call_genotype"] = ds.call_genotype.dims, da.asarray(
        ds.call_genotype.data,
        chunks=((20, ), (5, 5), (2, )),
    )
    with pytest.raises(
            NotImplementedError,
            match=
            "identity_by_state does not support chunking in the samples dimension",
    ):
        identity_by_state(ds)

Beispiel #16

0

Datei anzeigen

Datei: test_display.py Projekt: timothymillar/sgkit

def test_display_genotypes__duplicate_variant_ids():
    ds = simulate_genotype_call_dataset(n_variant=3, n_sample=3, seed=0)
    # set some variant IDs
    ds["variant_id"] = (["variants"], np.array(["V0", "V1", "V1"]))
    ds["variant_id_mask"] = (["variants"], np.array([False, False, False]))
    disp = display_genotypes(ds)
    expected = """\
        samples    S0   S1   S2
        variants               
        0         0/0  1/0  1/0
        1         0/1  1/0  0/1
        2         0/0  1/0  1/1"""  # noqa: W291
    assert str(disp) == dedent(expected)

Beispiel #17

0

Datei anzeigen

def simulate_dataset(gp: Any, chunks: int = -1) -> Dataset:
    gp = da.asarray(gp)
    gp = gp.rechunk((None, None, chunks))
    ds = simulate_genotype_call_dataset(n_variant=gp.shape[0],
                                        n_sample=gp.shape[1])
    ds = ds.drop_vars([variables.call_genotype, variables.call_genotype_mask])
    ds = ds.assign({
        variables.call_genotype_probability: (
            ("variants", "samples", "genotypes"),
            gp,
        )
    })
    return ds

Beispiel #18

0

Datei anzeigen

Datei: test_display.py Projekt: timothymillar/sgkit

def test_display_genotypes__truncated_rows():
    ds = simulate_genotype_call_dataset(n_variant=10, n_sample=10, seed=0)
    disp = display_genotypes(ds, max_variants=4, max_samples=10)
    expected = """\
        samples    S0   S1   S2   S3   S4   S5   S6   S7   S8   S9
        variants                                                  
        0         0/0  1/0  1/0  0/1  1/0  0/1  0/0  1/0  1/1  0/0
        1         1/0  0/1  1/0  1/1  1/1  1/0  1/0  0/0  1/0  1/1
        ...       ...  ...  ...  ...  ...  ...  ...  ...  ...  ...
        8         0/1  0/0  1/0  0/1  0/1  1/0  1/0  0/1  1/0  1/0
        9         1/1  0/1  1/0  0/1  1/0  1/1  0/1  1/0  1/1  1/0

        [10 rows x 10 columns]"""  # noqa: W291
    assert str(disp) == dedent(expected)

Beispiel #19

0

Datei anzeigen

def test_save_and_load_dataset(tmp_path, is_path):
    path = tmp_path / "ds.zarr"
    if not is_path:
        path = str(path)
    ds = simulate_genotype_call_dataset(n_variant=10, n_sample=10)
    save_dataset(ds, path)
    ds2 = load_dataset(path)
    assert_identical(ds, ds2)

    # save and load again to test https://github.com/pydata/xarray/issues/4386
    path2 = tmp_path / "ds2.zarr"
    if not is_path:
        path2 = str(path2)
    save_dataset(ds2, path2)
    assert_identical(ds, load_dataset(path2))

Beispiel #20

0

Datei anzeigen

Datei: test_display.py Projekt: timothymillar/sgkit

def test_display_genotypes():
    ds = simulate_genotype_call_dataset(n_variant=3, n_sample=3, seed=0)
    disp = display_genotypes(ds)
    expected = """\
        samples    S0   S1   S2
        variants               
        0         0/0  1/0  1/0
        1         0/1  1/0  0/1
        2         0/0  1/0  1/1"""  # noqa: W291
    assert str(disp) == dedent(expected)

    expected_html = """<table border="1" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th>samples</th>
      <th>S0</th>
      <th>S1</th>
      <th>S2</th>
    </tr>
    <tr>
      <th>variants</th>
      <th></th>
      <th></th>
      <th></th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <th>0</th>
      <td>0/0</td>
      <td>1/0</td>
      <td>1/0</td>
    </tr>
    <tr>
      <th>1</th>
      <td>0/1</td>
      <td>1/0</td>
      <td>0/1</td>
    </tr>
    <tr>
      <th>2</th>
      <td>0/0</td>
      <td>1/0</td>
      <td>1/1</td>
    </tr>
  </tbody>
</table>""".strip()
    assert expected_html in disp._repr_html_()

Beispiel #21

0

Datei anzeigen

Datei: test_display.py Projekt: timothymillar/sgkit

def test_display_genotypes__large():
    ds = simulate_genotype_call_dataset(n_variant=100_000,
                                        n_sample=1000,
                                        seed=0)
    disp = display_genotypes(ds, max_variants=4, max_samples=4)
    expected = """\
        samples    S0   S1  ... S998 S999
        variants            ...          
        0         0/0  1/0  ...  0/1  1/1
        1         1/1  1/1  ...  0/1  1/1
        ...       ...  ...  ...  ...  ...
        99998     0/1  1/1  ...  1/0  0/1
        99999     1/0  1/0  ...  1/0  1/0

        [100000 rows x 1000 columns]"""  # noqa: W291
    assert str(disp) == dedent(expected)

Beispiel #22

0

Datei anzeigen

Datei: test_ld.py Projekt: timothymillar/sgkit

def ldm_df(
    x: ArrayLike,
    size: int,
    step: Optional[int] = None,
    threshold: Optional[float] = None,
    diag: bool = False,
) -> DataFrame:
    ds = simulate_genotype_call_dataset(n_variant=x.shape[0],
                                        n_sample=x.shape[1])
    ds["dosage"] = (["variants", "samples"], x)
    ds = window_by_variant(ds, size=size, step=step)
    df = ld_matrix(ds, threshold=threshold).compute()
    if not diag:
        df = df.pipe(lambda df: df[df["i"] != df["j"]])
    df = df[~df["value"].isnull()]
    return df

Beispiel #23

0

Datei anzeigen

Datei: test_ld.py Projekt: timothymillar/sgkit

def test_scores():
    # Create zero row vectors except for 1st and 11th
    # (make them have non-zero variance)
    x = np.zeros((10, 10), dtype="uint8")
    # Make 3rd and 4th perfectly correlated
    x[2, :-1] = 1
    x[3, :-1] = 1
    # Make 8th and 9th partially correlated with 3/4
    x[7, :-5] = 1
    x[8, :-5] = 1

    ds = simulate_genotype_call_dataset(n_variant=x.shape[0],
                                        n_sample=x.shape[1])
    ds["dosage"] = (["variants", "samples"], x)
    ds = window_by_variant(ds, size=10)

    ldm = ld_matrix(ds, threshold=0.2)
    idx_drop_ds = maximal_independent_set(ldm)
    idx_drop = np.sort(idx_drop_ds.ld_prune_index_to_drop.data)

    npt.assert_equal(idx_drop, [3, 8])

    # check ld_prune removes correct variants
    pruned_ds = ld_prune(ds, threshold=0.2)
    npt.assert_equal(pruned_ds.variant_position.values,
                     [0, 1, 2, 4, 5, 6, 7, 9])

    # break tie between 3rd and 4th so 4th wins
    scores = np.ones(10, dtype="float32")
    scores[2] = 0
    scores[3] = 2
    ds[variables.variant_score] = (["variants"], scores)

    ldm = ld_matrix(ds, threshold=0.2, variant_score=variables.variant_score)
    idx_drop_ds = maximal_independent_set(ldm)
    idx_drop = np.sort(idx_drop_ds.ld_prune_index_to_drop.data)

    npt.assert_equal(idx_drop, [2, 8])

    # check ld_prune removes correct variants
    pruned_ds = ld_prune(ds,
                         threshold=0.2,
                         variant_score=variables.variant_score)
    npt.assert_equal(pruned_ds.variant_position.values,
                     [0, 1, 3, 4, 5, 6, 7, 9])

Beispiel #24

0

Datei anzeigen

Datei: test_display.py Projekt: timothymillar/sgkit

def test_display_genotypes__truncated_columns():
    ds = simulate_genotype_call_dataset(n_variant=10, n_sample=10, seed=0)
    disp = display_genotypes(ds, max_variants=10, max_samples=4)
    expected = """\
        samples    S0   S1  ...   S8   S9
        variants            ...          
        0         0/0  1/0  ...  1/1  0/0
        1         1/0  0/1  ...  1/0  1/1
        2         1/1  1/1  ...  0/0  1/0
        3         0/1  0/0  ...  1/0  0/0
        4         0/1  0/0  ...  0/0  1/1
        5         1/1  1/0  ...  0/0  1/0
        6         1/1  0/0  ...  1/0  0/1
        7         1/0  0/1  ...  0/1  0/0
        8         0/1  0/0  ...  1/0  1/0
        9         1/1  0/1  ...  1/1  1/0

        [10 rows x 10 columns]"""  # noqa: W291
    assert str(disp) == dedent(expected)

Beispiel #25

0

Datei anzeigen

Datei: test_pca.py Projekt: timothymillar/sgkit

def simulate_dataset(
    n_variant: int = 100,
    n_sample: int = 50,
    n_cohort: Optional[int] = None,
    chunks: Any = (None, None),
) -> Dataset:
    """Simulate dataset with optional population structure"""
    ds = simulate_genotype_call_dataset(n_variant, n_sample, seed=0)
    if n_cohort:
        ac = simulate_cohort_genotypes(
            ds.dims["variants"], ds.dims["samples"], n_cohort
        )
        ds["call_alternate_allele_count"] = xr.DataArray(
            ac, dims=("variants", "samples")
        )
    else:
        ds = count_call_alternate_alleles(ds)
    ds["call_alternate_allele_count"] = ds["call_alternate_allele_count"].chunk(chunks)
    return ds

Beispiel #26

0

Datei anzeigen

Datei: test_ld.py Projekt: timothymillar/sgkit

def test_vs_skallel(args):
    x, size, step, threshold, chunks = args

    ds = simulate_genotype_call_dataset(n_variant=x.shape[0],
                                        n_sample=x.shape[1])
    ds["dosage"] = (["variants", "samples"], da.asarray(x).rechunk({0:
                                                                    chunks}))
    ds = window_by_variant(ds, size=size, step=step)

    ldm = ld_matrix(ds, threshold=threshold)
    has_duplicates = ldm.compute().duplicated(subset=["i", "j"]).any()
    assert not has_duplicates
    idx_drop_ds = maximal_independent_set(ldm)

    idx_drop = np.sort(idx_drop_ds.ld_prune_index_to_drop.data)
    m = allel.locate_unlinked(x, size=size, step=step, threshold=threshold)
    idx_drop_ska = np.sort(np.argwhere(~m).squeeze(axis=1))

    npt.assert_equal(idx_drop_ska, idx_drop)

Beispiel #27

0

Datei anzeigen

Datei: test_ibs.py Projekt: timothymillar/sgkit

def test_identity_by_state__reference_implementation(ploidy, chunks, seed):
    ds = simulate_genotype_call_dataset(
        n_variant=sum(chunks[0]),
        n_sample=sum(chunks[1]),
        n_ploidy=ploidy,
        n_allele=sum(chunks[2]),
        missing_pct=0.2,
        seed=seed,
    )
    ds = count_call_alleles(ds)
    ds["call_allele_count"] = (
        ds.call_allele_count.dims,
        ds.call_allele_count.data.rechunk(chunks),
    )
    ds = identity_by_state(ds)
    actual = ds.stat_identity_by_state.values
    # reference implementation
    AF = ds.call_allele_frequency.data
    expect = np.nanmean(
        (AF[..., None, :, :] * AF[..., :, None, :]).sum(axis=-1),
        axis=0).compute()
    np.testing.assert_array_almost_equal(expect, actual)

Beispiel #28

0

Datei anzeigen

def simulate_regression_dataset(
    n_variant: int,
    n_sample: int,
    n_contig: int,
    n_covariate: int,
    n_trait: int,
    noise_scale: float = 0.01,
    seed: int = 0,
) -> Dataset:
    rs = np.random.RandomState(seed)
    ds = simulate_genotype_call_dataset(n_variant=n_variant,
                                        n_sample=n_sample,
                                        n_contig=n_contig)
    G = ds["call_genotype"].sum(dim="ploidy")
    X = rs.normal(size=(n_sample, n_covariate))
    Y = (G.T.data @ rs.normal(size=(G.shape[0], n_trait)) +
         X @ rs.normal(size=(n_covariate, n_trait)) +
         rs.normal(size=(n_sample, 1), scale=noise_scale))
    ds["call_dosage"] = G
    ds["sample_covariate"] = (("samples", "covariates"), X)
    ds["sample_trait"] = (("samples", "traits"), Y)
    return ds

Beispiel #29

0

Datei anzeigen

Datei: test_ibs.py Projekt: timothymillar/sgkit

def test_identity_by_state__diploid_biallelic(chunks):
    ds = simulate_genotype_call_dataset(
        n_variant=2,
        n_sample=3,
        n_ploidy=2,
        n_allele=2,
        seed=2,
    )
    ds = count_call_alleles(ds)
    if chunks is not None:
        ds["call_allele_count"] = (
            ds.call_allele_count.dims,
            ds.call_allele_count.data.rechunk(chunks),
        )
    ds = identity_by_state(ds)
    actual = ds.stat_identity_by_state.values
    expect = np.nanmean(
        np.array([
            [[1.0, 0.0, 0.5], [0.0, 1.0, 0.5], [0.5, 0.5, 0.5]],
            [[1.0, 1.0, 0.5], [1.0, 1.0, 0.5], [0.5, 0.5, 0.5]],
        ]),
        axis=0,
    )
    np.testing.assert_array_equal(expect, actual)

Beispiel #30

0

Datei anzeigen

def test_simulate_genotype_call_dataset__zarr(tmp_path):
    path = str(tmp_path / "ds.zarr")
    ds = simulate_genotype_call_dataset(n_variant=10, n_sample=10)
    ds.to_zarr(path)
    xr.testing.assert_equal(ds, xr.open_zarr(path, concat_characters=False))  # type: ignore[no-untyped-call]