def test_raise_on_both_path_types():
    with pytest.raises(
            ValueError,
            match=
            "Either `path` or all 3 of `{bed,bim,fam}_path` must be specified but not both",
    ):
        read_plink(path="x", bed_path="x")
def test_same_as_the_reference_implementation() -> None:
    """
    This test validates that our implementation gets exactly
    the same results as the reference R implementation.
    """

    d = Path(__file__).parent
    ds = read_plink(path="hapmap_JPT_CHB_r23a_filtered")

    pcs = da.from_array(
        pd.read_csv(d.joinpath("pcs.csv").as_posix(), usecols=[1, 2]).to_numpy()
    )
    ds[sample_pca_projection] = (("samples", "components"), pcs)
    phi = pc_relate(ds).pc_relate_phi.compute()

    n_samples = 90
    assert isinstance(phi, xr.DataArray)
    assert phi.shape == (n_samples, n_samples)

    # Get genesis/reference results:
    genesis_phi = pd.read_csv(d.joinpath("kinbtwe.csv"))
    genesis_phi = genesis_phi[["kin"]].to_numpy()

    phi_s = phi.data[np.triu_indices_from(phi.data, 1)]  # type: ignore[no-untyped-call]
    assert phi_s.size == genesis_phi.size
    assert np.allclose(phi_s, genesis_phi.T)
def test_read_multi_path(shared_datadir, ds1):
    path = shared_datadir / example_dataset_1
    ds2 = read_plink(
        bed_path=path.with_suffix(".bed"),
        bim_path=path.with_suffix(".bim"),
        fam_path=path.with_suffix(".fam"),
        bim_sep="\t",
        fam_sep="\t",
    )
    xr.testing.assert_equal(ds1, ds2)
Beispiel #4
0
def load_plink(paths: PLINKPaths, contig: Contig) -> Dataset:
    logger.info(
        f"Loading PLINK dataset for contig {contig} from {paths.bed_path}")
    with dask.config.set(scheduler="threads"):
        ds = read_plink(
            bed_path=paths.bed_path,
            bim_path=paths.bim_path,
            fam_path=paths.fam_path,
            bim_int_contig=False,
            count_a1=False,
        )
    ds["sample_id"] = ds["sample_id"].astype("int32")
    # All useful sample metadata will come from the
    # main UKB dataset instead
    ds = ds.drop_vars([
        "sample_family_id",
        "sample_paternal_id",
        "sample_maternal_id",
        "sample_phenotype",
    ])
    # Update contig index/names
    ds = transform_contig(ds, contig)
    return ds
import urllib.request

from sgkit.io.plink import read_plink

if __name__ == "__main__":
    for ext in (".bed", ".bim", ".fam"):
        urllib.request.urlretrieve(
            f"https://github.com/pystatgen/sgkit/raw/main/sgkit/tests/io/plink/data/plink_sim_10s_100v_10pmiss{ext}",
            f"plink_sim_10s_100v_10pmiss{ext}",
        )
    ds = read_plink(path="plink_sim_10s_100v_10pmiss")
    print(ds)
def ds1(shared_datadir, request):
    path = shared_datadir / example_dataset_1
    return read_plink(path=path, bim_sep="\t", fam_sep="\t", **request.param)