Example #1
0
def _rechunk_bgen(shared_datadir: Path, tmp_path: Path,
                  **kwargs: Any) -> Tuple[xr.Dataset, xr.Dataset, str]:
    path = shared_datadir / "example.bgen"
    ds = read_bgen(path, chunks=(10, -1, -1))
    store = tmp_path / "example.zarr"
    dsr = rechunk_bgen(ds, store, **kwargs)
    return ds, dsr, str(store)
Example #2
0
def test_read_bgen__with_sample_file(shared_datadir):
    # The example file was generated using
    # qctool -g sgkit_bgen/tests/data/example.bgen -og sgkit_bgen/tests/data/example-separate-samples.bgen -os sgkit_bgen/tests/data/example-separate-samples.sample -incl-samples sgkit_bgen/tests/data/samples
    # Then editing example-separate-samples.sample to change the sample IDs
    path = shared_datadir / "example-separate-samples.bgen"
    ds = read_bgen(path)
    # Check the sample IDs are the ones from the .sample file
    assert ds["sample_id"].values.tolist() == ["s1", "s2", "s3", "s4", "s5"]
Example #3
0
def test_read_bgen__contig_dtype(shared_datadir, dtype):
    path = shared_datadir / "example.bgen"
    ds = read_bgen(path, contig_dtype=dtype)
    dtype = np.dtype(dtype)
    if dtype.kind in {"U", "S"}:
        assert ds["variant_contig"].dtype == np.int64
    else:
        assert ds["variant_contig"].dtype == dtype
Example #4
0
def test_read_bgen__fancy_index(shared_datadir, chunks):
    path = shared_datadir / "example.bgen"
    ds = read_bgen(path, chunks=chunks)
    npt.assert_almost_equal(ds["call_genotype_probability"][INDEXES, 0],
                            EXPECTED_PROBABILITIES,
                            decimal=3)
    npt.assert_almost_equal(ds["call_dosage"][INDEXES, 0],
                            EXPECTED_DOSAGES,
                            decimal=3)
Example #5
0
def test_read_bgen__with_no_samples(shared_datadir):
    # The example file was generated using
    # qctool -g sgkit_bgen/tests/data/example.bgen -og sgkit_bgen/tests/data/example-no-samples.bgen -os sgkit_bgen/tests/data/example-no-samples.sample -bgen-omit-sample-identifier-block -incl-samples sgkit_bgen/tests/data/samples
    # Then deleting example-no-samples.sample
    path = shared_datadir / "example-no-samples.bgen"
    ds = read_bgen(path)
    # Check the sample IDs are generated
    assert ds["sample_id"].values.tolist() == [
        b"sample_0",
        b"sample_1",
        b"sample_2",
        b"sample_3",
        b"sample_4",
    ]
Example #6
0
def test_read_bgen__scalar_index(shared_datadir, chunks):
    path = shared_datadir / "example.bgen"
    ds = read_bgen(path, chunks=chunks)
    for i, ix in enumerate(INDEXES):
        npt.assert_almost_equal(ds["call_genotype_probability"][ix, 0],
                                EXPECTED_PROBABILITIES[i],
                                decimal=3)
        npt.assert_almost_equal(ds["call_dosage"][ix, 0],
                                EXPECTED_DOSAGES[i],
                                decimal=3)
        for j in range(3):
            npt.assert_almost_equal(
                ds["call_genotype_probability"][ix, 0, j],
                EXPECTED_PROBABILITIES[i, j],
                decimal=3,
            )
Example #7
0
def test_read_bgen(shared_datadir, chunks):
    path = shared_datadir / "example.bgen"
    ds = read_bgen(path, chunks=chunks)

    # check some of the data (in different chunks)
    assert ds["call_dosage"].shape == _shape("variants", "samples")
    npt.assert_almost_equal(ds["call_dosage"].values[1][0], 1.987, decimal=3)
    npt.assert_almost_equal(ds["call_dosage"].values[100][0], 0.160, decimal=3)
    npt.assert_array_equal(ds["call_dosage_mask"].values[0, 0], [True])
    npt.assert_array_equal(ds["call_dosage_mask"].values[0, 1], [False])
    assert ds["call_genotype_probability"].shape == _shape(
        "variants", "samples", "genotypes")
    npt.assert_almost_equal(ds["call_genotype_probability"].values[1][0],
                            [0.005, 0.002, 0.992],
                            decimal=3)
    npt.assert_almost_equal(ds["call_genotype_probability"].values[100][0],
                            [0.916, 0.007, 0.076],
                            decimal=3)
    npt.assert_array_equal(ds["call_genotype_probability_mask"].values[0, 0],
                           [True] * 3)
    npt.assert_array_equal(ds["call_genotype_probability_mask"].values[0, 1],
                           [False] * 3)
Example #8
0
def test_read_bgen__invalid_chunks(shared_datadir):
    path = shared_datadir / "example.bgen"
    with pytest.raises(ValueError,
                       match="`chunks` must be tuple with 3 items"):
        read_bgen(path, chunks=(100, -1))  # type: ignore[arg-type]
Example #9
0
def test_read_bgen__invalid_contig_dtype(shared_datadir, dtype):
    path = shared_datadir / "example.bgen"
    with pytest.raises(ValueError,
                       match="`contig_dtype` must be of string or int type"):
        read_bgen(path, contig_dtype=dtype)
Example #10
0
def test_read_bgen__invalid_gp_dtype(shared_datadir, dtype):
    path = shared_datadir / "example.bgen"
    with pytest.raises(ValueError,
                       match="`gp_dtype` must be a floating point data type"):
        read_bgen(path, gp_dtype=dtype)
Example #11
0
def test_read_bgen__gp_dtype(shared_datadir, dtype):
    path = shared_datadir / "example.bgen"
    ds = read_bgen(path, gp_dtype=dtype)
    dtype = np.dtype(dtype)
    assert ds["call_genotype_probability"].dtype == dtype
    assert ds["call_dosage"].dtype == dtype