Example #1
0
def test_random_split_bag():
    b = db.range(1000, npartitions=10)
    train, test = random_split(b, 0.2, 123)

    assert random_split(b, 0.2, 123)[0].name == train.name
    assert random_split(b, 0.3, 123)[0].name != train.name
    assert random_split(b, 0.2)[0].name != random_split(b, 0.2)[0].name

    train_c, test_c = dask.compute(train, test)
    assert 0.75 < len(train_c) / 1000 < 0.85
    assert len(train_c) + len(test_c) == 1000
    assert set(train_c) | set(test_c) == set(range(1000))
Example #2
0
def test_random_split_errors():
    b = db.range(1000, npartitions=10)
    with pytest.raises(ValueError):
        random_split(b, 2)
    with pytest.raises(ValueError):
        random_split(b, -1)
    with pytest.raises(ValueError):
        random_split(b, 0.5, "not-a-seed-or-RandomState")
    with pytest.raises(TypeError):
        random_split("not-a-dask-object", 0.5)
Example #3
0
def test_random_split_array():
    a = np.arange(1000)
    x = da.from_array(a, chunks=100)
    train, test = random_split(x, 0.2, 123)
    assert train.dtype == test.dtype == x.dtype

    assert random_split(x, 0.2, 123)[0].name == train.name
    assert random_split(x, 0.3, 123)[0].name != train.name
    assert random_split(x, 0.2)[0].name != random_split(x, 0.2)[0].name

    train_c, test_c = dask.compute(train, test)
    assert train_c.shape == train.shape
    assert test_c.shape == test.shape
    assert 0.75 < len(train_c) / 1000 < 0.85
    assert len(train_c) + len(test_c) == 1000
    assert set(train_c) | set(test_c) == set(range(1000))

    # 2D
    a = np.arange(1000).reshape((1000, 1))
    x = da.from_array(a, chunks=100)
    train, test = random_split(x, 0.2, 123)
    assert train.dtype == test.dtype == x.dtype

    train_c, test_c = dask.compute(train, test)
    assert train_c.shape == train.shape
    assert test_c.shape == test.shape
    assert 0.75 < len(train_c) / 1000 < 0.85
    assert len(train_c) + len(test_c) == 1000
Example #4
0
def test_random_split_matrix():
    a = np.arange(1000)
    m = dm.from_array(da.from_array(a, chunks=100))
    train, test = random_split(m, 0.2, 123)
    assert train.dtype == test.dtype == m.dtype
    assert train.ndim == test.ndim == 1
    assert train.shape == test.shape == (None,)

    assert random_split(m, 0.2, 123)[0].name == train.name
    assert random_split(m, 0.3, 123)[0].name != train.name
    assert random_split(m, 0.2)[0].name != random_split(m, 0.2)[0].name

    train_c, test_c = dask.compute(train, test)
    assert 0.75 < len(train_c) / 1000 < 0.85
    assert len(train_c) + len(test_c) == 1000
    assert set(train_c) | set(test_c) == set(range(1000))

    # 2D
    a = np.arange(1000).reshape((1000, 1))
    m = dm.from_array(da.from_array(a, chunks=100))
    train, test = random_split(m, 0.2, 123)
    assert train.dtype == test.dtype == m.dtype
    assert train.ndim == test.ndim == 2
    assert train.shape == test.shape == (None, 1)

    train_c, test_c = dask.compute(train, test)
    assert 0.75 < len(train_c) / 1000 < 0.85
    assert len(train_c) + len(test_c) == 1000

    # Sparse
    m = m.map_partitions(sparse.csr_matrix, shape=m.shape, dtype=m.dtype)
    train, test = random_split(m, 0.2, 123)
    train_c, test_c = dask.compute(train, test)
    assert 0.75 < train_c.shape[0] / 1000 < 0.85
    assert train_c.shape[0] + test_c.shape[0] == 1000
    assert sparse.issparse(train_c) and sparse.issparse(test_c)