Beispiel #1
0
def test_find_bins():
    bins = find_bins(DATA, random_state=0)
    assert bins.shape == (2, 256)
    assert bins.dtype == DATA.dtype

    assert np.allclose(bins[0][[64, 128, 192]],
                       np.array([-0.7, 0.0, 0.7]), atol=1e-1)

    assert np.allclose(bins[1][[64, 128, 192]],
                       np.array([9.99, 10.00, 10.01]), atol=1e-2)
Beispiel #2
0
def test_map_to_bins(n_bins):
    bins = find_bins(DATA, n_bins=n_bins, random_state=0)
    binned = map_to_bins(DATA, bins)
    assert binned.shape == DATA.shape
    assert binned.dtype == np.uint8
    assert binned.flags.f_contiguous

    min_indices = DATA.argmin(axis=0)
    max_indices = DATA.argmax(axis=0)

    for feature_idx, min_idx in enumerate(min_indices):
        assert binned[min_idx, feature_idx] == 0
    for feature_idx, max_idx in enumerate(max_indices):
        assert binned[max_idx, feature_idx] == n_bins - 1
Beispiel #3
0
from time import time
import numpy as np
from joblib import Memory
from pygbm.binning import find_bins, map_to_bins

m = Memory(location='/tmp')


@m.cache
def make_data(n_samples=int(1e8), n_features=5, seed=42, dtype=np.float32):
    rng = np.random.RandomState(seed)
    return rng.randn(n_samples, n_features).astype(dtype)


print("Generating random data...")
data = make_data(n_samples=int(1e8), n_features=5, seed=42, dtype=np.float32)
print("Extracting bins from subsample of data...")
bins = find_bins(data, random_state=0)

print("Mapping data to integer bins...")
tic = time()
binned = map_to_bins(data, bins)
toc = time()
duration = toc - tic
print(f"Processed {data.nbytes/1e9:0.3f} GB in {duration:0.3f}s"
      f" ({data.nbytes / 1e6 / duration:0.1f} MB/s)")
print(f"Output size: {binned.nbytes / 1e9:0.3f} GB")
Beispiel #4
0
def test_find_bins_invalid_n_bins():
    with pytest.raises(ValueError):
        find_bins(DATA, n_bins=1024)
Beispiel #5
0
def test_find_bins_low_n_bins():
    bins = find_bins(DATA, n_bins=128, random_state=0)
    assert bins.shape == (2, 128)
    assert bins.dtype == DATA.dtype