def test_agglomerative_clustering_memory_mapped():
    """AgglomerativeClustering must work on mem-mapped dataset.

    Non-regression test for issue #19875.
    """
    rng = np.random.RandomState(0)
    Xmm = create_memmap_backed_data(rng.randn(50, 100))
    AgglomerativeClustering(affinity="euclidean", linkage="single").fit(Xmm)
Example #2
0
def test_sparse_read_only_buffer(copy_X):
    """Test that sparse coordinate descent works for read-only buffers"""
    rng = np.random.RandomState(0)

    clf = ElasticNet(alpha=0.1, copy_X=copy_X, random_state=rng)
    X = sp.random(100, 20, format="csc", random_state=rng)

    # Make X.data read-only
    X.data = create_memmap_backed_data(X.data)

    y = rng.rand(100)
    clf.fit(X, y)
Example #3
0
def test_create_memmap_backed_data(monkeypatch, aligned):
    registration_counter = RegistrationCounter()
    monkeypatch.setattr(atexit, "register", registration_counter)

    input_array = np.ones(3)
    data = create_memmap_backed_data(input_array, aligned=aligned)
    check_memmap(input_array, data)
    assert registration_counter.nb_calls == 1

    data, folder = create_memmap_backed_data(input_array,
                                             return_folder=True,
                                             aligned=aligned)
    check_memmap(input_array, data)
    assert folder == os.path.dirname(data.filename)
    assert registration_counter.nb_calls == 2

    mmap_mode = "r+"
    data = create_memmap_backed_data(input_array,
                                     mmap_mode=mmap_mode,
                                     aligned=aligned)
    check_memmap(input_array, data, mmap_mode)
    assert registration_counter.nb_calls == 3

    input_list = [input_array, input_array + 1, input_array + 2]
    if aligned:
        with pytest.raises(
                ValueError,
                match="If aligned=True, input must be a single numpy array."):
            create_memmap_backed_data(input_list, aligned=True)
    else:
        mmap_data_list = create_memmap_backed_data(input_list, aligned=False)
        for input_array, data in zip(input_list, mmap_data_list):
            check_memmap(input_array, data)
        assert registration_counter.nb_calls == 4
Example #4
0
def test_create_memmap_backed_data(monkeypatch):
    registration_counter = RegistrationCounter()
    monkeypatch.setattr(atexit, 'register', registration_counter)

    input_array = np.ones(3)
    data = create_memmap_backed_data(input_array)
    check_memmap(input_array, data)
    assert registration_counter.nb_calls == 1

    data, folder = create_memmap_backed_data(input_array, return_folder=True)
    check_memmap(input_array, data)
    assert folder == os.path.dirname(data.filename)
    assert registration_counter.nb_calls == 2

    mmap_mode = 'r+'
    data = create_memmap_backed_data(input_array, mmap_mode=mmap_mode)
    check_memmap(input_array, data, mmap_mode)
    assert registration_counter.nb_calls == 3

    input_list = [input_array, input_array + 1, input_array + 2]
    mmap_data_list = create_memmap_backed_data(input_list)
    for input_array, data in zip(input_list, mmap_data_list):
        check_memmap(input_array, data)
    assert registration_counter.nb_calls == 4
def test_mst_linkage_core_memory_mapped(metric):
    """The MST-LINKAGE-CORE algorithm must work on mem-mapped dataset.

    Non-regression test for issue #19875.
    """
    rng = np.random.RandomState(seed=1)
    X = rng.normal(size=(20, 4))
    Xmm = create_memmap_backed_data(X)
    argdict = METRICS_DEFAULT_PARAMS[metric]
    keys = argdict.keys()
    for vals in itertools.product(*argdict.values()):
        kwargs = dict(zip(keys, vals))
        distance_metric = DistanceMetric.get_metric(metric, **kwargs)
        mst = mst_linkage_core(X, distance_metric)
        mst_mm = mst_linkage_core(Xmm, distance_metric)
        np.testing.assert_equal(mst, mst_mm)
Example #6
0
def test_memmap_on_contiguous_data(dtype):
    """Test memory mapped array on contiguous memoryview."""
    x = np.arange(10).astype(dtype)
    assert x.flags["C_CONTIGUOUS"]
    assert x.flags["ALIGNED"]

    # _test_sum consumes contiguous arrays
    # def _test_sum(NUM_TYPES[::1] x):
    sum_origin = _test_sum(x)

    # now on memory mapped data
    # aligned=True so avoid https://github.com/joblib/joblib/issues/563
    # without alignment, this can produce segmentation faults, see
    # https://github.com/scikit-learn/scikit-learn/pull/21654
    x_mmap = create_memmap_backed_data(x, mmap_mode="r+", aligned=True)
    sum_mmap = _test_sum(x_mmap)
    assert sum_mmap == pytest.approx(sum_origin, rel=1e-11)
Example #7
0
def test_memmap_backed_data(
    metric,
    PairwiseDistancesReduction,
    n_samples=512,
    n_features=100,
    dtype=np.float64,
):
    # Results must not depend on the datasets writability
    rng = np.random.RandomState(0)
    spread = 100
    X = rng.rand(n_samples, n_features).astype(dtype) * spread
    Y = rng.rand(n_samples, n_features).astype(dtype) * spread

    # Create read only datasets
    X_mm, Y_mm = create_memmap_backed_data([X, Y])

    if PairwiseDistancesReduction is PairwiseDistancesArgKmin:
        parameter = 10
        check_parameters = {}
    else:
        # Scaling the radius slightly with the numbers of dimensions
        radius = 10**np.log(n_features)
        parameter = radius
        check_parameters = {"radius": radius}

    ref_dist, ref_indices = PairwiseDistancesReduction.compute(
        X,
        Y,
        parameter,
        metric=metric,
        return_distance=True,
    )

    dist_mm, indices_mm = PairwiseDistancesReduction.compute(
        X_mm,
        Y_mm,
        parameter,
        metric=metric,
        return_distance=True,
    )

    ASSERT_RESULT[(PairwiseDistancesReduction, dtype)](ref_dist, dist_mm,
                                                       ref_indices, indices_mm,
                                                       **check_parameters)
Example #8
0
def test_loss_dtype(loss, readonly_memmap, dtype_in, dtype_out, sample_weight,
                    out1, out2, n_threads):
    """Test acceptance of dtypes, readonly and writeable arrays in loss functions.

    Check that loss accepts if all input arrays are either all float32 or all
    float64, and all output arrays are either all float32 or all float64.

    Also check that input arrays can be readonly, e.g. memory mapped.
    """
    loss = loss()
    # generate a y_true and raw_prediction in valid range
    n_samples = 5
    y_true, raw_prediction = random_y_true_raw_prediction(
        loss=loss,
        n_samples=n_samples,
        y_bound=(-100, 100),
        raw_bound=(-10, 10),
        seed=42,
    )
    y_true = y_true.astype(dtype_in)
    raw_prediction = raw_prediction.astype(dtype_in)

    if sample_weight is not None:
        sample_weight = np.array([2.0] * n_samples, dtype=dtype_in)
    if out1 is not None:
        out1 = np.empty_like(y_true, dtype=dtype_out)
    if out2 is not None:
        out2 = np.empty_like(raw_prediction, dtype=dtype_out)

    if readonly_memmap:
        y_true = create_memmap_backed_data(y_true, aligned=True)
        raw_prediction = create_memmap_backed_data(raw_prediction,
                                                   aligned=True)
        if sample_weight is not None:
            sample_weight = create_memmap_backed_data(sample_weight,
                                                      aligned=True)

    loss.loss(
        y_true=y_true,
        raw_prediction=raw_prediction,
        sample_weight=sample_weight,
        loss_out=out1,
        n_threads=n_threads,
    )
    loss.gradient(
        y_true=y_true,
        raw_prediction=raw_prediction,
        sample_weight=sample_weight,
        gradient_out=out2,
        n_threads=n_threads,
    )
    loss.loss_gradient(
        y_true=y_true,
        raw_prediction=raw_prediction,
        sample_weight=sample_weight,
        loss_out=out1,
        gradient_out=out2,
        n_threads=n_threads,
    )
    if out1 is not None and loss.is_multiclass:
        out1 = np.empty_like(raw_prediction, dtype=dtype_out)
    loss.gradient_hessian(
        y_true=y_true,
        raw_prediction=raw_prediction,
        sample_weight=sample_weight,
        gradient_out=out1,
        hessian_out=out2,
        n_threads=n_threads,
    )
    loss(y_true=y_true,
         raw_prediction=raw_prediction,
         sample_weight=sample_weight)
    loss.fit_intercept_only(y_true=y_true, sample_weight=sample_weight)
    loss.constant_to_optimal_zero(y_true=y_true, sample_weight=sample_weight)
    if hasattr(loss, "predict_proba"):
        loss.predict_proba(raw_prediction=raw_prediction)
    if hasattr(loss, "gradient_proba"):
        loss.gradient_proba(
            y_true=y_true,
            raw_prediction=raw_prediction,
            sample_weight=sample_weight,
            gradient_out=out1,
            proba_out=out2,
            n_threads=n_threads,
        )
Example #9
0
from sklearn.utils._testing import create_memmap_backed_data
from sklearn.utils.fixes import sp_version, parse_version


def dist_func(x1, x2, p):
    return np.sum((x1 - x2) ** p) ** (1.0 / p)


rng = check_random_state(0)
d = 4
n1 = 20
n2 = 25
X1 = rng.random_sample((n1, d)).astype("float64", copy=False)
X2 = rng.random_sample((n2, d)).astype("float64", copy=False)

[X1_mmap, X2_mmap] = create_memmap_backed_data([X1, X2])

# make boolean arrays: ones and zeros
X1_bool = X1.round(0)
X2_bool = X2.round(0)

[X1_bool_mmap, X2_bool_mmap] = create_memmap_backed_data([X1_bool, X2_bool])


V = rng.random_sample((d, d))
VI = np.dot(V, V.T)

BOOL_METRICS = [
    "matching",
    "jaccard",
    "dice",
Example #10
0
#%%
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_blobs
from sklearn.utils import shuffle
from sklearn.utils._testing import create_memmap_backed_data
from sklearn.cluster import AgglomerativeClustering

# code copied from `check_clustering` in sklearn/utils/estimator_checks.py
X, y = make_blobs(n_samples=100, random_state=30)
X, y = shuffle(X, y, random_state=7)
X = StandardScaler().fit_transform(X)
rng = np.random.RandomState(7)
X_noise = np.concatenate([X, rng.uniform(low=-3, high=3, size=(5, 2))])
X, y, X_noise = create_memmap_backed_data([X, y, X_noise])

import seaborn as sns

fig, ax = plt.subplots(1, 1, figsize=(8, 8))
sns.scatterplot(x=X[:, 0], y=X[:, 1], ax=ax)

# the params that triggered the error
# breaks for linkage='single', affinity "l1", "l2"
ag = AgglomerativeClustering(affinity="euclidean", linkage="single")
ag.fit(X)

#%%

len(np.unique(X, axis=0))
def _create_memmap_backed_data(data):
    return create_memmap_backed_data(
        data, mmap_mode="r", return_folder=False, aligned=True
    )
Example #12
0

def dist_func(x1, x2, p):
    return np.sum((x1 - x2)**p)**(1.0 / p)


rng = check_random_state(0)
d = 4
n1 = 20
n2 = 25
X64 = rng.random_sample((n1, d))
Y64 = rng.random_sample((n2, d))
X32 = X64.astype("float32")
Y32 = Y64.astype("float32")

[X_mmap, Y_mmap] = create_memmap_backed_data([X64, Y64])

# make boolean arrays: ones and zeros
X_bool = X64.round(0)
Y_bool = Y64.round(0)

[X_bool_mmap, Y_bool_mmap] = create_memmap_backed_data([X_bool, Y_bool])

V = rng.random_sample((d, d))
VI = np.dot(V, V.T)

METRICS_DEFAULT_PARAMS = [
    ("euclidean", {}),
    ("cityblock", {}),
    ("minkowski", dict(p=(1, 1.5, 2, 3))),
    ("chebyshev", {}),