def test_fit(): with dask.config.set(scheduler="single-threaded"): sgd = SGDClassifier(max_iter=5, tol=1e-3) sgd = fit(sgd, X, Y, classes=np.array([-1, 0, 1])) sol = sgd.predict(z) result = predict(sgd, Z) assert result.chunks == ((2, 2), ) assert result.compute().tolist() == sol.tolist()
def test_dataframes(): df = pd.DataFrame({"x": range(10), "y": [0, 1] * 5}) ddf = dd.from_pandas(df, npartitions=2) with dask.config.set(scheduler="single-threaded"): sgd = SGDClassifier(max_iter=5, tol=1e-3) sgd = fit(sgd, ddf[["x"]], ddf.y, classes=[0, 1]) sol = sgd.predict(df[["x"]]) result = predict(sgd, ddf[["x"]]) da.utils.assert_eq(sol, result)
def test_fit_shuffle_blocks(): N = 10 X = da.from_array(1 + np.arange(N).reshape(-1, 1), chunks=1) y = da.from_array(np.ones(N), chunks=1) classes = [0, 1] sgd = SGDClassifier(max_iter=5, random_state=0, fit_intercept=False, shuffle=False, tol=1e-3) sgd1 = fit(clone(sgd), X, y, random_state=0, classes=classes) sgd2 = fit(clone(sgd), X, y, random_state=42, classes=classes) assert len(sgd1.coef_) == len(sgd2.coef_) == 1 assert not np.allclose(sgd1.coef_, sgd2.coef_) X, y = make_classification(random_state=0, chunks=20) sgd_a = fit(clone(sgd), X, y, random_state=0, classes=classes, shuffle_blocks=False) sgd_b = fit(clone(sgd), X, y, random_state=42, classes=classes, shuffle_blocks=False) assert np.allclose(sgd_a.coef_, sgd_b.coef_) with pytest.raises(ValueError, match="cannot be used to seed"): fit( sgd, X, y, classes=np.array([-1, 0, 1]), shuffle_blocks=True, random_state=da.random.RandomState(42), )
def test_no_compute(): sgd = SGDClassifier(max_iter=5, tol=1e-3) result = fit(sgd, X, Y, classes=np.array([-1, 0, 1]), compute=False) assert isinstance(result, Delayed)
def test_no_partial_fit_raises(): X, y = make_classification(chunks=50) with pytest.raises(ValueError, match="RandomForestClassifier"): fit(RandomForestClassifier(), X, y)
def test_bag(): x = db.from_sequence(range(10), npartitions=2) vect = dask_ml.feature_extraction.text.HashingVectorizer() vect = fit(vect, x, None) y = vect.transform(x) assert y.shape[1] == vect.n_features