def test_basic(self): a = dpp.PolynomialFeatures() b = spp.PolynomialFeatures() a.fit(X) b.fit(X.compute()) assert_estimator_equal(a._transformer, b)
def test_basic(self): a = dpp.MinMaxScaler() b = spp.MinMaxScaler() a.fit(X) b.fit(X.compute()) assert_estimator_equal(a, b, exclude='n_samples_seen_')
def test_basic(self): a = dpp.StandardScaler() b = spp.StandardScaler() a.fit(X) b.fit(X.compute()) assert_estimator_equal(a, b)
def test_basic(self, single_chunk_classification): X, y = single_chunk_classification a = nn.ParitalMLPClassifier(classes=[0, 1], random_state=0) b = nn_.MLPClassifier(random_state=0) a.fit(X, y) b.partial_fit(X, y, classes=[0, 1]) assert_estimator_equal(a, b)
def test_basic(self, single_chunk_classification): X, y = single_chunk_classification a = nn.ParitalMLPRegressor(random_state=0) b = nn_.MLPRegressor(random_state=0) a.fit(X, y) b.partial_fit(X, y) assert_estimator_equal(a, b)
def test_basic(self): a = dpp.LabelEncoder() b = spp.LabelEncoder() a.fit(y) b.fit(y.compute()) assert_estimator_equal(a, b)
def test_hashing_vectorizer(container): b = db.from_sequence(JUNK_FOOD_DOCS, npartitions=2) if container == "series": b = b.to_dataframe(columns=["text"])["text"] elif container == "array": b = b.to_dataframe(columns=["text"])["text"].values vect_ref = sklearn.feature_extraction.text.HashingVectorizer() vect = dask_ml.feature_extraction.text.HashingVectorizer() X_ref = vect_ref.fit_transform(b.compute()) X_da = vect.fit_transform(b) assert_estimator_equal(vect_ref, vect) assert isinstance(X_da, da.Array) assert isinstance(X_da.blocks[0].compute(), scipy.sparse.csr_matrix) result = X_da.map_blocks(lambda x: x.toarray(), dtype=X_da.dtype) expected = X_ref.toarray() # TODO: use dask.utils.assert_eq # Currently this fails chk_dask, as we end up with an integer key in the # dask graph. np.testing.assert_array_equal(result, expected)
def test_basic(self, Xl_blobs_easy): X, _ = Xl_blobs_easy # make it super easy to cluster a = DKKMeans(n_clusters=3, random_state=0) b = SKKMeans(n_clusters=3, random_state=0) a.fit(X) b.fit(X) assert_estimator_equal( a, b, exclude=["n_iter_", "inertia_", "cluster_centers_", "labels_"]) assert abs(a.inertia_ - b.inertia_) < 0.01 # order is arbitrary, so align first a_order = np.argsort(a.cluster_centers_, 0)[:, 0] b_order = np.argsort(b.cluster_centers_, 0)[:, 0] a_centers = a.cluster_centers_[a_order] b_centers = b.cluster_centers_[b_order] np.testing.assert_allclose(a_centers, b_centers, rtol=1e-3) b_labels = replace(b.labels_, [0, 1, 2], a_order[b_order]).astype(b.labels_.dtype) assert_eq(a.labels_.compute(), b_labels) assert a.n_iter_ # this is hacky b.cluster_centers_ = b_centers a.cluster_centers_ = a_centers assert_eq(a.transform(X), b.transform(X), rtol=1e-3) yhat_a = a.predict(X) yhat_b = b.predict(X) assert_eq(yhat_a.compute(), yhat_b)
def test_basic(self, single_chunk_regression): X, y = single_chunk_regression a = lm.PartialPassiveAggressiveRegressor(random_state=0, max_iter=100, tol=1e-3) b = lm_.PassiveAggressiveRegressor(random_state=0, max_iter=100, tol=1e-3) a.fit(X, y) b.partial_fit(*dask.compute(X, y)) assert_estimator_equal(a, b, exclude=["loss_function_"])
def test_predict(kind): X, y = make_classification(chunks=100) if kind == "numpy": X, y = dask.compute(X, y) elif kind == "dask.dataframe": X = dd.from_dask_array(X) y = dd.from_dask_array(y) base = LogisticRegression(random_state=0, n_jobs=1, solver="lbfgs") wrap = ParallelPostFit( LogisticRegression(random_state=0, n_jobs=1, solver="lbfgs")) base.fit(*dask.compute(X, y)) wrap.fit(*dask.compute(X, y)) assert_estimator_equal(wrap.estimator, base) result = wrap.predict(X) expected = base.predict(X) assert_eq_ar(result, expected) result = wrap.predict_proba(X) expected = base.predict_proba(X) assert_eq_ar(result, expected) result = wrap.predict_log_proba(X) expected = base.predict_log_proba(X) assert_eq_ar(result, expected)
def test_basic(self, single_chunk_blobs): X, y = single_chunk_blobs a = cluster.PartialMiniBatchKMeans(n_clusters=3, random_state=0) b = cluster_.MiniBatchKMeans(n_clusters=3, random_state=0) a.fit(X) b.partial_fit(X) assert_estimator_equal(a, b, exclude=['random_state_'])
def test_basic(self): a = dpp.StandardScaler() b = spp.StandardScaler() a.fit(X) b.fit(X.compute()) assert_estimator_equal(a, b, exclude="n_samples_seen_")
def test_incremental_basic(scheduler, xy_classification): X, y = xy_classification with scheduler() as (s, [a, b]): est1 = SGDClassifier(random_state=0, tol=1e-3) est2 = clone(est1) clf = Incremental(est1) result = clf.fit(X, y, classes=[0, 1]) for slice_ in da.core.slices_from_chunks(X.chunks): est2.partial_fit(X[slice_], y[slice_[0]], classes=[0, 1]) assert result is clf assert isinstance(result.estimator.coef_, np.ndarray) np.testing.assert_array_almost_equal(result.estimator.coef_, est2.coef_) assert_estimator_equal(clf.estimator, est2, exclude=['loss_function_']) # Predict result = clf.predict(X) expected = est2.predict(X) assert isinstance(result, da.Array) assert_eq(result, expected) # score result = clf.score(X, y) expected = est2.score(X, y) # assert isinstance(result, da.Array) assert_eq(result, expected) clf = Incremental(SGDClassifier(random_state=0, tol=1e-3)) clf.partial_fit(X, y, classes=[0, 1]) assert_estimator_equal(clf.estimator, est2, exclude=['loss_function_'])
def test_basic_dataframe(sparse, method, dask_data, dtype): a = sklearn.preprocessing.OneHotEncoder(sparse=sparse, dtype=dtype) b = dask_ml.preprocessing.OneHotEncoder(sparse=sparse, dtype=dtype) if method == "fit": a.fit(df) b.fit(dask_data) expected = a.transform(df) result = b.transform(dask_data) else: expected = a.fit_transform(df) result = b.fit_transform(dask_data) assert_estimator_equal( a, b, exclude={ "n_values_", "feature_indices_", "active_features_", "dtypes_", "drop_idx_", }, ) assert isinstance(result, type(dask_data)) assert len(result.columns) == expected.shape[1] if sparse and PANDAS_VERSION >= packaging.version.parse("0.24.0"): # pandas sparse ExtensionDtype interface dtype = pd.SparseDtype(dtype, dtype(0)) assert (result.dtypes == dtype).all() da.utils.assert_eq(result.values, expected)
def test_basic_array(sparse, method, categories): a = sklearn.preprocessing.OneHotEncoder(categories=categories, sparse=sparse) b = dask_ml.preprocessing.OneHotEncoder(categories=categories, sparse=sparse) if method == "fit": a.fit(X) b.fit(dX) expected = a.transform(X) result = b.transform(dX) else: expected = a.fit_transform(X) result = b.fit_transform(dX) assert_estimator_equal( a, b, exclude={"n_values_", "feature_indices_", "active_features_", "dtypes_"} ) assert isinstance(result, da.Array) # can't use assert_eq since we're apparently making bad graphs # See TODO in `transform`. assert result.shape == expected.shape assert result.dtype == expected.dtype if sparse: assert scipy.sparse.issparse(result.blocks[0].compute()) result = result.map_blocks(lambda x: x.toarray(), dtype="f8").compute() da.utils.assert_eq(result, expected) else: result = result.compute() da.utils.assert_eq(result, expected)
def test_basic(self, single_chunk_classification): X, y = single_chunk_classification a = PartialPerceptron(classes=[0, 1], max_iter=1000, tol=1e-3) b = Perceptron(max_iter=1000, tol=1e-3) a.fit(X, y) b.partial_fit(X, y, classes=[0, 1]) assert_estimator_equal(a.coef_, b.coef_)
def test_basic_dataframe(sparse, method, dask_data, dtype): a = sklearn.preprocessing.OneHotEncoder(sparse=sparse, dtype=dtype) b = dask_ml.preprocessing.OneHotEncoder(sparse=sparse, dtype=dtype) if method == "fit": a.fit(df) b.fit(dask_data) expected = a.transform(df) result = b.transform(dask_data) else: expected = a.fit_transform(df) result = b.fit_transform(dask_data) assert_estimator_equal(a, b, exclude={ "n_values_", "feature_indices_", "active_features_", "dtypes_" }) assert isinstance(result, type(dask_data)) assert len(result.columns) == expected.shape[1] if sparse: dtype = pd.SparseDtype(dtype, dtype(0)) assert (result.dtypes == dtype).all() da.utils.assert_eq(result.values, expected)
def test_fit(data): a = sklearn.impute.SimpleImputer() b = dask_ml.impute.SimpleImputer() a.fit(X) b.fit(data) assert_estimator_equal(a, b)
def test_basic(self): a = dpp.LabelEncoder() b = spp.LabelEncoder() a.fit(y) b.fit(y.compute()) exclude = {"dtype_"} assert_estimator_equal(a, b, exclude=exclude)
def test_basic(self, single_chunk_regression): X, y = single_chunk_regression a = lm.PartialSGDRegressor(random_state=0, max_iter=1000, tol=1e-3) b = lm_.SGDRegressor(random_state=0, max_iter=1000, tol=1e-3) a.fit(X, y) b.partial_fit(X, y) assert_estimator_equal(a, b)
def test_array_transform(self): a = dpp.PolynomialFeatures() b = spp.PolynomialFeatures() res_a = a.fit_transform(X) res_b = b.fit_transform(X.compute()) assert_estimator_equal(a, b) assert dask.is_dask_collection(res_a) assert_eq_ar(res_a, res_b)
def test_basic(): a = dd.PCA() b = sd.PCA() a.fit(dX) b.fit(X) assert_estimator_equal(a, b, exclude=["components_"]) np.testing.assert_allclose( flip_vector_signs(a.components_, 1), flip_vector_signs(b.components_, 1) )
def test_transforms_other(): a = sklearn.feature_extraction.text.HashingVectorizer() b = dask_ml.feature_extraction.text.HashingVectorizer() X_a = a.fit_transform(JUNK_FOOD_DOCS) X_b = b.fit_transform(JUNK_FOOD_DOCS) assert_estimator_equal(a, b) np.testing.assert_array_equal(X_a.toarray(), X_b.toarray())
def test_input_types(self): a = dpp.PolynomialFeatures() b = spp.PolynomialFeatures() assert_estimator_equal(a.fit(df), a.fit(df.compute())) assert_estimator_equal(a.fit(df), a.fit(df.compute().values)) assert_estimator_equal(a.fit(df.values), a.fit(df.compute().values)) assert_estimator_equal(a.fit(df), b.fit(df.compute())) assert_estimator_equal(a.fit(df), b.fit(df.compute().values))
def test_fit(data): a = sklearn.impute.SimpleImputer() b = dask_ml.impute.SimpleImputer() a.fit(X) b.fit(data) assert_estimator_equal(a, b, exclude=["statistics_"]) np.testing.assert_array_almost_equal(a.statistics_, np.asarray(b.statistics_))
def test_fit(self): a = dpp.RobustScaler() b = spp.RobustScaler() # bigger data to make percentile more reliable # and not centered around 0 to make rtol work X, y = make_classification(n_samples=1000, chunks=200, random_state=0) X = X + 3 a.fit(X) b.fit(X.compute()) assert_estimator_equal(a, b, rtol=0.2)
def test_basic(self, single_chunk_classification): X, y = single_chunk_classification a = lm.PartialSGDClassifier(classes=[0, 1], random_state=0, max_iter=1000, tol=1e-3) b = lm_.SGDClassifier(random_state=0, max_iter=1000, tol=1e-3) a.fit(X, y) b.partial_fit(*dask.compute(X, y), classes=[0, 1]) assert_estimator_equal(a, b, exclude=exclude)
def test_basic(self, single_chunk_classification): X, y = single_chunk_classification a = lm.PartialPassiveAggressiveClassifier(classes=[0, 1], random_state=0, max_iter=100, tol=1e-3) b = lm_.PassiveAggressiveClassifier(random_state=0, max_iter=100, tol=1e-3) a.fit(X, y) b.partial_fit(X, y, classes=[0, 1]) assert_estimator_equal(a, b, exclude=['loss_function_'])
def test_fit_convert(data): a = sklearn.impute.SimpleImputer() b = dask_ml.impute.SimpleImputer() expected = a.fit_transform(X.astype(str).astype(object)) result = b.fit_transform(data.astype(str).astype(object)) assert_estimator_equal(a, b) assert isinstance(result, type(data)) if isinstance(data, (pd.DataFrame, dd.DataFrame)): result = result.values da.utils.assert_eq(result, expected)
def test_fit_constant(data): a = sklearn.impute.SimpleImputer(strategy="constant", fill_value=-999.0) b = dask_ml.impute.SimpleImputer(strategy="constant", fill_value=-999.0) expected = a.fit_transform(X) result = b.fit_transform(data) assert_estimator_equal(a, b) assert isinstance(result, type(data)) if isinstance(data, (pd.DataFrame, dd.DataFrame)): result = result.values da.utils.assert_eq(result, expected)