def test_gridsearch(): from sklearn.pipeline import make_pipeline dcv = pytest.importorskip('dask_searchcv') X, y = make_classification(n_samples=100, n_features=5, chunksize=10) grid = {'logisticregression__lamduh': [.001, .01, .1, .5]} pipe = make_pipeline(DoNothingTransformer(), LogisticRegression()) search = dcv.GridSearchCV(pipe, grid, cv=3) search.fit(X, y)
def test_big(fit_intercept, is_sparse): with dask.config.set(scheduler='synchronous'): X, y = make_classification(is_sparse=is_sparse) lr = LogisticRegression(fit_intercept=fit_intercept) lr.fit(X, y) lr.predict(X) lr.predict_proba(X) if fit_intercept: assert lr.intercept_ is not None
def test_fit(fit_intercept, is_sparse): X, y = make_classification(n_samples=100, n_features=5, chunksize=10, is_sparse=is_sparse) lr = LogisticRegression(fit_intercept=fit_intercept) lr.fit(X, y) lr.predict(X) lr.predict_proba(X)
def test_big(fit_intercept): import dask dask.set_options(get=dask.get) X, y = make_classification() lr = LogisticRegression(fit_intercept=fit_intercept) lr.fit(X, y) lr.predict(X) lr.predict_proba(X) if fit_intercept: assert lr.intercept_ is not None
def test_big(fit_intercept, is_sparse, is_cupy): with dask.config.set(scheduler='synchronous'): X, y = make_classification(is_sparse=is_sparse) if is_cupy and not is_sparse: cupy = pytest.importorskip('cupy') X, y = to_dask_cupy_array_xy(X, y, cupy) lr = LogisticRegression(fit_intercept=fit_intercept) lr.fit(X, y) lr.predict(X) lr.predict_proba(X) if fit_intercept: assert lr.intercept_ is not None
def test_fit(fit_intercept, is_sparse, is_cupy): X, y = make_classification(n_samples=100, n_features=5, chunksize=10, is_sparse=is_sparse) if is_cupy and not is_sparse: cupy = pytest.importorskip('cupy') X, y = to_dask_cupy_array_xy(X, y, cupy) lr = LogisticRegression(fit_intercept=fit_intercept) lr.fit(X, y) lr.predict(X) lr.predict_proba(X)
import dask.dataframe as dd import dask.datasets as ds import time from dask_ml.linear_model import LogisticRegression from dask_glm.datasets import make_classification X, y = make_classification(n_samples=1000) t = time.time() lr = LogisticRegression() lr.fit(X, y) lr.predict(X) lr.predict_proba(X) #est.score(X, y) print('\nTime dask_ml: ' + str(time.time() - t)) # Parallelize Scikit-Learn Directly from dask.distributed import Client from sklearn.externals.joblib import parallel_backend client = Client('localhost:8786') # Connect to a Dask Cluster print(client) with parallel_backend('dask', scatter=[X, y]): # Your normal scikit-learn code here t = time.time() lr = LogisticRegression() lr.fit(X, y) lr.predict(X) lr.predict_proba(X) #est.score(X, y) print('\nTime dask_ml distributed: ' + str(time.time() - t))
def make_dask_arrs(): return make_classification(n_samples=300, n_features=6)
def test_in_pipeline(): from sklearn.pipeline import make_pipeline X, y = make_classification(n_samples=100, n_features=5, chunksize=10) pipe = make_pipeline(DoNothingTransformer(), LogisticRegression()) pipe.fit(X, y)
class DoNothingTransformer(object): def fit(self, X, y=None): return self def transform(self, X, y=None): return X def fit_transform(self, X, y=None): return X def get_params(self, deep=True): return {} X, y = make_classification() def test_lr_init(solver): LogisticRegression(solver=solver) @pytest.mark.parametrize('fit_intercept', [True, False]) def test_fit(fit_intercept): X, y = make_classification(n_samples=100, n_features=5, chunksize=10) lr = LogisticRegression(fit_intercept=fit_intercept) lr.fit(X, y) lr.predict(X) lr.predict_proba(X)