def test_regressor_sg_train_mg_predict(datatype, keys, data_size, fit_intercept, client): # Just testing for basic compatibility w/ dask-ml's ParallelPostFit. # Refer to test_pickle.py for more extensive testing of single-GPU # model serialization. nrows, ncols, n_info = data_size X_train, y_train, _ = make_dataset(datatype, nrows, ncols, n_info) X_train_local = X_train.compute() y_train_local = y_train.compute() local_model = cuml.linear_model.LinearRegression( fit_intercept=fit_intercept) local_model.fit(X_train_local, y_train_local) dist_model = ParallelPostFit(estimator=local_model) predictions = dist_model.predict(X_train).compute() assert isinstance(predictions, cupy.ndarray) # Dataset should be fairly linear already so the predictions should # be very close to the training data. np.testing.assert_allclose(predictions.get(), y_train.compute().get(), atol=1e-3, rtol=1e-3)
def test_multiclass(): X, y = sklearn.datasets.make_classification(n_classes=3, n_informative=4) X = da.from_array(X, chunks=50) y = da.from_array(y, chunks=50) clf = ParallelPostFit( LogisticRegression(random_state=0, n_jobs=1, solver="lbfgs", multi_class="auto")) clf.fit(*dask.compute(X, y)) result = clf.predict(X) expected = clf.estimator.predict(X) assert isinstance(result, da.Array) assert_eq_ar(result, expected) result = clf.predict_proba(X) expected = clf.estimator.predict_proba(X) assert isinstance(result, da.Array) assert_eq_ar(result, expected) result = clf.predict_log_proba(X) expected = clf.estimator.predict_log_proba(X) assert_eq_ar(result, expected)
def train(self, X_train: np.ndarray, y_train: np.ndarray, X_test: np.ndarray, y_test: np.ndarray, verbose: bool = True, optimize: bool = False): X_train_prepared = self._preprocess_dataset(X_train) clf = ParallelPostFit(self.classifier, scoring='accuracy') self.classifier = clf.fit(X_train_prepared, y_train) X_test_prepared = self._preprocess_dataset(X_test) prediction = self.classifier.predict_proba(X_test_prepared) y_proba_list = [ self._predict_proba_to_label(proba).value for proba in prediction ] if verbose: self.evaluate(y_proba_list, y_test, classes=self.classifier.classes_) if optimize: opt_classifier = ImageModelOptimiser(self).optimize( X_train, y_train) self.classifier = opt_classifier.classifier
def test_multiclass(): X, y = sklearn.datasets.make_classification(n_classes=3, n_informative=4) X = da.from_array(X, chunks=50) y = da.from_array(y, chunks=50) if SK_GE_020: kwargs = {"multi_class": "auto"} else: kwargs = {} clf = ParallelPostFit( LogisticRegression(random_state=0, n_jobs=1, solver="lbfgs", **kwargs) ) clf.fit(X, y) result = clf.predict(X) expected = clf.estimator.predict(X) assert isinstance(result, da.Array) assert_eq_ar(result, expected) result = clf.predict_proba(X) expected = clf.estimator.predict_proba(X) assert isinstance(result, da.Array) assert_eq_ar(result, expected)
def test_predict(kind): X, y = make_classification(chunks=100) if kind == "numpy": X, y = dask.compute(X, y) elif kind == "dask.dataframe": X = dd.from_dask_array(X) y = dd.from_dask_array(y) base = LogisticRegression(random_state=0, n_jobs=1, solver="lbfgs") wrap = ParallelPostFit( LogisticRegression(random_state=0, n_jobs=1, solver="lbfgs")) base.fit(*dask.compute(X, y)) wrap.fit(*dask.compute(X, y)) assert_estimator_equal(wrap.estimator, base) result = wrap.predict(X) expected = base.predict(X) assert_eq_ar(result, expected) result = wrap.predict_proba(X) expected = base.predict_proba(X) assert_eq_ar(result, expected) result = wrap.predict_log_proba(X) expected = base.predict_log_proba(X) assert_eq_ar(result, expected)
def test_it_works(): clf = ParallelPostFit(GradientBoostingClassifier()) X, y = make_classification(n_samples=1000, chunks=100) clf.fit(X, y) assert isinstance(clf.predict(X), da.Array) assert isinstance(clf.predict_proba(X), da.Array)
def test_laziness(): clf = ParallelPostFit(LinearRegression()) X, y = make_classification(chunks=50) clf.fit(X, y) x = clf.score(X, y, compute=False) assert dask.is_dask_collection(x) assert 0 < x.compute() < 1
def test_predict(self, mock_load, mock_predict, mock_preprocess_ds): mock_load.return_value = ParallelPostFit() mock_preprocess_ds.return_value = [1, 2, 3] testable = SKLinearImageModel(pkl_file='trained_models/hog_sklearn.pkl') testable.predict(self.X) mock_load.assert_called_once() mock_predict.assert_called_once_with([1, 2, 3])
def test_auto_rechunk(): clf = ParallelPostFit(GradientBoostingClassifier()) X, y = make_classification(n_samples=1000, n_features=20, chunks=100) X = X.rechunk({0: 100, 1: 10}) clf.fit(X, y) assert clf.predict(X).compute().shape == (1000,) assert clf.predict_proba(X).compute().shape == (1000, 2) assert clf.score(X, y) == clf.score(X.compute(), y.compute())
def test_no_method_raises(): clf = ParallelPostFit(LinearRegression()) X, y = make_classification(chunks=50) clf.fit(X, y) with pytest.raises(AttributeError) as m: clf.predict_proba(X) assert m.match("The wrapped estimator (.|\n)* 'predict_proba' method.")
def test_transform_meta_override(): X = pd.DataFrame({"cat_s": ["a", "b", "c", "d"]}) dd_X = dd.from_pandas(X, npartitions=2) base = OneHotEncoder(sparse=False) base.fit(pd.DataFrame(X)) # Failure when not proving transform_meta # because of value dependent model wrap = ParallelPostFit(base) with pytest.raises(ValueError): wrap.transform(dd_X) wrap = ParallelPostFit(base, transform_meta=np.array([[0, 0, 0, 0]], dtype=np.float64)) result = wrap.transform(dd_X) expected = base.transform(X) assert_eq_ar(result, expected)
def test_predict_meta_override(): X = pd.DataFrame({"c_0": [1, 2, 3, 4]}) y = np.array([1, 2, 3, 4]) base = CategoricalNB() base.fit(pd.DataFrame(X), y) dd_X = dd.from_pandas(X, npartitions=2) dd_X._meta = pd.DataFrame({"c_0": [5]}) # Failure when not proving predict_meta # because of value dependent model wrap = ParallelPostFit(base) with pytest.raises(ValueError): wrap.predict(dd_X) # Success when providing meta over-ride wrap = ParallelPostFit(base, predict_meta=np.array([1])) result = wrap.predict(dd_X) expected = base.predict(X) assert_eq_ar(result, expected)
def test_predict_correct_output_dtype(): X, y = make_classification(chunks=100) X_ddf = dd.from_dask_array(X) base = LinearRegression(n_jobs=1) base.fit(X, y) wrap = ParallelPostFit(base) base_output = base.predict(X_ddf.compute()) wrap_output = wrap.predict(X_ddf) assert wrap_output.dtype == base_output.dtype
def test_it_works(): clf = ParallelPostFit(GradientBoostingClassifier()) X, y = make_classification(n_samples=1000, chunks=100) X_, y_ = dask.compute(X, y) clf.fit(X_, y_) assert isinstance(clf.predict(X), da.Array) assert isinstance(clf.predict_proba(X), da.Array) result = clf.score(X, y) expected = clf.estimator.score(X_, y_) assert result == expected
def test_sparse_inputs(): X = csr_matrix((3, 4)) y = np.asarray([0, 0, 1], dtype=np.int32) base = SGDClassifier(tol=1e-3) base = base.fit(X, y) wrap = ParallelPostFit(base) X_da = da.from_array(X, chunks=(1, 4)) result = wrap.predict(X_da).compute() expected = base.predict(X) assert_eq_ar(result, expected)
def test_multiclass(): X, y = make_classification(chunks=50, n_classes=3, n_informative=4) clf = ParallelPostFit(LogisticRegression(random_state=0)) clf.fit(X, y) result = clf.predict(X) expected = clf.estimator.predict(X) assert isinstance(result, da.Array) assert_eq_ar(result, expected) result = clf.predict_proba(X) expected = clf.estimator.predict_proba(X) assert isinstance(result, da.Array) assert_eq_ar(result, expected)
def test_transform(kind): X, y = make_classification(chunks=100) if kind == "numpy": X, y = dask.compute(X, y) elif kind == "dask.dataframe": X = dd.from_dask_array(X) y = dd.from_dask_array(y) base = PCA(random_state=0) wrap = ParallelPostFit(PCA(random_state=0)) base.fit(*dask.compute(X, y)) wrap.fit(*dask.compute(X, y)) assert_estimator_equal(wrap.estimator, base) result = base.transform(*dask.compute(X)) expected = wrap.transform(X) assert_eq_ar(result, expected)
def test_train(self, mock_preprocess_ds, mock_fit, mock_predict_proba): X_train, X_test, y_train, y_test = train_test_split( self.X, self.y, test_size=0.2, shuffle=True, random_state=42, ) X_train_prepared = X_train * 0.2 mock_preprocess_ds.return_value = X_train_prepared mock_fit.return_value = ParallelPostFit() testable = SKLinearImageModel(pkl_file=None) testable.train(X_train, y_train, X_test, y_test, verbose=False) np.testing.assert_array_equal(X_train_prepared, mock_fit.call_args[0][0]) np.testing.assert_array_equal(y_train, mock_fit.call_args[0][1]) mock_predict_proba.assert_called_once()
def test_warning_on_dask_array_without_array_function(): X, y = make_classification(n_samples=10, n_features=2, chunks=10) clf = ParallelPostFit(GradientBoostingClassifier()) clf = clf.fit(X, y) class FakeArray: def __init__(self, value): self.value = value @property def ndim(self): return self.value.ndim @property def len(self): return self.value.len @property def dtype(self): return self.value.dtype @property def shape(self): return self.value.shape ar = FakeArray(np.zeros(shape=(2, 2))) fake_dask_ar = da.from_array(ar) fake_dask_ar._meta = FakeArray(np.zeros(shape=(0, 0))) with pytest.warns( UserWarning, match="provide explicit `predict_meta` to the dask_ml.wrapper"): clf.predict(fake_dask_ar) with pytest.warns( UserWarning, match= "provide explicit `predict_proba_meta` to the dask_ml.wrapper", ): clf.predict_proba(fake_dask_ar)
def test_predict(kind): X, y = make_classification(chunks=100) if kind == 'numpy': X, y = dask.compute(X, y) elif kind == 'dask.dataframe': X = dd.from_dask_array(X) y = dd.from_dask_array(y) base = LogisticRegression(random_state=0) wrap = ParallelPostFit(LogisticRegression(random_state=0)) base.fit(X, y) wrap.fit(X, y) assert_estimator_equal(wrap.estimator, base) result = wrap.predict(X) expected = base.predict(X) assert_eq_ar(result, expected) result = wrap.predict_proba(X) expected = base.predict_proba(X) assert_eq_ar(result, expected)
import numpy as np import argparse from joblib import load parser = argparse.ArgumentParser() parser.add_argument('--filename', type=str, dest='filename', help='path to the dataset to be scored') args = parser.parse_args() if __name__ == "__main__": from distributed import Client, LocalCluster from dask_ml.wrappers import ParallelPostFit import dask.dataframe as dd cluster = LocalCluster() client = Client(cluster) clf = load('wta-matches-model.joblib') clf = ParallelPostFit(clf) matches = dd.read_csv(args.filename, assume_missing=True) point_diff = (matches.winner_rank_points - matches.loser_rank_points).dropna() X_test = point_diff.compute().values[:, np.newaxis] y_test_pred = clf.predict(X_test) np.save("predictions.npy", y_test_pred)
def convert(self, sql: "org.apache.calcite.sql.SqlNode", context: "dask_sql.Context") -> DataContainer: select = sql.getSelect() schema_name, experiment_name = context.fqn(sql.getExperimentName()) kwargs = convert_sql_kwargs(sql.getKwargs()) if experiment_name in context.schema[schema_name].experiments: if sql.getIfNotExists(): return elif not sql.getReplace(): raise RuntimeError( f"A experiment with the name {experiment_name} is already present." ) logger.debug( f"Creating Experiment {experiment_name} from query {select} with options {kwargs}" ) model_class = None automl_class = None experiment_class = None if "model_class" in kwargs: model_class = kwargs.pop("model_class") # when model class was provided, must provide experiment_class also for tuning if "experiment_class" not in kwargs: raise ValueError( f"Parameters must include a 'experiment_class' parameter for tuning {model_class}." ) experiment_class = kwargs.pop("experiment_class") elif "automl_class" in kwargs: automl_class = kwargs.pop("automl_class") else: raise ValueError( "Parameters must include a 'model_class' or 'automl_class' parameter." ) target_column = kwargs.pop("target_column", "") tune_fit_kwargs = kwargs.pop("tune_fit_kwargs", {}) parameters = kwargs.pop("tune_parameters", {}) experiment_kwargs = kwargs.pop("experiment_kwargs", {}) automl_kwargs = kwargs.pop("automl_kwargs", {}) logger.info(parameters) select_query = context._to_sql_string(select) training_df = context.sql(select_query) if not target_column: raise ValueError( "Unsupervised Algorithm cannot be tuned Automatically," "Consider providing 'target column'") non_target_columns = [ col for col in training_df.columns if col != target_column ] X = training_df[non_target_columns] y = training_df[target_column] if model_class and experiment_class: try: ModelClass = import_class(model_class) except ImportError: raise ValueError( f"Can not import model {model_class}. Make sure you spelled it correctly and have installed all packages." ) try: ExperimentClass = import_class(experiment_class) except ImportError: raise ValueError( f"Can not import tuner {experiment_class}. Make sure you spelled it correctly and have installed all packages." ) try: from dask_ml.wrappers import ParallelPostFit except ImportError: # pragma: no cover raise ValueError( "dask_ml must be installed to use automl and tune hyperparameters" ) model = ModelClass() search = ExperimentClass(model, {**parameters}, **experiment_kwargs) logger.info(tune_fit_kwargs) search.fit(X, y, **tune_fit_kwargs) df = pd.DataFrame(search.cv_results_) df["model_class"] = model_class context.register_model( experiment_name, ParallelPostFit(estimator=search.best_estimator_), X.columns, schema_name=schema_name, ) if automl_class: try: AutoMLClass = import_class(automl_class) except ImportError: raise ValueError( f"Can not import automl model {automl_class}. Make sure you spelled it correctly and have installed all packages." ) try: from dask_ml.wrappers import ParallelPostFit except ImportError: # pragma: no cover raise ValueError( "dask_ml must be installed to use automl and tune hyperparameters" ) automl = AutoMLClass(**automl_kwargs) # should be avoided if data doesn't fit in memory automl.fit(X.compute(), y.compute()) df = (pd.DataFrame( automl.evaluated_individuals_).T.reset_index().rename( {"index": "models"}, axis=1)) context.register_model( experiment_name, ParallelPostFit(estimator=automl.fitted_pipeline_), X.columns, schema_name=schema_name, ) context.register_experiment(experiment_name, experiment_results=df, schema_name=schema_name) cc = ColumnContainer(df.columns) dc = DataContainer(dd.from_pandas(df, npartitions=1), cc) return dc
def predict_xr( model, input_xr, chunk_size=None, persist=True, proba=False, clean=False, return_input=False, ): """ Using dask-ml ParallelPostfit(), runs the parallel predict and predict_proba methods of sklearn estimators. Useful for running predictions on a larger-than-RAM datasets. Last modified: September 2020 Parameters ---------- model : scikit-learn model or compatible object Must have a .predict() method that takes numpy arrays. input_xr : xarray.DataArray or xarray.Dataset. Must have dimensions 'x' and 'y' chunk_size : int The dask chunk size to use on the flattened array. If this is left as None, then the chunks size is inferred from the .chunks() method on the `input_xr` persist : bool If True, and proba=True, then 'input_xr' data will be loaded into distributed memory. This will ensure data is not loaded twice for the prediction of probabilities, but this will only work if the data is not larger than RAM. proba : bool If True, predict probabilities. This only applies if the model has a .predict_proba() method clean : bool If True, remove Infs and NaNs from input and output arrays return_input : bool If True, then the data variables in the 'input_xr' dataset will be appended to the output xarray dataset. Returns ---------- output_xr : xarray.Dataset An xarray.Dataset containing the prediction output from model with input_xr as input, if proba=True then dataset will also contain the prediciton probabilities. Has the same spatiotemporal structure as input_xr. """ if chunk_size is None: chunk_size = int(input_xr.chunks["x"][0]) * int( input_xr.chunks["y"][0]) # convert model to dask predict model = ParallelPostFit(model) # with joblib.parallel_backend("dask"): x, y, crs = input_xr.x, input_xr.y, input_xr.geobox.crs input_data = [] for var_name in input_xr.data_vars: input_data.append(input_xr[var_name]) input_data_flattened = [] # TODO: transfer to dask dataframe for arr in input_data: data = arr.data.flatten().rechunk(chunk_size) input_data_flattened.append(data) # reshape for prediction input_data_flattened = da.array(input_data_flattened).transpose() if clean: input_data_flattened = da.where(da.isfinite(input_data_flattened), input_data_flattened, 0) if proba and persist: # persisting data so we don't require loading all the data twice input_data_flattened = input_data_flattened.persist() # apply the classification print(" predicting...") out_class = model.predict(input_data_flattened) # Mask out NaN or Inf values in results if clean: out_class = da.where(da.isfinite(out_class), out_class, 0) # Reshape when writing out out_class = out_class.reshape(len(y), len(x)) # stack back into xarray output_xr = xr.DataArray(out_class, coords={ "x": x, "y": y }, dims=["y", "x"]) output_xr = output_xr.to_dataset(name="Predictions") if proba: print(" probabilities...") out_proba = model.predict_proba(input_data_flattened) # convert to % out_proba = da.max(out_proba, axis=1) * 100.0 if clean: out_proba = da.where(da.isfinite(out_proba), out_proba, 0) out_proba = out_proba.reshape(len(y), len(x)) out_proba = xr.DataArray(out_proba, coords={ "x": x, "y": y }, dims=["y", "x"]) output_xr["Probabilities"] = out_proba if return_input: print(" input features...") # unflatten the input_data_flattened array and append # to the output_xr containin the predictions arr = input_xr.to_array() stacked = arr.stack(z=["y", "x"]) # handle multivariable output output_px_shape = () if len(input_data_flattened.shape[1:]): output_px_shape = input_data_flattened.shape[1:] output_features = input_data_flattened.reshape( (len(stacked.z), *output_px_shape)) # set the stacked coordinate to match the input output_features = xr.DataArray( output_features, coords={ "z": stacked["z"] }, dims=[ "z", *[ "output_dim_" + str(idx) for idx in range(len(output_px_shape)) ], ], ).unstack() # convert to dataset and rename arrays output_features = output_features.to_dataset(dim="output_dim_0") data_vars = list(input_xr.data_vars) output_features = output_features.rename( {i: j for i, j in zip(output_features.data_vars, data_vars)} # noqa pylint: disable=unnecessary-comprehension ) # merge with predictions output_xr = xr.merge([output_xr, output_features], compat="override") return assign_crs(output_xr, str(crs))
def train_model(x_train, y_train): clf = ParallelPostFit(estimator=GaussianNB(), scoring='accuracy') clf.fit(x_train, y_train) return clf
def convert( self, sql: "org.apache.calcite.sql.SqlNode", context: "dask_sql.Context" ) -> DataContainer: select = sql.getSelect() model_name = str(sql.getModelName()) kwargs = convert_sql_kwargs(sql.getKwargs()) if model_name in context.models: if sql.getIfNotExists(): return elif not sql.getReplace(): raise RuntimeError( f"A model with the name {model_name} is already present." ) logger.debug( f"Creating model {model_name} from query {select} with options {kwargs}" ) try: model_class = kwargs.pop("model_class") except KeyError: raise ValueError("Parameters must include a 'model_class' parameter.") target_column = kwargs.pop("target_column", "") wrap_predict = kwargs.pop("wrap_predict", False) wrap_fit = kwargs.pop("wrap_fit", False) fit_kwargs = kwargs.pop("fit_kwargs", {}) try: ModelClass = import_class(model_class) except ImportError: raise ValueError( f"Can not import model {model_class}. Make sure you spelled it correctly and have installed all packages." ) model = ModelClass(**kwargs) if wrap_fit: from dask_ml.wrappers import Incremental model = Incremental(estimator=model) if wrap_predict: from dask_ml.wrappers import ParallelPostFit model = ParallelPostFit(estimator=model) select_query = context._to_sql_string(select) training_df = context.sql(select_query) if target_column: non_target_columns = [ col for col in training_df.columns if col != target_column ] X = training_df[non_target_columns] y = training_df[target_column] else: X = training_df y = None model.fit(X, y, **fit_kwargs) context.register_model(model_name, model, X.columns)
def test_sklearn(): from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer from sklearn.linear_model import SGDClassifier, LogisticRegressionCV from sklearn.model_selection import GridSearchCV from sklearn.pipeline import Pipeline from sklearn.svm import SVC from sklearn.externals import joblib from sklearn.datasets import make_classification, load_digits, fetch_20newsgroups from dask_ml.wrappers import ParallelPostFit categories = [ 'alt.atheism', 'talk.religion.misc', ] print("Loading 20 newsgroups dataset for categories:") print(categories) data = fetch_20newsgroups(subset='train', categories=categories) print("%d documents" % len(data.filenames)) print("%d categories" % len(data.target_names)) print() pipeline = Pipeline([ ('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', SGDClassifier(max_iter=1000)), ]) parameters = { 'vect__max_df': (0.5, 0.75, 1.0), # 'vect__max_features': (None, 5000, 10000, 50000), 'vect__ngram_range': ((1, 1), (1, 2)), # unigrams or bigrams # 'tfidf__use_idf': (True, False), # 'tfidf__norm': ('l1', 'l2'), # 'clf__alpha': (0.00001, 0.000001), # 'clf__penalty': ('l2', 'elasticnet'), # 'clf__n_iter': (10, 50, 80), } grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, cv=3, refit=False, iid=False) grid_search.fit(data.data, data.target) with joblib.parallel_backend('dask'): grid_search.fit(data.data, data.target) X, y = load_digits(return_X_y=True) svc = ParallelPostFit(SVC(random_state=0, gamma='scale')) param_grid = { # use estimator__param instead of param 'estimator__C': [0.01, 1.0, 10], } grid_search = GridSearchCV(svc, param_grid, iid=False, cv=3) grid_search.fit(X, y) big_X = da.concatenate( [da.from_array(X, chunks=X.shape) for _ in range(10)]) predicted = grid_search.predict(big_X) # X_train, y_train = make_classification(n_features=2, n_redundant=0, n_informative=2, random_state=1, n_clusters_per_class=1, n_samples=1000) N = 100 X_large = da.concatenate( [da.from_array(X_train, chunks=X_train.shape) for _ in range(N)]) y_large = da.concatenate( [da.from_array(y_train, chunks=y_train.shape) for _ in range(N)]) clf = ParallelPostFit(LogisticRegressionCV(cv=3)) clf.fit(X_train, y_train) y_pred = clf.predict(X_large) clf.score(X_large, y_large) # est.partial_fit(X_train_1, y_train_1) # from tpot import TPOTClassifier pass
# Scale up: connect to your own cluster with bmore resources # see http://dask.pydata.org/en/latest/setup.html client = Client(processes=False, threads_per_worker=4, n_workers=1, memory_limit='2GB') print(client) dtype = { 'total': np.float64, 'temperature': np.int32, 'humidity': np.float64, 'solar': np.float64, 'car_connected': np.int32, 'car_energy': np.int32, 'battery_energy': np.int32, 'current_temperature': np.int32, 'b': np.int32, 'c': np.int32, 'air': np.int32, 'cost': np.int32 } x = pd.read_csv('train_data.csv', dtype=dtype) y = x.pop('cost').values mlp = ParallelPostFit(neural_network.MLPRegressor(hidden_layer_sizes=(16,), solver='adam'), scoring="r2") print('Training') mlp.fit(x, y) print('Finished')
While only predict is demonstrated here, wrappers.ParallelPostFit is equally useful for predict_proba and transform. """ from timeit import default_timer as tic import pandas as pd import seaborn as sns import sklearn.datasets from sklearn.svm import SVC import dask_ml.datasets from dask_ml.wrappers import ParallelPostFit X, y = sklearn.datasets.make_classification(n_samples=1000) clf = ParallelPostFit(SVC(gamma='scale')) clf.fit(X, y) Ns = [100_000, 200_000, 400_000, 800_000] timings = [] for n in Ns: X, y = dask_ml.datasets.make_classification(n_samples=n, random_state=n, chunks=n // 20) t1 = tic() # Serial scikit-learn version clf.estimator.predict(X) timings.append(('Scikit-Learn', n, tic() - t1)) t1 = tic()
def perform_prediction(ds_input, estimator): """ Uses dask (if available) to run sklearn predict in parallel. Useful for quickly performing analysis. Parameters ---------- ds_input : xarray dataset or array. Dataset containing independent variables(i.e. low res image). Must have dimensions 'x' and 'y'. estimator : sklearn estimator object A pre-defined RandomForestRegressor scikit-learn estimator model. Returns ---------- ds_out : xarray dataset An xarray dataset containing the probabilities of the random forest model. """ # check ds in dataset or dataarray if not isinstance(ds_input, (xr.Dataset, xr.DataArray)): raise TypeError( '> Input dataset is not xarray dataset or data array type.') # check if x and y dims exist if 'x' not in list(ds_input.dims) and 'y' not in list(ds_input.dims): raise ValueError('> No x and/or y coordinate dimension in dataset.') # if input_xr isn't dask, coerce it is_dask = True if not bool(ds_input.chunks): is_dask = False ds_input = ds_input.chunk({'x': len(ds_input.x), 'y': len(ds_input.y)}) #get chunk size chunk_size = int(ds_input.chunks['x'][0]) * int(ds_input.chunks['y'][0]) # set up function for random forest prediction def predict(ds_input, estimator): # get x, y dims x, y, = ds_input['x'], ds_input['y'] # get crs if exists try: attributes = ds_input.attrs except: print('> No attributes available. Skipping.') attributes = None # seperate each var (image bands) and store in list input_data_list = [] for var_name in ds_input.data_vars: input_data_list.append(ds_input[var_name]) # flatten and chunk each dim array and add to flatten list input_data_flat = [] for da in input_data_list: data = da.data.flatten().rechunk(chunk_size) input_data_flat.append(data) # reshape for prediction via dask array type (dda) input_data_flat = dask_array.array(input_data_flat).transpose() # perform the prediction preds = estimator.predict(input_data_flat) # reshape for output preds = preds.reshape(len(y), len(x)) # recreate dataset ds_out = xr.DataArray(preds, coords={'x': x, 'y': y}, dims=['y', 'x']) ds_out = ds_out.to_dataset(name='result') # add attributes back on if attributes: ds_out.attrs.update(attributes) return ds_out # predict via parallel, or if missing, regular compute if is_dask == True: estimator = ParallelPostFit(estimator) with joblib.parallel_backend('dask'): ds_out = predict(ds_input, estimator) else: ds_out = predict(ds_input, estimator).compute() # return return ds_out
"module__activation": [ "relu", "elu", ], "batch_size": [32, 64], "optimizer__lr": loguniform(1e-4, 1e-3), "optimizer__weight_decay": loguniform(1e-6, 1e-3), "optimizer__momentum": uniform(0, 1), "optimizer__nesterov": [True], } from dask_ml.model_selection import HyperbandSearchCV search = HyperbandSearchCV(model, params, random_state=2, verbose=True, max_iter=9) y_train2 = y_train.reshape(-1, 1).persist() search.fit(X_train, y_train2) print(search.best_score_) print(search.best_params_) print(search.best_estimator_) from dask_ml.wrappers import ParallelPostFit deployed_model = ParallelPostFit(search.best_estimator_) deployed_model.score(X_test, y_test)