def training(grid_param, X_train, X_test, y_train, y_test): #Create cluster/client cluster = make_cluster() cluster client = Client(cluster) client #Construct Dask DataFrame X_train = dd.from_pandas(X_train, npartitions=4) y_train = dd.from_pandas(y_train, npartitions=4) X_test = dd.from_pandas(X_test, npartitions=4) y_test = dd.from_pandas(y_test, npartitions=4) estimator = RandomForestRegressor() #Train model train_time = time.time() grid_search = GridSearchCV_dask(estimator, grid_param, cv=2, n_jobs=-1) with joblib.parallel_backend("dask", scatter=[X_train, y_train]): grid_search.fit(X_train, y_train) grid_search.score(X_test, y_test) train_time = time.time() - train_time #Predictions acc_r2 = grid_search.best_estimator_.score(X_test, y_test) acc_mse = mean_squared_error(grid_search.best_estimator_.predict(X_test), y_test) return acc_r2, acc_mse, train_time
def RandomForestDask(param_grid, X_train, X_test, y_train, y_test): cluster = make_cluster() cluster client = Client(cluster) client dask_X_train = dd.from_pandas(X_train, npartitions=3) # preprocess data dask_y_train = dd.from_pandas(y_train, npartitions=3) dask_X_test = dd.from_pandas(X_test, npartitions=3) dask_y_test = dd.from_pandas(y_test, npartitions=3) estimator = RandomForestRegressor() param_grid = param_grid grid_search_dask = GridSearchCV_dask(estimator, param_grid, cv=2, n_jobs=-1) with joblib.parallel_backend("dask", scatter=[dask_X_train, dask_y_train]): grid_search_dask.fit(dask_X_train, dask_y_train) grid_search_dask.score(dask_X_test, dask_y_test) r_2 = grid_search_dask.best_estimator_.score(dask_X_test, dask_y_test) mse = mean_squared_error( grid_search_dask.best_estimator_.predict(X_test), y_test) return r_2, mse,
# modified from https://github.com/amueller/scipy-2018-sklearn/blob/master/notebooks/15.Pipelining_Estimators.ipynb from pathlib import Path import pandas as pd from sklearn.model_selection import train_test_split from dask_ml.model_selection import GridSearchCV from dask.distributed import Client from sklearn.pipeline import make_pipeline from dask_ml.preprocessing import StandardScaler from dask_ml.linear_model import LogisticRegression if __name__ == "__main__": client = Client() data = Path('./data') df = pd.read_csv(data / "01_heights_weights_genders.csv") y = 1 * (df.Gender == "Male").values X = df[['Height', 'Weight']].values X_train, X_test, y_train, y_test = train_test_split(X, y) pipeline = make_pipeline(StandardScaler(), LogisticRegression()) grid = GridSearchCV(pipeline, param_grid={'logisticregression__C': [.1, 1, 10, 100]}, cv=5) grid.fit(X_train, y_train) print("Score", grid.score(X_test, y_test))