def training(grid_param, X_train, X_test, y_train, y_test):
        #Create cluster/client
        cluster = make_cluster()
        cluster
        client = Client(cluster)
        client
        #Construct Dask DataFrame
        X_train = dd.from_pandas(X_train, npartitions=4)       
        y_train = dd.from_pandas(y_train, npartitions=4)
        X_test = dd.from_pandas(X_test, npartitions=4)        
        y_test = dd.from_pandas(y_test, npartitions=4)

        estimator = RandomForestRegressor()
        #Train model
        train_time = time.time()
        grid_search = GridSearchCV_dask(estimator, grid_param, cv=2, n_jobs=-1)

        with joblib.parallel_backend("dask", scatter=[X_train, y_train]):
            grid_search.fit(X_train, y_train)
        grid_search.score(X_test, y_test)
        train_time = time.time() - train_time
        #Predictions
        acc_r2 = grid_search.best_estimator_.score(X_test, y_test)
        acc_mse = mean_squared_error(grid_search.best_estimator_.predict(X_test), y_test)
        return acc_r2, acc_mse, train_time
    def RandomForestDask(param_grid, X_train, X_test, y_train, y_test):

        cluster = make_cluster()
        cluster
        client = Client(cluster)
        client
        dask_X_train = dd.from_pandas(X_train,
                                      npartitions=3)  # preprocess data
        dask_y_train = dd.from_pandas(y_train, npartitions=3)

        dask_X_test = dd.from_pandas(X_test, npartitions=3)
        dask_y_test = dd.from_pandas(y_test, npartitions=3)

        estimator = RandomForestRegressor()
        param_grid = param_grid

        grid_search_dask = GridSearchCV_dask(estimator,
                                             param_grid,
                                             cv=2,
                                             n_jobs=-1)
        with joblib.parallel_backend("dask",
                                     scatter=[dask_X_train, dask_y_train]):
            grid_search_dask.fit(dask_X_train, dask_y_train)
        grid_search_dask.score(dask_X_test, dask_y_test)
        r_2 = grid_search_dask.best_estimator_.score(dask_X_test, dask_y_test)
        mse = mean_squared_error(
            grid_search_dask.best_estimator_.predict(X_test), y_test)
        return r_2, mse,
Exemple #3
0
# modified from https://github.com/amueller/scipy-2018-sklearn/blob/master/notebooks/15.Pipelining_Estimators.ipynb

from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split
from dask_ml.model_selection import GridSearchCV
from dask.distributed import Client
from sklearn.pipeline import make_pipeline
from dask_ml.preprocessing import StandardScaler
from dask_ml.linear_model import LogisticRegression

if __name__ == "__main__":
    client = Client()
    data = Path('./data')
    df = pd.read_csv(data / "01_heights_weights_genders.csv")
    y = 1 * (df.Gender == "Male").values
    X = df[['Height', 'Weight']].values
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    pipeline = make_pipeline(StandardScaler(), LogisticRegression())
    grid = GridSearchCV(pipeline,
                        param_grid={'logisticregression__C': [.1, 1, 10, 100]},
                        cv=5)
    grid.fit(X_train, y_train)
    print("Score", grid.score(X_test, y_test))