def main(): cv = 5 clf_scoring = "accuracy" reg_scoring = "neg_mean_squared_error" data = load_iris() X = data["data"] y = data["target"] grid = dict(learning_rate=[.05, .01], max_depth=[4, 6, 8], colsample_bytree=[.6, .8, 1.0], n_estimators=[100, 200, 300]) model = DistGridSearchCV(XGBClassifier(), grid, spark.sparkContext, cv=cv, scoring=clf_scoring) model.fit(X, y) # predictions on the driver preds = model.predict(X) probs = model.predict_proba(X) # results print("-- Grid Search --") print("Best Score: {0}".format(model.best_score_)) print("Best colsample_bytree: {0}".format( model.best_estimator_.colsample_bytree)) print("Best learning_rate: {0}".format( model.best_estimator_.learning_rate)) print("Best max_depth: {0}".format(model.best_estimator_.max_depth)) print("Best n_estimators: {0}".format(model.best_estimator_.n_estimators))
# load sample data (binary target) data = load_breast_cancer() X = data["data"] y = data["target"] ### distributed grid search model = DistGridSearchCV(LogisticRegression(solver=solver), dict(C=Cs), sc, cv=cv, scoring=scoring) # distributed fitting with spark model.fit(X, y) # predictions on the driver preds = model.predict(X) probs = model.predict_proba(X) # results print("-- Grid Search --") print("Best Score: {0}".format(model.best_score_)) print("Best C: {0}".format(model.best_estimator_.C)) result_data = pd.DataFrame(model.cv_results_)[["param_C", "mean_test_score"]] print(result_data.sort_values("param_C")) print(pickle.loads(pickle.dumps(model))) ### distributed randomized search param_dist = dict(C=[]) model = DistRandomizedSearchCV( LogisticRegression(solver=solver), dict(C=Cs), sc,