def test_example(self): # The classic example from the sklearn documentation iris = datasets.load_iris() parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]} svr = svm.SVC() clf = grid_search.GridSearchCV(svr, parameters) clf.fit(iris.data, iris.target) clf2 = GridSearchCV(self.sc, svr, parameters) clf2.fit(iris.data, iris.target) b1 = clf.estimator b2 = clf2.estimator self.assertEqual(b1.get_params(), b2.get_params())
def test_example(self): # The classic example from the sklearn documentation iris = datasets.load_iris() parameters = {'kernel': ('linear', 'rbf'), 'C': [1, 10]} svr = svm.SVC(gamma='auto') clf = grid_search.GridSearchCV(svr, parameters) clf.fit(iris.data, iris.target) clf2 = GridSearchCV(self.sc, svr, parameters) clf2.fit(iris.data, iris.target) b1 = clf.estimator b2 = clf2.estimator self.assertEqual(b1.get_params(), b2.get_params())
scaler.fit(X_train) StandardScaler(copy=True, with_mean=True, with_std=True) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) from pyspark.sql import SparkSession # spark context spark = SparkSession.builder.appName("Regression_worker_2").getOrCreate() sc = spark.sparkContext # model 초기화 MLP_model = GridSearchCV(sc, MLPRegressor(alpha=0.005, random_state=42), {'hidden_layer_sizes':[[512, 4], [256, 4]], 'max_iter':[5000]}) #linear_model.fit(X_train, y_train) MLP_model.fit(X_train, y_train) #RandomForest_model.fit(X_train, y_train) #GradientBoosting_model.fit(X_train, y_train) # print scores models = [MLP_model] with open('./model_scores_worker_2.txt', 'w') as f: for m in models: f.write('Training Set Mean Squared Error: {:.2f}\n'.format(mean_squared_error(y_train, m.predict(X_train)))) f.write('Training Set R^2: {:.2f}\n'.format(r2_score(y_train, m.predict(X_train)))) f.write('Testing Set Mean Squared Error: {:.2f}\n'.format(mean_squared_error(y_test, m.predict(X_test)))) f.write('testing Set R^2: {:.2f}\n\n'.format(r2_score(y_test, m.predict(X_test)))) f.write('\nRunning Time: {:.2f}'.format(time.time() - start))
# model 초기화 linear_model = GridSearchCV(sc, LinearRegression(), {}) MLP_model = GridSearchCV( sc, MLPRegressor(hidden_layer_sizes=[512, 4], max_iter=5000, alpha=0.005, random_state=42), {}) RandomForest_model = GridSearchCV( sc, RandomForestRegressor(n_estimators=100, random_state=0), {}) GradientBoosting_model = GridSearchCV( sc, GradientBoostingRegressor(n_estimators=100, max_depth=10, criterion='mse'), {}) linear_model.fit(X_train, y_train) MLP_model.fit(X_train, y_train) RandomForest_model.fit(X_train, y_train) GradientBoosting_model.fit(X_train, y_train) # print scores models = [linear_model, MLP_model, RandomForest_model, GradientBoosting_model] with open('./model_scores_compare.txt', 'w') as f: for m in models: #f.write(str(m) + '\n') f.write('Training Set Mean Squared Error: {:.2f}\n'.format( mean_squared_error(y_train, m.predict(X_train)))) f.write('Training Set R^2: {:.2f}\n'.format( r2_score(y_train, m.predict(X_train)))) f.write('Testing Set Mean Squared Error: {:.2f}\n'.format(
digits = datasets.load_digits() X, y = digits.data, digits.target sc = createLocalSparkSession().sparkContext param_grid = { "max_depth": [3, None], "max_features": [1, 3, 10], "min_samples_split": [0.1, 0.2, 0.3], "min_samples_leaf": [1, 3, 10], "bootstrap": [True, False], "criterion": ["gini", "entropy"], "n_estimators": [10, 20, 40, 80] } gs = GridSearchCV(sc, RandomForestClassifier(), param_grid=param_grid) gs.fit(X, y) # 获取最佳参数 best_params_ = None best_score_ = 0 params = gs.cv_results_['params'] mean_train_score = gs.cv_results_['mean_train_score'] for i, score in enumerate(mean_train_score): if i == 0: best_score_ = score best_params_ = params[i] if score > best_score_: best_score_ = score best_params_ = params[i] print(best_params_) print(best_score_)