Ejemplo n.º 1
0
    def test_example(self):
        # The classic example from the sklearn documentation
        iris = datasets.load_iris()
        parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
        svr = svm.SVC()
        clf = grid_search.GridSearchCV(svr, parameters)
        clf.fit(iris.data, iris.target)

        clf2 = GridSearchCV(self.sc, svr, parameters)
        clf2.fit(iris.data, iris.target)

        b1 = clf.estimator
        b2 = clf2.estimator
        self.assertEqual(b1.get_params(), b2.get_params())
Ejemplo n.º 2
0
    def test_example(self):
        # The classic example from the sklearn documentation
        iris = datasets.load_iris()
        parameters = {'kernel': ('linear', 'rbf'), 'C': [1, 10]}
        svr = svm.SVC(gamma='auto')
        clf = grid_search.GridSearchCV(svr, parameters)
        clf.fit(iris.data, iris.target)

        clf2 = GridSearchCV(self.sc, svr, parameters)
        clf2.fit(iris.data, iris.target)

        b1 = clf.estimator
        b2 = clf2.estimator
        self.assertEqual(b1.get_params(), b2.get_params())
Ejemplo n.º 3
0
    def test_cv_lasso_with_mllib_featurization(self):
        data = [('hi there', 0.0),
                ('what is up', 1.0),
                ('huh', 1.0),
                ('now is the time', 5.0),
                ('for what', 0.0),
                ('the spark was there', 5.0),
                ('and so', 3.0),
                ('were many socks', 0.0),
                ('really', 1.0),
                ('too cool', 2.0)]
        data = self.sql.createDataFrame(data, ["review", "rating"])

        # Feature extraction using MLlib
        tokenizer = Tokenizer(inputCol="review", outputCol="words")
        hashingTF = HashingTF(inputCol="words", outputCol="features", numFeatures=20000)
        pipeline = Pipeline(stages=[tokenizer, hashingTF])
        data = pipeline.fit(data).transform(data)

        df = self.converter.toPandas(data.select(data.features.alias("review"), "rating"))

        pipeline = SKL_Pipeline([
            ('lasso', SKL_Lasso())
        ])
        parameters = {
            'lasso__alpha': (0.001, 0.005, 0.01)
        }

        grid_search = GridSearchCV(self.sc, pipeline, parameters)
        skl_gs = grid_search.fit(df.review.values, df.rating.values)
        assert len(skl_gs.cv_results_['params']) == len(parameters['lasso__alpha'])
Ejemplo n.º 4
0
 def test_cv_pipeline(self):
     pipeline = SKL_Pipeline([
         ('vect', SKL_HashingVectorizer(n_features=20)),
         ('tfidf', SKL_TfidfTransformer(use_idf=False)),
         ('lasso', SKL_Lasso(max_iter=1))
     ])
     parameters = {
         'lasso__alpha': (0.001, 0.005, 0.01)
     }
     grid_search = GridSearchCV(self.sc, pipeline, parameters)
     data = [('hi there', 0.0),
             ('what is up', 1.0),
             ('huh', 1.0),
             ('now is the time', 5.0),
             ('for what', 0.0),
             ('the spark was there', 5.0),
             ('and so', 3.0),
             ('were many socks', 0.0),
             ('really', 1.0),
             ('too cool', 2.0)]
     df = self.sql.createDataFrame(data, ["review", "rating"]).toPandas()
     skl_gs = grid_search.fit(df.review.values, df.rating.values)
     assert len(skl_gs.grid_scores_) == len(parameters['lasso__alpha'])
     # TODO
     for gs in skl_gs.grid_scores_:
         pass # assert(gs.)
Ejemplo n.º 5
0
 def test_cv_linreg(self):
     pipeline = SKL_Pipeline([('lasso', SKL_Lasso(max_iter=1))])
     parameters = {'lasso__alpha': (0.001, 0.005, 0.01)}
     grid_search = GridSearchCV(self.sc, pipeline, parameters)
     X = scipy.sparse.vstack(
         map(lambda x: self.list2csr([x, x + 1.0]), range(0, 100)))
     y = np.array(list(range(0, 100))).reshape((100, 1))
     skl_gs = grid_search.fit(X, y)
     assert len(skl_gs.cv_results_['params']) == len(
         parameters['lasso__alpha'])
Ejemplo n.º 6
0
# data 표준화
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler(copy=True, with_mean=True, with_std=True)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)


from pyspark.sql import SparkSession
# spark context
spark = SparkSession.builder.appName("Regression_worker_2").getOrCreate()
sc = spark.sparkContext

# model 초기화
MLP_model = GridSearchCV(sc, MLPRegressor(alpha=0.005, random_state=42), {'hidden_layer_sizes':[[512, 4], [256, 4]], 'max_iter':[5000]})

#linear_model.fit(X_train, y_train)
MLP_model.fit(X_train, y_train)
#RandomForest_model.fit(X_train, y_train)
#GradientBoosting_model.fit(X_train, y_train)
    
# print scores
models = [MLP_model]

with open('./model_scores_worker_2.txt', 'w') as f:
    for m in models:
        f.write('Training Set Mean Squared Error: {:.2f}\n'.format(mean_squared_error(y_train, m.predict(X_train))))
        f.write('Training Set R^2: {:.2f}\n'.format(r2_score(y_train, m.predict(X_train))))

        f.write('Testing Set Mean Squared Error: {:.2f}\n'.format(mean_squared_error(y_test, m.predict(X_test))))
X_train, X_test, y_train, y_test = train_test_split(X, y)

# data 표준화
scaler = StandardScaler()
scaler.fit(X_train)
StandardScaler(copy=True, with_mean=True, with_std=True)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

from pyspark.sql import SparkSession
# spark context
spark = SparkSession.builder.appName("Regression_compare_models").getOrCreate()
sc = spark.sparkContext

# model 초기화
linear_model = GridSearchCV(sc, LinearRegression(), {})
MLP_model = GridSearchCV(
    sc,
    MLPRegressor(hidden_layer_sizes=[512, 4],
                 max_iter=5000,
                 alpha=0.005,
                 random_state=42), {})
RandomForest_model = GridSearchCV(
    sc, RandomForestRegressor(n_estimators=100, random_state=0), {})
GradientBoosting_model = GridSearchCV(
    sc,
    GradientBoostingRegressor(n_estimators=100, max_depth=10, criterion='mse'),
    {})

linear_model.fit(X_train, y_train)
MLP_model.fit(X_train, y_train)
Ejemplo n.º 8
0
digits = datasets.load_digits()
X, y = digits.data, digits.target

sc = createLocalSparkSession().sparkContext
param_grid = {
    "max_depth": [3, None],
    "max_features": [1, 3, 10],
    "min_samples_split": [0.1, 0.2, 0.3],
    "min_samples_leaf": [1, 3, 10],
    "bootstrap": [True, False],
    "criterion": ["gini", "entropy"],
    "n_estimators": [10, 20, 40, 80]
}

gs = GridSearchCV(sc, RandomForestClassifier(), param_grid=param_grid)
gs.fit(X, y)

# 获取最佳参数
best_params_ = None
best_score_ = 0
params = gs.cv_results_['params']
mean_train_score = gs.cv_results_['mean_train_score']
for i, score in enumerate(mean_train_score):
    if i == 0:
        best_score_ = score
        best_params_ = params[i]
    if score > best_score_:
        best_score_ = score
        best_params_ = params[i]
print(best_params_)