def grid_search_svm(X_train, y_train,X_test,ngrams,n_split,svm_choice='linear',tfidf_choice=False,nums_train=None,nums_test=None): svm=None grid=None if svm_choice == 'linear': svm = LinearSVC() c_array = np.logspace(1., 4., num=4) if tfidf_choice: grid = {'vect__ngram_range': ngrams, 'tfidf__use_idf': (True, False), 'clf__C': c_array.tolist()} else: grid = {'vect__ngram_range': ngrams, 'clf__C': c_array.tolist()} elif svm_choice == 'svc': svm = SVC() c_array = np.logspace(-3., 6., num=10) g_array = np.logspace(-3., 3., num=7) if tfidf_choice: grid = {'vect__ngram_range': ngrams, 'tfidf__use_idf': (True, False), 'clf__kernel': ['rbf'], 'clf__C': c_array.tolist(), 'clf__gamma': g_array.tolist()} else: grid = {'vect__ngram_range': ngrams, 'clf__kernel': ['rbf'], 'clf__C': c_array.tolist(), 'clf__gamma': g_array.tolist()} if type(nums_train) is np.ndarray and type(nums_test) is np.ndarray: if tfidf_choice: clf_pipeline = Pipeline([('vect', CountVectorizer(ngram_range=ngrams)), ('tfidf', TfidfTransformer(smooth_idf=False)), ('numfeat', NumFeatureAdder(nums_train,nums_test)), ('clf',svm)]) else: clf_pipeline = Pipeline([('vect', CountVectorizer(ngram_range=ngrams)), ('numfeat', NumFeatureAdder(nums_train, nums_test)), ('clf', svm)]) else: if tfidf_choice: clf_pipeline = Pipeline([('vect', CountVectorizer(ngram_range=ngrams)), ('tfidf', TfidfTransformer(smooth_idf=False)), ('clf',svm)]) else: clf_pipeline = Pipeline([('vect', CountVectorizer(ngram_range=ngrams)), ('clf',svm)]) print(clf_pipeline.get_params().keys()) sc = SparkContext.getOrCreate() grid_search = GridSearchCV(sc, clf_pipeline, grid, n_jobs=-1, cv=n_split) grid_search.fit(X_train, y_train) grid_search_predicted = grid_search.predict(X_test) return grid_search_predicted
class Classifier: def __init__(self): self.model = None def preprocess(self, data): ''' Optional for preprocessing''' return CountVectorizer().fit_transform(data) def train(self,X, y, method="rf"): param_grid = { "max_depth": [6, None], "max_features": [5,10,20], } obj = RandomForestClassifier() if method == "svm": obj = SVC() self.model = GridSearchCV(RandomForestClassifier(), param_grid=param_grid) self.model.fit(X, y) def preidict(self, X): if self.model is None: return return self.model.predict(X)
class Classifier: def __init__(self): self.model = None def preprocess(self, data): ''' Optional for preprocessing''' return CountVectorizer().fit_transform(data) def train(self, X, y, method="rf"): param_grid = { "max_depth": [6, None], "max_features": [5, 10, 20], } obj = RandomForestClassifier() if method == "svm": obj = SVC() self.model = GridSearchCV(RandomForestClassifier(), param_grid=param_grid) self.model.fit(X, y) def preidict(self, X): if self.model is None: return return self.model.predict(X)
# In[99]: tuned_parameters = { "n_estimators": [ 100 ], "max_depth" : [ 3 ], "learning_rate": [ 0.1 ], } gbc = ensemble.GradientBoostingClassifier() clf = GridSearchCV(spark.sparkContext, gbc, tuned_parameters) clf # In[100]: clf.fit(X_timetrain, Y_timetrain_arr) clftest_pred = clf.predict(X_timetest) print "Accuracy is ", metrics.accuracy_score(Y_timetest_arr, clftest_pred) *100, "%" # In[101]: knn1 = KNeighborsClassifier() knn_params = { "n_neighbors": [31] } clf2 = GridSearchCV(spark.sparkContext, knn1, knn_params, n_jobs = 2) clf2 # In[102]:
SPARK_HOME + 'python/lib/pyspark.zip', SPARK_HOME + 'python/lib/py4j-0.10.1-src.zip'] ) from pyspark import SparkContext from pyspark import SparkConf if __name__ == '__main__': conf = SparkConf() conf.setMaster("local[3]") # 指定具体的Master机器 地址和端口 # conf.setMaster("spark://jdwang-HP:7077") conf.setAppName("spark_test") # 可以设置属性等 # conf.set("spark.executor.memory", "12g") sc = SparkContext(conf=conf) # 测试 from sklearn import svm, datasets from spark_sklearn import GridSearchCV iris = datasets.load_iris() parameters = {'kernel': ('linear', 'rbf'), 'C': [1, 10]} svr = svm.SVC() clf = GridSearchCV(sc, svr, parameters) clf.fit(iris.data, iris.target) print(clf.best_params_) print(clf.predict(iris.data)) end_time = time.time() print('running time is %ds'%(end_time-start_time))
sc = SparkContext(conf=conf) digits = load_digits() n_samples = len(digits.images) data = digits.images.reshape((n_samples, -1)) X_train, X_test, y_train, y_test = train_test_split(data, digits.target, test_size=0.3, random_state=0) svc = svm.SVC() hyperparam_grid = { 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': np.linspace(0.001, 0.01, num=10), 'C': np.linspace(1, 10, num=10), 'tol': np.linspace(0.01, 0.1, 10) } classifier = GridSearchCV(sc, svc, hyperparam_grid) start = time() classifier.fit(X_train, y_train) elapsed = time() - start print('elapsed: {} seconds'.format(elapsed)) print('Best Kernel:\t{}'.format(classifier.best_estimator_.kernel)) print('Best Gamma:\t{}'.format(classifier.best_estimator_.gamma)) print('Best C:\t\t{}'.format(classifier.best_estimator_.C)) y_pred = classifier.predict(X_test) print('Accuracy:\t{:.1%}'.format(metrics.accuracy_score(y_test, y_pred)))
# python 2.7 # import pyspark library from pyspark import SparkConf, SparkContext # spark_sklearn provides the same API as sklearn but uses Spark MLLib # under the hood to perform the actual computations in a distributed way # (passed in via the SparkContext instance). from spark_sklearn import GridSearchCV # import ML library from sklearn import svm, grid_search, datasets sc =SparkContext() iris = datasets.load_iris() parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]} svr = svm.SVC() clf = GridSearchCV(sc, svr, parameters) clf.fit(iris.data, iris.target) print ("==================") print (clf.predict(iris.data)) print ("==================")
('spark.executor.instances', j[2])]) spark = SparkSession.builder.config(conf=conf).getOrCreate() print(sc._conf.getAll()) for i in iter_list: print('--------------------Iterations = ', i, '-----------------------') param_grid = { "solver": ["sgd"], "max_iter": [i], "hidden_layer_sizes": [(100, 10)], } gs = GridSearchCV(sc, estimator=MLPClassifier(), param_grid=param_grid) print('Time info for iterations = ', i) get_ipython().run_line_magic('time', 'gs.fit(train, y_train)') preds = gs.predict(test) print('Accuracy=', np.sum(y_test == preds) * 100 / len(y_test), '%') #### CONFIG 2 ######## j = exec_config[1] print('----------------- Config = ', j, ' -------------------------') conf = sc._conf.setAll([('spark.executor.memory', j[0]), ('spark.executor.cores', j[1]), ('spark.executor.instances', j[2])]) spark = SparkSession.builder.config(conf=conf).getOrCreate() print(sc._conf.getAll()) for i in iter_list: print('--------------------Iterations = ', i, '-----------------------') param_grid = { "solver": ["sgd"],