def gridsearch(XX, XXpredict, yy, yypredict, clf): # tuned_parameters=settings.param_grid param_grid = settings.param_grid print("Gridsearch start") def report(grid_scores, n_top=3): top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top] for i, score in enumerate(top_scores): print("Model with rank: {0}".format(i + 1)) print("Mean validation score: {0:.3f} (std: {1:.3f})".format( score.mean_validation_score, numpy.std(score.cv_validation_scores))) print("Parameters: {0}".format(score.parameters)) print("") grid_search = GridSearchCV(sc, clf, param_grid=param_grid, cv=10, n_jobs=-1, verbose=1) start = time() grid_search.fit(XX, yy) print( "GridSearchCV took {:.2f} seconds for {:d} candidate settings.".format( time() - start, len(grid_search.grid_scores_))) report(grid_search.grid_scores_) return grid_search
def grid_search_svm(X_train, y_train,X_test,ngrams,n_split,svm_choice='linear',tfidf_choice=False,nums_train=None,nums_test=None): svm=None grid=None if svm_choice == 'linear': svm = LinearSVC() c_array = np.logspace(1., 4., num=4) if tfidf_choice: grid = {'vect__ngram_range': ngrams, 'tfidf__use_idf': (True, False), 'clf__C': c_array.tolist()} else: grid = {'vect__ngram_range': ngrams, 'clf__C': c_array.tolist()} elif svm_choice == 'svc': svm = SVC() c_array = np.logspace(-3., 6., num=10) g_array = np.logspace(-3., 3., num=7) if tfidf_choice: grid = {'vect__ngram_range': ngrams, 'tfidf__use_idf': (True, False), 'clf__kernel': ['rbf'], 'clf__C': c_array.tolist(), 'clf__gamma': g_array.tolist()} else: grid = {'vect__ngram_range': ngrams, 'clf__kernel': ['rbf'], 'clf__C': c_array.tolist(), 'clf__gamma': g_array.tolist()} if type(nums_train) is np.ndarray and type(nums_test) is np.ndarray: if tfidf_choice: clf_pipeline = Pipeline([('vect', CountVectorizer(ngram_range=ngrams)), ('tfidf', TfidfTransformer(smooth_idf=False)), ('numfeat', NumFeatureAdder(nums_train,nums_test)), ('clf',svm)]) else: clf_pipeline = Pipeline([('vect', CountVectorizer(ngram_range=ngrams)), ('numfeat', NumFeatureAdder(nums_train, nums_test)), ('clf', svm)]) else: if tfidf_choice: clf_pipeline = Pipeline([('vect', CountVectorizer(ngram_range=ngrams)), ('tfidf', TfidfTransformer(smooth_idf=False)), ('clf',svm)]) else: clf_pipeline = Pipeline([('vect', CountVectorizer(ngram_range=ngrams)), ('clf',svm)]) print(clf_pipeline.get_params().keys()) sc = SparkContext.getOrCreate() grid_search = GridSearchCV(sc, clf_pipeline, grid, n_jobs=-1, cv=n_split) grid_search.fit(X_train, y_train) grid_search_predicted = grid_search.predict(X_test) return grid_search_predicted
def grid_search(sc, data, label, features): """ 使用grid search寻找最优的超参数 """ # 产生备选的超参数集 parameters = {"alpha": 10**np.linspace(-4, 0, 45)} # Lasso模型里有超参数alpha,表示惩罚项的权重 la = Lasso() gs = GridSearchCV(sc, la, parameters) gs.fit(data[features], data[label]) return gs
def train(self, X, y, method="rf"): param_grid = { "max_depth": [6, None], "max_features": [5, 10, 20], } obj = RandomForestClassifier() if method == "svm": obj = SVC() self.model = GridSearchCV(RandomForestClassifier(), param_grid=param_grid) self.model.fit(X, y)
def gridSearch(sc, data, label, features): """ 使用 grid search 寻找最优的超参数 :param sc: :param data: :param label: :param features: :return: """ parameters = {"alpha": 10**np.linspace(-4, 0, 45)} la = Lasso() gs = GridSearchCV(sc, la, parameters) gs.fit(data[features], data[label]) return gs
def grid_search(sc, X, Y, K): roc_auc_scorer = make_scorer(roc_auc_score) clf = SPGridSearchCV(sc, pipe, cv=K, n_jobs=2, param_grid=param_grid, scoring=roc_auc_scorer) clf.fit(X, Y) grid_result = clf.grid_scores_ param_score = [] for i in range(len(grid_result)): param_score.append((grid_result[i][0], grid_result[i][1])) print(param_score) return param_score
def train(self,X, y, method="rf"): param_grid = { "max_depth": [6, None], "max_features": [5,10,20], } obj = RandomForestClassifier() if method == "svm": obj = SVC() self.model = GridSearchCV(RandomForestClassifier(), param_grid=param_grid) self.model.fit(X, y)
def with_spark(x_train, x_test, y_train, y_test, vents_and_files, spark_connect_str): from pyspark import SparkConf, SparkContext from spark_sklearn import GridSearchCV as SparkGridSearchCV conf = SparkConf().setMaster(spark_connect_str).setAppName("ecs251") conf.set("spark.executor.memory", "1g") conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") conf.set("spark.deploy.mode", "cluster") sc = SparkContext(conf=conf) param_grid = {"C": C, "gamma": GAMMA} gs = SparkGridSearchCV(sc, SVC(cache_size=CACHE_SIZE), param_grid=param_grid) res = gs.fit(x_train, y_train['y'].values) print("Best Score: ", res.best_score_) print("Best params: ", res.best_params_) predictions = res.predict(x_test) print("Accuracy: " + str(accuracy_score(y_test['y'], predictions))) print("Precision: " + str(precision_score(y_test['y'], predictions))) print("Recall: " + str(recall_score(y_test['y'], predictions))) fpr, tpr, thresh = roc_curve(y_test['y'], predictions) print("False pos rate: " + str(fpr[1])) print("True post rate: " + str(tpr[1]))
class Classifier: def __init__(self): self.model = None def preprocess(self, data): ''' Optional for preprocessing''' return CountVectorizer().fit_transform(data) def train(self,X, y, method="rf"): param_grid = { "max_depth": [6, None], "max_features": [5,10,20], } obj = RandomForestClassifier() if method == "svm": obj = SVC() self.model = GridSearchCV(RandomForestClassifier(), param_grid=param_grid) self.model.fit(X, y) def preidict(self, X): if self.model is None: return return self.model.predict(X)
class Classifier: def __init__(self): self.model = None def preprocess(self, data): ''' Optional for preprocessing''' return CountVectorizer().fit_transform(data) def train(self, X, y, method="rf"): param_grid = { "max_depth": [6, None], "max_features": [5, 10, 20], } obj = RandomForestClassifier() if method == "svm": obj = SVC() self.model = GridSearchCV(RandomForestClassifier(), param_grid=param_grid) self.model.fit(X, y) def preidict(self, X): if self.model is None: return return self.model.predict(X)
def main(): """ main function, runs the program trains spark sklearn model """ absolute_path = "/data/model_data/" train_df = np.loadtxt(absolute_path + "train.csv", delimiter=',') train_target_df = np.loadtxt(absolute_path + "target_train.csv", delimiter=',') test_df = np.loadtxt(absolute_path + "test.csv", delimiter=',') test_target_df = np.loadtxt(absolute_path + "target_test.csv", delimiter=',') regr = RandomForestRegressor(random_state=0, n_estimators=1000, min_samples_leaf=1) # best model so far! # pyspark regr_rf_cv = GridSearchCV(sc=spark.sparkContext, estimator=regr, n_jobs=20, cv=5, verbose=5, param_grid={}) regr_rf_cv.fit(train_df, train_target_df) y_list, y_hat_list = run_test(test_df, test_target_df, regr_rf_cv) print("Mean absolute error: {}".format(get_mean_absolute_error(y_list, y_hat_list))) print("Average relative error: {}".format(get_average_relative_error(y_list, y_hat_list))) save_model(regr_rf_cv.best_estimator_, "rf_uber_model", "/data/saved_model/") load_model("/data/saved_model/rf_uber_model.pkl", testExample=(test_df[0], test_target_df[0]))
j = exec_config[0] print('----------------- Config = ', j, ' -------------------------') conf = sc._conf.setAll([('spark.executor.memory', j[0]), ('spark.executor.cores', j[1]), ('spark.executor.instances', j[2])]) spark = SparkSession.builder.config(conf=conf).getOrCreate() print(sc._conf.getAll()) for i in iter_list: print('--------------------Iterations = ', i, '-----------------------') param_grid = { "solver": ["sgd"], "max_iter": [i], "hidden_layer_sizes": [(100, 10)], } gs = GridSearchCV(sc, estimator=MLPClassifier(), param_grid=param_grid) print('Time info for iterations = ', i) get_ipython().run_line_magic('time', 'gs.fit(train, y_train)') preds = gs.predict(test) print('Accuracy=', np.sum(y_test == preds) * 100 / len(y_test), '%') #### CONFIG 2 ######## j = exec_config[1] print('----------------- Config = ', j, ' -------------------------') conf = sc._conf.setAll([('spark.executor.memory', j[0]), ('spark.executor.cores', j[1]), ('spark.executor.instances', j[2])]) spark = SparkSession.builder.config(conf=conf).getOrCreate() print(sc._conf.getAll())
# Create hold-out test dataset x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25) param_grid = { "max_depth": [3, None], "max_features": [1, 3, 10], "min_samples_leaf": [1, 3, 10], "bootstrap": [True, False], "criterion": ["gini", "entropy"], "n_estimators": [10, 20, 40, 80] } gs = GridSearchCV(sc=sc, estimator=RandomForestClassifier(), cv=4, param_grid=param_grid, refit=True) with timeit(): gs.fit(x_train, y_train) results = pd.DataFrame(gs.cv_results_) print(results.sort_values(['mean_test_score'], ascending=False)[0:10]) # Validate accuracy of best model against hold-out data best_model = gs.best_estimator_ test_accuracy = best_model.score(x_test, y_test) print(test_accuracy) logger.log('Best model accuracy', test_accuracy)
from pyspark import SparkContext from sklearn import svm, datasets from spark_sklearn import GridSearchCV # create spark context - use all cores sc = SparkContext("local[*]", "Simple Test") # test data iris = datasets.load_iris() parameters = {'kernel':('linear', 'rbf'), 'C':[0.01, 0.1, 1, 2, 3, 4, 5, 10]} svr = svm.SVC() clf = GridSearchCV(sc, svr, parameters) clf.fit(iris.data, iris.target) print(clf.best_estimator_) print(clf.best_score_)
y_train = targetencoder.transform(gender_age_train['group']) ###################################################### # Training ####################################################### tuned_parameters = [{'n_estimators': [300,400], 'max_depth': [3,4], 'objective': ['multi:softprob'], 'reg_alpha': [1], 'reg_lambda': [1], 'colsample_bytree': [1], 'learning_rate': [0.1], 'colsample_bylevel': [0.01,0.1], 'subsample': [0.5,0.7]}] clf = XGBClassifier(seed=0) metric = 'neg_log_loss' sc = SparkContext.getOrCreate() clf_cv = GridSearchCV(sc = sc, param_grid = tuned_parameters, estimator = clf, scoring=metric, cv=5, verbose=3) model = clf_cv.fit(X_train,y_train) run_logger.log(metric, float(clf_cv.best_score_)) for key in clf_cv.best_params_.keys(): run_logger.log(key, clf_cv.best_params_[key]) if not path.exists('./outputs'): makedirs('./outputs') outfile = open('./outputs/sweeping_results.txt','w') print("metric = ", metric, file=outfile) for i in range(len(model.grid_scores_)): print(model.grid_scores_[i], file=outfile) outfile.close()
documentDF = session.createDataFrame([ ("Hi I heard about Spark", "spark"), ("I wish Java could use case classes", "java"), ("Logistic regression models are neat", "mlib"), ("Logistic regression models are neat", "spark"), ("Logistic regression models are neat", "mlib"), ("Logistic regression models are neat", "java"), ("Logistic regression models are neat", "spark"), ("Logistic regression models are neat", "java"), ("Logistic regression models are neat", "mlib") ], ["text", "preds"]).select(f.split("text", "\\s+").alias("new_text"), "preds") word2vec = Word2Vec(vectorSize=100, minCount=1, inputCol="new_text", outputCol="features") indexer = StringIndexer(inputCol="preds", outputCol="labels") pipline = Pipeline(stages=[word2vec, indexer]) ds = pipline.fit(documentDF).transform(documentDF) data = ds.toPandas() parameters = {'kernel': ('linear', 'rbf')} svr = svm.SVC() clf = GridSearchCV(session.sparkContext, svr, parameters) X = [x.values for x in data.features.values] y = [int(x) for x in data.labels.values] model = clf.fit(X, y) # modelB = session.sparkContext.broadcast(pickle.dumps(model)) # wow = documentDF.rdd.map(lambda row: pickle.loads(modelB.value).transform(row["features"].values)).collect() # print(wow)
sc = SparkContext(conf=conf) digits = load_digits() n_samples = len(digits.images) data = digits.images.reshape((n_samples, -1)) X_train, X_test, y_train, y_test = train_test_split(data, digits.target, test_size=0.3, random_state=0) svc = svm.SVC() hyperparam_grid = { 'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'gamma': np.linspace(0.001, 0.01, num=10), 'C': np.linspace(1, 10, num=10), 'tol': np.linspace(0.01, 0.1, 10) } classifier = GridSearchCV(sc, svc, hyperparam_grid) start = time() classifier.fit(X_train, y_train) elapsed = time() - start print('elapsed: {} seconds'.format(elapsed)) print('Best Kernel:\t{}'.format(classifier.best_estimator_.kernel)) print('Best Gamma:\t{}'.format(classifier.best_estimator_.gamma)) print('Best C:\t\t{}'.format(classifier.best_estimator_.C)) y_pred = classifier.predict(X_test) print('Accuracy:\t{:.1%}'.format(metrics.accuracy_score(y_test, y_pred)))
from dl_steer import dt_handler, coordinator, custom_model, engine_interface, provenance from keras.wrappers.scikit_learn import KerasClassifier from spark_sklearn import GridSearchCV from keras.models import Sequential data = dt_handler.read_dataset('input_data.csv') ... model = KerasClassifier(build_fn=custom_model.get_model(), verbose=0) X, y = data['X'], data['y'] queue = coordinator.get_queue() for hyperparameter_combination in queue: provenance.persist(hyperparameter_combination) grid = GridSearchCV(estimator=model, param_grid=hyperparameter_combination, n_jobs=-1, scoring="accuracy") grid_result = grid.fit(X, y) provenance.persist(grid_result) #The method below verifies if user steered the queue. If yes, it reloads the queue accordingly. queue.checkSteering()
#!/usr/bin/python # -*- coding: UTF-8 -*- from sklearn import svm, grid_search, datasets from spark_sklearn import GridSearchCV from pyspark import SparkConf, SparkContext if __name__ == "__main__": sc = SparkContext() iris = datasets.load_iris() parameters = {'kernel': ('linear', 'rbf'), 'C': [1, 10]} svr = svm.SVC() clf = GridSearchCV(sc, svr, parameters) clf.fit(iris.data, iris.target) print(clf.cv_results_)
SPARK_HOME + 'python/lib/pyspark.zip', SPARK_HOME + 'python/lib/py4j-0.10.1-src.zip'] ) from pyspark import SparkContext from pyspark import SparkConf if __name__ == '__main__': conf = SparkConf() conf.setMaster("local[3]") # 指定具体的Master机器 地址和端口 # conf.setMaster("spark://jdwang-HP:7077") conf.setAppName("spark_test") # 可以设置属性等 # conf.set("spark.executor.memory", "12g") sc = SparkContext(conf=conf) # 测试 from sklearn import svm, datasets from spark_sklearn import GridSearchCV iris = datasets.load_iris() parameters = {'kernel': ('linear', 'rbf'), 'C': [1, 10]} svr = svm.SVC() clf = GridSearchCV(sc, svr, parameters) clf.fit(iris.data, iris.target) print(clf.best_params_) print(clf.predict(iris.data)) end_time = time.time() print('running time is %ds'%(end_time-start_time))
def test(): import pandas as pd from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline from sklearn.ensemble import GradientBoostingClassifier # from sklearn.model_selection import GridSearchCV from spark_sklearn import GridSearchCV from pyspark import SparkConf, SparkContext, HiveContext from spark_sklearn import Converter import time start = time.time() conf = SparkConf().setAppName("spark-sklearn") sc = SparkContext(conf=conf) spark = HiveContext(sc) path = "/home/data/data_cell_lable_0521_rsrp_five3_all.csv" df = spark.read.csv(path, header=True, inferSchema=True) converter = Converter(sc) df_data = converter.toPandas(df) # 也可以直接使用 pandas的DataFrame进行操作 # inputpath1 = '/home/etluser/xiexiaoxuan/data/data_cell_lable_0521_rsrp_five3_all.csv' # df_data = pd.read_csv(inputpath1) df_data = df_data.dropna(axis=0, how='any') x1 = df_data.drop(['label'], axis=1) y1 = df_data['label'] gbm0 = GradientBoostingClassifier(n_estimators=262, max_depth=57, min_samples_split=50, random_state=10, subsample=0.7, learning_rate=0.01) pipeline = Pipeline([("standard", StandardScaler()), ("gbdt", gbm0)]) params = { "gbdt__n_estimators": [i for i in range(10, 20)], "gbdt__max_depth": [i for i in range(3, 20)] } grid_search = GridSearchCV(sc, pipeline, param_grid=params, error_score=0, scoring="accuracy", cv=5, n_jobs=10, pre_dispatch="2*n_jobs", return_train_score=False) grid_search.fit(x1, y1) end = time.time() print("总耗时 :%.2f s" % (end - start)) print(grid_search.best_estimator_) index = grid_search.best_index_ res = grid_search.cv_results_ best_score = res["mean_test_score"][index] print("===============: " + str(best_score))
print(y_train.shape) # COMMAND ---------- # MAGIC %md Create SVC Model # COMMAND ---------- from sklearn import svm, grid_search, datasets from spark_sklearn import GridSearchCV parameters = { 'kernel': ('linear', 'rbf', 'poly', 'rbf', 'sigmoid'), 'C': [1, 20] } svr = svm.SVC() clf = GridSearchCV(sc, svr, param_grid=parameters, scoring='accuracy') clf.fit(x_train, y_train) print(clf.best_params_) bestsvc = clf.best_estimator_ print(clf.best_score_) # COMMAND ---------- # MAGIC %md Create Random Forest Model # COMMAND ---------- en_rf = RandomForestClassifier(n_estimators=64, max_depth=32, min_samples_split=128, random_state=0)
# MAGIC %md Use GridSearchCV to get the best hyper parameters for Lasso Regression model.The hyperparameters are: # MAGIC - `normalize`. True or False.The regressors X will be normalized before regression by subtracting the mean and dividing by the l2-norm or by their standard deviations. # MAGIC - `alpha`. It represents the regularization strength; Regularization improves the conditioning of the problem and reduces the variance of the estimates. Here we chose a range (0.001, 1000). # MAGIC # MAGIC Then cross validation is defined as 5 time series splits which means it will train the model on combination of 4 subsets created from the training datset and validate the trained model on one subset. And the scoring method is R square which is a statistical measure of how close the data are to the fitted regression line. Then fit the gridsearchcv with features and target datasets # COMMAND ---------- from spark_sklearn import GridSearchCV from sklearn.model_selection import TimeSeriesSplit from sklearn.metrics import make_scorer, mean_absolute_error, r2_score lasso_run = \ GridSearchCV(sc, estimator=get_lasso_pipeline(), param_grid={'lso__normalize':[True,False], 'lso__alpha' :[10.0**n for n in range(-3,4)]}, cv=TimeSeriesSplit(n_splits=5), scoring=make_scorer(r2_score), return_train_score=False, n_jobs=-1 ) lasso_run.fit(trn_coal_cnt_fea_pdf, trn_coal_cnt_tgt_ser) display_pdf(est_grid_results_pdf(lasso_run, est_tag='lasso')) # COMMAND ---------- lasso_run.fit(trn_ore_tfidf_fea_pdf, trn_ore_tfidf_tgt_ser) display_pdf(est_grid_results_pdf(lasso_run,
def generate_model_package(training_data_path, id_cols, target_cols, fields_config_file, param_grid, model_name, target_var): """ training_data_path ,id_cols ,target_cols ,fields_config_file ,param_grid ,model_name ,target_var """ pyspark_app_nm = "train_" + model_name + "_" + secrets.token_hex(nbytes=4) logging.info("Starting process: " + pyspark_app_nm) #create spark object and spark context for parallel learning logging.info("Instantiating pyspark.") app_pyspark_conf = SparkConf() app_pyspark_conf.setAppName(pyspark_app_nm) # app_pyspark_conf.set('spark.executor.memory',spark_executor_memory) # app_pyspark_conf.set('spark.executor.cores', spark_executor_cores) spark = SparkSession.builder.config(conf=app_pyspark_conf).getOrCreate() sc = spark.sparkContext #load data logging.info("Beginning data load.") training_df = pd.read_parquet(training_data_path, engine='pyarrow') # sampling down # training_df_1 = training_df[training_df[target_var]==1].sample(20) # training_df_0 = training_df[training_df[target_var]==0].sample(40) # training_df = pd.concat([training_df_0,training_df_1]) # column handling logging.info("Creating column lists") all_cols = training_df.columns.tolist() x_cols = list(set(all_cols) - (set(target_cols + id_cols))) # dataframe setup X = training_df[x_cols] y = training_df[target_cols] # create holdout data logging.info("Creating holdout data") x_train, x_test, y_train, y_test = train_test_split(X, y[target_var], test_size=0.1, stratify=y[target_var]) wts = y_test.value_counts() wtrat = (wts[0] / wts[1]) # instantiate model gbm = lgb.LGBMClassifier() fit_params = { "eval_set": [(x_test, y_test)], "eval_metric": ear_stop_eval_mtr, "early_stopping_rounds": ear_stop_rnds # ,"scale_pos_weight": wtrat } grid_search = SparkGridSearchCV(sc, estimator=gbm, param_grid=param_grid, fit_params=fit_params) # grid_search.fit(x_train,y_train) grid_search.fit(x_train, y_train) best_model = grid_search.best_estimator_ optimized_parameters = best_model.get_params() # create confusion dataframe y_true = pd.DataFrame(y_test) y_true = y_true.reset_index() y_true.columns.values[0] = "CUSTOMER_KEY" y_true.columns.values[1] = "Y_TRUE" y_pred = pd.DataFrame(best_model.predict(x_test, y_test.tolist()), columns=["Y_PRED"]) confusion_data = pd.merge(left=y_true, right=y_pred, left_index=True, right_index=True) # summary statistics and metrics fr_col_nam_map = {0: "feature_nm", 1: "feature_importance"} feature_ranking = pd.DataFrame( [X.columns, best_model.feature_importances_]).T feature_ranking = feature_ranking.rename(columns=fr_col_nam_map) feature_ranking = feature_ranking.sort_values("feature_nm", ascending=False) metrics = { "precision_score": precision_score(confusion_data['Y_TRUE'], confusion_data['Y_PRED']), "roc_auc_score": roc_auc_score(confusion_data['Y_TRUE'], confusion_data['Y_PRED']), "classification_report": classification_report(confusion_data['Y_TRUE'], confusion_data['Y_PRED']), "confusion_matrix": confusion_matrix(confusion_data['Y_TRUE'], confusion_data['Y_PRED']), "accuracy_score": accuracy_score(confusion_data['Y_TRUE'], confusion_data['Y_PRED']), "precision_recall_curve": precision_recall_curve(confusion_data['Y_TRUE'], confusion_data['Y_PRED']), "recall_score": recall_score(confusion_data['Y_TRUE'], confusion_data['Y_PRED']), "roc_curve": roc_curve(confusion_data['Y_TRUE'], confusion_data['Y_PRED']) } output = { "model_name": model_name # string with model name , "model_class": best_model # grid_search.best_estimator_ , "optimized_parameters": optimized_parameters # best_model.get_params() , "feature_ranking": feature_ranking # best_model.feature_importances_ , "metrics": metrics, "confusion_data": confusion_data } return output
from sklearn import grid_search import pandas as pd from sklearn.ensemble import RandomForestClassifier from spark_sklearn import GridSearchCV df = pd.read_csv('../data/master_RushPassOnly.csv') y = df.pop('IsPass').values X = df.values param_grid = { "max_depth": [3, 5, 10, None], "max_features": [None, 'auto', 'log2'], "n_estimators": [100], 'min_samples_split': [2, 4], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False] } rf = RandomForestClassifier(verbose=2, n_jobs=-1) gs = GridSearchCV(rf, param_grid=param_grid, n_jobs=-1, verbose=2, scoring='neg_mean_squared_error') gs.fit(X, y) best_parameters = gs.best_params_ print best_parameters
from sklearn.ensemble import RandomForestRegressor from sklearn.datasets import load_boston from spark_sklearn import GridSearchCV import pyspark if __name__ == '__main__': sc = pyspark.SparkContext('local[*]') boston = load_boston() RAMDON_FOREST_PARAMS = { "n_estimators": [100], "max_features": [1, "auto", "sqrt", None], "max_depth": [1, 5, 10, None], "min_samples_leaf": [1, 2, 4, 50] } rf = RandomForestRegressor(random_state=0, n_jobs=-1) clf = GridSearchCV(sc, rf, RAMDON_FOREST_PARAMS) clf.fit(boston.data, boston.target) print("parameters for random forest: {0}".format(clf.best_params_), sep="\n")
# Utility function to report best scores def report(grid_scores, n_top=3): top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top] for i, score in enumerate(top_scores): print("Model with rank: {0}".format(i + 1)) print("Mean validation score: {0:.3f} (std: {1:.3f})".format( score.mean_validation_score, np.std(score.cv_validation_scores))) print("Parameters: {0}".format(score.parameters)) print("") # create spark context - use all cores sc = SparkContext("local[*]", "Simple Test") digits = datasets.load_digits() X, y = digits.data, digits.target param_grid = {"max_depth": [3, 5, 8, 10, 13, None], "max_features": [1, 3, 10, 20], "min_samples_split": [1, 3, 10], "min_samples_leaf": [1, 3, 10], "criterion": ["gini", "entropy"], "n_estimators": [500]} clf = RandomForestClassifier() #gs = grid_search.GridSearchCV(clf, param_grid=param_grid) # local gs = GridSearchCV(sc, clf, param_grid=param_grid) # distributed start = time() gs.fit(X, y) print("GridSearchCV took {:.2f} seconds for {:d} candidate settings.".format(time() - start, len(gs.grid_scores_))) report(gs.grid_scores_)
def call_GridSearchCV(model, praram_grid): GridSearchCV(sc, model, param_grid=param_grid)
X_minus_trea = X[np.where(y != 'TREA')] y_minus_trea = y[np.where(y != 'TREA')] X_final = X_minus_trea[np.where(y_minus_trea != 'PORNOGRAPHY/OBSCENE MAT')] y_final = y_minus_trea[np.where(y_minus_trea != 'PORNOGRAPHY/OBSCENE MAT')] # Separate training, dev, and test data: test_data, test_labels = X_final[800000:], y_final[800000:] dev_data, dev_labels = X_final[700000:800000], y_final[700000:800000] train_data, train_labels = X_final[100000:700000], y_final[100000:700000] calibrate_data, calibrate_labels = X_final[:100000], y_final[:100000] # Create mini versions of the above sets mini_train_data, mini_train_labels = X_final[:20000], y_final[:20000] mini_calibrate_data, mini_calibrate_labels = X_final[19000:28000], y_final[ 19000:28000] mini_dev_data, mini_dev_labels = X_final[49000:60000], y_final[49000:60000] param_grid = { 'C': [.001, .01, .01] + [i for i in range(1, 100, 5)], "penalty": ['l1', 'l2'] } clf = LogisticRegression() gs = GridSearchCV(sc, clf, param_grid) start = time() gs.fit(mini_train_data, mini_train_labels) print("GridSearchCV took {:.2f} seconds for {:d} candidate settings.".format( time() - start, len(gs.grid_scores_))) report(gs.grid_scores_)
xgb_params = { 'eta': 0.05, 'max_depth': 6, 'subsample': 0.7, 'colsample_bytree': 0.7, 'objective': 'reg:linear', 'silent': 1 } import xgboost as xgb dtrain = xgb.DMatrix(train_X, train_y, feature_names=train_X.columns.values) model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=100, feval=xgb_r2_score, maximize=True) # Gradient Boosting Regressor gbr = ensemble.GradientBoostingRegressor() clf = GridSearchCV(gbr, cv=3, param_grid=tuned_parameters, scoring='median_absolute_error') preds = clf.fit(X_train, y_train) best = clf.best_estimator_ # plot error for each round of boosting # Note: best_estimator_, staged_predict test_score = np.zeros(n_est, dtype=np.float64) train_score = best.train_score_ for i, y_pred in enumerate(best.staged_predict(X_test)): test_score[i] = best.loss_(y_test, y_pred) ### Grid search from pyspark import SparkContext, SparkConf from spark_sklearn import GridSearchCV
max_depth=None).fit(train_data, train_labels) RF_calibrated_and_tuned_pre_fit = CalibratedClassifierCV(RF_tuned, method='isotonic', cv='prefit') RF_calibrated_and_tuned = RF_calibrated_and_tuned_pre_fit.fit( calibration_data, Calibration_labels) param_grid = { "base_estimator": [RF_calibrated_and_tuned], "n_estimators": [i for i in range(1, 1001, 1)], "max_samples": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], "max_features": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20], "bootstrap": [True, False], "bootstrap_features": [True, False], "oob_score": [True, False] } clf = BaggingClassifier() # bagging_fitted = clf.fit(train_data,train_labels) # bagging_prediction_probabilities = bagging_fitted.predict_proba(dev_data) # log_loss_for_RF_tuned_calibrated_bagged = log_loss(y_true = dev_labels, y_pred = bagging_prediction_probabilities, labels = crime_labels) # print("Multi-class Log Loss with RF tuned calibrated and bagged is:", log_loss_for_RF_tuned_calibrated_bagged) gs = GridSearchCV(sc, clf, param_grid) # add "n_jobs?" start = time() gs.fit(mini_train_data, mini_train_labels) print("GridSearchCV took {:.2f} seconds for {:d} candidate settings.".format( time() - start, len(gs.grid_scores_))) report(gs.grid_scores_)
# python 2.7 # import pyspark library from pyspark import SparkConf, SparkContext # spark_sklearn provides the same API as sklearn but uses Spark MLLib # under the hood to perform the actual computations in a distributed way # (passed in via the SparkContext instance). from spark_sklearn import GridSearchCV # import ML library from sklearn import svm, grid_search, datasets sc =SparkContext() iris = datasets.load_iris() parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]} svr = svm.SVC() clf = GridSearchCV(sc, svr, parameters) clf.fit(iris.data, iris.target) print ("==================") print (clf.predict(iris.data)) print ("==================")
Y_timetrain_arr = np.ravel(Y_timetrain) X_timetest = X.loc[X.index >= 398] Y_timetest = y.loc[y.index >= 398] Y_timetest_arr = np.ravel(Y_timetest) X_timetest # In[99]: tuned_parameters = { "n_estimators": [ 100 ], "max_depth" : [ 3 ], "learning_rate": [ 0.1 ], } gbc = ensemble.GradientBoostingClassifier() clf = GridSearchCV(spark.sparkContext, gbc, tuned_parameters) clf # In[100]: clf.fit(X_timetrain, Y_timetrain_arr) clftest_pred = clf.predict(X_timetest) print "Accuracy is ", metrics.accuracy_score(Y_timetest_arr, clftest_pred) *100, "%" # In[101]: knn1 = KNeighborsClassifier() knn_params = { "n_neighbors": [31]
target_names=le.classes_) tuned_parameters = { "max_depth": [3, None], "max_features": [1, 'auto'], "min_samples_split": [1, 20], "n_estimators": [10, 300, 500] } rf = RandomForestClassifier(random_state=rs) # spark-sklearn conf = SparkConf() sc = SparkContext(conf=conf) clf = GridSearchCV(sc, rf, cv=3, param_grid=tuned_parameters, scoring='accuracy') # scikit-learn # clf = GridSearchCV(rf, cv=2, scoring='accuracy', # param_grid=tuned_parameters, # verbose=True) preds = clf.fit(X_train, y_train) best = clf.best_estimator_ this_score = metrics.accuracy_score(y_test, best.predict(X_test)) scorestr = "RF / GridSearchCV: Accuracy Score %0.2f" % this_score print print scorestr print "-" * len(scorestr)