def gscv_para(self, C_list, gamma_list, x_train, y_train): ''' 用网格搜索和交叉验证调节参数,考虑类别不平衡的情况,k-fold的k为10 Args: C_list: C参数的备选列表 gamma_list: gamma参数的备选列表 x_train: 训练集中的特征数据 y_train: 训练集中的类别数据 Returns: 最优的C参数,gamma参数和最优时的score(平均值) 优化的标准是SVC的score值,score越高表示,表示参数越好 ''' clf = SVC(class_weight='balanced', cache_size=4000) gscv = GSCV(clf, param_grid={ 'C': C_list, 'gamma': gamma_list }, n_jobs=-1, cv=10, pre_dispatch=4) gscv.fit(x_train, y_train) return gscv.best_params_.values(), gscv.best_score_
def SVM_gridsearch(parameters, data_train, labels_train, number_splits, num_threads): svm_clf = svm.SVC(gamma="scale", probability=True) # multiprocessing.cpu_count() clf = GSCV(svm_clf, parameters, cv=SKF(n_splits=number_splits), verbose=2, n_jobs=num_threads) clf.fit(data_train, labels_train) return clf
def cvgridsearch(self, skip_train=5): self.GS = GSCV(estimator=self.estimator, param_grid=self.param_dict, cv=self.cv, n_jobs=self.n_jobs, return_train_score=True) X_train = self.X_train[0:-1:skip_train, :] y_train = self.y_train[0:-1:skip_train] self.GS.fit(X_train, y_train) # self.plot_results_cvgridsearch() self.table_results_cvgridsearch()
def GridSearchPara(): N = 100 impurity = np.linspace(0, 0.2, N) hyperpara = { 'criterion': ['gini', 'entropy'], 'min_impurity_decrease': impurity, 'max_depth': np.linspace(1, 200, N) } model = GSCV(DTC(), hyperpara, cv=5) model.fit(X, Y) print model.best_params_, model.best_score_ return
def fitlearner(X, Y, acts=activities, classifier="RFC", name="_"): clf = GSCV( RFC(), { "n_estimators": np.arange(1, 5, 1) * 10, "max_features": ["auto", "sqrt", "log2", None], }, ) # default classifier print("Training the classifier...") clf.fit(X, Y) testX, testY = gettrainData(["108"], acts, 1) predictions = clf.predict(testX) success = (predictions == testY).sum() * 1.0 / len(predictions) print("Success Rate", success) _ = joblib.dump(clf, "Classifier_" + name) return clf
def GridSearchSinglePara(): N = 50 # 点数量 max_depth = np.linspace(1, 200, N) hyperpara = {'max_depth': max_depth} # 参数字典 # GridSearchCV 对象 model = GSCV(DTC(), hyperpara, cv=5) model.fit(X, Y) print model.best_params_, model.best_score_ # 作图部分 R = model.cv_results_ # 平均训练评分 mtrains = R['mean_train_score'] # 标准训练评分 strains = R['std_train_score'] # 平均验证评分 mtests = R['mean_test_score'] # 标准验证评分 stests = R['std_test_score'] # 作图 fig = plt.figure(figsize=(10, 10)) # 填充 plt.fill_between(max_depth, mtrains - strains, mtrains + strains, color='lightgray', alpha=0.3) plt.fill_between(max_depth, mtests - stests, mtests + stests, color='lightgray', alpha=0.3) # 曲线 plt.plot(max_depth, mtrains, color='r', label='train mean scores') plt.plot(max_depth, mtests, color='g', label='test mean scores') # 图的基本设置 plt.grid() plt.legend() plt.title('max_depth gridsearch') plt.xlabel('max_depth') plt.ylabel('score %') plt.show()
def run_experiment(arguments): # Load data set X, Y, log_tf = load_data_set(arguments['dataset'], path_to_source) estim = load_estimator(arguments['estimator_kwargs'], path_to_source) # Prepare for experiments n_test_sets = arguments['n_test_sets'] test_size = arguments['test_size'] param_grid = arguments[ 'param_grid'] # Parameter grid for estimator to CV over cv_folds = arguments['cv_folds'] n_jobs = arguments['n_jobs'] kf = ShuffleSplit(n_splits=arguments['n_test_sets'], test_size=arguments['test_size']) test_error = np.zeros(n_test_sets) best_parameters = {} test_iter = 0 computational_time = np.zeros(n_test_sets) # Extra array to store dot products if estimator is nsim almost_linearity_param = np.zeros(n_test_sets) for idx_train, idx_test in kf.split(X): start = time.time() reg = GSCV(estimator=estim, param_grid=param_grid, scoring='neg_mean_squared_error', iid=False, cv=cv_folds, verbose=0, pre_dispatch=n_jobs, error_score=np.nan, refit=True) # If estimator fitting raises an exception X_train, Y_train = X[idx_train, :], Y[idx_train] X_test, Y_test = X[idx_test, :], Y[idx_test] reg = reg.fit(X_train, Y_train) Y_predict = reg.best_estimator_.predict(X_test) end = time.time() best_parameters[test_iter] = reg.best_params_ if arguments['estimator_kwargs']['estimator'] in [ 'isotron', 'slisotron' ]: best_parameters[test_iter] = reg.best_estimator_.n_iter_cv() if log_tf: test_error[test_iter] = np.sqrt( mean_squared_error(np.exp(Y_test), np.exp(Y_predict))) else: test_error[test_iter] = np.sqrt( mean_squared_error(Y_test, Y_predict)) computational_time[test_iter] = end - start if arguments['estimator_kwargs']['estimator'] == 'nsim': almost_linearity_param[ test_iter] = reg.best_estimator_.measure_almost_linearity() test_iter += 1 print best_parameters print test_error # Save results mean_error = np.mean(test_error) std_error = np.std(test_error) mean_computational_time = np.mean(computational_time) mean_almost_linearity_param = np.mean(almost_linearity_param) filename = arguments['filename'] filename_mod = filename save_itr = 0 while os.path.exists('../results/' + filename_mod + '/'): save_itr += 1 filename_mod = filename + '_' + str(save_itr) else: os.makedirs('../results/' + filename_mod + '/') np.save('../results/' + filename_mod + '/test_errors.npy', test_error) np.savetxt('../results/' + filename_mod + '/test_errors.txt', test_error) np.savetxt('../results/' + filename_mod + '/computational_time.txt', computational_time) np.savetxt( '../results/' + filename_mod + '/computational_time_summary.txt', [mean_computational_time]) np.savetxt('../results/' + filename_mod + '/test_errors_summary.txt', np.array([mean_error, std_error])) np.save('../results/' + filename_mod + '/best_params.npy', best_parameters) if arguments['estimator_kwargs']['estimator'] == 'nsim': np.savetxt( '../results/' + filename_mod + '/almost_linearity_param.txt', almost_linearity_param) np.savetxt( '../results/' + filename_mod + '/almost_linearity_summary.txt', [mean_almost_linearity_param]) with open('../results/' + filename_mod + '/best_params_json.txt', 'w') as file: file.write(json.dumps(best_parameters, indent=4)) with open('../results/' + filename_mod + '/log.txt', 'w') as file: file.write(json.dumps(arguments, indent=4))
random_state=47) mlpc_norm_pipe = mp(MinMaxScaler(), MLPC(random_state=47)) mlp_param_grid1 = { 'mlpclassifier__hidden_layer_sizes': [10, 100, (10, 10), (100, 100)], 'mlpclassifier__activation': ['identity', 'logistic', 'tanh', 'relu'], 'mlpclassifier__solver': ['lbfgs', 'sgd', 'adam'] } mlp_param_grid2 = { 'hidden_layer_sizes': [10, 100, (10, 10), (100, 100)], 'activation': ['identity', 'logistic', 'tanh', 'relu'], 'solver': ['lbfgs', 'sgd', 'adam'] } mlp_norm_grid = GSCV(mlpc_norm_pipe, mlp_param_grid1, scoring='f1', cv=5) mlp_norm_grid.fit(X_train, y_train) print("Test set score: {:.2f}".format(mlp_norm_grid.score(X_test, y_test))) print("Best parameters: {}".format(mlp_norm_grid.best_params_)) mlp_norm_grid = GSCV(MLPC(), mlp_param_grid2, scoring='f1', cv=5) mlp_norm_grid.fit(X_train, y_train) print("Test set score: {:.2f}".format(mlp_norm_grid.score(X_test, y_test))) print("Best parameters: {}".format(mlp_norm_grid.best_params_)) mlpc_results = pd.DataFrame(mlp_norm_grid.cv_results_) display(mlpc_results.head) y_pred = mlp_norm_grid.predict(X_test) metrics.roc_auc_score(y_test, y_pred) metrics.accuracy_score(y_test, y_pred)
# In[ ]: from sklearn.preprocessing import StandardScaler as SS from sklearn.neural_network import MLPClassifier as MLP from sklearn.pipeline import Pipeline SS = SS() clf = MLP() #print(clf.get_params().keys()) pipe = Pipeline(steps=[('scaler', SS), ('MLP', clf)]) params = { 'MLP__hidden_layer_sizes': list(range(1000, 30000, 1000)), 'MLP__activation': ['logistic', 'tanh', 'relu'] } grid_search = GSCV(pipe, params, cv=8, scoring='accuracy') grid_search.fit(datax, datay) best_act = grid_search.best_params_.get('MLP__activation') best_hl = grid_search.best_params_.get('MLP__hidden_layer_size') print('Best Parameters:', grid_search.best_params_) print("Accuracy:", grid_search.best_score_) nested_score = cross_val_score(grid_search, datax, datay, cv=8) print('Nested Score:', nested_score.mean()) # In[ ]: import pickle final_model = grid_search filename = 'finalized_ANN_Alzh.sav'
#padronizacao dos dados baseados no conjunto de treinamento scale = StandardScaler().fit(data_tr) data_tr = scale.transform(data_tr) data_te = scale.transform(data_te) #KNN #PCA pca = PCA(0.8) pca.fit(data_tr) data_tr_pca = pca.transform(data_tr) data_te_pca = pca.transform(data_te) #estimando o parametro em 3 fold grid = GSCV(KNN(), p_knn) grid.fit(data_tr_pca, classes_tr) #acuracia knn = KNN(n_neighbors=grid.best_params_['n_neighbors']) knn.fit(data_tr_pca, classes_tr) acc = knn.score(data_te_pca, classes_te) acc_mean[0] += acc / 5 #SVM #estimando o parametro em 3 fold grid = GSCV(SVC(), p_svm) grid.fit(data_tr, classes_tr) #acuracia
# Applying k-Fold Cross Validation from sklearn.model_selection import cross_val_score accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10) accuracies.mean() accuracies.std() # Applying Grid Search to find the best model and the best parameters from sklearn.model_selection import GridSearchCV as GSCV # Will be a list of Dictionary # Can make this dictionary by optimizing the values that we need # to put in class SVC...(in this question) parameters = [{'C': [1, 0.9, 0.8, 0.7]} ] # Grid Search investigates all different Combinations and brings # out the best one # cv i.e Applying k-fold grid_search = GSCV(estimator = classifier, param_grid = parameters, scoring = 'accuracy', cv = 10) grid_search = grid_search.fit(X_train, y_train) # accuracy that we get through 10 fold validation best_accuracy = grid_search.best_score_ best_parameters = grid_search.best_params_
] return self def predict_proba(self, x): # 预测新数据 返回每一类的概率数组[n_samples, n_classes] logprobs = np.vstack( [model.score_samples(x) for model in self.models_]).T result = np.exp(logprobs + self.logpriors_) return result / result.sum(axis=1, keepdims=True) def predict(self, x): # 概率分类器 return self.classes_[np.argmax(self.predict_proba(x), axis=1)] # 使用自定义的评估类 bandwidths = 10**np.linspace(0, 2, 100) grid = GSCV(KDEClassifier(), { 'bandwidth': bandwidths }).fit(dig.data, dig.target) scores = [val.mean_validation_score for val in grid.grid_scores_] # 交叉检验值分数曲线 plt.figure(figsize=(12, 8)) plt.semilogx(bandwidths, scores) plt.xlabel('bandwidth') plt.ylabel('accuracy') plt.title('KDE Model Performance') print(grid.best_params_, grid.best_score_) print(cross_val_score(GNB(), dig.data, dig.target).mean())
mlpc_norm_pipe = mp(MinMaxScaler(), MLPC(random_state=47)) mlpc_stand_pipe = mp(StandardScaler(), MLPC(random_state=47)) mlpc_pca_pipe = mp(PCA(), MLPC()) """ #####kNN grid##### """ kNN_param_grid = { 'kneighborsclassifier__n_neighbors': [1, 2, 3, 4, 5], 'kneighborsclassifier__weights': ['uniform', 'distance'], 'kneighborsclassifier__p': [1, 2, 3] } """ Test set score: 0.12 Best parameters: {'kneighborsclassifier__n_neighbors': 1, 'kneighborsclassifier__p': 1, 'kneighborsclassifier__weights': 'uniform'} """ kNN_norm_grid = GSCV(knn_norm_pipe, kNN_param_grid, scoring='f1', cv=5) kNN_norm_grid.fit(rnafolding_X_train, rnafolding_y_train) print("Test set score: {:.2f}".format( kNN_norm_grid.score(rnafolding_X_test, rnafolding_y_test))) print("Best parameters: {}".format(kNN_norm_grid.best_params_)) kNN_norm_results = pd.DataFrame(kNN_norm_grid.cv_results_) display(kNN_norm_results.head) """ Test set score: 0.11 Best parameters: {'kneighborsclassifier__n_neighbors': 1, 'kneighborsclassifier__p': 1, 'kneighborsclassifier__weights': 'uniform'} """ kNN_stand_grid = GSCV(knn_stand_pipe, kNN_param_grid, scoring='f1', cv=5) kNN_stand_grid.fit(rnafolding_X_train, rnafolding_y_train) print("Test set score: {:.2f}".format( kNN_stand_grid.score(rnafolding_X_test, rnafolding_y_test))) print("Best parameters: {}".format(kNN_stand_grid.best_params_))
def train_model(self, x_data=[], y_data=[], con_cols=[], cat_cols=[], model=[], imputer=[], cvsplit=4, rstate=101, misper=[]): import warnings warnings.simplefilter('ignore', DeprecationWarning) import numpy as np import xgboost as xgb from sklearn.model_selection import GridSearchCV as GSCV from sklearn.model_selection import KFold, StratifiedKFold # ----model imports ---------------- from sklearn.ensemble import AdaBoostClassifier as ABC from sklearn.linear_model import LogisticRegression as LR from sklearn.ensemble import RandomForestClassifier as RFC import xgboost as xgb #-----------------Selecting the imputer------------------------------------------ Imputed_Data = self.Impute_the_data(imputer=imputer, x_data=x_data, y_data=y_data, con_cols=con_cols, cat_cols=cat_cols, misper=misper) train_y = Imputed_Data['train_y'] train_x = Imputed_Data['train_x'] X_resampled = Imputed_Data['X_resampled'] y_resampled = Imputed_Data['y_resampled'] #-----------------Selecting the model to run------------------------------------- if model == "xgboost": paramGrid = { 'max_depth': [5, 10], 'min_child_weight': np.arange(1, 9, 1), 'gamma': np.arange(0, 1, 0.001), 'subsample': np.arange(0.1, 0.9, 0.05), 'colsample_bytree': np.arange(0.1, 0.9, 0.05), 'n_estimator': [50, 100, 200], 'objective': ['binary:logistic', 'binary:logitraw'], 'learning_rate': [0.001, 0.01, 0.1] } xgb_params = {'eval_metric': 'auc'} model_run = xgb.XGBClassifier() gridsearch = GSCV(model_run, paramGrid, verbose=1, fit_params=xgb_params, cv=KFold(n_splits=cvsplit, random_state=rstate).get_n_splits( [train_x, train_y])) gridsearch.fit(train_x, train_y) xgb_params = dict(gridsearch.best_params_) params = xgb_params elif model == "adaboost": model_run = ABC() paramGrid = { 'learning_rate': [0.001, 0.01, 0.1], 'n_estimators': [50, 100, 200] } gridsearch = GSCV(model_run, paramGrid, verbose=1, cv=KFold(n_splits=cvsplit, random_state=rstate).get_n_splits( [train_x, train_y])) gridsearch.fit(train_x, train_y) ada_params = dict(gridsearch.best_params_) params = ada_params elif model == "logreg": model_run = LR() params = [] elif model == "randomforest": model_run = RFC() params = [] elif model == "lightgbm": print('lightgbm still not configured\n') sys.exit() Output = self.run_the_model(model_run, model, X_resampled, y_resampled, train_x, params, rstate, cvsplit) return { 'model': Output['model'], 'Acc_vals': Output['Acc_vals'], 'Mean_vals': Output['Mean_vals'], 'dataset': Output['dataset'], 'modeltype': Output['modeltype'] }
# print("F_score:", grid_search.best_score_) # nested_score = cross_val_score(grid_search, newx, newy, cv=3) # print('Nested Score:',nested_score.mean()) # In[18]: # your code goes here from sklearn.ensemble import RandomForestClassifier as RFC clf = RFC() params = { 'max_depth': list(range(40, 80)), 'min_samples_leaf': [2, 3, 4, 5, 6, 7, 10], 'max_features': ['sqrt', 'log2'] } grid_search = GSCV(clf, params, cv=15, scoring='f1_macro') grid_search.fit(datax, datay) best_depth = grid_search.best_params_.get('max_depth') best_msl = grid_search.best_params_.get('min_samples_leaf') best_features = grid_search.best_params_.get('max_features') print('Best Parameters:', grid_search.best_params_) print("F_score:", grid_search.best_score_) # In[19]: nested_score = cross_val_score(grid_search, datax, datay, cv=15) print('Nested Score:', nested_score.mean()) # In[17]:
"kernel": ["rbf"], "gamma": [ 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, ] }] grid_search = GSCV(estimator=classifier, param_grid=parameters, scoring="accuracy", cv=10, n_jobs=-1) grid_search = grid_search.fit(x_train, y_train) best_accuracy = grid_search.best_score_ best_parameters = grid_search.best_params_ #-------<Clasificar con valores optimos>---- classifier = SVC(C=1, kernel="rbf", gamma=0.7, random_state=0) classifier.fit(x_train, y_train) print(classifier) y_pred = classifier.predict(x_test) cm = confusion_matrix(y_test, y_pred) print(cm)
if err > 0: y += err * rng.randn(n) return x, y x, y = make_data(200) xtest = np.linspace(0, 1, 500)[:, np.newaxis] model = make_pipeline(PF(), LR()) tune_params = { 'polynomialfeatures__degree': np.arange(20), 'linearregression__fit_intercept': [True, False], } grid = GSCV(model, param_grid=tune_params, n_jobs=4, cv=5, verbose=1, refit=True) grid.fit(x, y) print(grid.best_params_, grid.best_score_, sep='\t\t') # {'linearregression__fit_intercept': True, 'polynomialfeatures__degree': 9} optimal = grid.best_estimator_ ypred = optimal.predict(xtest) # not all parameter values are tried out, but rather a fixed number of parameter setting randomized = RSCV(model, param_distributions=tune_params, n_jobs=4, cv=5, verbose=1) randomized.fit(x, y)
y_sample = test[target] X_train_scaled = train_scaled[features] y_train_scaled = train_scaled[target] X_sample_scaled = test_scaled[features] parameters = { 'n_estimators': [90], 'max_depth': [9], 'learning_rate': [0.2], 'min_child_weight': range(5, 21, 1), } model = XG_model.get_model() GS = GSCV(estimator=model, param_grid=parameters, cv=5, refit=True, scoring='neg_mean_squared_error') ndarray = plot_figure.plot_chart(GS, df, test, X_train_scaled, y_train_scaled, X_sample_scaled) #calculate time cost end = time.time() print('total time cost {:.2f} sec.'.format(end - start)) test_track = list(zip(range(len(test)), ndarray)) pre_y_track = list(zip(range(len(test)), test['predict_y_Value'].values)) distans = ar.frechet_distance(test_track, pre_y_track) print('appraisal:\nfrechet_distance =', distans)