random_grid = { 'n_estimators': n_estimators, 'max_features': max_features, 'max_depth': max_depth, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf, 'bootstrap': bootstrap } print(random_grid) rf = RandomForestClassifier() rf_random = RandomizedSearchCV(estimator=rf, param_distributions=random_grid, n_iter=100, cv=ps, verbose=2, random_state=42, n_jobs=-1) rf_random.fit(df_train, label) print('Best parameter setting found:') print(rf_random.best_params_) best_grid = rf_random.best_estimator_ best_grid = load(r'heuristic_vgg16_feature_maps_rf.joblib') y_pred = best_grid.predict(df_test) # Results of default RF model Prediction print('Evaluation on test set:') print('Acurracy' + str(accuracy_score(label_test, y_pred)))
cv=5) print(test, np.mean(scores)) if tune: # Random grid search for hyperparameter tuning from sklearn.model_selection import RandomizedSearchCV random_grid_decision = { 'max_features': ['sqrt', 'log2'], 'max_depth': [None, 20, 40, 60, 80, 100, 120], 'min_samples_split': [2, 4, 8], 'min_samples_leaf': [1, 2, 4], } # First we perform the decision tree optimisation search = RandomizedSearchCV(tree_classifier, param_distributions=random_grid_decision, n_iter=75, cv=5, n_jobs=-1, scoring='roc_auc', random_state=20) search.fit(train_values, train_labels) # Save the best model tree_classifier = search.best_estimator_ save_model('best_tree', tree_classifier) # And the random forest optimisation random_grid_forest = { 'n_estimators': [100, 200, 400, 600, 800, 1000, 1200], 'max_features': ['sqrt', 'log2'], 'max_depth': [None, 20, 40, 60, 80, 100, 120], 'min_samples_split': [2, 4, 8], 'min_samples_leaf': [1, 2, 4],
def rand_search(self): '''running a randomized search to find the parameter combination for a random forest which gives the best accuracy score''' print('*' * 80) print( '* Running RandomizedSearch for best parameter combination for RandomForest' ) print('*' * 80) #create the decision forest extra_clf_rand = ExtraTreesClassifier(random_state=100, max_depth=1, n_jobs=-1) with open( os.path.join(self.newdata_minusEP, 'extreme_randomforest_randomsearch.txt'), 'a') as text_file: text_file.write('Created random forest: extra_clf_rand \n') #set up randomized search param_rand = { "criterion": ["gini", "entropy"], #metric to judge reduction of impurity 'class_weight': ['balanced', None], 'n_estimators': randint(100, 10000), #number of trees in forest #'max_features': randint(2, 5),#max number of features when splitting "min_samples_split": randint(2, 20), #min samples per node to induce split #"max_depth": randint(1, 10),#max number of splits to do "min_samples_leaf": randint(1, 20), #min number of samples in a leaf "max_leaf_nodes": randint(10, 20) } #max number of leaves with open( os.path.join(self.newdata_minusEP, 'extreme_randomforest_randomsearch.txt'), 'a') as text_file: text_file.write( 'Running randomized search for the following parameters: %s \n' % param_rand) text_file.write('use cv=3, scoring=accuracy \n') #building and running the randomized search rand_search = RandomizedSearchCV(extra_clf_rand, param_rand, random_state=5, cv=3, n_iter=500, scoring='accuracy', n_jobs=-1) rand_search_transform = rand_search.fit(self.X_newdata_transform_train, self.y_train) with open( os.path.join(self.newdata_minusEP, 'extreme_randomforest_randomsearch.txt'), 'a') as text_file: text_file.write('Best parameters: ' + str(rand_search_transform.best_params_) + '\n') text_file.write('Best score: ' + str(rand_search_transform.best_score_) + '\n') feature_importances_transform = rand_search_transform.best_estimator_.feature_importances_ feature_importances_transform_ls = sorted(zip( feature_importances_transform, self.X_newdata_transform_train), reverse=True) with open( os.path.join(self.newdata_minusEP, 'extreme_randomforest_randomsearch.txt'), 'a') as text_file: text_file.write('Feature importances: %s \n' % feature_importances_transform_ls) self.best_params_transform = rand_search_transform.best_params_ self.feature_importances_transform_ls = feature_importances_transform_ls def feature_importances_best_estimator(feature_list, name, directory): datestring = datetime.strftime(datetime.now(), '%Y%m%d_%H%M') feature_list.sort(key=lambda x: x[1], reverse=True) feature = list(zip(*feature_list))[1] score = list(zip(*feature_list))[0] x_pos = np.arange(len(feature)) plt.bar(x_pos, score, align='center') plt.figure(figsize=(20, 10)) plt.xticks(x_pos, feature, rotation=90, fontsize=2) plt.title( 'Histogram of Feature Importances for best RandomForest using features %s ' % name) plt.xlabel('Features') plt.tight_layout() plt.savefig( os.path.join( directory, 'feature_importances_best_bar_plot_rand_bag_' + name + datestring + '.png')) plt.close() feature_importances_best_estimator( self.feature_importances_transform_ls, 'newdata_minusEP', self.newdata_minusEP)
# Maximum number of levels in tree max_depth = [int(x) for x in np.linspace(10, 1000,10)] # Minimum number of samples required to split a node min_samples_split = [2, 5, 10,14] # Minimum number of samples required at each leaf node min_samples_leaf = [1, 2, 4,6,8] # Create the random grid random_grid = {'n_estimators': n_estimators, 'max_features': max_features, 'max_depth': max_depth, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf} print(random_grid) rf_1=RandomForestClassifier() randomcv=RandomizedSearchCV(estimator=rf_1,param_distributions=random_grid,n_iter=100,cv=3,verbose=2, random_state=100,n_jobs=-1) ### fit the randomized model randomcv.fit(X_train,Y_train) #getting the best parameters best_grid=randomcv.best_estimator_ #fitting into the data and predicting best_grid.fit(X_train,Y_train) pred_2=best_grid.predict(X_test) #validation print(confusion_matrix(Y_test,pred_2)) print(accuracy_score(Y_test,pred_2)) print(classification_report(Y_test,pred_2))
'scale_pos_weight': st.randint(1, 13), 'reg_alpha': st.randint(1, 5) } fit_dict = { "eval_set": [(X_train, y_train), (X_valid, y_valid)], "early_stopping_rounds": 20, "eval_metric": "auc", "verbose": 100 } alg = xgb.XGBClassifier(**params) print("Model Parameters: ", alg.get_params().keys()) clf = RandomizedSearchCV(estimator=alg, n_iter=4, param_distributions=param_grid, cv=2, scoring="roc_auc") print("Parameter Search:") clf.fit(X_train, y_train, **fit_dict) print("Best All Params: ", clf.get_params()) print("Best Score: ", clf.best_score_) print("Best Parametes: ", clf.best_params_) xgb_pred = clf.predict_proba(test)[:, 1] xgb_pred[:5] # Submit xgb_sub = pd.DataFrame(xgb_pred, columns=["TARGET"], index=testdex) xgb_sub.to_csv("XGB.csv", index=True, float_format='%.8f')
"max_depth": [3, 4, 5, 6, 8, 10, 12, 15], "min_child_weight": [1, 3, 5, 7], "gamma": [0.0, 0.1, 0.2, 0.3, 0.4], "colsample_bytree": [0.3, 0.4, 0.5, 0.7] } from sklearn.model_selection import RandomizedSearchCV, GridSearchCV import xgboost model = xgboost.XGBClassifier() random_search = RandomizedSearchCV(model, param_distributions=params, n_iter=5, scoring='roc_auc', n_jobs=-1, cv=5, verbose=3) random_search.fit(x, y) x.head() random_search.best_estimator_ random_search.best_params_ model = xgboost.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
def test_hyperparameter_selection(digits): X_train, X_test, y_train, y_test = digits param_grid = {'eta': [0.02, 0.03]} mod = BasicSGDClassifier(max_iter=5) xval = RandomizedSearchCV(mod, param_grid, cv=2) xval.fit(X_train, y_train)
Y = pd.DataFrame(Y, columns=['target']) #model building param_grid = { 'eta': [0.05, 0.1, 0.15], 'max_depth': [6, 7, 8], 'gamma': [0.5, 1, 1.5], 'min_child_weight': [1, 5, 10] } xgb_model = xgb.XGBClassifier(n_estimators=500, objective='binary:logistic', metric='auc', scale_pos_weight=2) fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=2) rs_cv = RandomizedSearchCV(xgb_model, param_grid, cv=fold.split(X, Y)) st = datetime.now() rs_cv.fit(X, Y) #end=datetime.now() print("Time taken is:", datetime.now() - st) best_params = rs_cv.best_params_ #rs_cv.best_score_ = 0.9976542717402616 model_fit = xgb.XGBClassifier(params=best_params, n_estimators=500, objective='binary:logistic', metric='auc', scale_pos_weight=2) xgb_model = model_fit.fit(X, Y)
#dictionary for parameters params_space = { 'c1': scipy.stats.expon(scale=0.5), 'c2': scipy.stats.expon(scale=0.05) } # use the f1 score metric for evaluation f1_scorer = make_scorer(metrics.flat_f1_score, average='weighted', labels=labels) # search rs = RandomizedSearchCV(crf, params_space, cv=10, verbose=1, n_jobs=-1, n_iter=20, scoring=f1_scorer) rs.fit(X_train, y_train) crf = sklearn_crfsuite.CRF(algorithm='lbfgs', c1=0.02, c2=0.3, max_iterations=2000, all_possible_transitions=True, verbose=False) crf.fit(x_train, y_train) labels = ["O", "D", "T"] y_pred = crf.predict(x_test) print("F1 score (unweighted average) is %lf " %
'max_depth': max_depth, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf, 'bootstrap': bootstrap } # In[ ]: cv = KFold(5, shuffle=True) # In[ ]: rf_random = RandomizedSearchCV(estimator=base_model, param_distributions=random_grid, n_iter=100, cv=cv, verbose=2, random_state=101, n_jobs=-1) # In[ ]: rf_random.fit(X_train, y_train) # In[ ]: print(rf_random.best_params_) # In[ ]: # y_pred=rf_random.predict(X_test)
return { 'batch_size': batches, 'optimizer': optimizers, 'drop': dropout, 'learning_rate': learning_rate } # KerasClassifier 모델 구성하기 model = KerasClassifier(build_fn=build_model, verbose=1) # hyperparameters 변수 정의 hyperparameters = create_hyperparameter() search = RandomizedSearchCV(estimator=model, param_distributions=hyperparameters, cv=3) # 모델 훈련 search.fit(x_train, y_train) score = search.score(x_test, y_test) print(search.best_params_ ) # {'optimizer': 'adadelta', 'drop': 0.2, 'batch_size': 20} print("score : ", score) # 0.9661999940872192 def sum_of_squares(v): return sum(v_i**2 for v_i in v) # 실수 벡터를 입력하면 요소의 제곱으ㅢ 합을 리턴해주는 비용함수
"Random Forest Classifier": clf, "Support Vector Machine": svm, }, index=["accuracy"]) model_compare.T.plot.bar(figsize=(15, 10)) # In[14]: # Create a hyperparameter grid for LogisticRegression log_reg_grid = {"C": np.logspace(-4, 4, 20), "solver": ["liblinear"]} # Tune LogisticRegression np.random.seed(42) # Setup random hyperparameter search for LogisticRegression rs_log_reg = RandomizedSearchCV(LogisticRegression(), param_distributions=log_reg_grid, cv=5, n_iter=20, verbose=True) # Fit random hyperparameter search model for LogisticRegression rs_log_reg.fit(X_train, y_train) score = rs_log_reg.score(X_test, y_test) print(score * 100) # In[15]: log_reg_grid = {'C': np.logspace(-4, 4, 30), "solver": ["liblinear"]} #setup the gird cv gs_log_reg = GridSearchCV(LogisticRegression(), param_grid=log_reg_grid, cv=5, verbose=True)
def gridsearchGradientBoostingR(X, y, n_jobs=1, verbose=True): if verbose == True: verbose = 2 cv = 10 n_iter = 100 # cv=2 # n_iter=10 n_jobs = np.maximum(n_jobs, 1) if 'pandas' in str(type(X)): X = X.as_matrix().astype(np.float) if 'pandas' in str(type(y)): y = y.as_matrix().astype(np.float) # Loss function to be optimized (minimize) loss = ['ls', 'lad', 'huber', 'quantile'] # Number of weak learnes (trees) used in the boosting process n_estimators = [100, 250, 300, 500, 600, 750] # Maximum depth of each tree max_depth = [2, 3, 5, 10, 15] # Minimum number of samples per leaf min_samples_leaf = [1, 2, 4, 6, 8, 10] # Minimum number of samples to split a node min_samples_split = [2, 4, 6, 10, 12] # Maximum number of features to consider for making splits max_features = ['auto', 'sqrt', 'log2', None] # Maximum number of features to consider for making splits criterion = ['friedman_mse', 'mse'] #%% Make the grid. hyperparameter_grid = { 'loss': loss, 'n_estimators': n_estimators, 'max_depth': max_depth, 'min_samples_leaf': min_samples_leaf, 'min_samples_split': min_samples_split, 'max_features': max_features, 'criterion': criterion } # Create the model to use for hyperparameter tuning model = GradientBoostingRegressor() # model = xgboost.XGBRegressor() # Set up the random search with 5-fold cross validation random_cv = RandomizedSearchCV( estimator=model, param_distributions=hyperparameter_grid, cv=cv, n_iter=n_iter, scoring='neg_mean_absolute_error', n_jobs=n_jobs, verbose=verbose, return_train_score=True, refit=True, #Refit using the best found parameters on the whole dataset. ) # Fit on the training data random_cv.fit(X, y) # Show some results: if verbose: report(random_cv.cv_results_) # Find the best combination of settings model = random_cv.best_estimator_ # random_cv.best_score_ # random_cv.best_params_ # random_cv.best_index_ # random_cv.cv_results_['params'][search.best_index_] # random_results = pd.DataFrame(random_cv.cv_results_).sort_values('mean_test_score', ascending = False) # bestparams=random_cv.cv_results_['params'][random_cv.best_index_] return (model, random_cv)
X = scaler.fit_transform(x) ################################################################################################################################ # make scoring and kfold instance # scoring = make_scorer(balanced_accuracy_score) # also run with this scoring method scoring = make_scorer(f1_score) kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) it = 25 ################################################################################################################################ # KNN Classifier knn = KNeighborsClassifier() p_grid = {"n_neighbors": range(1, 30), "leaf_size": range(1, 50)} gknn = RandomizedSearchCV(knn, p_grid, n_iter=it, cv=kf, scoring=scoring) gknn.fit(X, y) cv_knn = gknn.cv_results_['mean_test_score'] print(cv_knn) df_knn = pd.DataFrame(cv_knn) df_knn.columns = ['KNN'] ############################################################################################################################### # Random Forest Classifier rf = ses.RandomForestClassifier() p_grid = { "max_depth": range(1, 25), "n_estimators": range(15, 75), "min_samples_leaf": range(1, 25), "min_samples_split": range(2, 25)
parameters = { 'max_features': [2, 3, 4], 'max_samples': [0.5, 0.7, 0.9], "base_estimator__C": [0.0001, 0.001, 0.01, 1, 10, 100] } # **Задание 11.** Следующая задача обучить бэггинг классификатор (`random_state`=42). В качестве базовых классификаторов возьмите 100 логистических регрессий и на этот раз используйте не `GridSearchCV`, а `RandomizedSearchCV`. Так как перебирать все 54 варианта комбинаций долго, то поставьте максимальное число итераций 20 для `RandomizedSearchCV`. Также не забудьте передать параметр валидации `cv` и `random_state=1`. Какая лучшая точность получилась? # In[23]: bg_clf = BaggingClassifier(base_estimator=lr, n_estimators=100, random_state=42) bg_clf_grid_random = RandomizedSearchCV(bg_clf, param_distributions=parameters, n_iter=20, cv=skf, random_state=1, n_jobs=-1) bg_clf_grid_random.fit(X, y) # In[24]: bg_clf_grid_random.best_score_ # **Задача 12.** Дайте интерпретацию лучших параметров для бэггинга. Почему именно такие значения оказались лучшими? # # - для бэггинга важно использовать как можно меньше признаков # - бэггинг лучше работает на небольших выборках # - меньше корреляция между одиночными моделями # - чем больше признаков, тем меньше теряется информации
def train(X, y, weight_classes=True, n_iter_search=500, score='roc_auc', random_state=123): ''' Train a binary SGD classifier using a randomized grid search with given scoring metric. Parameters: X (list-like): list of normalized attachment texts y (list-like): list of validated targets (0 = red, 1 = green) weight_classes (bool): whether or not to use the “balanced” mode to adjust class weights. n_iter_search (int): number of parameter settings that are sampled. Trades off runtime vs quality of the solution. score (str): the scorer used to evaluate the predictions on the test set. `roc_auc` by default. Available options include: accuracy, roc_auc, precision, fbeta, recall. Note: for fbeta, beta is set to 1.5 to favor recall of the positive class. random_state (int): sets the random seed for reproducibility. Returns: results (dict): a dict of scoring metrics and their values best_score (float): mean cross-validated score of the best_estimator. best_estimator (sklearn estimator): estimator that was chosen by the search best_params (dict): parameter setting that gave the best results on the hold out data. ''' if weight_classes: clf = SGDClassifier(class_weight='balanced') else: clf = clf = SGDClassifier() scoring = { 'accuracy': metrics.make_scorer(metrics.accuracy_score), 'roc_auc': metrics.make_scorer(metrics.roc_auc_score), 'precision': metrics.make_scorer(metrics.average_precision_score), 'fbeta': metrics.make_scorer(metrics.fbeta_score, beta=.5), 'recall': metrics.make_scorer(metrics.recall_score) } X_train, X_test, y_train, y_test = train_test_split( X, y, stratify=y, test_size=0.2, random_state=random_state) pipe = Pipeline([('vectorizer', TfidfVectorizer(stop_words='english')), ('select', SelectKBest(chi2)), ('clf', clf)]) param_dist = get_param_distribution() random_search = RandomizedSearchCV(pipe, param_distributions=param_dist, scoring=scoring, refit=score, n_iter=n_iter_search, cv=5, n_jobs=-1, verbose=1, random_state=random_state) try: random_search.fit(X_train, y_train) except Exception as e: logger.error(f"Exception occurred training a new model: \ {e}", exc_info=True) y_pred = random_search.predict(X_test) #get the col number of the positive class (i.e. green) positive_class_col = list(random_search.classes_).index(1) try: y_score = random_search.predict_proba(X_test)[:, positive_class_col] except AttributeError: y_score = random_search.decision_function(X_test) average_precision = metrics.average_precision_score(y_test, y_score) acc = metrics.accuracy_score(y_test, y_pred) try: roc_auc = metrics.roc_auc_score(y_test, y_pred) except ValueError: roc_auc = None precisions, recalls, _ = metrics.precision_recall_curve(y_test, y_score) try: auc = metrics.auc(recalls, precisions) except ValueError: auc = None fbeta = metrics.fbeta_score(y_test, y_pred, beta=1.5) recall = metrics.recall_score(y_test, y_pred) best_estimator = random_search.best_estimator_ best_params = random_search.best_params_ best_score = random_search.best_score_ result_values = [ y_pred, y_score, precisions, recall, average_precision, acc, roc_auc, auc, fbeta, recalls, best_score, best_estimator, y_test ] result_keys = [ 'y_pred', 'y_score', 'precisions', 'recall', 'average_precision', 'acc', 'roc_auc', 'auc', 'fbeta', 'recalls', 'best_score', 'best_estimator', 'y_test' ] results = {k: v for k, v in zip(result_keys, result_values)} return results, best_score, best_estimator, best_params
# {"mo__C" : [1,10,100,1000],"mo__kernel":["sigmoid"],"mo__gamma":[0.001,0.0001]} # ] Parameters = [ {"svc__C" : [1,10,100,1000],"svc__kernel":["linear"] }, {"svc__C" : [1,10,100],"svc__kernel":['rbf'],"svc__gamma":[0.001,0.0001]}, {"svc__C" : [1,10,100,1000],"svc__kernel":["sigmoid"],"svc__gamma":[0.001,0.0001]} ] # 2 # pipe = Pipeline([('scaler',MinMaxScaler()),('mo',SVC())]) # 아레와 결과치는 동일 하다 이방법은 이름을 정해줄수있다 이름을 정해줘야 위에 Parameters를 조정가능하다(mo__:이름으로 지정) pipe = make_pipeline(StandardScaler(),SVC()) #이걸 사용할때는 (SVC__)로 해야 된다 # model = GridSearchCV(pipe,Parameters,cv = 5) model = RandomizedSearchCV(pipe,Parameters,cv = 5) model.fit(x_train, y_train) results = model.score(x_test,y_test) print('최적의 매개변수 : ', model.best_estimator_) # model.best_estimator_ : 어떤것이 가장 좋은것(매개변수)인지 나온다 print(results) # =================== # for문 으로 grid랑 random돌릴때 # models = [GridSearchCV(pipe,Parameters,cv = 5),RandomizedSearchCV(pipe,Parameters,cv = 5)] # for algorithm in models : # model = algorithm # model.fit(x_train, y_train)
y, test_size=0.2, random_state=45) kfold = KFold(n_splits=5, shuffle=True) parameters = [{ 'n_estimators': [100, 150, 200, 250], 'max_depth': [6, 8, 10, 12], 'min_samples_leaf': [1, 3, 5, 7, 10], 'min_samples_split': [2, 3, 5, 10], 'n_jobs': [-1, 2, 4] }] # 2. 모델 model = RandomizedSearchCV(RandomForestClassifier(), parameters, cv=kfold) start = time() model.fit(x_train, y_train) print('RandomizedSearchCV took %.2f seconds' % (time() - start)) print('최적의 매개변수 :', model.best_estimator_) y_pred = model.predict(x_test) print('최종 정답률 :', accuracy_score(y_test, y_pred)) print('최종 정답률 :', model.score(x_test, y_test)) ''' RandomizedSearchCV took 20.05 seconds 최적의 매개변수 : RandomForestClassifier(max_depth=8, min_samples_leaf=3, min_samples_split=3, n_estimators=150, n_jobs=2) 최종 정답률 : 0.9649122807017544
X_test = test.drop(['subject', 'Activity'], axis=1) y_test = test.Activity print('Training data size : ', X_train.shape) print('Test data size : ', X_test.shape) from sklearn.linear_model import LogisticRegression from sklearn.model_selection import RandomizedSearchCV from sklearn.metrics import confusion_matrix from sklearn.metrics import accuracy_score from sklearn.metrics import classification_report import warnings warnings.filterwarnings("ignore") parameters = {'C': np.arange(10, 61, 10), 'penalty': ['l2', 'l1']} lr_classifier = LogisticRegression() lr_classifier_rs = RandomizedSearchCV(lr_classifier, param_distributions=parameters, random_state=42) lr_classifier_rs.fit(X_train, y_train) y_pred = lr_classifier_rs.predict(X_test) lr_accuracy = accuracy_score(y_true=y_test, y_pred=y_pred) print("Accuracy using Logistic Regression : ", lr_accuracy) labels = np.unique(y_pred) labels plt.figure(figsize=(10, 10)) sns.heatmap(confusion_matrix(y_test.values, y_pred), annot=True, cmap='Blues', fmt='', xticklabels=labels, yticklabels=labels)
scores = {} for name, grid in param_grids: scores[name] = [] from validator import val from postprocessing import MetricEqualizer, Average # we perform a 7-fold validated random search over 7 param options 7 times, then switch genders: B I B L I C A L # no but for real the exact numbers don't really matter for _ in range(7): for name, param_grid in param_grids: try: rand_cv = RandomizedSearchCV( pipeline, param_distributions=param_grid, n_iter=7, scoring=dist_score, cv=7, return_train_score=False) rand_cv.fit(couples, y) score = val( [rand_cv.best_estimator_], [Average(), MetricEqualizer(metric="percentage")], swap)["score"] print(score) scores[name].append(score) if score < best_score: best_score = score
def hyperparameter_tuning(X_train, Y_train, X_test, Y_test, save_name='SVM_model'): # Initialise the SVM classifier classifier = svm.SVC(cache_size=1024, class_weight='balanced', random_state=random.randint(1, 10000)) # Set possible parameter values C = [0.01, 0.1, 1, 10] gamma = [0.001, 0.01, 0.1] kernel = ['linear', 'rbf', 'poly', 'sigmoid'] tol = [0.0001, 0.001, 0.01] # Wrap the parameter values in random_grid random_grid = {'C': C, 'gamma': gamma, 'kernel': kernel, 'tol': tol} #===Run a randomised search for the optimal parameter setting===# classifier_random = RandomizedSearchCV(estimator=classifier, param_distributions=random_grid, n_iter=100, cv=5, verbose=2, random_state=random.randint( 1, 10000), n_jobs=-1) # Fit the random search model param_opt_rand = classifier_random.fit(X_train, Y_train) #=== Narrow down random optimal solutions to the best hyperparamets ===# # Extrapolate random optimal parameter values tol = param_opt_rand.best_estimator_.tol kernel = param_opt_rand.best_estimator_.kernel gamma = param_opt_rand.best_estimator_.gamma C = param_opt_rand.best_estimator_.C # IDEA: implement a precision variable. Divide the step size and the offset # used in the np.arange() by the precision which should by default be 1. param_grid = { 'C': np.arange(C - 0.8, C + 0.81, 0.4), 'gamma': [0.5 * gamma, gamma, gamma * 2], 'tol': [0.5 * tol, tol, 2 * tol] } # Set kernel classifier.kernel = kernel # Find the optimal hyperparameters using gridsearch # initialise the grid search with cross validation classifier_gridsearch = GridSearchCV(estimator=classifier, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2) # Run the grid search to find the model with the optimal hyperparamterers classifier_gridsearch.fit(X_train, Y_train) # Extrapolate the optimal hyperparamters SVC_params_opt = classifier_gridsearch.best_params_ # Evaluate the optimal model final_accuracy = classifier_gridsearch.best_estimator_.score( X_test, Y_test) final_roc_auc_score = metrics.roc_auc_score( Y_test, classifier_gridsearch.best_estimator_.predict(X_test)) # Locally save results SVM_params_file = open("Hyperparameters/SVM.pkl", "wb") pickle.dump(SVC_params_opt, SVM_params_file) SVM_params_file.close() # Locally save the entire model joblib.dump(classifier_gridsearch.best_estimator_, f'Models/SVM/{save_name}.pkl') # print the results print( "Support Vector Machine hyperparamter optimisation completed succesfully." ) print(f"Best SVM model accuracy: {final_accuracy}") print(f"Best SVM model roc_auc: {final_roc_auc_score}") return classifier_gridsearch.best_estimator_
print(random_grid) # In[23]: # Use the random grid to search for best hyperparameters # First create the base model to tune rf = RandomForestRegressor() # In[24]: # Random search of parameters, using 3 fold cross validation, # search across 100 different combinations rf_random = RandomizedSearchCV(estimator=rf, param_distributions=random_grid, scoring='neg_mean_squared_error', n_iter=100, cv=5, verbose=2, random_state=42, n_jobs=1) # In[25]: #fit the random forest model rf_random.fit(X_train, y_train) # In[26]: #displaying the best parameters rf_random.best_params_ # In[27]:
# model.add(Dense(12, input_dim=8, kernel_initializer='uniform', activation='linear', kernel_constraint=maxnorm(weight_constraint))) # model.add(Dropout(dropout_rate)) # model.add(Dense(1, kernel_initializer='uniform', activation='sigmoid')) # Compile model # model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) # return model #weight_constraint = [1, 2, 3, 4, 5] #dropout_rate = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] #param_grid = dict(dropout_rate=dropout_rate, weight_constraint=weight_constraint) ############################################################################################################## weight_constraint = [1, 2, 3, 4, 5] dropout_rate = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] param_grid = dict(dropout_rate=dropout_rate, weight_constraint=weight_constraint) grid = RandomizedSearchCV(estimator=model, param_grid=param_grid, n_jobs=1, random_state=3, cv=3) grid_result = grid.fit(X, Y) # summarize results print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_)) means = grid_result.cv_results_['mean_test_score'] stds = grid_result.cv_results_['std_test_score'] params = grid_result.cv_results_['params'] for mean, stdev, param in zip(means, stds, params): print("%f (%f) with: %r" % (mean, stdev, param))
def ml_tests(x_train, x_test, y_train, y_test, imputed_data): # XGBoost Standardmodell xg_reg = xgb.XGBRegressor(objective="reg:squarederror", n_estimators=50, seed=123) xg_reg.fit(x_train, y_train) preds = xg_reg.predict(x_test) rmse = np.sqrt(mean_squared_error(y_test, preds)) print("RMSE: %f" % rmse) print() datestr = time.strftime("%Y%m%d-%H%M") xg_reg_file = 'XGB_Standardmodell.pckl' with open(xg_reg_file, 'wb') as f: pickle.dump(xg_reg, f) plot_importance(xg_reg, max_num_features=10) fig = plt.gcf() fig.set_size_inches(17.5, 8) plt.savefig( 'Files/Feature_Importances_Grafiken/xgb_feature_importances.jpg') # Grid Search parameter Tuning print("Grid Search Parameter Tuning:") gbm_param_grid = { 'colsample_bytree': [0.3, 0.7], 'n_estimators': [25, 50, 80, 100], 'max_depth': [2, 5, 7] } gbm = xgb.XGBRegressor(objective="reg:squarederror") grid_mse = GridSearchCV(estimator=gbm, param_grid=gbm_param_grid, scoring="neg_mean_squared_error", cv=4, verbose=1) grid_mse.fit(x_train, y_train) print("Best parameters found: ", grid_mse.best_params_) print("Lowest RMSE Grid Search found: ", np.sqrt(np.abs(grid_mse.best_score_))) print() # Randomized Search parameter tuning print("Randomized Search Parameter Tuning:") gbm_param_grid2 = {'n_estimators': [25], 'max_depth': range(2, 12)} gbm2 = xgb.XGBRegressor(objective="reg:squarederror", n_estimators=10) randomized_mse = RandomizedSearchCV(estimator=gbm2, param_distributions=gbm_param_grid2, scoring="neg_mean_squared_error", n_iter=5, cv=4, verbose=1) randomized_mse.fit(x_train, y_train) print("Best parameters found: ", randomized_mse.best_params_) print("Lowest RMSE Randomized Search found: ", np.sqrt(np.abs(randomized_mse.best_score_))) dm_train = xgb.DMatrix(data=x_train, label=y_train) dm_test = xgb.DMatrix(data=x_test, label=y_test) params = {"booster": "gblinear", "objective": "reg:squarederror"} xg_reg2 = xgb.train(dtrain=dm_train, params=params, num_boost_round=15) preds2 = xg_reg2.predict(dm_test) rmse = np.sqrt(mean_squared_error(y_test, preds2)) print("RMSE: %f" % rmse) reg_params = [0.1, 0.3, 0.7, 1, 10, 100] params1 = {"objective": "reg:squarederror", "max_depth": 3} rmses_l2 = [] for reg in reg_params: params1["lambda"] = reg cv_results_rmse = xgb.cv(dtrain=dm_train, params=params1, nfold=3, num_boost_round=15, metrics="rmse", as_pandas=True) rmses_l2.append(cv_results_rmse["test-rmse-mean"].tail(1).values[0]) print("Best rmse as a function of l2:") print(pd.DataFrame(list(zip(reg_params, rmses_l2)), columns=["l2", "rmse"])) print() #print_feature_importances(model=xg_reg2, data=imputed_data.drop(columns=["angebotspreis"])) # Stochastic Gradient Boosting print("Stochastic Gradient Boosting:") sgbr = GradientBoostingRegressor(max_depth=4, subsample=0.9, max_features=0.75, n_estimators=200, random_state=2) sgbr.fit(x_train, y_train) y_pred = sgbr.predict(x_test) rmse = np.sqrt(mean_squared_error(y_test, y_pred)) print("RMSE: %f" % rmse) print() sgbr_file = 'sgbr_Standardmodell.pckl' with open(sgbr_file, 'wb') as f: pickle.dump(sgbr, f) print_feature_importances( model=sgbr, data=imputed_data.drop(columns=["angebotspreis"]), save_string= 'Files/Feature_Importances_Grafiken/sgbr_feature_importances.jpg') # Random Forrest print("Random Forrest:") rf = RandomForestRegressor(n_estimators=25, random_state=2) rf.fit(x_train, y_train) y_pred2 = rf.predict(x_test) rmse = np.sqrt(mean_squared_error(y_test, y_pred2)) print("RMSE: %f" % rmse) print() rf_file = 'rf_Standardmodell.pckl' with open(rf_file, 'wb') as f: pickle.dump(rf, f) print_feature_importances( model=rf, data=imputed_data.drop(columns=["angebotspreis"]), save_string= 'Files/Feature_Importances_Grafiken/rf_feature_importances.jpg')
print('Length of X (test): {} | Length of y (test): {}'.format( len(original_Xtest), len(original_ytest))) # List to append the score and then find the average accuracy_lst = [] precision_lst = [] recall_lst = [] f1_lst = [] auc_lst = [] # Classifier with optimal parameters # log_reg_sm = grid_log_reg.best_estimator_ log_reg_sm = LogisticRegression() rand_log_reg = RandomizedSearchCV(LogisticRegression(), log_reg_params, n_iter=4) # Implementing SMOTE Technique # Cross Validating the right way # Parameters log_reg_params = { "penalty": ['l1', 'l2'], 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000] } for train, test in sss.split(original_Xtrain, original_ytrain): pipeline = imbalanced_make_pipeline( SMOTE(sampling_strategy='minority'), rand_log_reg) # SMOTE happens during Cross Validation not before.. model = pipeline.fit(original_Xtrain[train], original_ytrain[train]) best_est = rand_log_reg.best_estimator_
def fineTuneClassifiers(self, X, y, classifiers): """Search over specified parameter values for various estimators/classifiers and choose the best one. This method searches over specified values and selects the classifier that achieves the best avg accuracy score for all evaluations. The supported search methods are: * *GridSearchCV*: Exhaustive search over specified parameter values for supported estimators. The following variables are defined in :func:`~src.config.MLConf` : * :attr:`~src.config.MLConf.MLP_hyperparameters` * :attr:`~src.config.MLConf.RandomForests_hyperparameters` * :attr:`~src.config.MLConf.XGBoost_hyperparameters` * :attr:`~src.config.MLConf.SVM_hyperparameters` * :attr:`~src.config.MLConf.DecisionTree_hyperparameters` * *RandomizedSearchCV*: Randomized search over continuous distribution space. :attr:`~src.config.MLConf.max_iter` defines the number of parameter settings that are sampled. :py:attr:`~src.config.MLConf.max_iter` trades off runtime vs quality of the solution. The following variables are defined in :func:`~src.config.MLConf` : * :attr:`~src.config.MLConf.MLP_hyperparameters_dist` * :attr:`~src.config.MLConf.RandomForests_hyperparameters_dist` * :attr:`~src.config.MLConf.XGBoost_hyperparameters_dist` * :attr:`~src.config.MLConf.SVM_hyperparameters_dist` * :attr:`~src.config.MLConf.DecisionTree_hyperparameters_dist` Parameters ---------- X: array-like or sparse matrix, shape = [n_samples, n_features] The training input samples. y: array-like, shape = [n_samples] or [n_samples, n_outputs] The target values, i.e. class labels. Returns ------- out: :obj:`dict` of {:obj:`str`: :obj:`int`, :obj:`str`: :obj:`str`} It returns a dictionary with keys *accuracy*, i.e., the used similarity score, and *classifier*, i.e., the name of the model in reference. """ hyperparams_data = list() for clf_key in classifiers: try: print(f'Tuning {clf_key}...') clf = None if self.search_method.lower() == 'grid': clf = GridSearchCV(self.clf_names[clf_key][0]( random_state=config.seed_no), self.clf_names[clf_key][1], cv=self.outer_cv, scoring=config.MLConf.score, verbose=1, n_jobs=self.n_jobs) # elif self.search_method.lower() == 'hyperband' and clf_key in ['XGBoost', 'Extra-Trees', 'Random Forest']: # HyperbandSearchCV( # clf_val[0](probability=True) if clf_key == 'SVM' else clf_val[0](), clf_val[2].copy().pop('n_estimators'), # resource_param='n_estimators', # min_iter=500 if clf_key == 'XGBoost' else 200, # max_iter=3000 if clf_key == 'XGBoost' else 1000, # cv=self.inner_cv, random_state=seed_no, scoring=score # ) else: # randomized is used as default clf = RandomizedSearchCV(self.clf_names[clf_key][0](), self.clf_names[clf_key][2], cv=self.outer_cv, scoring=config.MLConf.score, verbose=1, n_jobs=self.n_jobs, n_iter=self.n_iter) clf.fit(X, y) hyperparams_found = dict() hyperparams_found['score'] = clf.best_score_ hyperparams_found['results'] = clf.cv_results_ hyperparams_found['hyperparams'] = clf.best_params_ hyperparams_found['estimator'] = clf.best_estimator_ hyperparams_found['clf_name'] = clf_key hyperparams_found['scorers'] = clf.scorer_ hyperparams_data.append(hyperparams_found) except KeyError as e: print("type error: {} for key: {}".format(str(e), clf_key)) _, best_clf = max(enumerate(hyperparams_data), key=(lambda x: x[1]['score'])) return best_clf
pd.DataFrame(grid_search.cv_results_) # In[103]: from sklearn.model_selection import RandomizedSearchCV from scipy.stats import randint param_distribs = { 'n_estimators': randint(low=1, high=200), 'max_features': randint(low=1, high=8), } forest_reg = RandomForestRegressor(random_state=42) rnd_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs, n_iter=10, cv=5, scoring='neg_mean_squared_error', random_state=42) rnd_search.fit(housing_prepared, housing_labels) # In[104]: cvres = rnd_search.cv_results_ for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]): print(np.sqrt(-mean_score), params) # In[105]: feature_importances = grid_search.best_estimator_.feature_importances_ feature_importances
"model__epochs": epochs } from keras.wrappers.scikit_learn import KerasClassifier # classifier 분류 model = KerasClassifier(build_fn=build_network, verbose=1) # 사이킥런으로 랩핑을 하다. hyperparameters = create_hyperparameters() from sklearn.model_selection import RandomizedSearchCV from sklearn.model_selection import KFold, GridSearchCV # estimator=> model을 가져온다. from sklearn.preprocessing import MinMaxScaler from sklearn.pipeline import Pipeline from sklearn.svm import SVC # estimator=> model을 가져온다. pipe = Pipeline([("scaler", MinMaxScaler()), ('model', model)]) search = RandomizedSearchCV(pipe, hyperparameters, n_iter=10, n_jobs=15, cv=5, verbose=1) search.fit(x_train, y_train) print(search.best_params_) score = search.score(x_test, y_test) print("Score : ", score)
intermediate_dims = np.arange(1,8)*128 latent_dims = np.arange(2,10) latent_activations = ['relu', 'elu'] kernel_initializers = ['glorot_normal' , 'glorot_uniform', 'he_normal', 'he_uniform', 'lecun_normal', 'lecun_uniform'] param_grid = dict(n_hidden_layers = n_hidden_layers, # kernel_initializer = kernel_initializers, intermediate_dim = intermediate_dims, latent_dim = latent_dims, latent_activation = latent_activations) grid = RandomizedSearchCV(estimator = model, param_distributions = param_grid, error_score = np.nan, n_iter = n_RSCV_iters, cv = n_cv) grid_result = grid.fit(y_train, y_train) time_stamp = int(time()) data_dir = '/SaveFiles/' save_dir = os.environ['HOME'] + data_dir savename_tmplt = save_dir + 'grid_vae_{}_{}' if verbose: print("[INFO] Saving fitted model every way that I know how.") # if verbose: print("[INFO] Saving full model") # grid.save(savename_tmplt.format(time_stamp, 'full_model_save.hdf5'))
# Gera os parametros de entrada aleatoriamente. Alguns sao uniformes nos # EXPOENTES. alpha = 10**np.linspace(-3, 3, 10) # Une os parametros de entrada em um unico dicionario a ser passado para a # funcao. parametros = {'alpha': alpha} shuffle_splitter = ShuffleSplit(n_splits=5, test_size=0.3, random_state=1234) regressor = Ridge() cv_results = \ RandomizedSearchCV(estimator=regressor, cv=shuffle_splitter, param_distributions=parametros, verbose=1, n_jobs=4, scoring="neg_root_mean_squared_error") # Realizamos a busca atraves do treinamento cv_results.fit(X_data_scaled, y_data) print("\n---------------------LINEAR_REGRESSION_L2-------------------") print("\nMelhor conjunto de parâmetros: \n", cv_results.best_estimator_) print("\nMelhor error score: \n", -cv_results.best_score_) # Deafult do sklearn. Coloquei uma lista de 10 parametros iguais so pra nao dar warning, performance nao eh critico aqui alpha = [1.0] * 10