X = df.drop(["Customer Id", "Cost"], axis=1).values y = df['Cost'].values regressor = cb.CatBoostRegressor(verbose=0) param_grid = { "n_estimators": np.arange(100, 800, 100), "max_depth": np.arange(1, 20, 1), "learning_rate": np.arange(0.01, 0.1, 0.01) } model = model_selection.RandomizedSearchCV(estimator=regressor, param_distributions=param_grid, n_iter=5, scoring=make_scorer( calc_metric.calc_score, greater_is_better=True), verbose=10, cv=5, n_jobs=4) model.fit(X, np.log(y)) print() print("Best score: ", model.best_score_) print() print("Best params: ", model.best_params_) print() print("Best estimator: ", model.best_estimator_) print() # n_estimators=700, max_depth=3, score=-0.715, total= 1.7min
############################################################################### #selection of RF hyper-parameters by cross validation print "Selecting hyper-parameters" param_dist = { "n_estimators": sp_randint(100, 500), "max_features": ['auto', 'sqrt'] } model = ensemble.RandomForestClassifier(class_weight='balanced_subsample', n_jobs=ncores) n_iter_search = 100 rf_model = model_selection.RandomizedSearchCV( estimator=model, param_distributions=param_dist, n_iter=n_iter_search, scoring='accuracy', cv=5) #probar accuracy y precision rf_model.fit(X_train, y_train) print "Model selected: \"%s\"" % rf_model.best_estimator_ print "Best score: \"%s\"" % rf_model.best_score_ print "Best param: \"%s\"" % rf_model.best_params_ ############################################################################### #testing model performance print "testing model performance"
def train_nn(): x, y, col_names = get_cleaned_data() x_train, x_test, y_train, y_test = model_selection.train_test_split( x, y, test_size=0.3, random_state=0) x_train, x_test, col_names = lasso_fs(x_train, y_train, x_test, y_test, col_names) #%% Neural net + grid serach reg = neural_network.MLPRegressor(hidden_layer_sizes=(50, ), activation='relu', solver='lbfgs', alpha=0.0001, batch_size='auto', learning_rate='adaptive', learning_rate_init=0.001, power_t=0.5, max_iter=2000, shuffle=True, random_state=0, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True) param_grid = { 'alpha': [0.001, 0.01], 'hidden_layer_sizes': [40, 50, 60], 'activation': ['logistic'], 'solver': ['lbfgs'] } gscv = model_selection.GridSearchCV(reg, param_grid, scoring='neg_mean_absolute_error', fit_params=None, refit=True, cv=3, verbose=2, return_train_score=True) #%% Neural net + randomized search reg_rn = neural_network.MLPRegressor(hidden_layer_sizes=(50, ), activation='relu', solver='lbfgs', alpha=0.0001, batch_size='auto', learning_rate='adaptive', learning_rate_init=0.001, power_t=0.5, max_iter=2000, shuffle=True, random_state=0, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True) param_dist = { "hidden_layer_sizes": range(2, 100), "activation": ['relu', 'logistic'], 'alpha': [0.001, 0.01, 0.0001], } rscv = model_selection.RandomizedSearchCV( reg_rn, param_dist, n_iter=20, scoring='neg_mean_absolute_error', fit_params=None, n_jobs=1, iid=True, refit=True, cv=None, verbose=2, pre_dispatch='2*n_jobs', random_state=None, error_score='raise', return_train_score=True) print 'Using grid search CV' gscv.fit(x_train, y_train) reg = gscv.best_estimator_ print reg reg.fit(x_train, y_train) y_pred = reg.predict(x_test) mae_n = mean_absolute_error(y_pred, y_test) rmse_n = np.sqrt(mean_squared_error(y_pred, y_test)) print 'MAE', mae_n print 'RMSE', rmse_n print 'Using random search CV' rscv.fit(x_train, y_train) reg_rn = rscv.best_estimator_ print reg_rn reg_rn.fit(x_train, y_train) y_pred = reg_rn.predict(x_test) mae = mean_absolute_error(y_pred, y_test) rmse = np.sqrt(mean_squared_error(y_pred, y_test)) print 'MAE', mae print 'RMSE', rmse nn_res = { 'rcv_mae': mae_n, 'rcv_rmse': rmse_n, 'gcv_mae': mae, 'gcv_rmse': rmse } pickle.dump(nn_res, open("results/nn_res.p", "wb"))
X = df.drop(['Selling_Price', 'id'], 1).values y = df.Selling_Price.values regressor = ensemble.RandomForestRegressor(n_jobs=-1) param_grid = { "n_estimators": np.arange(100, 1500, 100), "max_depth": np.arange(1, 31), "criterion": ["mse", "mae"] } model = model_selection.RandomizedSearchCV( estimator=regressor, param_distributions=param_grid, n_iter=30, verbose=10, n_jobs=1, cv=5 ) model.fit(X, y) print(f"best score: {model.best_score_}") print("best parameter set: ") best_param = model.best_estimator_.get_params() for param_name in sorted(param_grid.keys()): print(f"\t{param_name} : {best_param[param_name]}") """
"rf__max_depth": np.arange(1, 20), "rf__criterion": ["gini", "entropy"] } # model = model_selection.GridSearchCV( # estimator=classifier, # param_grid=hyper_params_for_grid_search, # n_jobs=1, # cv=5, # verbose=10, # scoring="accuracy" # ) model = model_selection.RandomizedSearchCV( estimator=classifier, param_distributions=hyper_params_for_pipeline_classifier, n_iter=10, n_jobs=1, verbose=10, scoring="accuracy", cv=5) model.fit(X, y) print(model.best_score_) # gives the best score for the model print( model.best_estimator_.get_params() ) # gives the best params # the main params are criterion, n_esitamtors and max_depth """ best params found out for the model.gridsearchcv {'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': 7, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 200, 'n_jobs': -1, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}
X_train, X_test, y_train, y_test = model_selection.train_test_split( df[all_columns], df[target], test_size=0.15, random_state=42) model = lgb.LGBMClassifier(n_jobs=-1, random_state=42, metric='auc') params = { "num_leaves": [20, 50], "max_depth": [8, 10, 12, 15], "n_estimators": [100, 250, 500], "learning_rate": [0.01, 0.1, 0.9], "subsample": [0.1, 0.20, 0.5, 0.7, 1] } search = model_selection.RandomizedSearchCV(model, params, cv=3, scoring='roc_auc', verbose=5000, n_iter=100) search.fit(X_train, y_train) df_search = pd.DataFrame(search.cv_results_) ## best_pars = df_search['params'][df_search.rank_test_score == 1].iloc[0] print(best_pars) model = lgb.LGBMClassifier(n_jobs=-1, random_state=42, metric='auc', **best_pars) model.fit(X_train, y_train)
'criterion': ['gini', 'entropy'] } ''' intialize random search estimator is the model that we have defined param_distributions is the grid of parameters we use accuracy as our metric. higher value of verbose implies a lot of details are printed cv = 5 means that we are using 5 folds cv n_iter is the number of iterations we want if param_distributions has alll the values as list, random search will be done by sampling without replacement if any of the parameters come from a distribution, random search uses sampling with repalcement ''' model = model_selection.RandomizedSearchCV(estimator=classifier, param_distributions=params, n_iter=20, scoring='accuracy', verbose=10, n_jobs=-1, cv=5) # fit the model and extract best score model.fit(x, y) print(f'Best score : {model.best_score_}') print("Best parameters set:") best_parameters = model.best_estimator_.get_params() for param_name in sorted(params.keys()): print(f'\t {param_name} : {best_parameters[param_name]}')
scoring = make_scorer(f1_score, pos_label=options.positiveClass) if options.classifier == "SVM": classifier = SVC() if options.kernel == 'rbf': paramGrid = {'C': scipy.stats.expon(scale=10), 'gamma': scipy.stats.expon(scale=.1), 'kernel': ['rbf'], 'class_weight': ['balanced']} elif options.kernel == 'linear': paramGrid = {'C': scipy.stats.expon(scale=10), 'kernel': ['linear'], 'class_weight': ['balanced']} elif options.kernel == 'poly': paramGrid = {'C': scipy.stats.expon(scale=10), 'gamma': scipy.stats.expon(scale=.1), 'degree': [2, 3], 'kernel': ['poly'], 'class_weight': ['balanced']} classifier_cv = model_selection.RandomizedSearchCV(classifier, paramGrid, cv=10, n_jobs=30, verbose=3, scoring=scoring, random_state=42) if options.classifier == "MLP": classifier = MLPClassifier() paramGrid= {'hidden_layer_sizes':[(x,) for x in sample(range(30,101),2)],'max_iter': sample(range(80, 201),2)} classifier_cv = model_selection.GridSearchCV(classifier, paramGrid, cv=10, n_jobs=30, verbose=3, scoring=scoring) t1 = time() print(" Training and cross validation...") classifier_cv.fit(matrixTraining, trueTrainingClasses) best_score = classifier_cv.best_score_ best_parameters = classifier_cv.best_estimator_.get_params() print(" Training and cross validation done in {:.2} seg".format((time() - t1)))
print("\n\n\n----------------------------------- Random Forest -----------------------------------------") # Random forest classifier = RandomForestClassifier() algorithmName = "RandomForest" paramGrid = { 'n_estimators': [100, 150,200,300], 'bootstrap': [True, False], 'criterion': ["gini", "entropy"], 'class_weight': ['balanced', None], } myClassifier = model_selection.RandomizedSearchCV(classifier, paramGrid, cv=crossV, n_jobs=jobs, scoring=myScorer) myClassifier.fit(X_train, y_train) predict = classificator_score(myClassifier, X_test) print(classification_report(y_test, predict)) print("\n\n\n----------------------------------- SGDClassifier -----------------------------------------") classifier = SGDClassifier(loss = 'log') algorithmName = "SGDClassifier" paramGrid = {'alpha' : [10**(-x) for x in range(7)], 'penalty' : ['elasticnet', 'l1', 'l2'], 'l1_ratio' : [0.15, 0.25, 0.5, 0.75],
from sklearn import svm import scipy # Data:Iris X = datasets.load_iris().data y = datasets.load_iris().target X_train, X_test, y_train, y_test = model_selection.train_test_split( X, y, test_size=0.2, random_state=1) """ 1. 网格搜索 """ # 设置超参数网格 hyperparams = { 'C': [1, 10, 100], 'kernel': ['linear', 'rbf', 'poly'], 'gamma': ['auto', 'scale', 1, 2, 3] } # 设置模型 clf = model_selection.GridSearchCV(svm.SVC(), hyperparams, cv=10) clf.fit(X, y) print(f'Best hyperparameters: {clf.best_params_}') print(f'Top score: {clf.best_score_}') """ 2. 随机搜索 (从scipy.stats提供的分布类型中,连续、随机挑选一些值作为超参数) """ # 设置超参数分布 hyperparams = {'C': scipy.stats.expon(), 'gamma': scipy.stats.uniform()} # 设置模型 clf = model_selection.RandomizedSearchCV(svm.SVC(), hyperparams, cv=10, random_state=1) clf.fit(X, y) print(f'Best hyperparameters: {clf.best_params_}') print(f'Top score: {clf.best_score_}')
def train_cv_model(init_model, X, y, n_splits=10, training_size=0.7, test_size=0.3, search="Grid"): # cv, indices, cos_theta = data_split.imitate_split(y, n_splits, training_size, test_size, cos_theta_lim=0.8) print("cos_theta_list:", cos_theta) # ## cv = model_selection.ShuffleSplit(n_splits, training_size, test_size, random_state = 0) # train_val_data = [] train_val_scores = [] train_val_params = [] model = None # if search == "Grid": optimize_parameters = { "alpha": [0.0001, 0.0002, 0.0003, 0.0004, 0.0005], "max_iter": [200, 500, 1000, 1400, 1700, 2000, 2500, 3000], "random_state": [0, 1, 5, 10, 20, 30, 50, 60, 80, 100] } # model = model_selection.GridSearchCV( init_model, optimize_parameters, cv=cv, refit="r2", scoring=("r2", "neg_mean_squared_error"), return_train_score=True, n_jobs=4) elif search == "Random": # optimize_parameters = { "alpha": scipy_expon(scale=0.0005), "max_iter": scipy_randint(200, 3000), "random_state": scipy_randint(0, 100) } # model = model_selection.RandomizedSearchCV( init_model, optimize_parameters, refit="r2", scoring=("r2", "neg_mean_squared_error"), cv=cv, n_iter=100, return_train_score=True, n_jobs=4) else: raise ("""Error Parameter input "search"!""") return model, train_val_scores, train_val_data, train_val_params # model.fit(X, y) ### # mean_train_r2 = model.cv_results_.get("mean_train_r2") mean_val_r2 = model.cv_results_.get("mean_test_r2") mean_train_neg_mean_squared_error = model.cv_results_.get( "mean_train_neg_mean_squared_error") mean_val_neg_mean_squared_error = model.cv_results_.get( "mean_test_neg_mean_squared_error") train_val_scores.extend([ mean_train_r2, mean_val_r2, mean_train_neg_mean_squared_error, mean_val_neg_mean_squared_error ]) ### # X_train, X_val = X[indices[0]], X[indices[1]] y_train, y_val = y[indices[0]], y[indices[1]] prediction_train = model.predict(X_train) prediction_val = model.predict(X_val) train_val_data.extend( [X_train, X_val, y_train, y_val, prediction_train, prediction_val]) ### if search == "Grid": train_val_params.extend([optimize_parameters, model.best_params_]) elif search == "Random": tmp_params = {} params = model.cv_results_.get("params") keys = list(params[0].keys()) for key in keys: tmp_params.update({key: []}) for params_dict in params: for key in keys: tmp_params.get(key).append(params_dict.get(key)) for key in keys: tmp_params.get(key).sort() train_val_params.extend([tmp_params, model.best_params_]) # return model, train_val_data, train_val_scores, train_val_params
def randomized_search_cv( X_fit, y_fit, X_train, y_train, X_val, y_val, model, params_dist, scorer, cv, n_jobs, random_search_params, log_residuals, ): if random_search_params[0]: print(f"\n-------------- Randomized Grid SearchCV started....") pprint(f"Parameters' distributions: {params_dist}") model_name = type(model).__name__ # Setup MLflow tracking server exp_id = mlflow_set_exp_id("Model:Fit") run_name = f"{model_name}-rand" ## Enable autologging mlflow.sklearn.autolog(log_model_signatures=False) print(f"Autologging {model_name} started...") # Define RANDOMIZED grid search random_search = model_selection.RandomizedSearchCV( model, param_distributions=params_dist, n_iter=random_search_params[1], # default 10 scoring=scorer, n_jobs=n_jobs, cv=cv, refit=True, return_train_score=True, verbose=3, random_state=rnd_state, ) ##* Fit model with MLflow logging with mlflow.start_run(experiment_id=exp_id, run_name=run_name): tic = time.time() model_random_search = random_search.fit( X_fit, y_fit, ) min, sec = divmod(time.time() - tic, 60) ## Disable autologging mlflow.sklearn.autolog(disable=True) # Log custom metrics and data print(f"Randomized grid search took: {int(min)}min {int(sec)}sec") print(f"Log custom metrics...") log_custom_metrics(model_random_search, X_train, y_train, X_val, y_val) if log_residuals: log_model_residuals(model_random_search, X_train, y_train, X_val, y_val) print( f"Randomized search: Best params are:\n {model_random_search.best_params_}" ) print(f"{model_name.title()}: Random search:") print_custom_metrics(model_random_search, X_train, y_train, X_val, y_val) winsound.Beep(frequency=2000, duration=300) return model, model_random_search.best_estimator_, model_random_search.best_params_ else: print(f"\nSkip a Randomized Grid SearchCV....") return model, None, None
#---- random forest training with hyperparameter tuning random_grid = {'n_estimators': [10, 100, 500, 1000], 'max_features': [0.25, 0.50, 0.75], 'max_depth': [5, 10, 20, 25], 'min_samples_split': [10, 20], 'min_samples_leaf': [5, 7, 10], 'bootstrap': [True, False], 'random_state': [random_seed]} print('> Random Forest classifier...') optimized_rfc = skms.RandomizedSearchCV(estimator = RandomForestClassifier(), param_distributions = random_grid, n_iter = 100, cv = 5, scoring=['roc_auc', 'recall'], refit ='roc_auc', verbose=1, n_jobs = -1, random_state = random_seed) optimized_rfc.fit(X_train, y_train) print('\n') #---- obtaining results of the grid run cv_results = optimized_rfc.cv_results_ cv_results_df = pd.DataFrame(cv_results) print('> hyperparameter tuning results') print(cv_results_df)
def hyper_parameter_optimization_example(): from time import time from scipy.stats import randint as sp_randint # Get some data. digits = datasets.load_digits() X, y = digits.data, digits.target # Build a classifier. clf = ensemble.RandomForestClassifier(n_estimators=20) # Utility function to report best scores. def report(results, n_top=3): for i in range(1, n_top + 1): candidates = np.flatnonzero(results['rank_test_score'] == i) for candidate in candidates: print('Model with rank: {0}'.format(i)) print('Mean validation score: {0:.3f} (std: {1:.3f})'.format( results['mean_test_score'][candidate], results['std_test_score'][candidate])) print('Parameters: {0}'.format(results['params'][candidate])) print('') # Specify parameters and distributions to sample from. param_dist = { 'max_depth': [3, None], 'max_features': sp_randint(1, 11), 'min_samples_split': sp_randint(2, 11), 'min_samples_leaf': sp_randint(1, 11), 'bootstrap': [True, False], 'criterion': ['gini', 'entropy'], } # Run randomized search. n_iter_search = 20 random_search = model_selection.RandomizedSearchCV( clf, param_distributions=param_dist, n_iter=n_iter_search) start = time() random_search.fit(X, y) print( 'RandomizedSearchCV took %.2f seconds for %d candidates parameter settings.' % ((time() - start), n_iter_search)) report(random_search.cv_results_) # Use a full grid over all parameters. param_grid = { 'max_depth': [3, None], 'max_features': [1, 3, 10], 'min_samples_split': [2, 3, 10], 'min_samples_leaf': [1, 3, 10], 'bootstrap': [True, False], 'criterion': ['gini', 'entropy'], } # Run grid search. #os.environ["OMP_NUM_THREADS"] = "2" grid_search = model_selection.GridSearchCV(clf, param_grid=param_grid, verbose=1, n_jobs=2) start = time() grid_search.fit(X, y) print( 'GridSearchCV took %.2f seconds for %d candidate parameter settings.' % (time() - start, len(grid_search.cv_results_['params']))) report(grid_search.cv_results_)
classifier= pipeline.Pipeline( [ ('scaling', scl), ('pca', pca), ('rf', rf) ] ) param_grid= { "pca__n_components": np.arange(5, 10), "rf__n_estimators": np.arange(100, 1500, 100), # for grid search: [100, 200, 300, 400], "rf__max_depth": np.arange(1, 20), # for grid search: [1, 3, 5, 7, 9, 11], "rf__criterion": ['gini', 'entropy'] } model= model_selection.RandomizedSearchCV( estimator=classifier, param_distributions=param_grid, n_iter= 15, scoring="accuracy", verbose=10, n_jobs= 1, cv=5 ) model.fit(X,y) print(model.best_score_) print(model.best_estimator_.get_params())
def hyper_parameter_tuning_example(): # Hyper-parameters are parameters that are not directly learnt within estimators. # In scikit-learn they are passed as arguments to the constructor of the estimator classes. # Typical examples include C, kernel and gamma for Support Vector Classifier, alpha for Lasso, etc. # It is possible and recommended to search the hyper-parameter space for the best cross validation score. # Any parameter provided when constructing an estimator may be optimized in this manner. # Specifically, to find the names and current values for all parameters for a given estimator: # estimator.get_params() """ parameters = { 'C': scipy.stats.expon(scale=100), 'gamma': scipy.stats.expon(scale=0.1), 'kernel': ['rbf'], 'class_weight':['balanced', None], } parameters = { 'C': utils.fixes.loguniform(1e0, 1e3), 'gamma': utils.fixes.loguniform(1e-4, 1e-3), 'kernel': ['rbf'], 'class_weight':['balanced', None] } """ #-------------------- # Exhaustive grid search. # REF [site] >> https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html if True: iris = datasets.load_iris() #param_grid = {'kernel':('linear', 'rbf'), 'C':[1, 10]} param_grid = [ { 'C': [1, 10, 100, 1000], 'kernel': ['linear'] }, { 'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf'] }, ] clf = svm.SVC() search = model_selection.GridSearchCV(clf, param_grid) search.fit(iris.data, iris.target) print('CV keys = {}.'.format(sorted(search.cv_results_.keys()))) print(pd.DataFrame(search.cv_results_)) print('Best params: {}.'.format(search.best_params_)) print('Best estimator: {}.'.format(search.best_estimator_)) print('Best score = {}.'.format(search.best_score_)) #-------------------- # Randomized search. # REF [site] >> https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html if True: iris = datasets.load_iris() clf = linear_model.LogisticRegression(solver='saga', tol=1e-2, max_iter=200, random_state=0) param_distributions = { 'C': scipy.stats.uniform(loc=0, scale=4), 'penalty': ['l2', 'l1'] } search = model_selection.RandomizedSearchCV(clf, param_distributions, random_state=0) search = search.fit(iris.data, iris.target) print('CV keys = {}.'.format(sorted(search.cv_results_.keys()))) print(pd.DataFrame(search.cv_results_)) print('Best params: {}.'.format(search.best_params_)) print('Best estimator: {}.'.format(search.best_estimator_)) print('Best score = {}.'.format(search.best_score_)) #-------------------- # Randomized parameter optimization. # REF [site] >> https://scikit-learn.org/stable/auto_examples/model_selection/plot_randomized_search.html if True: X, y = datasets.load_digits(return_X_y=True, n_class=3) # Build a classifier. clf = linear_model.SGDClassifier(loss="hinge", penalty="elasticnet", fit_intercept=True) # Utility function to report best scores. def report(results, n_top=3): for i in range(1, n_top + 1): candidates = np.flatnonzero(results["rank_test_score"] == i) for candidate in candidates: print("Model with rank: {0}".format(i)) print( "Mean validation score: {0:.3f} (std: {1:.3f})".format( results["mean_test_score"][candidate], results["std_test_score"][candidate])) print("Parameters: {0}".format( results["params"][candidate])) print("") # Specify parameters and distributions to sample from. param_dist = { "average": [True, False], "l1_ratio": scipy.stats.uniform(0, 1), "alpha": utils.fixes.loguniform(1e-2, 1e0), } # Run randomized search. n_iter_search = 15 random_search = model_selection.RandomizedSearchCV( clf, param_distributions=param_dist, n_iter=n_iter_search) start = time.time() random_search.fit(X, y) print( "RandomizedSearchCV took %.2f seconds for %d candidates parameter settings." % ((time.time() - start), n_iter_search)) report(random_search.cv_results_) #print('CV keys = {}.'.format(sorted(random_search.cv_results_.keys()))) #print(pd.DataFrame(random_search.cv_results_)) # Use a full grid over all parameters. param_grid = { "average": [True, False], "l1_ratio": np.linspace(0, 1, num=10), "alpha": np.power(10, np.arange(-2, 1, dtype=float)), } # Run grid search. grid_search = model_selection.GridSearchCV(clf, param_grid=param_grid) start = time.time() grid_search.fit(X, y) print( "GridSearchCV took %.2f seconds for %d candidate parameter settings." % (time.time() - start, len(grid_search.cv_results_["params"]))) report(grid_search.cv_results_)
def get_best_model(self, model_name, binary=False, standarize=False, feature_sel=False): """ Train all the classifiers. Parameters ---------- model_name : string Name of the model to test. The implemented ones are `svc`: Support Vector Machines `lsvc`: Linear Support Vector Machines `RandomForest`: Random Forest `knn`: K-Nearest Neighbor binary : bool If true the variables are converted to 0 / 1 (no porpoise / porpoise) instead of noise/lq/hq clicks standarize : bool Set to True if the variables should be standarized before training feature_sel : bool Set to True if the best features should be selected instead of all of them Returns ------- Dictionary with the name of the model as key and another dictionary as value with ind_vars, model, binary as keys and their respective representations in values (It also adds it to the property "models" of the class) """ x = self.train_data[self.ind_vars] y = self.train_data[self.dep_var] if binary: # Convert the classes in 0 (no porpoise) or 1 (porpoise) y = self.convert2binary(y) # If standarize is considered, append it to the pipeline steps steps = [] if standarize: # Standarize the data scaler = preprocessing.StandardScaler() steps.append(('scaler', scaler)) # Some common parameters tol = 1e-3 gamma = utils.fixes.loguniform(1e-4, 1000) c_values = utils.fixes.loguniform(0.1, 1000) class_weight = ['balanced', None] # Get the model if model_name == 'svc': # List all the possible parameters that want to be checked kernel_list = ['poly', 'rbf'] degree = stats.randint(1, 4) param_distr = { 'degree': degree, 'C': c_values, 'gamma': gamma, 'kernel': kernel_list } # Classifier with fixed values clf = svm.SVC(tol=tol, cache_size=500, probability=True, max_iter=500) elif model_name == 'logit': # List all the possible parameters that want to be checked penalty = ['l1', 'l2', 'elasticnet', 'none'] param_distr = { 'penalty': penalty, 'C': c_values, 'class_weight': class_weight } # Classifier with fixed values clf = linear_model.LogisticRegression() elif model_name == 'forest': # List all the possible parameters that want to be checked n_estimators = stats.randint(100, 300) param_distr = {'n_estimators': n_estimators} # Classifier with fixed values clf = ensemble.RandomForestClassifier() elif model_name == 'knn': # List all the possible parameters that want to be checked n_neighbors = stats.randint(2, 9) algorithm = ['auto', 'ball_tree', 'kd_tree'] param_distr = {'n_neighbors': n_neighbors, 'algorithm': algorithm} # Classifier with fixed values clf = neighbors.KNeighborsClassifier() else: raise Exception('%s is not implemented!' % model_name) if feature_sel: # selection = feature_selection.RFECV(estimator=svm.LinearSVC(), step=1, scoring='roc_auc') selection = feature_selection.SelectFromModel( ensemble.ExtraTreesClassifier(n_estimators=50)) # selection = feature_selection.SelectFromModel(svm.LinearSVC()) # Add the feature selection to the steps steps.append(('feature_selection', selection)) # Search for the best parameters gm_cv = model_selection.RandomizedSearchCV( estimator=clf, scoring='roc_auc', param_distributions=param_distr, n_iter=100) steps.append(('classification', gm_cv)) # Create pipeline and fit model = pipeline.Pipeline(steps) model.fit(x, y) if feature_sel: ind_vars = model['feature_selection'].transform( self.test_data[self.ind_vars]) else: ind_vars = self.ind_vars print(model['classification'].best_estimator_) self.models[model_name] = { 'ind_vars': ind_vars, 'model': model, 'binary': binary } # Save the model as a pickle file! pickle.dump(model, open('pyporcc/models/%s.pkl' % model_name, 'wb')) return self.models[model_name]
X = data.drop(['Price', 'kfold'], 1).values y = data.Price.values forest = ensemble.RandomForestRegressor(n_jobs=-1) params = { "n_estimators": np.arange(100, 1500, 100), "criterion" : ["mse", "mae"], "max_depth": np.arange(1, 31) } model = model_selection.RandomizedSearchCV( estimator=forest, cv=5, verbose=10, param_distributions=params, n_iter=20, n_jobs=1, ) model.fit(X, y) print(f"best score = {model.best_score_}") print(f"best params = {model.best_params_}") """ didn't get the params as the model took 20 mins to train for 1 iter - 1 fold, if you got a high processing CPU, feel free to check it """
#get the logic or model learned by Algorithm #issue: not readable print(final_estimator.tree_) #get the readable tree structure from tree_ object #visualize the deciion tree dot_data = io.StringIO() tree.export_graphviz(final_estimator, out_file = dot_data, feature_names = X_train.columns) graph = pydot.graph_from_dot_data(dot_data.getvalue())[0] os.chdir("C:\\Users\\vesuraju\\OneDrive - DXC Production\\Venkat\\Personal\\Trainings\\ML\\Classes_Year 2020\\Codes_2020\\Datasets\\Submissions") graph.write_pdf("tree_GridsearchCV.pdf") #Random search dt_rand_estimator = model_selection.RandomizedSearchCV(dt_estimator, dt_grid, cv=10, n_iter=20) dt_rand_estimator.fit(X_train, y_train) #access the results print(dt_rand_estimator.best_params_) print(dt_rand_estimator.best_score_) final_estimator_rand = dt_rand_estimator.best_estimator_ results = dt_rand_estimator.cv_results_ print(results.get("mean_test_score")) print(results.get("mean_train_score")) print(results.get("params")) #get the logic or model learned by Algorithm #issue: not readable print(final_estimator_rand.tree_)
paramGrid = { 'C': scipy.stats.expon(scale=100), 'kernel': ['linear'], 'class_weight': ['balanced', None] } elif args.kernel == 'poly': paramGrid = { 'C': scipy.stats.expon(scale=100), 'gamma': scipy.stats.expon(scale=.1), 'degree': [2, 3], 'kernel': ['poly'], 'class_weight': ['balanced', None] } myClassifier = model_selection.RandomizedSearchCV(classifier, paramGrid, n_iter=nIter, cv=crossV, n_jobs=jobs, verbose=3) elif args.classifier == 'BernoulliNB': # BernoulliNB classifier = BernoulliNB() paramGrid = {'alpha': scipy.stats.expon(scale=1.0)} myClassifier = model_selection.RandomizedSearchCV(classifier, paramGrid, n_iter=nIter, cv=crossV, n_jobs=jobs, verbose=3, scoring=myScorer) elif args.classifier == 'MultinomialNB':
# load data train = pd.read_csv("train_macro.csv") test = pd.read_csv("test_macro.csv") # features which will be used features = [col for col in train.columns if col not in ['id', 'timestamp', 'price_doc', 'price_log', 'price_per_sq']] """ Model 2 - Random Forest """ rf_param_distr = dict(n_estimators=scipy.stats.randint(1, 300 + 1), max_features=scipy.stats.uniform(loc=0.1, scale=0.9), max_depth=scipy.stats.randint(1, 20 + 1), min_samples_split=scipy.stats.randint(2, 20 + 1), min_samples_leaf=scipy.stats.randint(1, 30 + 1)) rf_rand_param_search = model_selection.RandomizedSearchCV(estimator=ensemble.RandomForestRegressor(), param_distributions=rf_param_distr, n_iter=200, n_jobs=2, cv=5, verbose=20) rf_rand_param_search.fit(train[features].values, train.price_doc.values) psr_rf = param_search_res(rf_rand_param_search.cv_results_) pickle.dump(psr_rf, open("psr_rf", "wb")) rf_rand_param_search.best_params_ # best score = 0.67122194