def gridSearch(self, regressor, X_train, y_train): parameters = { 'depth': sp_randInt(6, 10), # 'learning_rate': sp_randFloat(), 'iterations': sp_randInt(600, 900) # } randm = RandomizedSearchCV(estimator=regressor, param_distributions=parameters, cv=3, n_iter=4, n_jobs=8) # randm.fit(X_train, y_train) #Results from Random Search print("\n========================================================") print(" Results from Random Search ") print("========================================================") print("\n s:\n", randm.best_estimator_) print("\n The best score across ALL searched params:\n", randm.best_score_) print("\n The best parameters across ALL searched params:\n", randm.best_params_) #new catboost model using best parameters regressor = CatBoostRegressor( iterations=randm.best_params_['iterations'], learning_rate=randm.best_params_['learning_rate'], depth=randm.best_params_['depth'], od_type='IncToDec') return regressor, randm.best_params_
# 전처리 x_train, x_test, y_train, y_test = train_test_split(x, y, shuffle=True, test_size=0.2, random_state=42) print(x_train.shape) # (3628, 110336) print(x_test.shape) # (908, 110336) print(y_train.shape) # (3628,) print(y_test.shape) # (908,) scaler = MinMaxScaler() scaler.fit(x_train) x_train = scaler.transform(x_train) x_test = scaler.transform(x_test) # 모델 구성 model = HistGradientBoostingClassifier(verbose=1, random_state=42) parameters = {"learning_rate": sp_randFloat(), "max_iter" : [1000,1200,1500], "l2_regularization" : [1.5, 0.5, 0, 1], "max_depth" : sp_randInt(4, 10) } randm = RandomizedSearchCV(estimator=model, param_distributions = parameters, cv = 2, n_iter = 10, n_jobs=-1) randm.fit(x_train, y_train) print(" Results from Random Search " ) print("The best estimator across ALL searched params:", randm.best_estimator_) print("The best score across ALL searched params:", randm.best_score_) print(" The best parameters across ALL searched params:", randm.best_params_) end_now = datetime.datetime.now()
model_list.append(('RF_4', 'Random Forest Algorithm: PS-4', trained_model, accuracy, conf_matrix, class_report, kappa_score)) ############################################################################### ###### Automatic tuning of parameter settings using RandomizedSearchCV ##### ############################################################################### #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Model setup model = RandomForestClassifier() parameters = {'max_depth' : sp_randInt(4, 10), 'criterion' : ['gini', 'entropy'], 'max_features' : ['auto', 'sqrt', 'log2'], 'n_estimators' : sp_randInt(100, 1000), 'min_impurity_decrease' : sp_randFloat(), } random = RandomizedSearchCV(estimator = model, param_distributions = parameters, cv = KFold,, n_iter = 10, verbose = 1, n_jobs = 10) random.fit(X_train, y_train) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Randomized search results print("\n =========================================================") print(" Random Search Results ") print("============================================================")
def main(): dataset = pd.read_csv('tcd-ml-1920-group-income-train.csv') train = dataset.copy() train = train[:1044560] train = train.drop_duplicates(subset='Instance', keep='first', inplace=False) train = train.drop(columns=['Instance']) train = train.drop_duplicates(inplace=False) y = np.log(train['Total Yearly Income [EUR]']) train = train.drop(columns=[ 'Total Yearly Income [EUR]', 'Hair Color', 'Housing Situation', 'Wears Glasses', ]) train = changeSizeOfCity(train) train = degree(train) train = genderCleaning(train) train = bodyHeight(train) train = profession(train) train = satisfaction(train) train = work_experience(train) train = processAdditionToSalary(train) train = crime(train) #Encode using get dummies print("Start Dummies") te = TargetEncoder() train[['Gender', 'Country', 'Profession', 'University Degree']] = te.fit_transform( train[['Gender', 'Country', 'Profession', 'University Degree']], y) print("End Dummies") #catboost regressor creation regressor = CatBoostRegressor(od_type='IncToDec') #split data 80/20 X_train, X_test, y_train, y_test = train_test_split(train, y, test_size=0.2) #create an evalution dataset tostop overfitting eval_dataset = Pool(X_test, y_test) #grdi search for optimal parameters print('Fitting') parameters = { 'depth': sp_randInt(3, 14), 'learning_rate': sp_randFloat(), 'iterations': sp_randInt(800, 1200) } randm = RandomizedSearchCV(estimator=regressor, param_distributions=parameters, cv=4, n_iter=10, n_jobs=5) randm.fit(X_train, y_train) # Results from Random Search print("\n========================================================") print(" Results from Random Search ") print("========================================================") print("\n s:\n", randm.best_estimator_) print("\n The best score across ALL searched params:\n", randm.best_score_) print("\n The best parameters across ALL searched params:\n", randm.best_params_) # new catboost model using best parameters regressor = CatBoostRegressor( iterations=randm.best_params_['iterations'], learning_rate=randm.best_params_['learning_rate'], depth=randm.best_params_['depth'], od_type='IncToDec', use_best_model=True) test_dataset = pd.read_csv('tcd-ml-1920-group-income-test.csv') predict_X = test_dataset X_train = train y_train = y predict_y = predict_X.pop('Total Yearly Income [EUR]') predict_X = year(predict_X) predict_X = predict_X.drop(columns=[ 'Instance', 'Hair Color', 'Housing Situation', 'Wears Glasses', ]) #clean test data predict_X = changeSizeOfCity(predict_X) predict_X = degree(predict_X) predict_X = genderCleaning(predict_X) predict_X = bodyHeight(predict_X) predict_X = profession(predict_X) predict_X = work_experience(predict_X) predict_X = processAdditionToSalary(predict_X) predict_X = satisfaction(predict_X) predict_X = crime(predict_X) predict_X[['Gender', 'Country', 'Profession', 'University Degree']] = te.transform( predict_X[[ 'Gender', 'Country', 'Profession', 'University Degree' ]], predict_y) X_train, predict_X = train.align(predict_X, join='outer', axis=1, fill_value=0) print('Fitting Test Data') regressor.fit(X_training, y_training, eval_set=eval_dataset) #predict on the trained model pred2 = regressor.predict(predict_X) output = pd.read_csv('tcd-ml-1920-group-income-submission.csv') instance = output['Instance'] output.pop('Instance') a = pd.DataFrame.from_dict({ 'Instance': instance, 'Total Yearly Income [EUR]': np.exp(pred2) }) a.to_csv("tcd-ml-1920-group-income-submission.csv", index=False) y_pred = regressor.predict(X_test) print('MAE is: {}'.format( mean_absolute_error(np.exp(y_test), np.exp(y_pred))))
# parameters = {'learning_rate': [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.11, 0.12, 0.13, 0.14, 0.15, 0.16], # 'subsample' : [0.9, 0.5, 0.2, 0.1], # 'n_estimators' : [100,500,1000, 1500], # 'max_depth' : [4,6,8,10] # } GBR = GradientBoostingRegressor( random_state=1, verbose=1, ) # grid_GBR = GridSearchCV(estimator=GBR, param_grid = parameters, cv = 2, n_jobs=-1) from sklearn.model_selection import RandomizedSearchCV from scipy.stats import uniform as sp_randFloat from scipy.stats import randint as sp_randInt score = 0 while score < 0.8: # model = GradientBoostingRegressor() parameters = {'learning_rate': sp_randFloat(), 'subsample' : sp_randFloat(), 'n_estimators' : sp_randInt(100, 1000), 'max_depth' : sp_randInt(4, 10) } grid_GBR = RandomizedSearchCV(estimator=GBR, param_distributions = parameters, cv = 2, n_iter = 10, n_jobs=-1) grid_GBR.fit(df1_train[ vars ][ :-24 ], df1_train['POWER'][ 24: ]) # grid_GBR.fit(df1_train[ vars ][ :-24 ], df1_train['POWER'][ 24: ] ) print(" Results from Grid Search " ) print("\n The best estimator across ALL searched params:\n",grid_GBR.best_estimator_) print("\n The best score across ALL searched params:\n",grid_GBR.best_score_) print("\n The best parameters across ALL searched params:\n",grid_GBR.best_params_) score = grid_GBR.best_score_
class_report, kappa_score)) ############################################################################### ###### Automatic tuning of parameter settings using RandomizedSearchCV ##### ############################################################################### #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Model setup model = MLPClassifier() parameters = {'activation' : ['relu', 'tanh'], 'solver' : ['sgd', 'adam'], 'learning_rate' : ['constant', 'adaptive'], 'hidden_layer_sizes' : [100, 200, 300], 'max_iter' : sp_randInt(50, 300), 'batch_size' : [10, 30, 50, 70], 'learning_rate_init' : sp_randFloat() } random = RandomizedSearchCV(estimator = model, param_distributions = parameters, cv = KFold, n_iter = 10, verbose = 1, n_jobs = 10) random.fit(X_train, y_train) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Randomized search results print("\n =========================================================") print(" Random Search Results ") print("============================================================") print("\n The best estimator :\n", grid.best_estimator_) print("\n The best score :\n", grid.best_score_)
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Collect performance results model_list.append(('XGBoost_4', 'XGBoost Algorithm: PS-4', trained_model, accuracy, conf_matrix, class_report, kappa_score)) ############################################################################### ###### Automatic tuning of parameter settings using RandomizedSearchCV ##### ############################################################################### #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Model setup model = xgb.XGBClassifier() parameters = {'max_depth' : sp_randInt(4, 10), 'gamma' :sp_randFloat(), 'learning_rate' : sp_randFloat(), 'n_estimators' : sp_randInt(100, 1000) } random = RandomizedSearchCV(estimator = model, param_distributions = parameters, cv = KFold, verbose = 1, n_iter = 10) random.fit(X_train, y_train) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Randomized search results print("\n =========================================================") print(" Random Search Results ") print("============================================================")
test_size=0.2, random_state=42) print(x_train.shape) # (3628, 110336) print(x_test.shape) # (908, 110336) print(y_train.shape) # (3628,) print(y_test.shape) # (908,) scaler = StandardScaler() scaler.fit(x_train) x_train = scaler.transform(x_train) x_test = scaler.transform(x_test) # 모델 구성 model = GradientBoostingClassifier(verbose=1, random_state=42) parameters = { "learning_rate": sp_randFloat(), "subsample": sp_randFloat(), "n_estimators": sp_randInt(100, 1000), "max_depth": sp_randInt(4, 10) } randm = RandomizedSearchCV(estimator=model, param_distributions=parameters, cv=2, n_iter=10, n_jobs=-1) randm.fit(x_train, y_train) print(" Results from Random Search ") print("The best estimator across ALL searched params:", randm.best_estimator_) print("The best score across ALL searched params:", randm.best_score_) print(" The best parameters across ALL searched params:", randm.best_params_)
def run(self, trainingDasaset, plotting): dataset = trainingDasaset accuracy = 0 train = dataset.copy() y = train['int_rate'] train = train.drop(columns=[ 'int_rate', ]) regressor = CatBoostRegressor(od_type='IncToDec') #split data 80/20 X_train, X_test, y_train, y_test = train_test_split(train, y, test_size=0.2) #grid search for optimal parameters print('Fitting') parameters = { 'depth': sp_randInt(6, 10), 'learning_rate': sp_randFloat(), 'iterations': sp_randInt(600, 1000) } regressor = CatBoostRegressor(iterations=643, learning_rate=0.9600690303599169, depth=6, od_type='IncToDec') bestParams = None #regressor,bestParams = self.gridSearch(regressor,X_train, y_train) if plotting == True: print('Fitting Test Data') regressor.fit(X_train, y_train) y_pred = regressor.predict(X_test) print( "###################################CatBoost#############################" ) print('MAE is: {}'.format( mean_absolute_error(np.exp(y_test), np.exp(y_pred)))) accuracy = r2_score(y_test, y_pred) if bestParams != None: print(bestParams) #accuracy = np.sqrt(metrics.mean_squared_error(y_test, y_pred)) #predict test data else: regressor.fit(train, y) testData = pd.read_csv( "./SiameseNeuralNetworkProject/MachineLearningAlgorithmSuite/CleanedData/SiameseTrainingData.csv" ) predictions = regressor.predict(testData) np.savetxt( "./SiameseNeuralNetworkProject/MachineLearningAlgorithmSuite/OutputFiles/CatBoostPredictions.csv", predictions, delimiter=",") testData = pd.read_csv( "./SiameseNeuralNetworkProject/MachineLearningAlgorithmSuite/CleanedData/OverallTestingData.csv" ) predictions = regressor.predict(testData) np.savetxt( "./SiameseNeuralNetworkProject/MachineLearningAlgorithmSuite/OutputFiles/CatBoostPredictionsTestData.csv", predictions, delimiter=",") return accuracy
from sklearn.model_selection import RandomizedSearchCV from scipy.stats import uniform as sp_randFloat from scipy.stats import randint as sp_randInt from util import load_data, preprocess, mk_trainset, metric from make_var import make_variable from clustering import clustering # 자기가 실행할 모델 파라미터 관련 코드 실행 (그냥 다한다! : >) # real implement에서 lightgbm 제외하고는 mk_trainset의 category False, PCA True로 설정. # random_search의 모델 명, 파라미터 명 설정. (random_search 함수 내 파라미터 저장 이름도 확인. 덮어쓰기 안되게!) # 파라미터 로드는 맨 아래 방법처럼. n_estimators = [int(x) for x in range(10000, 50000, 5000)] importance_type = ['split', 'gain'] lambda_l1 = sp_randFloat() lambda_l2 = sp_randFloat() max_depth = sp_randInt(3, 30) depth = sp_randInt(3, 30) min_child_samples = sp_randInt(1, 7) min_data_in_leaf = sp_randInt(1, 7) min_sum_hessian_in_leaf = sp_randInt(1, 10) num_leaves = sp_randInt(10, 50) bagging_fraction = sp_randFloat() feature_fraction = sp_randFloat() learning_rate = sp_randFloat() max_bin = sp_randInt(low=0, high=30) min_gain_to_split = sp_randFloat() max_leaf_nodes = sp_randInt(10, 50) min_samples_leaf = sp_randInt(2, 30) min_samples_split = sp_randInt(2, 30)
# Visualize distribution of target attribute- num, bins, patches = plt.hist(y, bins = int(np.ceil(np.sqrt(y.size)))) plt.show() # Visualize distributions of all numeric attributes in features- sns.boxplot(data = X_df) plt.title("Pima diabetes: Boxplot distribution - numeric columns") plt.show() # Initialize a GradientBoostingRegressor model- gbr = GradientBoostingRegressor() # Specify parameters for hyper-parameter tuning- parameters = { 'learning_rate': sp_randFloat(), 'subsample' : sp_randFloat(), 'n_estimators' : sp_randInt(100, 1000), 'max_depth' : sp_randInt(4, 10) } ''' RandomizedSearchCV parameters- 1. estimator: here, we input the metric or the model for which we need to optimize the parameters. 2. param_distributions: here, we have to pass the dictionary of parameters that we need to optimize. 3. cv: here, we have to pass an interger value signifying the number of splits needed for cross validation. By default it's 5.
import catboost as cb from scipy.stats import randint as sp_randInt from scipy.stats import uniform as sp_randFloat from sklearn.model_selection import RandomizedSearchCV from skopt import BayesSearchCV from skopt.space import Integer, Real SCORING_LIST = ["accuracy", "roc_auc", "f1"] XGBOOST_RANDOMSEARCH_PARAMS = { "silent": [False], "max_depth": sp_randInt(6, 20), "learning_rate": sp_randFloat(0.01, 0.3), "subsample": [0.5, 0.6, 0.7, 0.8, 0.9, 1.0], "colsample_bytree": [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], "colsample_bylevel": [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], "min_child_weight": [0.5, 1.0, 3.0, 5.0, 7.0, 10.0], "gamma": [0, 0.25, 0.5, 1.0], "reg_lambda": [0.1, 1.0, 5.0, 10.0, 50.0, 100.0], "n_estimators": [200], } XGBOOST_BAYESSEARCH_PARAMS = { "silent": [False], "max_depth": Integer(6, 20), "learning_rate": Real(0.01, 0.3), "subsample": [0.5, 0.6, 0.7, 0.8, 0.9, 1.0], "colsample_bytree": [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], "colsample_bylevel": [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], "min_child_weight": [0.5, 1.0, 3.0, 5.0, 7.0, 10.0], "gamma": [0, 0.25, 0.5, 1.0],