def gb_grid_search(train_seed, cv_generator, cv_args): """ GradientBoosting """ _log_path = grid_search_log_path + 'gb_' x_train, y_train, x_test, id_test = utils.load_preprocessed_data( preprocessed_data_path) parameters = { 'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.05, 'loss': 'deviance', 'max_depth': 25, 'max_features': 'auto', 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 50, 'min_samples_split': 1000, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 200, 'presort': 'auto', 'random_state': train_seed, 'subsample': 0.8, 'verbose': 2, 'warm_start': False } GB = GradientBoosting(x_train, y_train, x_test, id_test) reg = GB.get_reg(parameters) # parameters_grid = None parameters_grid = { 'n_estimators': (20, 50, 100), 'learning_rate': (0.05, 0.2, 0.5), 'max_depth': (5, 10, 15), 'max_features': (6, 8, 10), 'min_samples_leaf': (300, 400, 500), 'min_samples_split': (3000, 4000, 5000), 'subsample': (0.6, 0.8, 1) } SKLearnGridSearch.grid_search(_log_path, x_train, y_train, reg, params=parameters, params_grid=parameters_grid, cv_generator=cv_generator, cv_args=cv_args) utils.print_grid_info('GradientBoosting', parameters, parameters_grid)
def et_grid_search(train_seed, cv_generator, cv_args): """ Extra Trees """ _log_path = grid_search_log_path + 'et_' x_train, y_train, w_train, e_train, x_test, id_test = utils.load_preprocessed_data( preprocessed_data_path) parameters = { 'bootstrap': True, 'n_estimators': 50, 'class_weight': None, 'criterion': 'gini', 'max_depth': 25, 'max_features': 'auto', 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 50, 'min_samples_split': 1000, 'min_weight_fraction_leaf': 0.0, 'n_jobs': -1, 'oob_score': True, 'random_state': train_seed, 'verbose': 2, 'warm_start': False } ET = ExtraTrees(x_train, y_train, x_test, id_test) reg = ET.get_reg(parameters) # parameters_grid = None parameters_grid = { 'n_estimators': (30, 40, 50), 'max_depth': (5, 6), 'max_features': (6, 7), 'min_samples_leaf': (200, 250, 300), 'min_samples_split': (3000, 3500, 4000) } SKLearnGridSearch.grid_search(_log_path, x_train, y_train, reg, params=parameters, params_grid=parameters_grid, cv_generator=cv_generator, cv_args=cv_args) utils.print_grid_info('Extra Trees', parameters, parameters_grid)
def rf_grid_search(train_seed, cv_generator, cv_args): """ Random Forest """ _log_path = grid_search_log_path + 'rf_' x_train, y_train, x_test, id_test = utils.load_preprocessed_data( preprocessed_data_path) parameters = { 'n_estimators': 32, 'bootstrap': True, 'class_weight': None, 'criterion': 'gini', 'max_depth': 6, 'max_features': 7, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 300, 'min_samples_split': 4000, 'min_weight_fraction_leaf': 0.0, 'n_jobs': -1, 'oob_score': True, 'random_state': train_seed, 'verbose': 2, 'warm_start': False } RF = RandomForest(x_train, y_train, x_test, id_test) reg = RF.get_reg(parameters) # parameters_grid = None parameters_grid = { # 'n_estimators': (30, 31, 32), 'max_depth': (2, 3), # 'max_features': (6, 7), 'min_samples_leaf': (286, 287), 'min_samples_split': (3972, 3974, 3976, 3978) } SKLearnGridSearch.grid_search(_log_path, x_train, y_train, reg, params=parameters, params_grid=parameters_grid, cv_generator=cv_generator, cv_args=cv_args) utils.print_grid_info('Random Forest', parameters, parameters_grid)
def lr_grid_search(train_seed, cv_generator, cv_args): """ Logistic Regression """ _log_path = grid_search_log_path + 'lr_' x_train, y_train, x_test, id_test = utils.load_preprocessed_data( preprocessed_data_path) parameters = { 'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': 'True', 'intercept_scaling': 1, 'max_iter': 100, 'multi_class': 'multinomial', 'n_jobs': -1, 'penalty': 'l2', 'solver': 'sag', 'tol': 0.0001, 'random_state': train_seed, 'verbose': 2, 'warm_start': False } LR = LRegression(x_train, y_train, x_test, id_test) reg = LR.get_reg(parameters) # parameters_grid = None parameters_grid = { 'C': (0.2, 0.5, 1), 'max_iter': (50, 100, 200), 'tol': (0.001, 0.005, 0.01) } SKLearnGridSearch.grid_search(_log_path, x_train, y_train, reg, params=parameters, params_grid=parameters_grid, cv_generator=cv_generator, cv_args=cv_args) utils.print_grid_info('Logistic Regression', parameters, parameters_grid)
def lgb_grid_search(train_seed, cv_generator, cv_args): """ LightGBM """ _log_path = grid_search_log_path + 'lgb_' x_train, y_train, x_test, id_test = utils.load_preprocessed_data( preprocessed_data_path) x_g_train, x_g_test = utils.load_preprocessed_data_g( preprocessed_data_path) parameters = { 'learning_rate': 0.006, 'boosting_type': 'gbdt', # traditional Gradient Boosting Decision Tree. # 'boosting_type': 'dart', # Dropouts meet Multiple Additive Regression Trees. # 'boosting_type': 'goss', # Gradient-based One-Side Sampling. # 'boosting_type': 'rf', # Random Forest. 'num_leaves': 3, # <2^(max_depth) 'max_depth': 8, # default=-1 'n_estimators': 79, 'max_bin': 1005, 'subsample_for_bin': 1981, 'objective': 'binary', 'min_split_gain': 0., 'min_child_weight': 1, 'min_child_samples': 0, 'subsample': 0.723, 'subsample_freq': 3, 'colsample_bytree': 0.11, 'reg_alpha': 0., 'reg_lambda': 0., 'silent': False, 'seed': train_seed } LGB = SKLearnLightGBM(x_g_train, y_train, x_g_test, id_test) reg = LGB.get_reg(parameters) # parameters_grid = None parameters_grid = { 'learning_rate': (0.002, 0.005, 0.01), 'n_estimators': (30, 60, 90), 'num_leaves': (32, 64, 128), # <2^(max_depth) 'colsample_bytree': (0.6, 0.8, 0.1), 'max_depth': (6, 8, 10), # default=-1 # 'min_data_in_leaf': 20, # default=20 # 'bagging_fraction': (0.5, 0.7, 0.9), # 'feature_fraction': (0.5, 0.7, 0.9), # 'subsample_for_bin': (50000, 100000, 150000), # 'subsample_freq': (4, 6, 8), # 'subsample': (0.6, 0.8, 1.0), # 'max_bin': (255, 355, 455) } SKLearnGridSearch.grid_search(_log_path, x_train, y_train, reg, params=parameters, params_grid=parameters_grid, cv_generator=cv_generator, cv_args=cv_args) utils.print_grid_info('LightGBM', parameters, parameters_grid)
def xgb_grid_search(train_seed, cv_generator, cv_args): """ XGBoost """ _log_path = grid_search_log_path + 'xgb_' x_train, y_train, x_test, id_test = utils.load_preprocessed_data( preprocessed_data_path) parameters = { 'objective': 'binary:logistic', 'learning_rate': 0.002, 'n_estimators': 100, 'max_depth': 9, 'min_child_weight': 5, 'max_delta_step': 0, 'silent': False, 'subsample': 0.8, 'colsample_bytree': 0.8, 'colsample_bylevel': 1, 'base_score': 0.5, 'gamma': 0, 'reg_alpha': 0, 'reg_lambda': 0, 'nthread': -1, 'seed': train_seed # 'missing': None, # 'nthread': -1, # 'scale_pos_weight': 1, } XGB = SKLearnXGBoost(x_train, y_train, x_test, id_test) reg = XGB.get_reg(parameters) # parameters_grid = None parameters_grid = { 'learning_rate': (0.002, 0.005, 0.01), 'n_estimators': (20, 50, 100, 150), 'max_depth': (5, 7, 9), # 'subsample': 0.8, # 'colsample_bytree': 0.8, # 'colsample_bylevel': 1, # 'gamma': 0, # 'min_child_weight': 1, # 'max_delta_step': 0, # 'base_score': 0.5, # 'reg_alpha': 0, # 'reg_lambda': 0, } SKLearnGridSearch.grid_search(_log_path, x_train, y_train, reg, params=parameters, params_grid=parameters_grid, cv_generator=cv_generator, cv_args=cv_args) utils.print_grid_info('XGBoost', parameters, parameters_grid)
def stack_lgb_grid_search(train_seed, cv_generator, cv_args): _log_path = grid_search_log_path + 'stk_lgb_' x_train, y_train, x_test, id_test = utils.load_preprocessed_data( preprocessed_data_path) x_g_train, x_g_test = utils.load_preprocessed_data_g( preprocessed_data_path) blender_x_tree, blender_test_tree, blender_x_g_tree, blender_test_g_tree \ = utils.load_stacked_data(stack_output_path + 'l2_') g_train = x_g_train[:, -1] x_train_reuse = x_train[:, :88] print('------------------------------------------------------') print('Stacking Reused Features of Train Set...') blender_x_tree = np.concatenate((blender_x_tree, x_train_reuse), axis=1) blender_x_g_tree = np.column_stack((blender_x_tree, g_train)) parameters = { 'learning_rate': 0.006, 'boosting_type': 'gbdt', # traditional Gradient Boosting Decision Tree. # 'boosting_type': 'dart', # Dropouts meet Multiple Additive Regression Trees. # 'boosting_type': 'goss', # Gradient-based One-Side Sampling. # 'boosting_type': 'rf', # Random Forest. 'num_leaves': 3, # <2^(max_depth) 'max_depth': 8, # default=-1 'n_estimators': 79, 'max_bin': 1005, 'subsample_for_bin': 1981, 'objective': 'regression', 'min_split_gain': 0., 'min_child_weight': 1, 'min_child_samples': 0, 'subsample': 0.723, 'subsample_freq': 3, 'colsample_bytree': 0.11, 'reg_alpha': 0., 'reg_lambda': 0., 'silent': False, 'random_state': train_seed } LGB = SKLearnLightGBM(blender_x_g_tree, y_train, blender_test_g_tree, id_test) reg = LGB.get_reg(parameters) # parameters_grid = None parameters_grid = { 'learning_rate': (0.002, 0.005, 0.01), 'n_estimators': (30, 60, 90), 'num_leaves': (32, 64, 128), # <2^(max_depth) 'colsample_bytree': (0.6, 0.8, 0.1), 'max_depth': (6, 8, 10), # default=-1 # 'min_data_in_leaf': 20, # default=20 # 'bagging_fraction': (0.5, 0.7, 0.9), # 'feature_fraction': (0.5, 0.7, 0.9), # 'subsample_for_bin': (50000, 100000, 150000), # 'subsample_freq': (4, 6, 8), # 'subsample': (0.6, 0.8, 1.0), # 'max_bin': (255, 355, 455) } SKLearnGridSearch.grid_search(_log_path, x_train, y_train, reg, params=parameters, params_grid=parameters_grid, cv_generator=cv_generator, cv_args=cv_args) utils.print_grid_info('LightGBM', parameters, parameters_grid)