Exemple #1
0
    def gb_grid_search(train_seed, cv_generator, cv_args):
        """
            GradientBoosting
        """
        _log_path = grid_search_log_path + 'gb_'

        x_train, y_train, x_test, id_test = utils.load_preprocessed_data(
            preprocessed_data_path)

        parameters = {
            'criterion': 'friedman_mse',
            'init': None,
            'learning_rate': 0.05,
            'loss': 'deviance',
            'max_depth': 25,
            'max_features': 'auto',
            'max_leaf_nodes': None,
            'min_impurity_decrease': 0.0,
            'min_impurity_split': None,
            'min_samples_leaf': 50,
            'min_samples_split': 1000,
            'min_weight_fraction_leaf': 0.0,
            'n_estimators': 200,
            'presort': 'auto',
            'random_state': train_seed,
            'subsample': 0.8,
            'verbose': 2,
            'warm_start': False
        }

        GB = GradientBoosting(x_train, y_train, x_test, id_test)

        reg = GB.get_reg(parameters)

        # parameters_grid = None

        parameters_grid = {
            'n_estimators': (20, 50, 100),
            'learning_rate': (0.05, 0.2, 0.5),
            'max_depth': (5, 10, 15),
            'max_features': (6, 8, 10),
            'min_samples_leaf': (300, 400, 500),
            'min_samples_split': (3000, 4000, 5000),
            'subsample': (0.6, 0.8, 1)
        }

        SKLearnGridSearch.grid_search(_log_path,
                                      x_train,
                                      y_train,
                                      reg,
                                      params=parameters,
                                      params_grid=parameters_grid,
                                      cv_generator=cv_generator,
                                      cv_args=cv_args)

        utils.print_grid_info('GradientBoosting', parameters, parameters_grid)
Exemple #2
0
    def et_grid_search(train_seed, cv_generator, cv_args):
        """
            Extra Trees
        """
        _log_path = grid_search_log_path + 'et_'

        x_train, y_train, w_train, e_train, x_test, id_test = utils.load_preprocessed_data(
            preprocessed_data_path)

        parameters = {
            'bootstrap': True,
            'n_estimators': 50,
            'class_weight': None,
            'criterion': 'gini',
            'max_depth': 25,
            'max_features': 'auto',
            'max_leaf_nodes': None,
            'min_impurity_decrease': 0.0,
            'min_samples_leaf': 50,
            'min_samples_split': 1000,
            'min_weight_fraction_leaf': 0.0,
            'n_jobs': -1,
            'oob_score': True,
            'random_state': train_seed,
            'verbose': 2,
            'warm_start': False
        }

        ET = ExtraTrees(x_train, y_train, x_test, id_test)

        reg = ET.get_reg(parameters)

        # parameters_grid = None

        parameters_grid = {
            'n_estimators': (30, 40, 50),
            'max_depth': (5, 6),
            'max_features': (6, 7),
            'min_samples_leaf': (200, 250, 300),
            'min_samples_split': (3000, 3500, 4000)
        }

        SKLearnGridSearch.grid_search(_log_path,
                                      x_train,
                                      y_train,
                                      reg,
                                      params=parameters,
                                      params_grid=parameters_grid,
                                      cv_generator=cv_generator,
                                      cv_args=cv_args)

        utils.print_grid_info('Extra Trees', parameters, parameters_grid)
Exemple #3
0
    def rf_grid_search(train_seed, cv_generator, cv_args):
        """
            Random Forest
        """
        _log_path = grid_search_log_path + 'rf_'

        x_train, y_train, x_test, id_test = utils.load_preprocessed_data(
            preprocessed_data_path)

        parameters = {
            'n_estimators': 32,
            'bootstrap': True,
            'class_weight': None,
            'criterion': 'gini',
            'max_depth': 6,
            'max_features': 7,
            'max_leaf_nodes': None,
            'min_impurity_decrease': 0.0,
            'min_samples_leaf': 300,
            'min_samples_split': 4000,
            'min_weight_fraction_leaf': 0.0,
            'n_jobs': -1,
            'oob_score': True,
            'random_state': train_seed,
            'verbose': 2,
            'warm_start': False
        }

        RF = RandomForest(x_train, y_train, x_test, id_test)

        reg = RF.get_reg(parameters)

        # parameters_grid = None

        parameters_grid = {
            # 'n_estimators': (30, 31, 32),
            'max_depth': (2, 3),
            # 'max_features': (6, 7),
            'min_samples_leaf': (286, 287),
            'min_samples_split': (3972, 3974, 3976, 3978)
        }

        SKLearnGridSearch.grid_search(_log_path,
                                      x_train,
                                      y_train,
                                      reg,
                                      params=parameters,
                                      params_grid=parameters_grid,
                                      cv_generator=cv_generator,
                                      cv_args=cv_args)

        utils.print_grid_info('Random Forest', parameters, parameters_grid)
Exemple #4
0
    def lr_grid_search(train_seed, cv_generator, cv_args):
        """
            Logistic Regression
        """
        _log_path = grid_search_log_path + 'lr_'

        x_train, y_train, x_test, id_test = utils.load_preprocessed_data(
            preprocessed_data_path)

        parameters = {
            'C': 1.0,
            'class_weight': None,
            'dual': False,
            'fit_intercept': 'True',
            'intercept_scaling': 1,
            'max_iter': 100,
            'multi_class': 'multinomial',
            'n_jobs': -1,
            'penalty': 'l2',
            'solver': 'sag',
            'tol': 0.0001,
            'random_state': train_seed,
            'verbose': 2,
            'warm_start': False
        }

        LR = LRegression(x_train, y_train, x_test, id_test)

        reg = LR.get_reg(parameters)

        # parameters_grid = None

        parameters_grid = {
            'C': (0.2, 0.5, 1),
            'max_iter': (50, 100, 200),
            'tol': (0.001, 0.005, 0.01)
        }

        SKLearnGridSearch.grid_search(_log_path,
                                      x_train,
                                      y_train,
                                      reg,
                                      params=parameters,
                                      params_grid=parameters_grid,
                                      cv_generator=cv_generator,
                                      cv_args=cv_args)

        utils.print_grid_info('Logistic Regression', parameters,
                              parameters_grid)
Exemple #5
0
    def lgb_grid_search(train_seed, cv_generator, cv_args):
        """
            LightGBM
        """
        _log_path = grid_search_log_path + 'lgb_'

        x_train, y_train, x_test, id_test = utils.load_preprocessed_data(
            preprocessed_data_path)
        x_g_train, x_g_test = utils.load_preprocessed_data_g(
            preprocessed_data_path)

        parameters = {
            'learning_rate': 0.006,
            'boosting_type':
            'gbdt',  # traditional Gradient Boosting Decision Tree.
            # 'boosting_type': 'dart',        # Dropouts meet Multiple Additive Regression Trees.
            # 'boosting_type': 'goss',        # Gradient-based One-Side Sampling.
            # 'boosting_type': 'rf',          # Random Forest.
            'num_leaves': 3,  # <2^(max_depth)
            'max_depth': 8,  # default=-1
            'n_estimators': 79,
            'max_bin': 1005,
            'subsample_for_bin': 1981,
            'objective': 'binary',
            'min_split_gain': 0.,
            'min_child_weight': 1,
            'min_child_samples': 0,
            'subsample': 0.723,
            'subsample_freq': 3,
            'colsample_bytree': 0.11,
            'reg_alpha': 0.,
            'reg_lambda': 0.,
            'silent': False,
            'seed': train_seed
        }

        LGB = SKLearnLightGBM(x_g_train, y_train, x_g_test, id_test)

        reg = LGB.get_reg(parameters)

        # parameters_grid = None

        parameters_grid = {
            'learning_rate': (0.002, 0.005, 0.01),
            'n_estimators': (30, 60, 90),
            'num_leaves': (32, 64, 128),  # <2^(max_depth)
            'colsample_bytree': (0.6, 0.8, 0.1),
            'max_depth': (6, 8, 10),  # default=-1
            # 'min_data_in_leaf': 20,                  # default=20
            # 'bagging_fraction': (0.5, 0.7, 0.9),
            # 'feature_fraction': (0.5, 0.7, 0.9),
            # 'subsample_for_bin': (50000, 100000, 150000),
            # 'subsample_freq': (4, 6, 8),
            # 'subsample': (0.6, 0.8, 1.0),
            # 'max_bin': (255, 355, 455)
        }

        SKLearnGridSearch.grid_search(_log_path,
                                      x_train,
                                      y_train,
                                      reg,
                                      params=parameters,
                                      params_grid=parameters_grid,
                                      cv_generator=cv_generator,
                                      cv_args=cv_args)

        utils.print_grid_info('LightGBM', parameters, parameters_grid)
Exemple #6
0
    def xgb_grid_search(train_seed, cv_generator, cv_args):
        """
            XGBoost
        """
        _log_path = grid_search_log_path + 'xgb_'

        x_train, y_train, x_test, id_test = utils.load_preprocessed_data(
            preprocessed_data_path)

        parameters = {
            'objective': 'binary:logistic',
            'learning_rate': 0.002,
            'n_estimators': 100,
            'max_depth': 9,
            'min_child_weight': 5,
            'max_delta_step': 0,
            'silent': False,
            'subsample': 0.8,
            'colsample_bytree': 0.8,
            'colsample_bylevel': 1,
            'base_score': 0.5,
            'gamma': 0,
            'reg_alpha': 0,
            'reg_lambda': 0,
            'nthread': -1,
            'seed': train_seed
            # 'missing': None,
            # 'nthread': -1,
            # 'scale_pos_weight': 1,
        }

        XGB = SKLearnXGBoost(x_train, y_train, x_test, id_test)

        reg = XGB.get_reg(parameters)

        # parameters_grid = None

        parameters_grid = {
            'learning_rate': (0.002, 0.005, 0.01),
            'n_estimators': (20, 50, 100, 150),
            'max_depth': (5, 7, 9),
            # 'subsample': 0.8,
            # 'colsample_bytree': 0.8,
            # 'colsample_bylevel': 1,
            # 'gamma': 0,
            # 'min_child_weight': 1,
            # 'max_delta_step': 0,
            # 'base_score': 0.5,
            # 'reg_alpha': 0,
            # 'reg_lambda': 0,
        }

        SKLearnGridSearch.grid_search(_log_path,
                                      x_train,
                                      y_train,
                                      reg,
                                      params=parameters,
                                      params_grid=parameters_grid,
                                      cv_generator=cv_generator,
                                      cv_args=cv_args)

        utils.print_grid_info('XGBoost', parameters, parameters_grid)
Exemple #7
0
    def stack_lgb_grid_search(train_seed, cv_generator, cv_args):

        _log_path = grid_search_log_path + 'stk_lgb_'

        x_train, y_train, x_test, id_test = utils.load_preprocessed_data(
            preprocessed_data_path)
        x_g_train, x_g_test = utils.load_preprocessed_data_g(
            preprocessed_data_path)
        blender_x_tree, blender_test_tree, blender_x_g_tree, blender_test_g_tree \
            = utils.load_stacked_data(stack_output_path + 'l2_')

        g_train = x_g_train[:, -1]
        x_train_reuse = x_train[:, :88]

        print('------------------------------------------------------')
        print('Stacking Reused Features of Train Set...')
        blender_x_tree = np.concatenate((blender_x_tree, x_train_reuse),
                                        axis=1)
        blender_x_g_tree = np.column_stack((blender_x_tree, g_train))

        parameters = {
            'learning_rate': 0.006,
            'boosting_type':
            'gbdt',  # traditional Gradient Boosting Decision Tree.
            # 'boosting_type': 'dart',        # Dropouts meet Multiple Additive Regression Trees.
            # 'boosting_type': 'goss',        # Gradient-based One-Side Sampling.
            # 'boosting_type': 'rf',          # Random Forest.
            'num_leaves': 3,  # <2^(max_depth)
            'max_depth': 8,  # default=-1
            'n_estimators': 79,
            'max_bin': 1005,
            'subsample_for_bin': 1981,
            'objective': 'regression',
            'min_split_gain': 0.,
            'min_child_weight': 1,
            'min_child_samples': 0,
            'subsample': 0.723,
            'subsample_freq': 3,
            'colsample_bytree': 0.11,
            'reg_alpha': 0.,
            'reg_lambda': 0.,
            'silent': False,
            'random_state': train_seed
        }

        LGB = SKLearnLightGBM(blender_x_g_tree, y_train, blender_test_g_tree,
                              id_test)

        reg = LGB.get_reg(parameters)

        # parameters_grid = None

        parameters_grid = {
            'learning_rate': (0.002, 0.005, 0.01),
            'n_estimators': (30, 60, 90),
            'num_leaves': (32, 64, 128),  # <2^(max_depth)
            'colsample_bytree': (0.6, 0.8, 0.1),
            'max_depth': (6, 8, 10),  # default=-1
            # 'min_data_in_leaf': 20,                  # default=20
            # 'bagging_fraction': (0.5, 0.7, 0.9),
            # 'feature_fraction': (0.5, 0.7, 0.9),
            # 'subsample_for_bin': (50000, 100000, 150000),
            # 'subsample_freq': (4, 6, 8),
            # 'subsample': (0.6, 0.8, 1.0),
            # 'max_bin': (255, 355, 455)
        }

        SKLearnGridSearch.grid_search(_log_path,
                                      x_train,
                                      y_train,
                                      reg,
                                      params=parameters,
                                      params_grid=parameters_grid,
                                      cv_generator=cv_generator,
                                      cv_args=cv_args)

        utils.print_grid_info('LightGBM', parameters, parameters_grid)