コード例 #1
0
def lightgbm(X_train, y_train, X_test, y_test):
    
    reg = LGBMModel(objective='regression')
#    reg_cv = GridSearchCV(reg, {'max_depth': [2,4,6], 'n_estimators': [50]}, verbose=1)
#    reg_cv.fit(X_train, y_train)
#    print(reg_cv.best_params_, reg_cv.best_score_) 
#    reg = xgb.LGBMModel(**reg_cv.best_params_)
    
    start = time.time()
    reg.fit(X_train, y_train)
    time_train = time.time() - start

    pred_train = reg.predict(X_train)
    start = time.time()
    pred_test = reg.predict(X_test)
    time_test = time.time() - start
    
    return pred_train, pred_test, time_train, time_test, reg.feature_importances_
コード例 #2
0
        def objective(
                num_leaves,
                scale_pos_weight,
                min_child_samples,
                bin_construct_sample_cnt,
                max_bin,
                min_sum_hessian_in_leaf,
                max_depth,
                min_split_gain,
                min_child_weight,
        ):
            try:
                scores = []

                params = {
                    'num_leaves': int(round(num_leaves, ndigits=0)),
                    'scale_pos_weight': scale_pos_weight,
                    'min_child_samples': int(round(min_child_samples, ndigits=0)),
                    'bin_construct_sample_cnt': int(round(bin_construct_sample_cnt, ndigits=0)),
                    'max_bin': int(round(max_bin, ndigits=0)),
                    'min_sum_hessian_in_leaf': min_sum_hessian_in_leaf,

                    'max_depth': int(round(max_depth, ndigits=0)),
                    'min_split_gain': min_split_gain,
                    'min_child_weight': min_child_weight,

                    'n_jobs': self.n_jobs,
                    'silent': self.verbose < 1,
                    'random_state': self.random_state}

                if isinstance(self.fixed_parameters, dict):
                    params.update(self.fixed_parameters)

                if self.use_gpu:
                    params.update({'device': 'gpu',
                                   'gpu_platform_id': 1,
                                   'gpu_device_id': 0})

                skf = StratifiedKFold(
                    self.n_folds, shuffle=self.shuffle, random_state=self.random_state)

                for train_index, valid_index in skf.split(x, y):

                    x_train, y_train = x[train_index, :], y[train_index]
                    x_valid, y_valid = x[valid_index, :], y[valid_index]

                    params['objective'] = 'binary'
                    gbm = LGBMModel(**params)

                    gbm.fit(x_train, y_train,
                            eval_set=[(x_valid, y_valid)],
                            early_stopping_rounds=self.early_stopping_rounds,
                            verbose=int(self.verbose > 0))

                    y_valid_hat = gbm.predict(x_valid, num_iteration=gbm.best_iteration_)

                    loss_valid = log_loss(y_valid, y_valid_hat)

                    scores.append(loss_valid)

                result = np.mean(scores)

                self.iterations.append((params, result))

                return result

            except:
                # exc_type, exc_obj, exc_tb = sys.exc_info()
                # fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
                # print(exc_type, fname, exc_tb.tb_lineno)
                return 999.99
コード例 #3
0
ファイル: stack.py プロジェクト: lucasvenez/mult
class Stack(object):
    def __init__(self,
                 random_state=None,
                 test_size=0.2,
                 verbose=None,
                 optimization_n_call=50,
                 optimization_n_folds=2,
                 optimization_early_stopping_rounds=1,
                 optimization_shuffle=True):

        self.opt = LightGBMOptimizer(
            n_folds=optimization_n_folds,
            n_calls=optimization_n_call,
            early_stopping_rounds=optimization_early_stopping_rounds,
            shuffle=optimization_shuffle,
            n_jobs=-1)

        self.lgb_opt = LightGBMOptimizer(
            n_folds=optimization_n_folds,
            n_calls=optimization_n_call,
            early_stopping_rounds=optimization_early_stopping_rounds,
            shuffle=optimization_shuffle,
            n_jobs=-1)

        self.mlp_opt = MLPOptimizer(n_folds=optimization_n_folds,
                                    n_calls=optimization_n_call,
                                    shuffle=optimization_shuffle,
                                    n_jobs=-1)

        self.knn_opt = KNNOptimizer(n_folds=optimization_n_folds,
                                    n_calls=optimization_n_call,
                                    shuffle=optimization_shuffle,
                                    n_jobs=-1)

        self.svm_opt = SVMOptimizer(
            n_folds=optimization_n_folds,
            n_calls=optimization_n_call,
            early_stopping_rounds=optimization_early_stopping_rounds,
            shuffle=optimization_shuffle,
            n_jobs=-1)

        self.model = None

        self.lgb_model = None
        self.mlp_model = None
        self.knn_model = None
        self.svm_model = None

        self.random_state = random_state
        self.test_size = test_size
        self.verbose = verbose

    def stack_predict(self, x):

        lgb_y_hat = self.lgb_model.predict(
            x, num_iteration=self.lgb_model.best_iteration_)
        print(lgb_y_hat.shape)

        mlp_y_hat = self.mlp_model.predict_proba(x)[:, -1]
        print(mlp_y_hat.shape)

        knn_y_hat = self.knn_model.predict_proba(x)[:, -1]
        print(knn_y_hat.shape)

        svm_y_hat = self.svm_model.predict_proba(x)[:, -1]
        print(svm_y_hat.shape)

        return np.array([lgb_y_hat, mlp_y_hat, knn_y_hat, svm_y_hat]).T

    def fit(self, x, y, early_stopping_rounds=None):

        self.fit_lightgbm(x, y, early_stopping_rounds)
        self.fit_knn(x, y)
        self.fit_mlp(x, y, early_stopping_rounds)
        self.fit_svm(x, y)

        x_stack = self.stack_predict(x)

        print('fit stack')

        optimized_params = self.opt.optimize(x, y)
        optimized_params['objective'] = 'binary'

        self.model = LGBMModel(**optimized_params)
        self.model.fit(x_stack, y)

        if early_stopping_rounds is not None and early_stopping_rounds > 0:

            x_train, x_valid, y_train, y_valid = train_test_split(
                x,
                y,
                stratify=y,
                shuffle=True,
                test_size=self.test_size,
                random_state=self.random_state)

            self.lgb_model.fit(x_train,
                               y_train,
                               eval_set=[(x_valid, y_valid)],
                               verbose=self.verbose)

        else:
            self.lgb_model.fit(x, y)

    def fit_lightgbm(self, x, y, early_stopping_rounds):

        print('fit lightgbm')

        optimized_params = self.lgb_opt.optimize(x, y)
        optimized_params['objective'] = 'binary'

        optimized_params['random_state'] = self.random_state
        optimized_params['n_jobs'] = -1

        self.lgb_model = LGBMModel(**optimized_params)
        self.lgb_model.fit(x, y)

        if early_stopping_rounds is not None and early_stopping_rounds > 0:

            x_train, x_valid, y_train, y_valid = train_test_split(
                x,
                y,
                stratify=y,
                shuffle=True,
                test_size=self.test_size,
                random_state=self.random_state)

            self.lgb_model.fit(x_train,
                               y_train,
                               eval_set=[(x_valid, y_valid)],
                               verbose=self.verbose)

        else:
            self.lgb_model.fit(x, y)

    def fit_svm(self, x, y):

        print('fit svm')

        optimized_params = self.svm_opt.optimize(x, y)

        optimized_params['random_state'] = self.random_state

        self.svm_model = SVC(**optimized_params, probability=True)

        self.svm_model.fit(x, y)

    def fit_mlp(self, x, y, early_stopping_rounds):

        print('fit mlp')

        optimized_params = self.mlp_opt.optimize(x, y)

        optimized_params['random_state'] = self.random_state

        esr = early_stopping_rounds is not None and early_stopping_rounds > 0

        self.mlp_model = MLPClassifier(**optimized_params,
                                       early_stopping=esr,
                                       validation_fraction=self.test_size)

        self.mlp_model.fit(x, y)

    def fit_knn(self, x, y):

        print('fit knn')

        optimized_params = self.knn_opt.optimize(x, y)
        optimized_params['n_jobs'] = -1

        self.knn_model = KNeighborsClassifier(**optimized_params)

        self.knn_model.fit(x, y)

    def predict(self, x):

        x_stack = self.stack_predict(x)

        return self.model.predict(x_stack,
                                  num_iteration=self.model.best_iteration_)
コード例 #4
0
class LGBMPredictor:
    def __init__(self):
        self.data_dir = '../../datasets'

        if not path.exists(self.data_dir):
            raise Exception(
                '{} directory not found.'.format(self.data_dir)
            )

        self.train_file = '{}/{}'.format(self.data_dir, 'train.zip')
        self.val_file = '{}/{}'.format(self.data_dir, 'val.zip')
        self.pred_val_file = '{}/{}'.format(
            self.data_dir, 'lgbm_pred_val.zip'
        )
        self.test_file = '{}/{}'.format(self.data_dir, 'test.zip')
        self.pred_test_file = '{}/{}'.format(
            self.data_dir, 'lgbm_pred_test.zip'
        )

    def load_data(self, zip_path):
        df = pd.read_csv(
            zip_path,
            dtype={'fullVisitorId': 'str'},
            compression='zip'
        )

        [rows, columns] = df.shape

        print('\nLoaded {} rows with {} columns from {}.\n'.format(
            rows, columns, zip_path
        ))

        return df

    def load(self):
        print('Loading train data from {}'.format(self.train_file))
        self.train_df = self.load_data(self.train_file)

        print('Loading val data from {}'.format(self.val_file))
        self.val_df = self.load_data(self.val_file)

        print('Loading test data from {}'.format(self.test_file))
        self.test_df = self.load_data(self.test_file)

    def prepare_data(self):
        train_df = self.train_df
        val_df = self.val_df
        test_df = self.test_df

        self.train_id = train_df['fullVisitorId'].values
        self.val_id = val_df['fullVisitorId'].values
        self.test_id = test_df['fullVisitorId'].values

        self.train_y = train_df['totals.transactionRevenue'].values
        self.train_log_y = np.log1p(self.train_y)

        self.val_y = val_df['totals.transactionRevenue'].values
        self.val_log_y = np.log1p(self.val_y)

        self.train_X = train_df.drop(
            ['totals.transactionRevenue', 'fullVisitorId'],
            axis=1
        )
        self.val_X = val_df.drop(
            ['totals.transactionRevenue', 'fullVisitorId'],
            axis=1
        )
        self.test_X = test_df.drop(['fullVisitorId'], axis=1)

        print('\nShape of the train dataset: {}'.format(self.train_X.shape))
        print('\nShape of the val dataset: {}'.format(self.val_X.shape))
        print('\nShape of the test dataset: {}\n'.format(self.test_X.shape))

    def lgbm_model(self):
        self.model = LGBMModel(
            objective='regression',
            metric='rmse',
            n_estimators=1000,
            learning_rate=0.01,
            min_child_samples=100,
            bagging_fraction=0.7,
            feature_fraction=0.5,
            bagging_freq=5,
            bagging_seed=2020
        )

        self.model = self.model.fit(
            self.train_X,
            self.train_log_y,
            eval_set=(self.val_X, self.val_log_y),
            early_stopping_rounds=100,
            verbose=100
        )

    def lgbm_predict(self, X):
        return self.model.predict(X, self.model.best_iteration_)

    def lgbm_train(self):
        self.lgbm_model()

    def predict(self):
        self.prev_val = self.lgbm_predict(self.val_X)
        self.prev_test = self.lgbm_predict(self.test_X)

    def evaluate_val_prediction(self):
        pred_val = self.prev_val

        pred_val[pred_val < 0] = 0
        pred_val_data = {
            'fullVisitorId': self.val_id,
            'transactionRevenue': self.val_y,
            'predictedRevenue': np.expm1(pred_val)
        }

        pred_val_df = pd.DataFrame(pred_val_data)

        pred_val_df = pred_val_df.groupby('fullVisitorId')
        pred_val_df = pred_val_df['transactionRevenue', 'predictedRevenue']\
            .sum().reset_index()

        rsme_val = np.sqrt(
            mean_squared_error(
                np.log1p(pred_val_df['transactionRevenue'].values),
                np.log1p(pred_val_df['predictedRevenue'].values)
            )
        )

        self.rsme_val = rsme_val
        self.prev_val_df = pred_val_df

    def evaluate_test_prediction(self):
        pred_test = self.pred_test

        pred_test[pred_test < 0] = 0

        pred_test_data = {
            'fullVisitorId': self.test_id,
            'predictedRevenue': np.expm1(pred_test)
        }

        pred_test_df = pd.DataFrame(pred_test_data)

        pred_test_df = pred_test_df.groupby('fullVisitorId')
        pred_test_df = pred_test_df['predictedRevenue'].sum().reset_index()

        self.pred_test_df = pred_test_df

    def write_to_csv(self):
        self.pred_val_df.to_csv(
            self.pred_val_file,
            index=False,
            compression='zip'
        )

        self.pred_test_df.to_csv(
            self.pred_test_file,
            index=False,
            compression='zip'
        )