Ejemplo n.º 1
0
def lgb_evaluate(
        min_child_sample, num_leaves, max_bin,
        min_child_weight, subsample, subsample_freq,
        colsample_bytree, reg_alpha, reg_lambda,

                 ):

    target = 'visitors'
    features, cat_features = features_set_f2()
    split = 0.33
    seed = 177
    full_data, ntrain, ntest = data_preparation()
    cat_feats_lgb = [i for i, x in enumerate(full_data[features].columns) if x in cat_features]
    trn = full_data[:ntrain]
    x_train, x_valid, y_train, y_valid = train_test_split(
        trn[features].values, trn[target].values,
        test_size=split, random_state=seed
    )

    del full_data, trn
    gc.collect()

    lgb_params = dict()
    lgb_params['objective'] = 'regression_l2'
    lgb_params['metric'] = 'l2_root'
    lgb_params['learning_rate'] = 0.1
    lgb_params['random_state'] = seed
    lgb_params['silent'] = True  # does help
    lgb_params['verbose_eval'] = False

    lgb_params['n_estimators'] = 500


    lgb_params['min_child_samples'] = int(np.round(min_child_sample))
    lgb_params['num_leaves'] =  int(np.round(num_leaves))
    lgb_params['max_bin'] =  int(np.round(max_bin))
    lgb_params['subsample_freq'] =  int(np.round(subsample_freq))
    lgb_params['colsample_bytree'] = colsample_bytree
    lgb_params['reg_alpha'] = reg_alpha
    lgb_params['reg_lambda'] = reg_lambda
    lgb_params['min_child_weight'] = min_child_weight
    lgb_params['subsample'] = subsample
    lgb_params['cat_features'] = cat_feats_lgb

    lgb_clf = LgbWrapper(params=lgb_params)
    lgb_clf.train(x_train, y_train, x_valid, y_valid)

    return lgb_clf.best_score
Ejemplo n.º 2
0
def xgb_evaluate(min_child_weight, colsample_bytree, max_depth, subsample,
                 gamma, reg_alpha, reg_lambda, rate_drop):

    target = 'visitors'
    features, col_feats = features_set_f2()
    split = 0.33
    seed = 177
    full_data, ntrain, ntest = data_preparation()
    trn = full_data[:ntrain]
    x_train, x_valid, y_train, y_valid = train_test_split(trn[features].values,
                                                          trn[target].values,
                                                          test_size=split,
                                                          random_state=seed)

    del full_data, trn
    gc.collect()

    xgb_params = dict()
    xgb_params['objective'] = 'reg:linear'
    xgb_params['eval_metric'] = 'rmse'
    xgb_params['eta'] = 0.1
    xgb_params['seed'] = seed
    xgb_params['silent'] = True  # does help
    xgb_params['verbose_eval'] = False
    xgb_params['nrounds'] = 500

    xgb_params['max_depth'] = int(np.round(max_depth))
    xgb_params['min_child_weight'] = int(np.round(min_child_weight))
    xgb_params['colsample_bytree'] = colsample_bytree
    xgb_params['subsample'] = subsample
    xgb_params['gamma'] = gamma
    xgb_params['alpha'] = reg_alpha
    xgb_params['lambda'] = reg_lambda
    xgb_params['rate_drop'] = rate_drop

    xgb_clf = XgbWrapper(seed=seed, params=xgb_params)
    xgb_clf.train(x_train, y_train, x_valid, y_valid)

    return xgb_clf.best_score
Ejemplo n.º 3
0
import sys

sys.path.append("../")
from general.preprocess import data_preparation
from general.ClfWrappers import XgbWrapper
from general.utilities import sub_to_csv
from features.f2 import features_set_f2
from cv.cv_04 import cross_validate


TARGET = 'visitors'
FEATURES, CAT_FEATS = features_set_f2()
SEED = 177
print("Overfiting process initiating...")

xgb_params = dict()
xgb_params['objective'] = 'reg:linear'
xgb_params['eval_metric'] = 'rmse'
xgb_params['eta'] = 0.02
xgb_params['seed'] = SEED
xgb_params['silent'] = True  # does help
xgb_params['verbose_eval'] = False
xgb_params['nrounds'] = 2000
xgb_params['early_stopping_rounds'] = 100

xgb_params['max_depth'] = 5
xgb_params['min_child_weight'] = 1.91
xgb_params['colsample_bytree'] = 0.920
xgb_params['subsample'] = 0.856
xgb_params['gamma'] = 0.718
xgb_params['alpha'] = 1.83
Ejemplo n.º 4
0
import pandas as pd
import xgboost as xgb

from sklearn.linear_model import Ridge, LinearRegression
from sklearn.metrics import mean_squared_error

sys.path.append("../")
from general.utilities import sub_to_csv_stacker
from general.preprocess import data_preparation
from general.ClfWrappers import Stacker, SklearnWrapper, XgbWrapper
from stacker.utilities import search_model, score_valid
from features.f2 import features_set_f2

# starts here
full_data, ntrain, ntest = data_preparation()
features, cat_features = features_set_f2()
tgt = pd.read_csv('../data/air_visit_data.csv').visitors.values
trn_list = [x for x in glob.glob('../valid/*.csv') if 'cv4' in x]
tst_list = [x for x in glob.glob('../submission/*.csv') if 'cv4' in x]

trn_series = pd.DataFrame()
tst_series = pd.DataFrame()

for i, trn in enumerate(trn_list):
    temp = pd.read_csv(
        trn,
        index_col=['id']).rename(columns={'visitors': ('visitors_' + str(i))})
    trn_series = pd.concat([trn_series, temp], axis=1)

for i, tst in enumerate(tst_list):
    temp = pd.read_csv(