def lgb_evaluate( min_child_sample, num_leaves, max_bin, min_child_weight, subsample, subsample_freq, colsample_bytree, reg_alpha, reg_lambda, ): target = 'visitors' features, cat_features = features_set_f2() split = 0.33 seed = 177 full_data, ntrain, ntest = data_preparation() cat_feats_lgb = [i for i, x in enumerate(full_data[features].columns) if x in cat_features] trn = full_data[:ntrain] x_train, x_valid, y_train, y_valid = train_test_split( trn[features].values, trn[target].values, test_size=split, random_state=seed ) del full_data, trn gc.collect() lgb_params = dict() lgb_params['objective'] = 'regression_l2' lgb_params['metric'] = 'l2_root' lgb_params['learning_rate'] = 0.1 lgb_params['random_state'] = seed lgb_params['silent'] = True # does help lgb_params['verbose_eval'] = False lgb_params['n_estimators'] = 500 lgb_params['min_child_samples'] = int(np.round(min_child_sample)) lgb_params['num_leaves'] = int(np.round(num_leaves)) lgb_params['max_bin'] = int(np.round(max_bin)) lgb_params['subsample_freq'] = int(np.round(subsample_freq)) lgb_params['colsample_bytree'] = colsample_bytree lgb_params['reg_alpha'] = reg_alpha lgb_params['reg_lambda'] = reg_lambda lgb_params['min_child_weight'] = min_child_weight lgb_params['subsample'] = subsample lgb_params['cat_features'] = cat_feats_lgb lgb_clf = LgbWrapper(params=lgb_params) lgb_clf.train(x_train, y_train, x_valid, y_valid) return lgb_clf.best_score
def xgb_evaluate(min_child_weight, colsample_bytree, max_depth, subsample, gamma, reg_alpha, reg_lambda, rate_drop): target = 'visitors' features, col_feats = features_set_f2() split = 0.33 seed = 177 full_data, ntrain, ntest = data_preparation() trn = full_data[:ntrain] x_train, x_valid, y_train, y_valid = train_test_split(trn[features].values, trn[target].values, test_size=split, random_state=seed) del full_data, trn gc.collect() xgb_params = dict() xgb_params['objective'] = 'reg:linear' xgb_params['eval_metric'] = 'rmse' xgb_params['eta'] = 0.1 xgb_params['seed'] = seed xgb_params['silent'] = True # does help xgb_params['verbose_eval'] = False xgb_params['nrounds'] = 500 xgb_params['max_depth'] = int(np.round(max_depth)) xgb_params['min_child_weight'] = int(np.round(min_child_weight)) xgb_params['colsample_bytree'] = colsample_bytree xgb_params['subsample'] = subsample xgb_params['gamma'] = gamma xgb_params['alpha'] = reg_alpha xgb_params['lambda'] = reg_lambda xgb_params['rate_drop'] = rate_drop xgb_clf = XgbWrapper(seed=seed, params=xgb_params) xgb_clf.train(x_train, y_train, x_valid, y_valid) return xgb_clf.best_score
import sys sys.path.append("../") from general.preprocess import data_preparation from general.ClfWrappers import XgbWrapper from general.utilities import sub_to_csv from features.f2 import features_set_f2 from cv.cv_04 import cross_validate TARGET = 'visitors' FEATURES, CAT_FEATS = features_set_f2() SEED = 177 print("Overfiting process initiating...") xgb_params = dict() xgb_params['objective'] = 'reg:linear' xgb_params['eval_metric'] = 'rmse' xgb_params['eta'] = 0.02 xgb_params['seed'] = SEED xgb_params['silent'] = True # does help xgb_params['verbose_eval'] = False xgb_params['nrounds'] = 2000 xgb_params['early_stopping_rounds'] = 100 xgb_params['max_depth'] = 5 xgb_params['min_child_weight'] = 1.91 xgb_params['colsample_bytree'] = 0.920 xgb_params['subsample'] = 0.856 xgb_params['gamma'] = 0.718 xgb_params['alpha'] = 1.83
import pandas as pd import xgboost as xgb from sklearn.linear_model import Ridge, LinearRegression from sklearn.metrics import mean_squared_error sys.path.append("../") from general.utilities import sub_to_csv_stacker from general.preprocess import data_preparation from general.ClfWrappers import Stacker, SklearnWrapper, XgbWrapper from stacker.utilities import search_model, score_valid from features.f2 import features_set_f2 # starts here full_data, ntrain, ntest = data_preparation() features, cat_features = features_set_f2() tgt = pd.read_csv('../data/air_visit_data.csv').visitors.values trn_list = [x for x in glob.glob('../valid/*.csv') if 'cv4' in x] tst_list = [x for x in glob.glob('../submission/*.csv') if 'cv4' in x] trn_series = pd.DataFrame() tst_series = pd.DataFrame() for i, trn in enumerate(trn_list): temp = pd.read_csv( trn, index_col=['id']).rename(columns={'visitors': ('visitors_' + str(i))}) trn_series = pd.concat([trn_series, temp], axis=1) for i, tst in enumerate(tst_list): temp = pd.read_csv(