def early_stop_opt(self, X_train, X_eval, y_train, y_eval, categories): if self.is_multi_label: lgb_train = lgb.Dataset(X_train, y_train[:, 0]) lgb_eval = lgb.Dataset(X_eval, y_eval[:, 0], reference=lgb_train) else: y_train = ohe2cat(y_train) y_eval = ohe2cat(y_eval) lgb_train = lgb.Dataset(X_train, y_train) lgb_eval = lgb.Dataset(X_eval, y_eval, reference=lgb_train) self.hyperparams['num_boost_round'] = 1000 tmp_lr = self.hyperparams.pop('learning_rate') self.learning_rates = self.get_log_lr(1000, tmp_lr, tmp_lr * 0.6) self._model = lgb.train({ **self.params, **self.hyperparams }, verbose_eval=20, train_set=lgb_train, valid_sets=lgb_eval, valid_names='eval', early_stopping_rounds=20, learning_rates=self.learning_rates ) #categorical_feature=categories) self.hyperparams['num_boost_round'] = self._model.best_iteration self.learning_rates = self.learning_rates[:self._model.best_iteration] log('best boost round: {}'.format(self._model.best_iteration))
def get_top_preds(self): models_name = self.hist_info.keys() models_auc = [self.hist_info[name][0] for name in models_name] models_name_sorted, models_auc_sored = (list(i) for i in zip(*sorted( zip(models_name, models_auc), key=lambda x: x[1], reverse=True))) for i in range(len(models_auc_sored), 0, -1): std = np.std(models_auc_sored[:i]) top_num = i if std < self.ensemble_std_threshold: break log('top {} model auc std is {}'.format(top_num, std)) top_auc = np.array(models_auc_sored[:top_num]) # weights = top_auc / top_auc.sum() # print(weights) top_auc = top_auc + 15 * (top_auc - top_auc.mean()) top_auc = np.array([max(0.01, i) for i in top_auc]) weights = top_auc / top_auc.sum() print(weights) top_preds = [] for i in range(top_num): name = models_name_sorted[i] rank = i + 1 auc = models_auc_sored[i] weight = weights[i] preds = self.hist_info[name][1] top_preds.append((name, rank, auc, weight, preds)) return top_preds
def explore_model_space(self, train_loop_num): if train_loop_num == 1: self.model = self.model_space.get_model( self.model_prior[self.model_idx], self.round_num) self.last_model_type = self.model.type else: if self.model.not_rise_num == self.model.patience \ or (self.model.not_gain_num > self.model.not_gain_threhlod) \ or self.model.run_num >= self.model.max_run or self.info['mode'] =='bagging': log('model {}'.format(self.model.name)) log('not rise num {}'.format(self.model.not_rise_num)) log('not gain num {}'.format(self.model.not_gain_num)) log('run num {}'.format(self.model.run_num)) log('last auc gain {}'.format(self.model.auc_gain)) self.model_idx += 1 self.reset_model_cache() if self.model_idx == len(self.model_prior): self.sort_model_prior() self.info['mode'] = 'bagging' self.data_space.update = True self.model = self.model_space.get_model( self.model_prior[self.model_idx], self.round_num) self.use_all_data = False if self.model.type != self.last_model_type: self.dataloader = None gc.collect()
def log_feat_importances(self, return_info=False): if not self.is_multi_label: importances = pd.DataFrame({ 'features': [i for i in self._model.feature_name()], 'importances': self._model.feature_importance("gain") }) else: importances = pd.DataFrame({ 'features': [i for i in self.models[0].feature_name()], 'importances': self.models[0].feature_importance("gain") }) importances.sort_values('importances', ascending=False, inplace=True) log('feat importance:{}'.format(importances.head(100))) importances = importances[importances['importances'] > 0] size = int(len(importances) * 0.8) log('imp cols size {}'.format(size)) self.import_cols = importances['features'][:size].values self.imp_nums = list(importances['features'][:30].values)
def blending_predict(self): top_preds = self.get_top_preds() ensemble_preds = 0 for name, rank, auc, weight, preds in top_preds: m = np.mean(preds) log('blending: {}, rank: {}, mean {}, val auc: {} weight {}'. format(name, rank, m, auc, weight)) ensemble_preds += weight * preds / m return ensemble_preds
def get_categories(self, df): categories = [] col_set = set(df.columns) for col in self.cat_cols: if col in col_set: if df[col].nunique() <= 10: categories.append(col) log('get categories: {}'.format(categories)) return categories
def early_stop_opt(self, X_train, X_eval, y_train, y_eval, categories): if self.is_multi_label: dtrain = xgb.DMatrix(X_train, y_train[:, 1]) dvalid = xgb.DMatrix(X_eval, y_eval[:, 1]) else: dtrain = xgb.DMatrix(X_train, ohe2cat(y_train)) dvalid = xgb.DMatrix(X_eval, ohe2cat(y_eval)) model = xgb.train({**self.params, **self.hyperparams}, dtrain, evals=[(dvalid, 'eval')], num_boost_round=1200, early_stopping_rounds=10) #categorical_feature=categories) self.params['num_boost_round'] = model.best_iteration log('best boost round: {}'.format(model.best_iteration))
def sort_model_prior(self): log('old models prior is {}'.format(self.model_prior)) model_perform = collections.defaultdict(list) for name, info in self.hist_info.items(): first_name = name.split('_')[0] auc = info[0] if first_name in model_perform: model_perform[first_name].append(auc) self.model_prior = sorted(self.model_prior, key=lambda x: np.mean(model_perform[x]), reverse=True) log('new models prior is {}'.format(self.model_prior)) self.model_idx = 0 self.round_num += 1
def epoch_train(self, dataloader, run_num, is_multi_label=None, info=None, time_remain=None): self.is_multi_label = is_multi_label X, y, train_idxs, cat = dataloader['X'], dataloader['y'], dataloader['train_idxs'], dataloader['cat_cols'] train_x, train_y = X.loc[train_idxs], y[train_idxs] if info['mode'] == 'bagging': self.hyperparams = info['xgb'].copy() self.hyperparams['random_seed'] = np.random.randint(0, 2020) run_num = self.explore_params_round if run_num == self.explore_params_round: print('xgb explore_params_round') train_x, train_y, val_x, val_y, = self.split_data(train_x, train_y) self.import_cols = info['imp_cols'] if train_x.shape[1] > 300 and train_x.shape[0] > 10000: train_x = train_x[self.import_cols[:300]] val_x = val_x[self.import_cols[:300]] log('explore_params_round sample 300 cols') train_x.reset_index(drop=True, inplace=True) train_x = train_x.sample(n=10000) train_y = train_y[list(train_x.index)] log('explore_params_round sample 1w samples') elif train_x.shape[0] > 10000: train_x.reset_index(drop=True, inplace=True) train_x = train_x.sample(n=10000) train_y = train_y[list(train_x.index)] log('explore_params_round sample 1w samples') elif train_x.shape[1] > 300: train_x = train_x[self.import_cols[:300]] val_x = val_x[self.import_cols[:300]] log('explore_params_round sample 300 cols') print('shape: ', train_x.shape) self.bayes_opt(train_x, val_x, train_y, val_y, cat) self.early_stop_opt(train_x, val_x, train_y, val_y, cat) info['xgb'] = self.hyperparams.copy() train_x, train_y = X.loc[train_idxs], y[train_idxs] if run_num == self.all_data_round: print('xgb all data round') all_train_idxs = dataloader['all_train_idxs'] train_x = X.loc[all_train_idxs] train_y = y[all_train_idxs] if not self.is_multi_label: xgb_train = xgb.DMatrix(train_x, ohe2cat(train_y)) self._model = xgb.train({**self.params, **self.hyperparams}, xgb_train) else: for cls in range(self.num_class): cls_y = train_y[:, cls] xgb_train = xgb.DMatrix(train_x, cls_y) self.models[cls] = self._model = xgb.train({**self.params, **self.hyperparams}, xgb_train)
def nn_process(self, X, cat_cols): num_cols = [col for col in X.columns if col not in cat_cols] log('nn_process num col: {}'.format(num_cols)) log('nn_process cat col:{}'.format(cat_cols)) X = fill_na(X) cat_feats = self.cat_fit_transform(X, cat_cols) num_feats = self.num_fit_transform(X, num_cols) if len(cat_feats) > 0 and len(num_feats) > 0: feats = np.concatenate([cat_feats, num_feats], axis=1) elif len(cat_feats) > 0: feats = cat_feats elif len(num_feats) > 0: feats = num_feats return feats
def early_stop_opt(self, X_train, X_eval, y_train, y_eval, categories): if self.is_multi_label: y_train = y_train[:, 1] y_eval = y_eval[:, 1] else: y_train = ohe2cat(y_train) y_eval = ohe2cat(y_eval) model = CatBoostClassifier(**{**self.params, **self.hyperparams}) model.fit(X_train, y_train, eval_set=[(X_eval, y_eval)], use_best_model=True, verbose=10, early_stopping_rounds=20) self.params['iterations'] = model.best_iteration_ log('best iterations: {}'.format(model.best_iteration_))
def objective(hyperparams): tmp_hyperparams.update(hyperparams) log('hyper {}'.format(tmp_hyperparams)) model = lgb.train( { **params, **tmp_hyperparams }, train_set=train_data, valid_sets=valid_data, #categorical_feature=categories, early_stopping_rounds=18, verbose_eval=0) score = model.best_score["valid_0"][params["metric"]] # in classification, less is better return {'loss': score, 'status': STATUS_OK}
def bayes_opt(self, X_train, X_eval, y_train, y_eval, categories): if self.is_multi_label: y_train = y_train[:, 1] y_eval = y_eval[:, 1] else: y_train = ohe2cat(y_train) space = { "learning_rate": hp.loguniform("learning_rate", np.log(0.01), np.log(0.1)), "depth": hp.choice("depth", [4, 6, 8, 10, 12]), "l2_leaf_reg": hp.uniform('l2_leaf_reg', 0.1, 2), } def objective(hyperparams): hyperparams = self.hyperparams.copy() hyperparams['iterations'] = 300 model = CatBoostClassifier(**{**self.params, **hyperparams}) model.fit(X_train, y_train) pred = model.predict_proba(X_eval) if self.is_multi_label: score = roc_auc_score(y_eval, pred[:, 1]) else: score = roc_auc_score(y_eval, pred) return {'loss': -score, 'status': STATUS_OK} trials = Trials() best = hyperopt.fmin(fn=objective, space=space, trials=trials, algo=tpe.suggest, max_evals=15, verbose=1, rstate=np.random.RandomState(1)) self.hyperparams.update(space_eval(space, best)) log("auc = {}, hyperparams: {}".format( -trials.best_trial['result']['loss'], self.hyperparams))
def bayes_opt(self, X_train, X_eval, y_train, y_eval, categories): if self.is_multi_label: dtrain = xgb.DMatrix(X_train, y_train[:, 1]) dvalid = xgb.DMatrix(X_eval, y_eval[:, 1]) else: dtrain = xgb.DMatrix(X_train, ohe2cat(y_train)) dvalid = xgb.DMatrix(X_eval, ohe2cat(y_eval)) space = { "learning_rate": hp.loguniform("learning_rate", np.log(0.01), np.log(0.1)), "max_depth": hp.choice("max_depth", [4, 6, 8, 10, 12]), "min_child_weight": hp.uniform('min_child_weight', 0.01, 1), "min_data_in_leaf": hp.choice("min_data_in_leaf", np.linspace(10, 100, 20, dtype=int)), "gamma": hp.uniform("gamma", 0.001, 0.1), "lambda": hp.uniform("lambda", 0, 1), "alpha": hp.uniform("alpha", 0, 1), "colsample_bytree": hp.choice("colsample_bytree", [0.7, 0.9]), "colsample_bylevel": hp.choice("colsample_bylevel", [0.7, 0.9]), "colsample_bynode": hp.choice("colsample_bynode", [0.7, 0.9]), } def objective(hyperparams): model = xgb.train({**self.params, **hyperparams}, dtrain, num_boost_round=50) pred = model.predict(dvalid) if self.is_multi_label: score = roc_auc_score(y_eval[:, 1], pred[:, 1]) else: score = roc_auc_score(y_eval, pred) return {'loss': -score, 'status': STATUS_OK} trials = Trials() best = hyperopt.fmin(fn=objective, space=space, trials=trials, algo=tpe.suggest, max_evals=10, verbose=1, rstate=np.random.RandomState(1)) self.hyperparams.update(space_eval(space, best)) log("auc = {}, hyperparams: {}".format(-trials.best_trial['result']['loss'], self.hyperparams))
def explore_space(self, train_loop_num, time_remain=None): self.explore_model_space(train_loop_num) self.explore_data_space(train_loop_num) self.create_model(self.metadata.output_dim) # train and evaluate self.model.epoch_train(self.dataloader, run_num=self.model.run_num, is_multi_label=self.info['is_multi_label'], info=self.info, time_remain=time_remain) if not self.use_all_data: val_auc = self.model.epoch_valid(self.dataloader) log('explore model {}, val auc is {}'.format( self.model.name, val_auc)) else: val_auc = self.model.best_auc + 0.0001 self.use_all_data = False self.update_model_hist(val_auc)
def get_dataloader(self, train_loop_num, round_num, run_num, use_all_data, model_type): self.train_idxs, self.val_idxs = self.splits[round_num - 1] print('round {}'.format(round_num)) data_loader = {} do_sample_col = False if use_all_data: log('all data') sample_idxs = self.all_train_idxs else: log('all train') sample_idxs = self.train_idxs log('sample ratio {}'.format(len(sample_idxs) / len(self.train_idxs))) cat_cols = self.get_categories(self.data) if model_type == 'nn_keras': feats = self.nn_process(self.data, cat_cols) feats = pd.DataFrame(feats, index=self.all_idxs) data_loader['X'], data_loader['y'] = feats, self.y data_loader['shape'] = feats.shape[1] elif model_type == 'emb_nn': feats = pd.DataFrame(self.data, index=self.all_idxs) self.label_encode(feats, cat_cols) data_loader['X'], data_loader['y'] = feats, self.y data_loader['shape'] = feats.shape[1] elif model_type == 'tree': feats = self.data if do_sample_col: data_loader['X'] = feats else: data_loader['X'] = feats data_loader['y'] = self.y data_loader['cat_cols'] = cat_cols elif model_type == 'lr': feats = self.nn_process(self.data, cat_cols) feats = pd.DataFrame(feats, index=self.all_idxs) data_loader['X'], data_loader['y'] = feats, self.y data_loader['cat_cols'] = cat_cols data_loader['all_train_idxs'], data_loader['train_idxs'], \ data_loader['val_idxs'], data_loader['test_idxs'], \ data_loader['splits'], data_loader['cat_cols']\ = self.all_train_idxs, sample_idxs, self.val_idxs, self.test_idxs, self.splits, cat_cols return data_loader
def epoch_train(self, dataloader, run_num, is_multi_label=False, info=None, time_remain=None): self.is_multi_label = is_multi_label X, y, train_idxs, cat = dataloader['X'], dataloader['y'], dataloader[ 'train_idxs'], dataloader['cat_cols'] train_x, train_y = X.loc[train_idxs], y[train_idxs] if info['mode'] == 'bagging': self.hyperparams = info['lgb'].copy() self.hyperparams['seed'] = np.random.randint(0, 2020) num_leaves = self.hyperparams['num_leaves'] self.hyperparams['num_leaves'] += np.random.randint( -int(num_leaves / 10), int(num_leaves / 10)) run_num = 0 if run_num == self.explore_params_round: print('lgb explore_params_round') train_x, train_y, val_x, val_y, = self.split_data(train_x, train_y) self.log_feat_importances() if train_x.shape[1] > 300 and train_x.shape[0] > 20000: train_x = train_x[self.import_cols[:300]] val_x = val_x[self.import_cols[:300]] log('explore_params_round sample 300 cols') train_x.reset_index(drop=True, inplace=True) train_x = train_x.sample(n=20000) train_y = train_y[list(train_x.index)] log('explore_params_round sample 2w samples') elif train_x.shape[0] > 20000: train_x.reset_index(drop=True, inplace=True) train_x = train_x.sample(n=20000) train_y = train_y[list(train_x.index)] log('explore_params_round sample 2w samples') elif train_x.shape[1] > 300: train_x = train_x[self.import_cols[:300]] val_x = val_x[self.import_cols[:300]] log('explore_params_round sample 300 cols') print('shape: ', train_x.shape) self.bayes_opt(train_x, val_x, train_y, val_y, cat, phase=1) self.early_stop_opt(train_x, val_x, train_y, val_y, cat) info['lgb'] = self.hyperparams.copy() info['imp_cols'] = self.import_cols if run_num == self.ensemble_num: print('lgb ensemble_num') splits = dataloader['splits'] for i in range(len(splits)): train_idxs, val_idxs = splits[i] train_x, train_y = X.loc[train_idxs], y[train_idxs] hyperparams = self.hyperparams.copy() # num_leaves = hyperparams['num_leaves'] # num_leaves += np.random.randint(-int(num_leaves/10), int(num_leaves/10)) # hyperparams['num_leaves'] = num_leaves # log('model {} leaves {}'.format(i, num_leaves)) if self.is_multi_label: self.en_models = defaultdict(list) for cls in range(self.num_class): cls_y = train_y[:, cls] lgb_train = lgb.Dataset(train_x, cls_y) if not self.learning_rates: self.en_models[i].append( lgb.train({ **self.params, **hyperparams }, train_set=lgb_train)) else: self.en_models[i].append( lgb.train({ **self.params, **hyperparams }, train_set=lgb_train, learning_rates=self.learning_rates)) else: lgb_train = lgb.Dataset(train_x, ohe2cat(train_y)) if not self.learning_rates: self.en_models[i] = lgb.train( { **self.params, **hyperparams }, train_set=lgb_train) else: self.en_models[i] = lgb.train( { **self.params, **hyperparams }, train_set=lgb_train, learning_rates=self.learning_rates) self.ensemble_pred = True else: print('lgb norm train') train_x, train_y = X.loc[train_idxs], y[train_idxs] hyperparams = self.hyperparams.copy() log('hyperparams {}'.format(hyperparams)) if run_num == self.all_data_round_pre or run_num == self.all_data_round: print('lgb all data round') all_train_idxs = dataloader['all_train_idxs'] train_x = X.loc[all_train_idxs] train_y = y[all_train_idxs] print('shape: ', train_x.shape) if not is_multi_label: lgb_train = lgb.Dataset(train_x, ohe2cat(train_y)) if not self.learning_rates: self._model = lgb.train({ **self.params, **hyperparams }, train_set=lgb_train) else: self._model = lgb.train({ **self.params, **hyperparams }, train_set=lgb_train, learning_rates=self.learning_rates) else: self.params['num_class'] = 2 for cls in range(self.num_class): cls_y = train_y[:, cls] lgb_train = lgb.Dataset(train_x, cls_y) if not self.learning_rates: self.models[cls] = lgb.train( { **self.params, **self.hyperparams }, train_set=lgb_train) else: self.models[cls] = lgb.train( { **self.params, **self.hyperparams }, train_set=lgb_train, learning_rates=self.learning_rates) self.log_feat_importances() if self.imp_nums is not None: info['imp_nums'] = self.imp_nums
def drop_post_drop_column(self, df): if len(self.post_drop_set) != 0: drop_cols = list(self.post_drop_set) df.drop(drop_cols, axis=1, inplace=True) gc.collect() log('post drop cols:{}'.format(drop_cols))
def bayes_opt(self, X_train, X_eval, y_train, y_eval, categories, phase=1): if self.is_multi_label: train_data = lgb.Dataset(X_train, label=y_train[:, 0]) valid_data = lgb.Dataset(X_eval, label=y_eval[:, 0]) else: y_train = ohe2cat(y_train) y_eval = ohe2cat(y_eval) train_data = lgb.Dataset(X_train, label=y_train) valid_data = lgb.Dataset(X_eval, label=y_eval) params = self.params if phase == 1: space = { 'max_depth': hp.choice("max_depth", [-1, 5, 7, 9]), "num_leaves": hp.choice("num_leaves", np.linspace(20, 61, 10, dtype=int)), "reg_alpha": hp.uniform("reg_alpha", 0, 1), "reg_lambda": hp.uniform("reg_lambda", 0, 1), "min_child_samples": hp.choice("min_data_in_leaf", np.linspace(10, 120, 10, dtype=int)), "min_child_weight": hp.uniform('min_child_weight', 0.01, 1), "min_split_gain": hp.uniform('min_split_gain', 0.001, 0.1), 'colsample_bytree': hp.choice("colsample_bytree", [0.7, 0.9]), "learning_rate": hp.loguniform("learning_rate", np.log(0.01), np.log(0.1)), } tmp_hyperparams = {} tmp_hyperparams['num_boost_round'] = 100 max_evals = 20 else: space = { "learning_rate": hp.loguniform("learning_rate", np.log(0.01), np.log(0.1)), } tmp_hyperparams = {} update = [ 'max_depth', 'num_leaves', 'reg_alpha', 'reg_lambda', 'min_data_in_leaf', 'min_child_weight', 'min_split_gain' ] for p in update: tmp_hyperparams[p] = self.hyperparams[p] tmp_hyperparams['num_boost_round'] = 500 max_evals = 5 def objective(hyperparams): tmp_hyperparams.update(hyperparams) log('hyper {}'.format(tmp_hyperparams)) model = lgb.train( { **params, **tmp_hyperparams }, train_set=train_data, valid_sets=valid_data, #categorical_feature=categories, early_stopping_rounds=18, verbose_eval=0) score = model.best_score["valid_0"][params["metric"]] # in classification, less is better return {'loss': score, 'status': STATUS_OK} trials = Trials() best = hyperopt.fmin(fn=objective, space=space, trials=trials, algo=tpe.suggest, max_evals=max_evals, verbose=1, rstate=np.random.RandomState(1)) self.hyperparams.update(space_eval(space, best)) log("auc = {}, hyperparams: {}".format( -trials.best_trial['result']['loss'], self.hyperparams))
def reset_model_cache(self): log('clear model cache') del self.model self.model = None gc.collect() K.clear_session()