def early_stop_opt(self, X_train, X_eval, y_train, y_eval, categories): if self.is_multi_label: lgb_train = lgb.Dataset(X_train, y_train[:, 0]) lgb_eval = lgb.Dataset(X_eval, y_eval[:, 0], reference=lgb_train) else: y_train = ohe2cat(y_train) y_eval = ohe2cat(y_eval) lgb_train = lgb.Dataset(X_train, y_train) lgb_eval = lgb.Dataset(X_eval, y_eval, reference=lgb_train) self.hyperparams['num_boost_round'] = 1000 tmp_lr = self.hyperparams.pop('learning_rate') self.learning_rates = self.get_log_lr(1000, tmp_lr, tmp_lr * 0.6) self._model = lgb.train({ **self.params, **self.hyperparams }, verbose_eval=20, train_set=lgb_train, valid_sets=lgb_eval, valid_names='eval', early_stopping_rounds=20, learning_rates=self.learning_rates ) #categorical_feature=categories) self.hyperparams['num_boost_round'] = self._model.best_iteration self.learning_rates = self.learning_rates[:self._model.best_iteration]
def bayes_opt(self, X_train, X_eval, y_train, y_eval, categories): if self.is_multi_label: dtrain = xgb.DMatrix(X_train, y_train[:, 1]) dvalid = xgb.DMatrix(X_eval, y_eval[:, 1]) else: dtrain = xgb.DMatrix(X_train, ohe2cat(y_train)) dvalid = xgb.DMatrix(X_eval, ohe2cat(y_eval)) space = { "learning_rate": hp.loguniform("learning_rate", np.log(0.01), np.log(0.1)), "max_depth": hp.choice("max_depth", [4, 6, 8, 10, 12]), "min_child_weight": hp.uniform('min_child_weight', 0.01, 1), "min_data_in_leaf": hp.choice("min_data_in_leaf", np.linspace(10, 100, 20, dtype=int)), "gamma": hp.uniform("gamma", 0.001, 0.1), "lambda": hp.uniform("lambda", 0, 1), "alpha": hp.uniform("alpha", 0, 1), "colsample_bytree": hp.choice("colsample_bytree", [0.7, 0.9]), "colsample_bylevel": hp.choice("colsample_bylevel", [0.7, 0.9]), "colsample_bynode": hp.choice("colsample_bynode", [0.7, 0.9]), } def objective(hyperparams): model = xgb.train({ **self.params, **hyperparams }, dtrain, num_boost_round=50) pred = model.predict(dvalid) if self.is_multi_label: score = roc_auc_score(y_eval[:, 1], pred[:, 1]) else: score = roc_auc_score(y_eval, pred) return {'loss': -score, 'status': STATUS_OK} trials = Trials() best = hyperopt.fmin(fn=objective, space=space, trials=trials, algo=tpe.suggest, max_evals=10, verbose=1, rstate=np.random.RandomState(1)) self.hyperparams.update(space_eval(space, best))
def early_stop_opt(self, X_train, X_eval, y_train, y_eval, categories): if self.is_multi_label: dtrain = xgb.DMatrix(X_train, y_train[:, 1]) dvalid = xgb.DMatrix(X_eval, y_eval[:, 1]) else: dtrain = xgb.DMatrix(X_train, ohe2cat(y_train)) dvalid = xgb.DMatrix(X_eval, ohe2cat(y_eval)) model = xgb.train({**self.params, **self.hyperparams}, dtrain, evals=[(dvalid, 'eval')], num_boost_round=1200, early_stopping_rounds=10) #categorical_feature=categories) self.params['num_boost_round'] = model.best_iteration
def early_stop_opt(self, X_train, X_eval, y_train, y_eval, categories): if self.is_multi_label: y_train = y_train[:, 1] y_eval = y_eval[:, 1] else: y_train = ohe2cat(y_train) y_eval = ohe2cat(y_eval) model = CatBoostClassifier(**{**self.params, **self.hyperparams}) model.fit(X_train, y_train, eval_set=[(X_eval, y_eval)], use_best_model=True, verbose=10, early_stopping_rounds=20) self.params['iterations'] = model.best_iteration_
def epoch_train(self, dataloader, run_num, is_multi_label=None, info=None, time_remain=None): self.is_multi_label = is_multi_label X, y, train_idxs, cat = dataloader['X'], dataloader['y'], dataloader['train_idxs'], dataloader['cat_cols'] train_x, train_y = X.loc[train_idxs], y[train_idxs] if info['mode'] == 'bagging': self.hyperparams = info['xgb'].copy() self.hyperparams['random_seed'] = np.random.randint(0, 2020) run_num = self.explore_params_round if run_num == self.explore_params_round: print('xgb explore_params_round') train_x, train_y, val_x, val_y, = self.split_data(train_x, train_y) self.import_cols = info['imp_cols'] if train_x.shape[1] > 300 and train_x.shape[0] > 10000: train_x = train_x[self.import_cols[:300]] val_x = val_x[self.import_cols[:300]] log('explore_params_round sample 300 cols') train_x.reset_index(drop=True, inplace=True) train_x = train_x.sample(n=10000) train_y = train_y[list(train_x.index)] log('explore_params_round sample 1w samples') elif train_x.shape[0] > 10000: train_x.reset_index(drop=True, inplace=True) train_x = train_x.sample(n=10000) train_y = train_y[list(train_x.index)] log('explore_params_round sample 1w samples') elif train_x.shape[1] > 300: train_x = train_x[self.import_cols[:300]] val_x = val_x[self.import_cols[:300]] log('explore_params_round sample 300 cols') print('shape: ', train_x.shape) self.bayes_opt(train_x, val_x, train_y, val_y, cat) self.early_stop_opt(train_x, val_x, train_y, val_y, cat) info['xgb'] = self.hyperparams.copy() train_x, train_y = X.loc[train_idxs], y[train_idxs] if run_num == self.all_data_round: print('xgb all data round') all_train_idxs = dataloader['all_train_idxs'] train_x = X.loc[all_train_idxs] train_y = y[all_train_idxs] if not self.is_multi_label: xgb_train = xgb.DMatrix(train_x, ohe2cat(train_y)) self._model = xgb.train({**self.params, **self.hyperparams}, xgb_train) else: for cls in range(self.num_class): cls_y = train_y[:, cls] xgb_train = xgb.DMatrix(train_x, cls_y) self.models[cls] = self._model = xgb.train({**self.params, **self.hyperparams}, xgb_train)
def train_valid_split_idxs(self, ratio=0.2): sss = StratifiedShuffleSplit(n_splits=5, test_size=ratio, random_state=0) idxs = np.arange(len(self.y)) i = 0 for train, val in sss.split(idxs, ohe2cat(self.y)): self.splits[i] = (train, val) i += 1 self.train_idxs, self.val_idxs = self.splits[0] self.m = len(self.train_idxs) self.auto_sample = AutoSample(self.y[self.train_idxs])
def bayes_opt(self, X_train, X_eval, y_train, y_eval, categories): if self.is_multi_label: y_train = y_train[:, 1] y_eval = y_eval[:, 1] else: y_train = ohe2cat(y_train) space = { "learning_rate": hp.loguniform("learning_rate", np.log(0.01), np.log(0.1)), "depth": hp.choice("depth", [4, 6, 8, 10, 12]), "l2_leaf_reg": hp.uniform('l2_leaf_reg', 0.1, 2), } def objective(hyperparams): hyperparams = self.hyperparams.copy() hyperparams['iterations'] = 300 model = CatBoostClassifier(**{**self.params, **hyperparams}) model.fit(X_train, y_train) pred = model.predict_proba(X_eval) if self.is_multi_label: score = roc_auc_score(y_eval, pred[:, 1]) else: score = roc_auc_score(y_eval, pred) return {'loss': -score, 'status': STATUS_OK} trials = Trials() best = hyperopt.fmin(fn=objective, space=space, trials=trials, algo=tpe.suggest, max_evals=15, verbose=1, rstate=np.random.RandomState(1)) self.hyperparams.update(space_eval(space, best)) log("auc = {}, hyperparams: {}".format( -trials.best_trial['result']['loss'], self.hyperparams))
def epoch_train(self, dataloader, run_num, **kwargs): X, y, train_idxs = dataloader['X'], dataloader['y'], dataloader['train_idxs'] train_x, train_y = X.loc[train_idxs].values, y[train_idxs] print('epoch train shape') print(train_x.shape) epochs = 5 if not self.is_multi_label: train_y = ohe2cat(train_y) self._model.fit(train_x, train_y, epochs=epochs, #callbacks=callbacks, #validation_data=(val_x, ohe2cat(val_y)), # validation_split=0.2, verbose=1, # Logs once per epoch. batch_size=128, shuffle=True, # initial_epoch=self.epoch_cnt, # use_multiprocessing=True )
def epoch_train(self, dataloader, run_num, is_multi_label=None, info=None): self.is_multi_label = is_multi_label X, y, train_idxs, cat = dataloader['X'], dataloader['y'], dataloader[ 'train_idxs'], dataloader['cat_cols'] train_x, train_y = X.loc[train_idxs], y[train_idxs] if info['mode'] == 'bagging': self.hyperparams = info['tgb'].copy() self.hyperparams['random_seed'] = np.random.randint(0, 2020) run_num = self.explore_params_round if run_num == self.explore_params_round: print('tgb explore_params_round') X, y, val_idxs = dataloader['X'], dataloader['y'], dataloader[ 'val_idxs'] val_x, val_y = X.loc[val_idxs], y[val_idxs] self.bayes_opt(train_x, val_x, train_y, val_y, cat) #self.early_stop_opt(train_x, val_x, train_y, val_y, cat) info['tgb'] = self.hyperparams.copy() if run_num == self.all_data_round: print('tgb all data round') all_train_idxs = dataloader['all_train_idxs'] train_x = X.loc[all_train_idxs] train_y = y[all_train_idxs] if self.is_multi_label: for cls in range(self.num_class): cls_y = train_y[:, cls] self.models[cls] = TGBMClassifier(**{ **self.params, **self.hyperparams }) self.models[cls].fit(train_x, cls_y) else: self._model = TGBMClassifier(**{**self.params, **self.hyperparams}) self._model.fit(train_x, ohe2cat(train_y))
def epoch_train(self, dataloader, run_num, is_multi_label=False, info=None, time_remain=None): self.is_multi_label = is_multi_label X, y, train_idxs, cat = dataloader['X'], dataloader['y'], dataloader[ 'train_idxs'], dataloader['cat_cols'] train_x, train_y = X.loc[train_idxs], y[train_idxs] if info['mode'] == 'bagging': self.hyperparams = info['lgb'].copy() self.hyperparams['seed'] = np.random.randint(0, 2020) num_leaves = self.hyperparams['num_leaves'] self.hyperparams['num_leaves'] += np.random.randint( -int(num_leaves / 10), int(num_leaves / 10)) run_num = 0 if run_num == self.explore_params_round: print('lgb explore_params_round') train_x, train_y, val_x, val_y, = self.split_data(train_x, train_y) self.log_feat_importances() if train_x.shape[1] > 300 and train_x.shape[0] > 20000: train_x = train_x[self.import_cols[:300]] val_x = val_x[self.import_cols[:300]] log('explore_params_round sample 300 cols') train_x.reset_index(drop=True, inplace=True) train_x = train_x.sample(n=20000) train_y = train_y[list(train_x.index)] log('explore_params_round sample 2w samples') elif train_x.shape[0] > 20000: train_x.reset_index(drop=True, inplace=True) train_x = train_x.sample(n=20000) train_y = train_y[list(train_x.index)] log('explore_params_round sample 2w samples') elif train_x.shape[1] > 300: train_x = train_x[self.import_cols[:300]] val_x = val_x[self.import_cols[:300]] log('explore_params_round sample 300 cols') print('shape: ', train_x.shape) self.bayes_opt(train_x, val_x, train_y, val_y, cat, phase=1) self.early_stop_opt(train_x, val_x, train_y, val_y, cat) info['lgb'] = self.hyperparams.copy() info['imp_cols'] = self.import_cols if run_num == self.ensemble_num: print('lgb ensemble_num') splits = dataloader['splits'] for i in range(len(splits)): train_idxs, val_idxs = splits[i] train_x, train_y = X.loc[train_idxs], y[train_idxs] hyperparams = self.hyperparams.copy() # num_leaves = hyperparams['num_leaves'] # num_leaves += np.random.randint(-int(num_leaves/10), int(num_leaves/10)) # hyperparams['num_leaves'] = num_leaves # log('model {} leaves {}'.format(i, num_leaves)) if self.is_multi_label: self.en_models = defaultdict(list) for cls in range(self.num_class): cls_y = train_y[:, cls] lgb_train = lgb.Dataset(train_x, cls_y) if not self.learning_rates: self.en_models[i].append( lgb.train({ **self.params, **hyperparams }, train_set=lgb_train)) else: self.en_models[i].append( lgb.train({ **self.params, **hyperparams }, train_set=lgb_train, learning_rates=self.learning_rates)) else: lgb_train = lgb.Dataset(train_x, ohe2cat(train_y)) if not self.learning_rates: self.en_models[i] = lgb.train( { **self.params, **hyperparams }, train_set=lgb_train) else: self.en_models[i] = lgb.train( { **self.params, **hyperparams }, train_set=lgb_train, learning_rates=self.learning_rates) self.ensemble_pred = True else: print('lgb norm train') train_x, train_y = X.loc[train_idxs], y[train_idxs] hyperparams = self.hyperparams.copy() log('hyperparams {}'.format(hyperparams)) if run_num == self.all_data_round_pre or run_num == self.all_data_round: print('lgb all data round') all_train_idxs = dataloader['all_train_idxs'] train_x = X.loc[all_train_idxs] train_y = y[all_train_idxs] print('shape: ', train_x.shape) if not is_multi_label: lgb_train = lgb.Dataset(train_x, ohe2cat(train_y)) if not self.learning_rates: self._model = lgb.train({ **self.params, **hyperparams }, train_set=lgb_train) else: self._model = lgb.train({ **self.params, **hyperparams }, train_set=lgb_train, learning_rates=self.learning_rates) else: self.params['num_class'] = 2 for cls in range(self.num_class): cls_y = train_y[:, cls] lgb_train = lgb.Dataset(train_x, cls_y) if not self.learning_rates: self.models[cls] = lgb.train( { **self.params, **self.hyperparams }, train_set=lgb_train) else: self.models[cls] = lgb.train( { **self.params, **self.hyperparams }, train_set=lgb_train, learning_rates=self.learning_rates) self.log_feat_importances() if self.imp_nums is not None: info['imp_nums'] = self.imp_nums
def bayes_opt(self, X_train, X_eval, y_train, y_eval, categories, phase=1): if self.is_multi_label: train_data = lgb.Dataset(X_train, label=y_train[:, 0]) valid_data = lgb.Dataset(X_eval, label=y_eval[:, 0]) else: y_train = ohe2cat(y_train) y_eval = ohe2cat(y_eval) train_data = lgb.Dataset(X_train, label=y_train) valid_data = lgb.Dataset(X_eval, label=y_eval) params = self.params if phase == 1: space = { 'max_depth': hp.choice("max_depth", [-1, 5, 7, 9]), "num_leaves": hp.choice("num_leaves", np.linspace(20, 61, 10, dtype=int)), "reg_alpha": hp.uniform("reg_alpha", 0, 1), "reg_lambda": hp.uniform("reg_lambda", 0, 1), "min_child_samples": hp.choice("min_data_in_leaf", np.linspace(10, 120, 10, dtype=int)), "min_child_weight": hp.uniform('min_child_weight', 0.01, 1), "min_split_gain": hp.uniform('min_split_gain', 0.001, 0.1), 'colsample_bytree': hp.choice("colsample_bytree", [0.7, 0.9]), "learning_rate": hp.loguniform("learning_rate", np.log(0.01), np.log(0.1)), } tmp_hyperparams = {} tmp_hyperparams['num_boost_round'] = 100 max_evals = 20 else: space = { "learning_rate": hp.loguniform("learning_rate", np.log(0.01), np.log(0.1)), } tmp_hyperparams = {} update = [ 'max_depth', 'num_leaves', 'reg_alpha', 'reg_lambda', 'min_data_in_leaf', 'min_child_weight', 'min_split_gain' ] for p in update: tmp_hyperparams[p] = self.hyperparams[p] tmp_hyperparams['num_boost_round'] = 500 max_evals = 5 def objective(hyperparams): tmp_hyperparams.update(hyperparams) model = lgb.train( { **params, **tmp_hyperparams }, train_set=train_data, valid_sets=valid_data, #categorical_feature=categories, early_stopping_rounds=18, verbose_eval=0) score = model.best_score["valid_0"][params["metric"]] # in classification, less is better return {'loss': score, 'status': STATUS_OK} trials = Trials() best = hyperopt.fmin(fn=objective, space=space, trials=trials, algo=tpe.suggest, max_evals=max_evals, verbose=1, rstate=np.random.RandomState(1)) self.hyperparams.update(space_eval(space, best))
def epoch_train(self, dataloader, run_num, **kwargs): if self.train_gen is None: X, y, cats = dataloader['X'], dataloader['y'], dataloader[ 'cat_cols'] train_idxs, val_idxs, test_idxs = dataloader[ 'train_idxs'], dataloader['val_idxs'], dataloader['test_idxs'] train_x, train_y = X.loc[train_idxs], ohe2cat( y[train_idxs]).reshape(len(train_idxs), 1) val_x, valy = X.loc[val_idxs], ohe2cat(y[val_idxs]).reshape( len(val_idxs), 1) test_x = X.loc[test_idxs] train_x.reset_index(drop=True, inplace=True) val_x.reset_index(drop=True, inplace=True) test_x.reset_index(drop=True, inplace=True) self.train_gen = DataLoader(DataGen(train_x, train_y, cats, mode='train'), batch_size=32, shuffle=True, num_workers=4) self.val_gen = DataLoader(DataGen(val_x, None, cats, mode='val'), batch_size=100, shuffle=False, num_workers=4) self.test_gen = DataLoader(DataGen(test_x, None, cats, mode='test'), batch_size=100, shuffle=False, num_workers=4) emb_szs = [[X[col].nunique(), 4] for col in cats] n_cont = X.shape[1] - len(cats) print('input len', 4 * len(emb_szs) + n_cont) out_sz = self.num_classes layers = [500, 500] self.model = TabularModel(emb_szs, n_cont, out_sz, layers).to(self.device) self.criterion = nn.CrossEntropyLoss() self.optimizer = optim.SGD(self.model.parameters(), lr=0.001, momentum=0.9) epochs = 10 for epoch in range(epochs): running_loss = 0.0 for i, data in enumerate(self.train_gen, 0): cat_feats, num_feats, labels = data[0].to( self.device), data[1].to(self.device), data[2].to( self.device) self.optimizer.zero_grad() preds = self.model(cat_feats, num_feats) loss = self.criterion(preds, labels.squeeze()) loss.backward() self.optimizer.step() running_loss += loss.item() if i % 100 == 99: # print every 2000 mini-batches print('[%d, %5d] loss: %.3f' % (epoch + 1, i + 1, running_loss / 100)) running_loss = 0.0 import pdb pdb.set_trace()
def epoch_train(self, dataloader, run_num): X, y, train_idxs = dataloader['X'], dataloader['y'], dataloader[ 'train_idxs'] train_x, train_y = X.loc[train_idxs], y[train_idxs] self._model.fit(train_x, ohe2cat(train_y))
def epoch_train(self, dataloader, run_num, is_multi_label=None, info=None, time_remain=None): self.is_multi_label = is_multi_label X, y, train_idxs, cat = dataloader['X'], dataloader['y'], dataloader[ 'train_idxs'], dataloader['cat_cols'] train_x, train_y = X.loc[train_idxs], y[train_idxs] if info['mode'] == 'bagging': self.hyperparams = info['cb'].copy() self.hyperparams['random_seed'] = np.random.randint(0, 2020) run_num = self.explore_params_round if run_num == self.explore_params_round: train_x, train_y, val_x, val_y, = self.split_data(train_x, train_y) self.import_cols = info['imp_cols'] if train_x.shape[1] > 300 and train_x.shape[0] > 20000: train_x = train_x[self.import_cols[:300]] val_x = val_x[self.import_cols[:300]] train_x.reset_index(drop=True, inplace=True) train_x = train_x.sample(n=20000) train_y = train_y[list(train_x.index)] elif train_x.shape[0] > 20000: train_x.reset_index(drop=True, inplace=True) train_x = train_x.sample(n=20000) train_y = train_y[list(train_x.index)] elif train_x.shape[1] > 300: train_x = train_x[self.import_cols[:300]] val_x = val_x[self.import_cols[:300]] self.bayes_opt(train_x, val_x, train_y, val_y, cat) self.early_stop_opt(train_x, val_x, train_y, val_y, cat) info['cb'] = self.hyperparams.copy() train_x, train_y = X.loc[train_idxs], y[train_idxs] if run_num == self.all_data_round: all_train_idxs = dataloader['all_train_idxs'] train_x = X.loc[all_train_idxs] train_y = y[all_train_idxs] if self.is_multi_label: for cls in range(self.num_class): cls_y = train_y[:, cls] self.models[cls] = CatBoostClassifier(**{ **self.params, **self.hyperparams }) self.models[cls].fit(train_x, cls_y) else: self._model = CatBoostClassifier(**{ **self.params, **self.hyperparams }) self._model.fit(train_x, ohe2cat(train_y))