Esempio n. 1
0
    def early_stop_opt(self, X_train, X_eval, y_train, y_eval, categories):
        if self.is_multi_label:
            lgb_train = lgb.Dataset(X_train, y_train[:, 0])
            lgb_eval = lgb.Dataset(X_eval, y_eval[:, 0], reference=lgb_train)
        else:
            y_train = ohe2cat(y_train)
            y_eval = ohe2cat(y_eval)
            lgb_train = lgb.Dataset(X_train, y_train)
            lgb_eval = lgb.Dataset(X_eval, y_eval, reference=lgb_train)

        self.hyperparams['num_boost_round'] = 1000
        tmp_lr = self.hyperparams.pop('learning_rate')
        self.learning_rates = self.get_log_lr(1000, tmp_lr, tmp_lr * 0.6)

        self._model = lgb.train({
            **self.params,
            **self.hyperparams
        },
                                verbose_eval=20,
                                train_set=lgb_train,
                                valid_sets=lgb_eval,
                                valid_names='eval',
                                early_stopping_rounds=20,
                                learning_rates=self.learning_rates
                                )  #categorical_feature=categories)

        self.hyperparams['num_boost_round'] = self._model.best_iteration
        self.learning_rates = self.learning_rates[:self._model.best_iteration]
Esempio n. 2
0
    def bayes_opt(self, X_train, X_eval, y_train, y_eval, categories):
        if self.is_multi_label:
            dtrain = xgb.DMatrix(X_train, y_train[:, 1])
            dvalid = xgb.DMatrix(X_eval, y_eval[:, 1])
        else:
            dtrain = xgb.DMatrix(X_train, ohe2cat(y_train))
            dvalid = xgb.DMatrix(X_eval, ohe2cat(y_eval))
        space = {
            "learning_rate":
            hp.loguniform("learning_rate", np.log(0.01), np.log(0.1)),
            "max_depth":
            hp.choice("max_depth", [4, 6, 8, 10, 12]),
            "min_child_weight":
            hp.uniform('min_child_weight', 0.01, 1),
            "min_data_in_leaf":
            hp.choice("min_data_in_leaf", np.linspace(10, 100, 20, dtype=int)),
            "gamma":
            hp.uniform("gamma", 0.001, 0.1),
            "lambda":
            hp.uniform("lambda", 0, 1),
            "alpha":
            hp.uniform("alpha", 0, 1),
            "colsample_bytree":
            hp.choice("colsample_bytree", [0.7, 0.9]),
            "colsample_bylevel":
            hp.choice("colsample_bylevel", [0.7, 0.9]),
            "colsample_bynode":
            hp.choice("colsample_bynode", [0.7, 0.9]),
        }

        def objective(hyperparams):
            model = xgb.train({
                **self.params,
                **hyperparams
            },
                              dtrain,
                              num_boost_round=50)

            pred = model.predict(dvalid)
            if self.is_multi_label:
                score = roc_auc_score(y_eval[:, 1], pred[:, 1])
            else:
                score = roc_auc_score(y_eval, pred)

            return {'loss': -score, 'status': STATUS_OK}

        trials = Trials()
        best = hyperopt.fmin(fn=objective,
                             space=space,
                             trials=trials,
                             algo=tpe.suggest,
                             max_evals=10,
                             verbose=1,
                             rstate=np.random.RandomState(1))

        self.hyperparams.update(space_eval(space, best))
Esempio n. 3
0
    def early_stop_opt(self, X_train, X_eval, y_train, y_eval, categories):
        if self.is_multi_label:
            dtrain = xgb.DMatrix(X_train, y_train[:, 1])
            dvalid = xgb.DMatrix(X_eval, y_eval[:, 1])
        else:
            dtrain = xgb.DMatrix(X_train, ohe2cat(y_train))
            dvalid = xgb.DMatrix(X_eval, ohe2cat(y_eval))

        model = xgb.train({**self.params, **self.hyperparams}, dtrain, evals=[(dvalid, 'eval')], num_boost_round=1200,
                          early_stopping_rounds=10) #categorical_feature=categories)


        self.params['num_boost_round'] = model.best_iteration
Esempio n. 4
0
    def early_stop_opt(self, X_train, X_eval, y_train, y_eval, categories):
        if self.is_multi_label:
            y_train = y_train[:, 1]
            y_eval = y_eval[:, 1]
        else:
            y_train = ohe2cat(y_train)
            y_eval = ohe2cat(y_eval)

        model = CatBoostClassifier(**{**self.params, **self.hyperparams})
        model.fit(X_train,
                  y_train,
                  eval_set=[(X_eval, y_eval)],
                  use_best_model=True,
                  verbose=10,
                  early_stopping_rounds=20)

        self.params['iterations'] = model.best_iteration_
Esempio n. 5
0
    def epoch_train(self, dataloader, run_num, is_multi_label=None, info=None, time_remain=None):
        self.is_multi_label = is_multi_label
        X, y, train_idxs, cat = dataloader['X'], dataloader['y'], dataloader['train_idxs'], dataloader['cat_cols']
        train_x, train_y = X.loc[train_idxs], y[train_idxs]

        if info['mode'] == 'bagging':
            self.hyperparams = info['xgb'].copy()
            self.hyperparams['random_seed'] = np.random.randint(0, 2020)
            run_num = self.explore_params_round

        if run_num == self.explore_params_round:
            print('xgb explore_params_round')
            train_x, train_y, val_x, val_y, = self.split_data(train_x, train_y)

            self.import_cols = info['imp_cols']

            if train_x.shape[1] > 300 and train_x.shape[0] > 10000:
                train_x = train_x[self.import_cols[:300]]
                val_x = val_x[self.import_cols[:300]]
                log('explore_params_round sample 300 cols')
                train_x.reset_index(drop=True, inplace=True)
                train_x = train_x.sample(n=10000)
                train_y = train_y[list(train_x.index)]
                log('explore_params_round sample 1w samples')

            elif train_x.shape[0] > 10000:
                train_x.reset_index(drop=True, inplace=True)
                train_x = train_x.sample(n=10000)
                train_y = train_y[list(train_x.index)]
                log('explore_params_round sample 1w samples')

            elif train_x.shape[1] > 300:
                train_x = train_x[self.import_cols[:300]]
                val_x = val_x[self.import_cols[:300]]
                log('explore_params_round sample 300 cols')

            print('shape: ', train_x.shape)
            self.bayes_opt(train_x, val_x, train_y, val_y, cat)
            self.early_stop_opt(train_x, val_x, train_y, val_y, cat)
            info['xgb'] = self.hyperparams.copy()

        train_x, train_y = X.loc[train_idxs], y[train_idxs]
        if run_num == self.all_data_round:
            print('xgb all data round')
            all_train_idxs = dataloader['all_train_idxs']
            train_x = X.loc[all_train_idxs]
            train_y = y[all_train_idxs]
        if not self.is_multi_label:
            xgb_train = xgb.DMatrix(train_x, ohe2cat(train_y))
            self._model = xgb.train({**self.params, **self.hyperparams}, xgb_train)
        else:
            for cls in range(self.num_class):
                cls_y = train_y[:, cls]
                xgb_train = xgb.DMatrix(train_x, cls_y)
                self.models[cls] = self._model = xgb.train({**self.params, **self.hyperparams}, xgb_train)
Esempio n. 6
0
    def train_valid_split_idxs(self, ratio=0.2):
        sss = StratifiedShuffleSplit(n_splits=5, test_size=ratio, random_state=0)
        idxs = np.arange(len(self.y))
        i = 0
        for train, val in sss.split(idxs, ohe2cat(self.y)):
            self.splits[i] = (train, val)
            i += 1

        self.train_idxs, self.val_idxs = self.splits[0]

        self.m = len(self.train_idxs)
        self.auto_sample = AutoSample(self.y[self.train_idxs])
Esempio n. 7
0
    def bayes_opt(self, X_train, X_eval, y_train, y_eval, categories):
        if self.is_multi_label:
            y_train = y_train[:, 1]
            y_eval = y_eval[:, 1]
        else:
            y_train = ohe2cat(y_train)

        space = {
            "learning_rate":
            hp.loguniform("learning_rate", np.log(0.01), np.log(0.1)),
            "depth":
            hp.choice("depth", [4, 6, 8, 10, 12]),
            "l2_leaf_reg":
            hp.uniform('l2_leaf_reg', 0.1, 2),
        }

        def objective(hyperparams):
            hyperparams = self.hyperparams.copy()
            hyperparams['iterations'] = 300
            model = CatBoostClassifier(**{**self.params, **hyperparams})
            model.fit(X_train, y_train)
            pred = model.predict_proba(X_eval)

            if self.is_multi_label:
                score = roc_auc_score(y_eval, pred[:, 1])
            else:
                score = roc_auc_score(y_eval, pred)

            return {'loss': -score, 'status': STATUS_OK}

        trials = Trials()
        best = hyperopt.fmin(fn=objective,
                             space=space,
                             trials=trials,
                             algo=tpe.suggest,
                             max_evals=15,
                             verbose=1,
                             rstate=np.random.RandomState(1))

        self.hyperparams.update(space_eval(space, best))
        log("auc = {}, hyperparams: {}".format(
            -trials.best_trial['result']['loss'], self.hyperparams))
Esempio n. 8
0
    def epoch_train(self, dataloader, run_num, **kwargs):
        X, y, train_idxs = dataloader['X'], dataloader['y'], dataloader['train_idxs']
        train_x, train_y = X.loc[train_idxs].values, y[train_idxs]
        print('epoch train shape')
        print(train_x.shape)

        epochs = 5

        if not self.is_multi_label:
            train_y = ohe2cat(train_y)
        self._model.fit(train_x, train_y,
                        epochs=epochs,
                        #callbacks=callbacks,
                        #validation_data=(val_x, ohe2cat(val_y)),
                        # validation_split=0.2,
                        verbose=1,  # Logs once per epoch.
                        batch_size=128,
                        shuffle=True,
                        # initial_epoch=self.epoch_cnt,
                        # use_multiprocessing=True
                        )
Esempio n. 9
0
    def epoch_train(self, dataloader, run_num, is_multi_label=None, info=None):
        self.is_multi_label = is_multi_label
        X, y, train_idxs, cat = dataloader['X'], dataloader['y'], dataloader[
            'train_idxs'], dataloader['cat_cols']
        train_x, train_y = X.loc[train_idxs], y[train_idxs]

        if info['mode'] == 'bagging':
            self.hyperparams = info['tgb'].copy()
            self.hyperparams['random_seed'] = np.random.randint(0, 2020)
            run_num = self.explore_params_round

        if run_num == self.explore_params_round:
            print('tgb explore_params_round')
            X, y, val_idxs = dataloader['X'], dataloader['y'], dataloader[
                'val_idxs']
            val_x, val_y = X.loc[val_idxs], y[val_idxs]

            self.bayes_opt(train_x, val_x, train_y, val_y, cat)
            #self.early_stop_opt(train_x, val_x, train_y, val_y, cat)

            info['tgb'] = self.hyperparams.copy()

        if run_num == self.all_data_round:
            print('tgb all data round')
            all_train_idxs = dataloader['all_train_idxs']
            train_x = X.loc[all_train_idxs]
            train_y = y[all_train_idxs]

        if self.is_multi_label:
            for cls in range(self.num_class):
                cls_y = train_y[:, cls]
                self.models[cls] = TGBMClassifier(**{
                    **self.params,
                    **self.hyperparams
                })
                self.models[cls].fit(train_x, cls_y)
        else:
            self._model = TGBMClassifier(**{**self.params, **self.hyperparams})
            self._model.fit(train_x, ohe2cat(train_y))
Esempio n. 10
0
    def epoch_train(self,
                    dataloader,
                    run_num,
                    is_multi_label=False,
                    info=None,
                    time_remain=None):
        self.is_multi_label = is_multi_label
        X, y, train_idxs, cat = dataloader['X'], dataloader['y'], dataloader[
            'train_idxs'], dataloader['cat_cols']
        train_x, train_y = X.loc[train_idxs], y[train_idxs]

        if info['mode'] == 'bagging':
            self.hyperparams = info['lgb'].copy()
            self.hyperparams['seed'] = np.random.randint(0, 2020)
            num_leaves = self.hyperparams['num_leaves']
            self.hyperparams['num_leaves'] += np.random.randint(
                -int(num_leaves / 10), int(num_leaves / 10))
            run_num = 0

        if run_num == self.explore_params_round:
            print('lgb explore_params_round')
            train_x, train_y, val_x, val_y, = self.split_data(train_x, train_y)

            self.log_feat_importances()

            if train_x.shape[1] > 300 and train_x.shape[0] > 20000:
                train_x = train_x[self.import_cols[:300]]
                val_x = val_x[self.import_cols[:300]]
                log('explore_params_round sample 300 cols')
                train_x.reset_index(drop=True, inplace=True)
                train_x = train_x.sample(n=20000)
                train_y = train_y[list(train_x.index)]
                log('explore_params_round sample 2w samples')

            elif train_x.shape[0] > 20000:
                train_x.reset_index(drop=True, inplace=True)
                train_x = train_x.sample(n=20000)
                train_y = train_y[list(train_x.index)]
                log('explore_params_round sample 2w samples')

            elif train_x.shape[1] > 300:
                train_x = train_x[self.import_cols[:300]]
                val_x = val_x[self.import_cols[:300]]
                log('explore_params_round sample 300 cols')

            print('shape: ', train_x.shape)

            self.bayes_opt(train_x, val_x, train_y, val_y, cat, phase=1)
            self.early_stop_opt(train_x, val_x, train_y, val_y, cat)
            info['lgb'] = self.hyperparams.copy()
            info['imp_cols'] = self.import_cols

        if run_num == self.ensemble_num:
            print('lgb ensemble_num')
            splits = dataloader['splits']
            for i in range(len(splits)):
                train_idxs, val_idxs = splits[i]
                train_x, train_y = X.loc[train_idxs], y[train_idxs]
                hyperparams = self.hyperparams.copy()
                # num_leaves = hyperparams['num_leaves']
                # num_leaves += np.random.randint(-int(num_leaves/10), int(num_leaves/10))
                # hyperparams['num_leaves'] = num_leaves
                # log('model {} leaves {}'.format(i, num_leaves))
                if self.is_multi_label:
                    self.en_models = defaultdict(list)
                    for cls in range(self.num_class):
                        cls_y = train_y[:, cls]
                        lgb_train = lgb.Dataset(train_x, cls_y)
                        if not self.learning_rates:
                            self.en_models[i].append(
                                lgb.train({
                                    **self.params,
                                    **hyperparams
                                },
                                          train_set=lgb_train))
                        else:
                            self.en_models[i].append(
                                lgb.train({
                                    **self.params,
                                    **hyperparams
                                },
                                          train_set=lgb_train,
                                          learning_rates=self.learning_rates))
                else:
                    lgb_train = lgb.Dataset(train_x, ohe2cat(train_y))
                    if not self.learning_rates:
                        self.en_models[i] = lgb.train(
                            {
                                **self.params,
                                **hyperparams
                            },
                            train_set=lgb_train)
                    else:
                        self.en_models[i] = lgb.train(
                            {
                                **self.params,
                                **hyperparams
                            },
                            train_set=lgb_train,
                            learning_rates=self.learning_rates)
                self.ensemble_pred = True

        else:
            print('lgb norm train')
            train_x, train_y = X.loc[train_idxs], y[train_idxs]
            hyperparams = self.hyperparams.copy()
            log('hyperparams {}'.format(hyperparams))
            if run_num == self.all_data_round_pre or run_num == self.all_data_round:
                print('lgb all data round')
                all_train_idxs = dataloader['all_train_idxs']
                train_x = X.loc[all_train_idxs]
                train_y = y[all_train_idxs]
            print('shape: ', train_x.shape)
            if not is_multi_label:
                lgb_train = lgb.Dataset(train_x, ohe2cat(train_y))
                if not self.learning_rates:
                    self._model = lgb.train({
                        **self.params,
                        **hyperparams
                    },
                                            train_set=lgb_train)
                else:
                    self._model = lgb.train({
                        **self.params,
                        **hyperparams
                    },
                                            train_set=lgb_train,
                                            learning_rates=self.learning_rates)
            else:
                self.params['num_class'] = 2
                for cls in range(self.num_class):
                    cls_y = train_y[:, cls]
                    lgb_train = lgb.Dataset(train_x, cls_y)
                    if not self.learning_rates:
                        self.models[cls] = lgb.train(
                            {
                                **self.params,
                                **self.hyperparams
                            },
                            train_set=lgb_train)
                    else:
                        self.models[cls] = lgb.train(
                            {
                                **self.params,
                                **self.hyperparams
                            },
                            train_set=lgb_train,
                            learning_rates=self.learning_rates)
            self.log_feat_importances()
            if self.imp_nums is not None:
                info['imp_nums'] = self.imp_nums
Esempio n. 11
0
    def bayes_opt(self, X_train, X_eval, y_train, y_eval, categories, phase=1):
        if self.is_multi_label:
            train_data = lgb.Dataset(X_train, label=y_train[:, 0])
            valid_data = lgb.Dataset(X_eval, label=y_eval[:, 0])
        else:
            y_train = ohe2cat(y_train)
            y_eval = ohe2cat(y_eval)
            train_data = lgb.Dataset(X_train, label=y_train)
            valid_data = lgb.Dataset(X_eval, label=y_eval)

        params = self.params

        if phase == 1:
            space = {
                'max_depth':
                hp.choice("max_depth", [-1, 5, 7, 9]),
                "num_leaves":
                hp.choice("num_leaves", np.linspace(20, 61, 10, dtype=int)),
                "reg_alpha":
                hp.uniform("reg_alpha", 0, 1),
                "reg_lambda":
                hp.uniform("reg_lambda", 0, 1),
                "min_child_samples":
                hp.choice("min_data_in_leaf",
                          np.linspace(10, 120, 10, dtype=int)),
                "min_child_weight":
                hp.uniform('min_child_weight', 0.01, 1),
                "min_split_gain":
                hp.uniform('min_split_gain', 0.001, 0.1),
                'colsample_bytree':
                hp.choice("colsample_bytree", [0.7, 0.9]),
                "learning_rate":
                hp.loguniform("learning_rate", np.log(0.01), np.log(0.1)),
            }
            tmp_hyperparams = {}
            tmp_hyperparams['num_boost_round'] = 100
            max_evals = 20

        else:
            space = {
                "learning_rate":
                hp.loguniform("learning_rate", np.log(0.01), np.log(0.1)),
            }
            tmp_hyperparams = {}
            update = [
                'max_depth', 'num_leaves', 'reg_alpha', 'reg_lambda',
                'min_data_in_leaf', 'min_child_weight', 'min_split_gain'
            ]

            for p in update:
                tmp_hyperparams[p] = self.hyperparams[p]

            tmp_hyperparams['num_boost_round'] = 500
            max_evals = 5

        def objective(hyperparams):
            tmp_hyperparams.update(hyperparams)
            model = lgb.train(
                {
                    **params,
                    **tmp_hyperparams
                },
                train_set=train_data,
                valid_sets=valid_data,
                #categorical_feature=categories,
                early_stopping_rounds=18,
                verbose_eval=0)

            score = model.best_score["valid_0"][params["metric"]]

            # in classification, less is better
            return {'loss': score, 'status': STATUS_OK}

        trials = Trials()
        best = hyperopt.fmin(fn=objective,
                             space=space,
                             trials=trials,
                             algo=tpe.suggest,
                             max_evals=max_evals,
                             verbose=1,
                             rstate=np.random.RandomState(1))
        self.hyperparams.update(space_eval(space, best))
Esempio n. 12
0
    def epoch_train(self, dataloader, run_num, **kwargs):
        if self.train_gen is None:
            X, y, cats = dataloader['X'], dataloader['y'], dataloader[
                'cat_cols']
            train_idxs, val_idxs, test_idxs = dataloader[
                'train_idxs'], dataloader['val_idxs'], dataloader['test_idxs']

            train_x, train_y = X.loc[train_idxs], ohe2cat(
                y[train_idxs]).reshape(len(train_idxs), 1)
            val_x, valy = X.loc[val_idxs], ohe2cat(y[val_idxs]).reshape(
                len(val_idxs), 1)
            test_x = X.loc[test_idxs]

            train_x.reset_index(drop=True, inplace=True)
            val_x.reset_index(drop=True, inplace=True)
            test_x.reset_index(drop=True, inplace=True)

            self.train_gen = DataLoader(DataGen(train_x,
                                                train_y,
                                                cats,
                                                mode='train'),
                                        batch_size=32,
                                        shuffle=True,
                                        num_workers=4)

            self.val_gen = DataLoader(DataGen(val_x, None, cats, mode='val'),
                                      batch_size=100,
                                      shuffle=False,
                                      num_workers=4)

            self.test_gen = DataLoader(DataGen(test_x, None, cats,
                                               mode='test'),
                                       batch_size=100,
                                       shuffle=False,
                                       num_workers=4)

            emb_szs = [[X[col].nunique(), 4] for col in cats]
            n_cont = X.shape[1] - len(cats)
            print('input len', 4 * len(emb_szs) + n_cont)
            out_sz = self.num_classes
            layers = [500, 500]
            self.model = TabularModel(emb_szs, n_cont, out_sz,
                                      layers).to(self.device)

            self.criterion = nn.CrossEntropyLoss()
            self.optimizer = optim.SGD(self.model.parameters(),
                                       lr=0.001,
                                       momentum=0.9)

        epochs = 10
        for epoch in range(epochs):
            running_loss = 0.0
            for i, data in enumerate(self.train_gen, 0):
                cat_feats, num_feats, labels = data[0].to(
                    self.device), data[1].to(self.device), data[2].to(
                        self.device)
                self.optimizer.zero_grad()
                preds = self.model(cat_feats, num_feats)

                loss = self.criterion(preds, labels.squeeze())
                loss.backward()
                self.optimizer.step()

                running_loss += loss.item()

                if i % 100 == 99:  # print every 2000 mini-batches
                    print('[%d, %5d] loss: %.3f' %
                          (epoch + 1, i + 1, running_loss / 100))
                    running_loss = 0.0
        import pdb
        pdb.set_trace()
Esempio n. 13
0
 def epoch_train(self, dataloader, run_num):
     X, y, train_idxs = dataloader['X'], dataloader['y'], dataloader[
         'train_idxs']
     train_x, train_y = X.loc[train_idxs], y[train_idxs]
     self._model.fit(train_x, ohe2cat(train_y))
Esempio n. 14
0
    def epoch_train(self,
                    dataloader,
                    run_num,
                    is_multi_label=None,
                    info=None,
                    time_remain=None):
        self.is_multi_label = is_multi_label
        X, y, train_idxs, cat = dataloader['X'], dataloader['y'], dataloader[
            'train_idxs'], dataloader['cat_cols']
        train_x, train_y = X.loc[train_idxs], y[train_idxs]

        if info['mode'] == 'bagging':
            self.hyperparams = info['cb'].copy()
            self.hyperparams['random_seed'] = np.random.randint(0, 2020)
            run_num = self.explore_params_round

        if run_num == self.explore_params_round:
            train_x, train_y, val_x, val_y, = self.split_data(train_x, train_y)

            self.import_cols = info['imp_cols']

            if train_x.shape[1] > 300 and train_x.shape[0] > 20000:
                train_x = train_x[self.import_cols[:300]]
                val_x = val_x[self.import_cols[:300]]
                train_x.reset_index(drop=True, inplace=True)
                train_x = train_x.sample(n=20000)
                train_y = train_y[list(train_x.index)]

            elif train_x.shape[0] > 20000:
                train_x.reset_index(drop=True, inplace=True)
                train_x = train_x.sample(n=20000)
                train_y = train_y[list(train_x.index)]

            elif train_x.shape[1] > 300:
                train_x = train_x[self.import_cols[:300]]
                val_x = val_x[self.import_cols[:300]]

            self.bayes_opt(train_x, val_x, train_y, val_y, cat)
            self.early_stop_opt(train_x, val_x, train_y, val_y, cat)

            info['cb'] = self.hyperparams.copy()

        train_x, train_y = X.loc[train_idxs], y[train_idxs]
        if run_num == self.all_data_round:
            all_train_idxs = dataloader['all_train_idxs']
            train_x = X.loc[all_train_idxs]
            train_y = y[all_train_idxs]

        if self.is_multi_label:
            for cls in range(self.num_class):
                cls_y = train_y[:, cls]
                self.models[cls] = CatBoostClassifier(**{
                    **self.params,
                    **self.hyperparams
                })
                self.models[cls].fit(train_x, cls_y)
        else:
            self._model = CatBoostClassifier(**{
                **self.params,
                **self.hyperparams
            })
            self._model.fit(train_x, ohe2cat(train_y))