Exemple #1
0
    def _xgboost(self):

        params = self.params

        if self.problem_type == 'regression':
            model = XGBRegressor(**params)

            model.fit(self.training_data.drop(["TARGET"], axis=1),
                      self.training_data['TARGET'])

            preds = model.predict(self.validation_data.drop(['TARGET'],
                                                            axis=1))

            return model, preds, self.validation_data['TARGET']

        elif self.problem_type == 'classification':
            model = XGBClassifier(**params)

            model.fit(self.training_data.drop(["TARGET"], axis=1),
                      self.training_data['TARGET'])

            preds = model.predict_proba(
                self.validation_data.drop(['TARGET'], axis=1))[:, 1]

            return model, preds, self.validation_data['TARGET']

        elif self.problem_type == 'multiclass':
            model = XGBClassifier(**params)

            model.fit(self.training_data.drop(["TARGET"], axis=1),
                      self.training_data['TARGET'])

            preds = model.predict_proba(
                self.validation_data.drop(['TARGET'], axis=1))
            preds = [np.argmax(p) for p in preds]
            return model, preds, self.validation_data['TARGET']

        else:
            raise Exception("Problem Type not supported")
Exemple #2
0
class XGboost:
    """docstring for XGboost"""
    def __init__(self, model_configs, task_type='Regression'):
        self.model_configs = model_configs

        self.max_depth = model_configs['max_depth']
        self.learning_rate = model_configs['learning_rate']
        self.n_estimators = model_configs['n_estimators']
        self.objective = model_configs['objective']
        self.booster = model_configs['booster']
        self.subsample = model_configs['subsample']
        self.colsample_bylevel = model_configs['colsample_bylevel']
        self.colsample_bytree = model_configs['colsample_bytree']
        self.min_child_weight = model_configs['min_child_weight']
        self.reg_alpha = model_configs['reg_alpha']
        self.reg_lambda = model_configs['reg_lambda']
        self.scale_pos_weight = model_configs['scale_pos_weight']
        self.max_delta_step = model_configs['max_delta_step']

        self.random_seed = model_configs['random_seed']
        self.eval_metric = model_configs['early_stopping']['eval_metric']
        self.early_stopping_round = model_configs['early_stopping']['round']

        self.task_type = task_type
        np.random.seed(seed=self.random_seed)

        self.setup_model()

    def setup_model(self):
        if self.task_type == 'Classification':
            self.model = XGBClassifier(
                max_depth=self.max_depth,
                learning_rate=self.learning_rate,
                n_estimators=self.n_estimators,
                objective=self.objective,
                booster=self.booster,
                subsample=self.subsample,
                colsample_bylevel=self.colsample_bylevel,
                colsample_bytree=self.colsample_bytree,
                min_child_weight=self.min_child_weight,
                reg_alpha=self.reg_alpha,
                reg_lambda=self.reg_lambda,
                scale_pos_weight=self.scale_pos_weight,
                max_delta_step=self.max_delta_step,
                random_state=self.random_seed,
                silent=False,
                n_jobs=8)

        elif self.task_type == 'Regression':
            self.model = XGBRegressor(max_depth=self.max_depth,
                                      learning_rate=self.learning_rate,
                                      n_estimators=self.n_estimators,
                                      objective=self.objective,
                                      booster=self.booster,
                                      subsample=self.subsample,
                                      colsample_bylevel=self.colsample_bylevel,
                                      colsample_bytree=self.colsample_bytree,
                                      min_child_weight=self.min_child_weight,
                                      reg_alpha=self.reg_alpha,
                                      reg_lambda=self.reg_lambda,
                                      scale_pos_weight=self.scale_pos_weight,
                                      random_state=self.random_seed,
                                      silent=False,
                                      n_jobs=8)

        else:
            raise "Task type Error!"

    def fit_model(self, train_loader):
        self.model.fit(train_loader[0], train_loader[1])

    def predict(self, test_loader):
        if self.task_type == 'Classification':
            return self.model.predict_proba(test_loader[0])
        else:
            return self.model.predict(test_loader[0])
Exemple #3
0
class Stacking(BaseEnsembleModel):
    def __init__(self,
                 stats,
                 ensemble_size: int,
                 task_type: int,
                 metric: _BaseScorer,
                 output_dir=None,
                 meta_learner='xgboost',
                 kfold=5):
        super().__init__(stats=stats,
                         ensemble_method='blending',
                         ensemble_size=ensemble_size,
                         task_type=task_type,
                         metric=metric,
                         output_dir=output_dir)

        self.kfold = kfold
        try:
            from xgboost import XGBClassifier
        except:
            warnings.warn(
                "Xgboost is not imported! Blending will use linear model instead!"
            )
            meta_learner = 'linear'

        # We use Xgboost as default meta-learner
        if self.task_type in CLS_TASKS:
            if meta_learner == 'linear':
                from sklearn.linear_model.logistic import LogisticRegression
                self.meta_learner = LogisticRegression(max_iter=1000)
            elif meta_learner == 'gb':
                from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier
                self.meta_learner = GradientBoostingClassifier(
                    learning_rate=0.05,
                    subsample=0.7,
                    max_depth=4,
                    n_estimators=250)
            elif meta_learner == 'xgboost':
                from xgboost import XGBClassifier
                self.meta_learner = XGBClassifier(max_depth=4,
                                                  learning_rate=0.05,
                                                  n_estimators=150)
        else:
            if meta_learner == 'linear':
                from sklearn.linear_model import LinearRegression
                self.meta_learner = LinearRegression()
            elif meta_learner == 'xgboost':
                from xgboost import XGBRegressor
                self.meta_learner = XGBRegressor(max_depth=4,
                                                 learning_rate=0.05,
                                                 n_estimators=70)

    def fit(self, data):
        # Split training data for phase 1 and phase 2
        if self.task_type in CLS_TASKS:
            kf = StratifiedKFold(n_splits=self.kfold)
        else:
            kf = KFold(n_splits=self.kfold)

        # Train basic models using a part of training data
        model_cnt = 0
        suc_cnt = 0
        feature_p2 = None
        for algo_id in self.stats["include_algorithms"]:
            train_list = self.stats[algo_id]['train_data_list']
            configs = self.stats[algo_id]['configurations']
            for idx in range(len(train_list)):
                X, y = train_list[idx].data
                for _config in configs:
                    if self.base_model_mask[model_cnt] == 1:
                        for j, (train, test) in enumerate(kf.split(X, y)):
                            x_p1, x_p2, y_p1, _ = X[train], X[test], y[
                                train], y[test]
                            estimator = fetch_predict_estimator(
                                self.task_type, _config, x_p1, y_p1)
                            with open(
                                    os.path.join(
                                        self.output_dir, '%s-model%d_part%d' %
                                        (self.timestamp, model_cnt, j)),
                                    'wb') as f:
                                pkl.dump(estimator, f)
                            if self.task_type in CLS_TASKS:
                                pred = estimator.predict_proba(x_p2)
                                n_dim = np.array(pred).shape[1]
                                if n_dim == 2:
                                    # Binary classificaion
                                    n_dim = 1
                                # Initialize training matrix for phase 2
                                if feature_p2 is None:
                                    num_samples = len(train) + len(test)
                                    feature_p2 = np.zeros(
                                        (num_samples,
                                         self.ensemble_size * n_dim))
                                if n_dim == 1:
                                    feature_p2[test,
                                               suc_cnt * n_dim:(suc_cnt + 1) *
                                               n_dim] = pred[:, 1:2]
                                else:
                                    feature_p2[test,
                                               suc_cnt * n_dim:(suc_cnt + 1) *
                                               n_dim] = pred
                            else:
                                pred = estimator.predict(x_p2).reshape(-1, 1)
                                n_dim = 1
                                # Initialize training matrix for phase 2
                                if feature_p2 is None:
                                    num_samples = len(train) + len(test)
                                    feature_p2 = np.zeros(
                                        (num_samples,
                                         self.ensemble_size * n_dim))
                                feature_p2[test, suc_cnt *
                                           n_dim:(suc_cnt + 1) * n_dim] = pred
                        suc_cnt += 1
                    model_cnt += 1
        # Train model for stacking using the other part of training data
        self.meta_learner.fit(feature_p2, y)
        return self

    def get_feature(self, data, solvers):
        # Predict the labels via stacking
        feature_p2 = None
        model_cnt = 0
        suc_cnt = 0
        for algo_id in self.stats["include_algorithms"]:
            train_list = self.stats[algo_id]['train_data_list']
            configs = self.stats[algo_id]['configurations']
            for train_node in train_list:
                test_node = solvers[algo_id].optimizer['fe'].apply(
                    data, train_node)
                for _ in configs:
                    if self.base_model_mask[model_cnt] == 1:
                        for j in range(self.kfold):
                            with open(
                                    os.path.join(
                                        self.output_dir, '%s-model%d_part%d' %
                                        (self.timestamp, model_cnt, j)),
                                    'rb') as f:
                                estimator = pkl.load(f)
                            if self.task_type in CLS_TASKS:
                                pred = estimator.predict_proba(
                                    test_node.data[0])
                                n_dim = np.array(pred).shape[1]
                                if n_dim == 2:
                                    n_dim = 1
                                if feature_p2 is None:
                                    num_samples = len(test_node.data[0])
                                    feature_p2 = np.zeros(
                                        (num_samples,
                                         self.ensemble_size * n_dim))
                                # Get average predictions
                                if n_dim == 1:
                                    feature_p2[:, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] = \
                                        feature_p2[:, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] + pred[:,
                                                                                               1:2] / self.kfold
                                else:
                                    feature_p2[:, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] = \
                                        feature_p2[:, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] + pred / self.kfold
                            else:
                                pred = estimator.predict(
                                    test_node.data[0]).reshape(-1, 1)
                                n_dim = 1
                                # Initialize training matrix for phase 2
                                if feature_p2 is None:
                                    num_samples = len(test_node.data[0])
                                    feature_p2 = np.zeros(
                                        (num_samples,
                                         self.ensemble_size * n_dim))
                                # Get average predictions
                                feature_p2[:, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] = \
                                    feature_p2[:, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] + pred / self.kfold
                        suc_cnt += 1
                    model_cnt += 1
        return feature_p2

    def predict(self, data, solvers):
        feature_p2 = self.get_feature(data, solvers)
        # Get predictions from meta-learner
        if self.task_type in CLS_TASKS:
            final_pred = self.meta_learner.predict_proba(feature_p2)
        else:
            final_pred = self.meta_learner.predict(feature_p2)
        return final_pred
Exemple #4
0
class Stacking(BaseEnsembleModel):
    def __init__(self,
                 model_info,
                 ensemble_size,
                 task_type,
                 metric,
                 evaluator,
                 model_type='ml',
                 meta_learner='xgboost',
                 kfold=3,
                 save_dir=None,
                 random_state=None):
        super().__init__(model_info=model_info,
                         ensemble_size=ensemble_size,
                         task_type=task_type,
                         metric=metric,
                         evaluator=evaluator,
                         model_type=model_type,
                         save_dir=save_dir,
                         random_state=random_state)

        self.kfold = kfold
        # We use Xgboost as default meta-learner
        if self.task_type == CLASSIFICATION:
            if meta_learner == 'logistic':
                from sklearn.linear_model.logistic import LogisticRegression
                self.meta_learner = LogisticRegression(max_iter=1000)
            elif meta_learner == 'gb':
                from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier
                self.meta_learner = GradientBoostingClassifier(
                    learning_rate=0.05,
                    subsample=0.7,
                    max_depth=4,
                    n_estimators=250)
            elif meta_learner == 'xgboost':
                from xgboost import XGBClassifier
                self.meta_learner = XGBClassifier(max_depth=4,
                                                  learning_rate=0.05,
                                                  n_estimators=150)
        elif self.task_type == REGRESSION:
            if meta_learner == 'linear':
                from sklearn.linear_model import LinearRegression
                self.meta_learner = LinearRegression()
            elif meta_learner == 'xgboost':
                from xgboost import XGBRegressor
                self.meta_learner = XGBRegressor(max_depth=4,
                                                 learning_rate=0.05,
                                                 n_estimators=70)

    def fit(self, dm: DataManager):
        # Split training data for phase 1 and phase 2
        if self.task_type == CLASSIFICATION:
            kf = StratifiedKFold(n_splits=self.kfold)
        elif self.task_type == REGRESSION:
            kf = KFold(n_splits=self.kfold)
        feature_p2 = None
        if self.model_type == 'ml':
            # Train basic models using a part of training data
            for i, config in enumerate(self.config_list):
                for j, (train,
                        test) in enumerate(kf.split(dm.train_X, dm.train_y)):
                    x_p1, x_p2, y_p1, _ = dm.train_X[train], dm.train_X[
                        test], dm.train_y[train], dm.train_y[test]
                    estimator = self.get_estimator(config, x_p1, y_p1)
                    # The final list will contain self.kfold * self.ensemble_size models
                    self.ensemble_models.append(estimator)
                    pred = self.get_proba_predictions(estimator, x_p2)
                    if self.task_type == CLASSIFICATION:
                        n_dim = np.array(pred).shape[1]
                        if n_dim == 2:
                            # Binary classificaion
                            n_dim = 1
                        # Initialize training matrix for phase 2
                        if feature_p2 is None:
                            num_samples = len(train) + len(test)
                            feature_p2 = np.zeros(
                                (num_samples, self.ensemble_size * n_dim))
                        if n_dim == 1:
                            feature_p2[test,
                                       i * n_dim:(i + 1) * n_dim] = pred[:,
                                                                         1:2]
                        else:
                            feature_p2[test, i * n_dim:(i + 1) * n_dim] = pred
                    elif self.task_type == REGRESSION:
                        shape = np.array(pred).shape
                        n_dim = shape[1]
                        # Initialize training matrix for phase 2
                        if feature_p2 is None:
                            num_samples = len(train) + len(test)
                            feature_p2 = np.zeros(
                                (num_samples, self.ensemble_size * n_dim))
                        feature_p2[test, i * n_dim:(i + 1) * n_dim] = pred
            # Train model for stacking using the other part of training data
            self.meta_learner.fit(feature_p2, dm.train_y)
        elif self.model_type == 'dl':
            pass
        return self

    def get_feature(self, X):
        # Predict the labels via stacking
        feature_p2 = None
        for i, model in enumerate(self.ensemble_models):
            pred = self.get_proba_predictions(model, X)
            if self.task_type == CLASSIFICATION:
                n_dim = np.array(pred).shape[1]
                if n_dim == 2:
                    n_dim = 1
                if feature_p2 is None:
                    num_samples = len(X)
                    feature_p2 = np.zeros(
                        (num_samples, self.ensemble_size * n_dim))
                index = i % self.kfold
                # Get average predictions
                if n_dim == 1:
                    feature_p2[:, index * n_dim:(index + 1) * n_dim] = feature_p2[:,
                                                                       index * n_dim:(index + 1) * n_dim] + \
                                                                       pred[:, 1:2] / self.kfold
                else:
                    feature_p2[:, index * n_dim:(index + 1) * n_dim] = feature_p2[:,
                                                                       index * n_dim:(index + 1) * n_dim] + \
                                                                       pred / self.kfold
            elif self.task_type == REGRESSION:
                shape = np.array(pred).shape
                n_dim = shape[1]
                # Initialize training matrix for phase 2
                if feature_p2 is None:
                    num_samples = len(X)
                    feature_p2 = np.zeros(
                        (num_samples, self.ensemble_size * n_dim))
                index = i % self.kfold
                # Get average predictions
                feature_p2[:, index * n_dim:(index + 1) * n_dim] = feature_p2[:,
                                                                   index * n_dim:(index + 1) * n_dim] + \
                                                                   pred / self.kfold
        return feature_p2

    def predict(self, X):
        feature_p2 = self.get_feature(X)
        # Get predictions from meta-learner
        final_pred = self.meta_learner.predict(feature_p2)
        return final_pred

    def predict_proba(self, X):
        feature_p2 = self.get_feature(X)
        # Get predictions from meta-learner
        final_pred = self.meta_learner.predict_proba(feature_p2)
        return final_pred
Exemple #5
0
class Blending(BaseEnsembleModel):
    def __init__(self,
                 model_info,
                 ensemble_size,
                 task_type,
                 metric,
                 evaluator,
                 model_type='ml',
                 meta_learner='xgboost'):
        super().__init__(model_info, ensemble_size, task_type, metric,
                         evaluator, model_type)

        # We use Xgboost as default meta-learner
        if self.task_type == CLASSIFICATION:
            if meta_learner == 'logistic':
                from sklearn.linear_model.logistic import LogisticRegression
                self.meta_learner = LogisticRegression(max_iter=1000)
            elif meta_learner == 'gb':
                self.meta_learner = GradientBoostingClassifier(
                    learning_rate=0.05,
                    subsample=0.7,
                    max_depth=4,
                    n_estimators=250)
            elif meta_learner == 'xgboost':
                from xgboost import XGBClassifier
                self.meta_learner = XGBClassifier(max_depth=4,
                                                  learning_rate=0.05,
                                                  n_estimators=150)
        elif self.task_type == REGRESSION:
            if meta_learner == 'linear':
                from sklearn.linear_model import LinearRegression
                self.meta_learner = LinearRegression()
            elif meta_learner == 'xgboost':
                from xgboost import XGBRegressor
                self.meta_learner = XGBRegressor(max_depth=4,
                                                 learning_rate=0.05,
                                                 n_estimators=70)

    def fit(self, dm: DataManager):
        # Split training data for phase 1 and phase 2
        if self.task_type == CLASSIFICATION:
            x_p1, x_p2, y_p1, y_p2 = train_test_split(dm.train_X,
                                                      dm.train_y,
                                                      test_size=0.2,
                                                      stratify=dm.train_y)
        elif self.task_type == REGRESSION:
            x_p1, x_p2, y_p1, y_p2 = train_test_split(dm.train_X,
                                                      dm.train_y,
                                                      test_size=0.2)
        feature_p2 = None
        if self.model_type == 'ml':
            # Train basic models using a part of training data
            for i, config in enumerate(self.config_list):
                estimator = self.get_estimator(config, x_p1, y_p1)
                self.ensemble_models.append(estimator)
                pred = self.get_proba_predictions(estimator, x_p2)
                if self.task_type == CLASSIFICATION:
                    n_dim = np.array(pred).shape[1]
                    if n_dim == 2:
                        # Binary classificaion
                        n_dim = 1
                    # Initialize training matrix for phase 2
                    if feature_p2 is None:
                        num_samples = len(x_p2)
                        feature_p2 = np.zeros(
                            (num_samples, self.ensemble_size * n_dim))
                    if n_dim == 1:
                        feature_p2[:, i * n_dim:(i + 1) * n_dim] = pred[:, 1:2]
                    else:
                        feature_p2[:, i * n_dim:(i + 1) * n_dim] = pred

                elif self.task_type == REGRESSION:
                    shape = np.array(pred).shape
                    n_dim = shape[1]
                    # Initialize training matrix for phase 2
                    if feature_p2 is None:
                        num_samples = len(x_p2)
                        feature_p2 = np.zeros(
                            (num_samples, self.ensemble_size * n_dim))
                    feature_p2[:, i * n_dim:(i + 1) * n_dim] = pred
            # Train model for blending using the other part of training data
            self.meta_learner.fit(feature_p2, y_p2)

        elif self.model_type == 'dl':
            pass
        return self

    def get_feature(self, X):
        # Predict the labels via blending
        feature_p2 = None
        for i, model in enumerate(self.ensemble_models):
            pred = self.get_proba_predictions(model, X)
            if self.task_type == CLASSIFICATION:
                n_dim = np.array(pred).shape[1]
                if n_dim == 2:
                    # Binary classificaion
                    n_dim = 1
                # Initialize training matrix for phase 2
                if feature_p2 is None:
                    num_samples = len(X)
                    feature_p2 = np.zeros(
                        (num_samples, self.ensemble_size * n_dim))
                if n_dim == 1:
                    feature_p2[:, i * n_dim:(i + 1) * n_dim] = pred[:, 1:2]
                else:
                    feature_p2[:, i * n_dim:(i + 1) * n_dim] = pred

            elif self.task_type == REGRESSION:
                n_dim = np.array(pred).shape[1]
                # Initialize training matrix for phase 2
                if feature_p2 is None:
                    num_samples = len(X)
                    feature_p2 = np.zeros(
                        (num_samples, self.ensemble_size * n_dim))
                feature_p2[:, i * n_dim:(i + 1) * n_dim] = pred
        return feature_p2

    def predict(self, X):
        feature_p2 = self.get_feature(X)
        # Get predictions from meta-learner
        final_pred = self.meta_learner.predict(feature_p2)
        return final_pred

    def predict_proba(self, X):
        feature_p2 = self.get_feature(X)
        # Get predictions from meta-learner
        final_pred = self.meta_learner.predict_proba(feature_p2)
        return final_pred
## -------------------------------------
## SHAP values
## -------------------------------------
print("Getting SHAP values")
## fit the full regression function
cc_all = (np.sum(np.isnan(data.x_train), axis=1) == 0)
cc_all_test = (np.sum(np.isnan(data.x_test), axis=1) == 0)
start = time.time()
ensemble.fit(data.x_train[cc_all, :], np.ravel(data.y_train[cc_all]))
## print test-set error
if args.measure == "auc":
    if 'nn' in args.estimator_type:
        test_preds = np.mean(ensemble.transform(data.x_test[cc_all_test, :]),
                             axis=1)
    else:
        test_preds = ensemble.predict_proba(data.x_test[cc_all_test, :])
else:
    test_preds = ensemble.predict(data.x_test[cc_all_test, :])

log_lik = (-1) * sl_scorer(y_true=np.ravel(data.y_test[cc_all_test]),
                           y_pred=test_preds,
                           normalize=False)
print('Estimated negative log likelihood: ' + str(log_lik))
if "tree" in args.estimator_type:
    explainer = shap.TreeExplainer(ensemble)
    shap_values = explainer.shap_values(data.x_test[cc_all_test, :])
else:
    if args.measure == "auc":
        explainer = shap.KernelExplainer(
            ensemble.transform, shap.kmeans(data.x_train[cc_all, :], 100))
        tmp = explainer.shap_values(data.x_test[cc_all_test, :], nsample=500)