def _xgboost(self): params = self.params if self.problem_type == 'regression': model = XGBRegressor(**params) model.fit(self.training_data.drop(["TARGET"], axis=1), self.training_data['TARGET']) preds = model.predict(self.validation_data.drop(['TARGET'], axis=1)) return model, preds, self.validation_data['TARGET'] elif self.problem_type == 'classification': model = XGBClassifier(**params) model.fit(self.training_data.drop(["TARGET"], axis=1), self.training_data['TARGET']) preds = model.predict_proba( self.validation_data.drop(['TARGET'], axis=1))[:, 1] return model, preds, self.validation_data['TARGET'] elif self.problem_type == 'multiclass': model = XGBClassifier(**params) model.fit(self.training_data.drop(["TARGET"], axis=1), self.training_data['TARGET']) preds = model.predict_proba( self.validation_data.drop(['TARGET'], axis=1)) preds = [np.argmax(p) for p in preds] return model, preds, self.validation_data['TARGET'] else: raise Exception("Problem Type not supported")
class XGboost: """docstring for XGboost""" def __init__(self, model_configs, task_type='Regression'): self.model_configs = model_configs self.max_depth = model_configs['max_depth'] self.learning_rate = model_configs['learning_rate'] self.n_estimators = model_configs['n_estimators'] self.objective = model_configs['objective'] self.booster = model_configs['booster'] self.subsample = model_configs['subsample'] self.colsample_bylevel = model_configs['colsample_bylevel'] self.colsample_bytree = model_configs['colsample_bytree'] self.min_child_weight = model_configs['min_child_weight'] self.reg_alpha = model_configs['reg_alpha'] self.reg_lambda = model_configs['reg_lambda'] self.scale_pos_weight = model_configs['scale_pos_weight'] self.max_delta_step = model_configs['max_delta_step'] self.random_seed = model_configs['random_seed'] self.eval_metric = model_configs['early_stopping']['eval_metric'] self.early_stopping_round = model_configs['early_stopping']['round'] self.task_type = task_type np.random.seed(seed=self.random_seed) self.setup_model() def setup_model(self): if self.task_type == 'Classification': self.model = XGBClassifier( max_depth=self.max_depth, learning_rate=self.learning_rate, n_estimators=self.n_estimators, objective=self.objective, booster=self.booster, subsample=self.subsample, colsample_bylevel=self.colsample_bylevel, colsample_bytree=self.colsample_bytree, min_child_weight=self.min_child_weight, reg_alpha=self.reg_alpha, reg_lambda=self.reg_lambda, scale_pos_weight=self.scale_pos_weight, max_delta_step=self.max_delta_step, random_state=self.random_seed, silent=False, n_jobs=8) elif self.task_type == 'Regression': self.model = XGBRegressor(max_depth=self.max_depth, learning_rate=self.learning_rate, n_estimators=self.n_estimators, objective=self.objective, booster=self.booster, subsample=self.subsample, colsample_bylevel=self.colsample_bylevel, colsample_bytree=self.colsample_bytree, min_child_weight=self.min_child_weight, reg_alpha=self.reg_alpha, reg_lambda=self.reg_lambda, scale_pos_weight=self.scale_pos_weight, random_state=self.random_seed, silent=False, n_jobs=8) else: raise "Task type Error!" def fit_model(self, train_loader): self.model.fit(train_loader[0], train_loader[1]) def predict(self, test_loader): if self.task_type == 'Classification': return self.model.predict_proba(test_loader[0]) else: return self.model.predict(test_loader[0])
class Stacking(BaseEnsembleModel): def __init__(self, stats, ensemble_size: int, task_type: int, metric: _BaseScorer, output_dir=None, meta_learner='xgboost', kfold=5): super().__init__(stats=stats, ensemble_method='blending', ensemble_size=ensemble_size, task_type=task_type, metric=metric, output_dir=output_dir) self.kfold = kfold try: from xgboost import XGBClassifier except: warnings.warn( "Xgboost is not imported! Blending will use linear model instead!" ) meta_learner = 'linear' # We use Xgboost as default meta-learner if self.task_type in CLS_TASKS: if meta_learner == 'linear': from sklearn.linear_model.logistic import LogisticRegression self.meta_learner = LogisticRegression(max_iter=1000) elif meta_learner == 'gb': from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier self.meta_learner = GradientBoostingClassifier( learning_rate=0.05, subsample=0.7, max_depth=4, n_estimators=250) elif meta_learner == 'xgboost': from xgboost import XGBClassifier self.meta_learner = XGBClassifier(max_depth=4, learning_rate=0.05, n_estimators=150) else: if meta_learner == 'linear': from sklearn.linear_model import LinearRegression self.meta_learner = LinearRegression() elif meta_learner == 'xgboost': from xgboost import XGBRegressor self.meta_learner = XGBRegressor(max_depth=4, learning_rate=0.05, n_estimators=70) def fit(self, data): # Split training data for phase 1 and phase 2 if self.task_type in CLS_TASKS: kf = StratifiedKFold(n_splits=self.kfold) else: kf = KFold(n_splits=self.kfold) # Train basic models using a part of training data model_cnt = 0 suc_cnt = 0 feature_p2 = None for algo_id in self.stats["include_algorithms"]: train_list = self.stats[algo_id]['train_data_list'] configs = self.stats[algo_id]['configurations'] for idx in range(len(train_list)): X, y = train_list[idx].data for _config in configs: if self.base_model_mask[model_cnt] == 1: for j, (train, test) in enumerate(kf.split(X, y)): x_p1, x_p2, y_p1, _ = X[train], X[test], y[ train], y[test] estimator = fetch_predict_estimator( self.task_type, _config, x_p1, y_p1) with open( os.path.join( self.output_dir, '%s-model%d_part%d' % (self.timestamp, model_cnt, j)), 'wb') as f: pkl.dump(estimator, f) if self.task_type in CLS_TASKS: pred = estimator.predict_proba(x_p2) n_dim = np.array(pred).shape[1] if n_dim == 2: # Binary classificaion n_dim = 1 # Initialize training matrix for phase 2 if feature_p2 is None: num_samples = len(train) + len(test) feature_p2 = np.zeros( (num_samples, self.ensemble_size * n_dim)) if n_dim == 1: feature_p2[test, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] = pred[:, 1:2] else: feature_p2[test, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] = pred else: pred = estimator.predict(x_p2).reshape(-1, 1) n_dim = 1 # Initialize training matrix for phase 2 if feature_p2 is None: num_samples = len(train) + len(test) feature_p2 = np.zeros( (num_samples, self.ensemble_size * n_dim)) feature_p2[test, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] = pred suc_cnt += 1 model_cnt += 1 # Train model for stacking using the other part of training data self.meta_learner.fit(feature_p2, y) return self def get_feature(self, data, solvers): # Predict the labels via stacking feature_p2 = None model_cnt = 0 suc_cnt = 0 for algo_id in self.stats["include_algorithms"]: train_list = self.stats[algo_id]['train_data_list'] configs = self.stats[algo_id]['configurations'] for train_node in train_list: test_node = solvers[algo_id].optimizer['fe'].apply( data, train_node) for _ in configs: if self.base_model_mask[model_cnt] == 1: for j in range(self.kfold): with open( os.path.join( self.output_dir, '%s-model%d_part%d' % (self.timestamp, model_cnt, j)), 'rb') as f: estimator = pkl.load(f) if self.task_type in CLS_TASKS: pred = estimator.predict_proba( test_node.data[0]) n_dim = np.array(pred).shape[1] if n_dim == 2: n_dim = 1 if feature_p2 is None: num_samples = len(test_node.data[0]) feature_p2 = np.zeros( (num_samples, self.ensemble_size * n_dim)) # Get average predictions if n_dim == 1: feature_p2[:, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] = \ feature_p2[:, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] + pred[:, 1:2] / self.kfold else: feature_p2[:, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] = \ feature_p2[:, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] + pred / self.kfold else: pred = estimator.predict( test_node.data[0]).reshape(-1, 1) n_dim = 1 # Initialize training matrix for phase 2 if feature_p2 is None: num_samples = len(test_node.data[0]) feature_p2 = np.zeros( (num_samples, self.ensemble_size * n_dim)) # Get average predictions feature_p2[:, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] = \ feature_p2[:, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] + pred / self.kfold suc_cnt += 1 model_cnt += 1 return feature_p2 def predict(self, data, solvers): feature_p2 = self.get_feature(data, solvers) # Get predictions from meta-learner if self.task_type in CLS_TASKS: final_pred = self.meta_learner.predict_proba(feature_p2) else: final_pred = self.meta_learner.predict(feature_p2) return final_pred
class Stacking(BaseEnsembleModel): def __init__(self, model_info, ensemble_size, task_type, metric, evaluator, model_type='ml', meta_learner='xgboost', kfold=3, save_dir=None, random_state=None): super().__init__(model_info=model_info, ensemble_size=ensemble_size, task_type=task_type, metric=metric, evaluator=evaluator, model_type=model_type, save_dir=save_dir, random_state=random_state) self.kfold = kfold # We use Xgboost as default meta-learner if self.task_type == CLASSIFICATION: if meta_learner == 'logistic': from sklearn.linear_model.logistic import LogisticRegression self.meta_learner = LogisticRegression(max_iter=1000) elif meta_learner == 'gb': from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier self.meta_learner = GradientBoostingClassifier( learning_rate=0.05, subsample=0.7, max_depth=4, n_estimators=250) elif meta_learner == 'xgboost': from xgboost import XGBClassifier self.meta_learner = XGBClassifier(max_depth=4, learning_rate=0.05, n_estimators=150) elif self.task_type == REGRESSION: if meta_learner == 'linear': from sklearn.linear_model import LinearRegression self.meta_learner = LinearRegression() elif meta_learner == 'xgboost': from xgboost import XGBRegressor self.meta_learner = XGBRegressor(max_depth=4, learning_rate=0.05, n_estimators=70) def fit(self, dm: DataManager): # Split training data for phase 1 and phase 2 if self.task_type == CLASSIFICATION: kf = StratifiedKFold(n_splits=self.kfold) elif self.task_type == REGRESSION: kf = KFold(n_splits=self.kfold) feature_p2 = None if self.model_type == 'ml': # Train basic models using a part of training data for i, config in enumerate(self.config_list): for j, (train, test) in enumerate(kf.split(dm.train_X, dm.train_y)): x_p1, x_p2, y_p1, _ = dm.train_X[train], dm.train_X[ test], dm.train_y[train], dm.train_y[test] estimator = self.get_estimator(config, x_p1, y_p1) # The final list will contain self.kfold * self.ensemble_size models self.ensemble_models.append(estimator) pred = self.get_proba_predictions(estimator, x_p2) if self.task_type == CLASSIFICATION: n_dim = np.array(pred).shape[1] if n_dim == 2: # Binary classificaion n_dim = 1 # Initialize training matrix for phase 2 if feature_p2 is None: num_samples = len(train) + len(test) feature_p2 = np.zeros( (num_samples, self.ensemble_size * n_dim)) if n_dim == 1: feature_p2[test, i * n_dim:(i + 1) * n_dim] = pred[:, 1:2] else: feature_p2[test, i * n_dim:(i + 1) * n_dim] = pred elif self.task_type == REGRESSION: shape = np.array(pred).shape n_dim = shape[1] # Initialize training matrix for phase 2 if feature_p2 is None: num_samples = len(train) + len(test) feature_p2 = np.zeros( (num_samples, self.ensemble_size * n_dim)) feature_p2[test, i * n_dim:(i + 1) * n_dim] = pred # Train model for stacking using the other part of training data self.meta_learner.fit(feature_p2, dm.train_y) elif self.model_type == 'dl': pass return self def get_feature(self, X): # Predict the labels via stacking feature_p2 = None for i, model in enumerate(self.ensemble_models): pred = self.get_proba_predictions(model, X) if self.task_type == CLASSIFICATION: n_dim = np.array(pred).shape[1] if n_dim == 2: n_dim = 1 if feature_p2 is None: num_samples = len(X) feature_p2 = np.zeros( (num_samples, self.ensemble_size * n_dim)) index = i % self.kfold # Get average predictions if n_dim == 1: feature_p2[:, index * n_dim:(index + 1) * n_dim] = feature_p2[:, index * n_dim:(index + 1) * n_dim] + \ pred[:, 1:2] / self.kfold else: feature_p2[:, index * n_dim:(index + 1) * n_dim] = feature_p2[:, index * n_dim:(index + 1) * n_dim] + \ pred / self.kfold elif self.task_type == REGRESSION: shape = np.array(pred).shape n_dim = shape[1] # Initialize training matrix for phase 2 if feature_p2 is None: num_samples = len(X) feature_p2 = np.zeros( (num_samples, self.ensemble_size * n_dim)) index = i % self.kfold # Get average predictions feature_p2[:, index * n_dim:(index + 1) * n_dim] = feature_p2[:, index * n_dim:(index + 1) * n_dim] + \ pred / self.kfold return feature_p2 def predict(self, X): feature_p2 = self.get_feature(X) # Get predictions from meta-learner final_pred = self.meta_learner.predict(feature_p2) return final_pred def predict_proba(self, X): feature_p2 = self.get_feature(X) # Get predictions from meta-learner final_pred = self.meta_learner.predict_proba(feature_p2) return final_pred
class Blending(BaseEnsembleModel): def __init__(self, model_info, ensemble_size, task_type, metric, evaluator, model_type='ml', meta_learner='xgboost'): super().__init__(model_info, ensemble_size, task_type, metric, evaluator, model_type) # We use Xgboost as default meta-learner if self.task_type == CLASSIFICATION: if meta_learner == 'logistic': from sklearn.linear_model.logistic import LogisticRegression self.meta_learner = LogisticRegression(max_iter=1000) elif meta_learner == 'gb': self.meta_learner = GradientBoostingClassifier( learning_rate=0.05, subsample=0.7, max_depth=4, n_estimators=250) elif meta_learner == 'xgboost': from xgboost import XGBClassifier self.meta_learner = XGBClassifier(max_depth=4, learning_rate=0.05, n_estimators=150) elif self.task_type == REGRESSION: if meta_learner == 'linear': from sklearn.linear_model import LinearRegression self.meta_learner = LinearRegression() elif meta_learner == 'xgboost': from xgboost import XGBRegressor self.meta_learner = XGBRegressor(max_depth=4, learning_rate=0.05, n_estimators=70) def fit(self, dm: DataManager): # Split training data for phase 1 and phase 2 if self.task_type == CLASSIFICATION: x_p1, x_p2, y_p1, y_p2 = train_test_split(dm.train_X, dm.train_y, test_size=0.2, stratify=dm.train_y) elif self.task_type == REGRESSION: x_p1, x_p2, y_p1, y_p2 = train_test_split(dm.train_X, dm.train_y, test_size=0.2) feature_p2 = None if self.model_type == 'ml': # Train basic models using a part of training data for i, config in enumerate(self.config_list): estimator = self.get_estimator(config, x_p1, y_p1) self.ensemble_models.append(estimator) pred = self.get_proba_predictions(estimator, x_p2) if self.task_type == CLASSIFICATION: n_dim = np.array(pred).shape[1] if n_dim == 2: # Binary classificaion n_dim = 1 # Initialize training matrix for phase 2 if feature_p2 is None: num_samples = len(x_p2) feature_p2 = np.zeros( (num_samples, self.ensemble_size * n_dim)) if n_dim == 1: feature_p2[:, i * n_dim:(i + 1) * n_dim] = pred[:, 1:2] else: feature_p2[:, i * n_dim:(i + 1) * n_dim] = pred elif self.task_type == REGRESSION: shape = np.array(pred).shape n_dim = shape[1] # Initialize training matrix for phase 2 if feature_p2 is None: num_samples = len(x_p2) feature_p2 = np.zeros( (num_samples, self.ensemble_size * n_dim)) feature_p2[:, i * n_dim:(i + 1) * n_dim] = pred # Train model for blending using the other part of training data self.meta_learner.fit(feature_p2, y_p2) elif self.model_type == 'dl': pass return self def get_feature(self, X): # Predict the labels via blending feature_p2 = None for i, model in enumerate(self.ensemble_models): pred = self.get_proba_predictions(model, X) if self.task_type == CLASSIFICATION: n_dim = np.array(pred).shape[1] if n_dim == 2: # Binary classificaion n_dim = 1 # Initialize training matrix for phase 2 if feature_p2 is None: num_samples = len(X) feature_p2 = np.zeros( (num_samples, self.ensemble_size * n_dim)) if n_dim == 1: feature_p2[:, i * n_dim:(i + 1) * n_dim] = pred[:, 1:2] else: feature_p2[:, i * n_dim:(i + 1) * n_dim] = pred elif self.task_type == REGRESSION: n_dim = np.array(pred).shape[1] # Initialize training matrix for phase 2 if feature_p2 is None: num_samples = len(X) feature_p2 = np.zeros( (num_samples, self.ensemble_size * n_dim)) feature_p2[:, i * n_dim:(i + 1) * n_dim] = pred return feature_p2 def predict(self, X): feature_p2 = self.get_feature(X) # Get predictions from meta-learner final_pred = self.meta_learner.predict(feature_p2) return final_pred def predict_proba(self, X): feature_p2 = self.get_feature(X) # Get predictions from meta-learner final_pred = self.meta_learner.predict_proba(feature_p2) return final_pred
## ------------------------------------- ## SHAP values ## ------------------------------------- print("Getting SHAP values") ## fit the full regression function cc_all = (np.sum(np.isnan(data.x_train), axis=1) == 0) cc_all_test = (np.sum(np.isnan(data.x_test), axis=1) == 0) start = time.time() ensemble.fit(data.x_train[cc_all, :], np.ravel(data.y_train[cc_all])) ## print test-set error if args.measure == "auc": if 'nn' in args.estimator_type: test_preds = np.mean(ensemble.transform(data.x_test[cc_all_test, :]), axis=1) else: test_preds = ensemble.predict_proba(data.x_test[cc_all_test, :]) else: test_preds = ensemble.predict(data.x_test[cc_all_test, :]) log_lik = (-1) * sl_scorer(y_true=np.ravel(data.y_test[cc_all_test]), y_pred=test_preds, normalize=False) print('Estimated negative log likelihood: ' + str(log_lik)) if "tree" in args.estimator_type: explainer = shap.TreeExplainer(ensemble) shap_values = explainer.shap_values(data.x_test[cc_all_test, :]) else: if args.measure == "auc": explainer = shap.KernelExplainer( ensemble.transform, shap.kmeans(data.x_train[cc_all, :], 100)) tmp = explainer.shap_values(data.x_test[cc_all_test, :], nsample=500)