def predict(self, data): model_pred_list = [] final_pred = [] # Get predictions from each model model_cnt = 0 for algo_id in self.stats: model_to_eval = self.stats[algo_id] for idx, (_, _, path) in enumerate(model_to_eval): with open(path, 'rb') as f: op_list, model = pkl.load(f) _node = data.copy_() _node = construct_node(_node, op_list) if self.base_model_mask[model_cnt] == 1: if self.task_type in CLS_TASKS: model_pred_list.append( model.predict_proba(_node.data[0])) else: model_pred_list.append(model.predict(_node.data[0])) model_cnt += 1 # Calculate the average of predictions for i in range(len(data.data[0])): sample_pred_list = [ model_pred[i] for model_pred in model_pred_list ] pred_average = reduce(lambda x, y: x + y, sample_pred_list) / len(sample_pred_list) final_pred.append(pred_average) return np.array(final_pred)
def fit(self, data): # Split training data for phase 1 and phase 2 test_size = 0.2 # Train basic models using a part of training data model_cnt = 0 suc_cnt = 0 feature_p2 = None for algo_id in self.stats.keys(): model_to_eval = self.stats[algo_id] for idx, (config, _, path) in enumerate(model_to_eval): with open(path, 'rb')as f: op_list, model = pkl.load(f) _node = data.copy_() _node = construct_node(_node, op_list, mode='train') X, y = _node.data if self.task_type in CLS_TASKS: x_p1, x_p2, y_p1, y_p2 = train_test_split(X, y, test_size=test_size, stratify=data.data[1], random_state=1) else: x_p1, x_p2, y_p1, y_p2 = train_test_split(X, y, test_size=test_size, random_state=1) if self.base_model_mask[model_cnt] == 1: estimator = fetch_predict_estimator(self.task_type, algo_id, config[0], x_p1, y_p1, weight_balance=_node.enable_balance, data_balance=_node.data_balance) with open(os.path.join(self.output_dir, '%s-blending-model%d' % (self.timestamp, model_cnt)), 'wb') as f: pkl.dump(estimator, f) if self.task_type in CLS_TASKS: pred = estimator.predict_proba(x_p2) n_dim = np.array(pred).shape[1] if n_dim == 2: # Binary classificaion n_dim = 1 # Initialize training matrix for phase 2 if feature_p2 is None: num_samples = len(x_p2) feature_p2 = np.zeros((num_samples, self.ensemble_size * n_dim)) if n_dim == 1: feature_p2[:, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] = pred[:, 1:2] else: feature_p2[:, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] = pred else: pred = estimator.predict(x_p2).reshape(-1, 1) n_dim = 1 # Initialize training matrix for phase 2 if feature_p2 is None: num_samples = len(x_p2) feature_p2 = np.zeros((num_samples, self.ensemble_size * n_dim)) feature_p2[:, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] = pred suc_cnt += 1 model_cnt += 1 self.meta_learner.fit(feature_p2, y_p2) return self
def get_feature(self, data): # Predict the labels via stacking feature_p2 = None model_cnt = 0 suc_cnt = 0 for algo_id in self.stats.keys(): model_to_eval = self.stats[algo_id] for idx, (config, _, path) in enumerate(model_to_eval): with open(path, 'rb') as f: op_list, model = pkl.load(f) _node = data.copy_() _node = construct_node(_node, op_list) if self.base_model_mask[model_cnt] == 1: for j in range(self.kfold): with open( os.path.join( self.output_dir, '%s-model%d_part%d' % (self.timestamp, model_cnt, j)), 'rb') as f: estimator = pkl.load(f) if self.task_type in CLS_TASKS: pred = estimator.predict_proba(_node.data[0]) n_dim = np.array(pred).shape[1] if n_dim == 2: n_dim = 1 if feature_p2 is None: num_samples = len(_node.data[0]) feature_p2 = np.zeros( (num_samples, self.ensemble_size * n_dim)) # Get average predictions if n_dim == 1: feature_p2[:, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] = \ feature_p2[:, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] + pred[:, 1:2] / self.kfold else: feature_p2[:, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] = \ feature_p2[:, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] + pred / self.kfold else: pred = estimator.predict(_node.data[0]).reshape( -1, 1) n_dim = 1 # Initialize training matrix for phase 2 if feature_p2 is None: num_samples = len(_node.data[0]) feature_p2 = np.zeros( (num_samples, self.ensemble_size * n_dim)) # Get average predictions feature_p2[:, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] = \ feature_p2[:, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] + pred / self.kfold suc_cnt += 1 model_cnt += 1 return feature_p2
def predict(self, data): predictions = [] cur_idx = 0 for algo_id in self.stats.keys(): model_to_eval = self.stats[algo_id] for idx, (_, _, path) in enumerate(model_to_eval): with open(path, 'rb') as f: op_list, estimator = pkl.load(f) _node = data.copy_() _node = construct_node(_node, op_list) X_test = _node.data[0] if cur_idx in self.model_idx: if self.task_type in CLS_TASKS: predictions.append(estimator.predict_proba(X_test)) else: predictions.append(estimator.predict(X_test)) else: if len(self.shape) == 1: predictions.append(np.zeros(len(_node.data[0]))) else: predictions.append( np.zeros((len(_node.data[0]), self.shape[1]))) cur_idx += 1 predictions = np.asarray(predictions) # if predictions.shape[0] == len(self.weights_), # predictions include those of zero-weight models. if predictions.shape[0] == len(self.weights_): return np.average(predictions, axis=0, weights=self.weights_) # if prediction model.shape[0] == len(non_null_weights), # predictions do not include those of zero-weight models. elif predictions.shape[0] == np.count_nonzero(self.weights_): non_null_weights = [w for w in self.weights_ if w > 0] return np.average(predictions, axis=0, weights=non_null_weights) # If none of the above applies, then something must have gone wrong. else: raise ValueError("The dimensions of ensemble predictions" " and ensemble weights do not match!")
def _predict(self, test_data: DataNode): if self.ensemble_method is not None: if self.es is None: raise AttributeError("AutoML is not fitted!") return self.es.predict(test_data) else: if self.inner_opt_algorithm == 'combined': best_op_list, estimator = load_combined_transformer_estimator( self.output_dir, self.best_config, self.timestamp) else: best_op_list, estimator = load_transformer_estimator( self.output_dir, self.optimal_algo_id, self.best_hpo_config, self.best_fe_config, self.timestamp) test_data_node = test_data.copy_() test_data_node = construct_node(test_data_node, best_op_list) if self.task_type in CLS_TASKS: return estimator.predict_proba(test_data_node.data[0]) else: return estimator.predict(test_data_node.data[0])
def __call__(self, config, **kwargs): start_time = time.time() return_dict = dict() self.seed = 1 downsample_ratio = kwargs.get('resource_ratio', 1.0) if 'holdout' in self.resampling_strategy: try: # Prepare data node. with warnings.catch_warnings(): warnings.filterwarnings("ignore") if self.resampling_params is None or 'test_size' not in self.resampling_params: test_size = 0.33 else: test_size = self.resampling_params['test_size'] from sklearn.model_selection import ShuffleSplit ss = ShuffleSplit(n_splits=1, test_size=test_size, random_state=self.seed) for train_index, test_index in ss.split( self.data_node.data[0], self.data_node.data[1]): _x_train, _x_val = self.data_node.data[0][ train_index], self.data_node.data[0][test_index] _y_train, _y_val = self.data_node.data[1][ train_index], self.data_node.data[1][test_index] self.train_node.data = [_x_train, _y_train] self.val_node.data = [_x_val, _y_val] data_node, op_list = parse_config(self.train_node, config, record=True) _val_node = self.val_node.copy_() _val_node = construct_node(_val_node, op_list) _x_train, _y_train = data_node.data _x_val, _y_val = _val_node.data config_dict = config.get_dictionary().copy() # regression gadgets regressor_id, clf = get_estimator(config_dict, self.estimator_id) score = validation(clf, self.scorer, _x_train, _y_train, _x_val, _y_val, random_state=self.seed) if 'rw_lock' not in kwargs or kwargs['rw_lock'] is None: self.logger.info( 'rw_lock not defined! Possible read-write conflicts may happen!' ) lock = kwargs.get('rw_lock', Lock()) lock.acquire() if np.isfinite(score): save_flag, model_path, delete_flag, model_path_deleted = self.topk_model_saver.add( config, score, regressor_id) if save_flag is True: with open(model_path, 'wb') as f: pkl.dump([op_list, clf], f) self.logger.info("Model saved to %s" % model_path) self.topk_model_saver.save_topk_config() try: if delete_flag and os.path.exists(model_path_deleted): os.remove(model_path_deleted) self.logger.info("Model deleted from %s" % model_path_deleted) except: pass lock.release() except Exception as e: import traceback traceback.print_exc() self.logger.info('Evaluator: %s' % (str(e))) score = -np.inf elif 'cv' in self.resampling_strategy: # Prepare data node. try: with warnings.catch_warnings(): warnings.filterwarnings("ignore") if 'cv' in self.resampling_strategy: if self.resampling_params is None or 'folds' not in self.resampling_params: folds = 5 else: folds = self.resampling_params['folds'] from sklearn.model_selection import KFold kfold = KFold(n_splits=folds, random_state=self.seed, shuffle=False) scores = list() for train_index, test_index in kfold.split( self.data_node.data[0], self.data_node.data[1]): _x_train, _x_val = self.data_node.data[0][ train_index], self.data_node.data[0][test_index] _y_train, _y_val = self.data_node.data[1][ train_index], self.data_node.data[1][test_index] self.train_node.data = [_x_train, _y_train] self.val_node.data = [_x_val, _y_val] data_node, op_list = parse_config(self.train_node, config, record=True) _val_node = self.val_node.copy_() _val_node = construct_node(_val_node, op_list) _x_train, _y_train = data_node.data _x_val, _y_val = _val_node.data config_dict = config.get_dictionary().copy() # regressor gadgets regressor_id, clf = get_estimator( config_dict, self.estimator_id) _score = validation(clf, self.scorer, _x_train, _y_train, _x_val, _y_val, random_state=self.seed) scores.append(_score) score = np.mean(scores) # TODO: Don't save models for cv if 'rw_lock' not in kwargs or kwargs['rw_lock'] is None: self.logger.info( 'rw_lock not defined! Possible read-write conflicts may happen!' ) lock = kwargs.get('rw_lock', Lock()) lock.acquire() if np.isfinite(score): _ = self.topk_model_saver.add(config, score, regressor_id) self.topk_model_saver.save_topk_config() lock.release() except Exception as e: import traceback traceback.print_exc() self.logger.info('Evaluator: %s' % (str(e))) score = -np.inf elif 'partial' in self.resampling_strategy: try: # Prepare data node. with warnings.catch_warnings(): warnings.filterwarnings("ignore") if self.resampling_params is None or 'test_size' not in self.resampling_params: test_size = 0.33 else: test_size = self.resampling_params['test_size'] from sklearn.model_selection import ShuffleSplit ss = ShuffleSplit(n_splits=1, test_size=test_size, random_state=self.seed) for train_index, test_index in ss.split( self.data_node.data[0], self.data_node.data[1]): _x_train, _x_val = self.data_node.data[0][ train_index], self.data_node.data[0][test_index] _y_train, _y_val = self.data_node.data[1][ train_index], self.data_node.data[1][test_index] self.train_node.data = [_x_train, _y_train] self.val_node.data = [_x_val, _y_val] data_node, op_list = parse_config(self.train_node, config, record=True) _val_node = self.val_node.copy_() _val_node = construct_node(_val_node, op_list) _x_train, _y_train = data_node.data if downsample_ratio != 1: down_ss = ShuffleSplit(n_splits=1, test_size=downsample_ratio, random_state=self.seed) for _, _val_index in down_ss.split(_x_train, _y_train): _act_x_train, _act_y_train = _x_train[ _val_index], _y_train[_val_index] else: _act_x_train, _act_y_train = _x_train, _y_train _val_index = list(range(len(_x_train))) _x_val, _y_val = _val_node.data config_dict = config.get_dictionary().copy() # Regressor gadgets regressor_id, clf = get_estimator(config_dict, self.estimator_id) score = validation(clf, self.scorer, _act_x_train, _act_y_train, _x_val, _y_val, random_state=self.seed) # TODO: Only save models with maximum resources if 'rw_lock' not in kwargs or kwargs['rw_lock'] is None: self.logger.info( 'rw_lock not defined! Possible read-write conflicts may happen!' ) lock = kwargs.get('rw_lock', Lock()) lock.acquire() if np.isfinite(score) and downsample_ratio == 1: save_flag, model_path, delete_flag, model_path_deleted = self.topk_model_saver.add( config, score, regressor_id) if save_flag is True: with open(model_path, 'wb') as f: pkl.dump([op_list, clf], f) self.logger.info("Model saved to %s" % model_path) self.topk_model_saver.save_topk_config() try: if delete_flag and os.path.exists(model_path_deleted): os.remove(model_path_deleted) self.logger.info("Model deleted from %s" % model_path_deleted) except: pass lock.release() except Exception as e: import traceback traceback.print_exc() self.logger.info('Evaluator: %s' % (str(e))) score = -np.inf else: raise ValueError('Invalid resampling strategy: %s!' % self.resampling_strategy) try: self.logger.info( 'Evaluation<%s> | Score: %.4f | Time cost: %.2f seconds | Shape: %s' % (regressor_id, self.scorer._sign * score, time.time() - start_time, _x_train.shape)) except: pass # Turn it into a minimization problem. return_dict['objective_value'] = -score return -score
def __call__(self, config, **kwargs): start_time = time.time() return_dict = dict() if self.name is None: raise ValueError('This evaluator has no name/type!') assert self.name in ['hpo', 'fe'] if self.name == 'hpo': hpo_config = config if config is not None else self.hpo_config fe_config = kwargs.get('ano_config', self.fe_config) else: fe_config = config if config is not None else self.fe_config hpo_config = kwargs.get('ano_config', self.hpo_config) # Prepare configuration. self.seed = 1 downsample_ratio = kwargs.get('resource_ratio', 1.0) if 'holdout' in self.resampling_strategy: try: with warnings.catch_warnings(): warnings.filterwarnings("ignore") if self.resampling_params is None or 'test_size' not in self.resampling_params: test_size = 0.33 else: test_size = self.resampling_params['test_size'] from sklearn.model_selection import StratifiedShuffleSplit ss = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=self.seed) for train_index, test_index in ss.split(self.data_node.data[0], self.data_node.data[1]): _X_train, _X_val = self.data_node.data[0][train_index], self.data_node.data[0][test_index] _y_train, _y_val = self.data_node.data[1][train_index], self.data_node.data[1][test_index] self.train_node.data = [_X_train, _y_train] self.val_node.data = [_X_val, _y_val] data_node, op_list = parse_config(self.train_node, fe_config, record=True, if_imbal=self.if_imbal) _val_node = self.val_node.copy_() _val_node = construct_node(_val_node, op_list) _X_train, _y_train = data_node.data _X_val, _y_val = _val_node.data config_dict = hpo_config.get_dictionary().copy() # Prepare training and initial params for classifier. init_params, fit_params = {}, {} if data_node.enable_balance == 1: init_params, fit_params = self.get_fit_params(_y_train, self.estimator_id) for key, val in init_params.items(): config_dict[key] = val if data_node.data_balance == 1: fit_params['data_balance'] = True classifier_id, clf = get_estimator(config_dict, self.estimator_id) if self.onehot_encoder is None: self.onehot_encoder = OneHotEncoder(categories='auto') y = np.reshape(_y_train, (len(_y_train), 1)) self.onehot_encoder.fit(y) score = validation(clf, self.scorer, _X_train, _y_train, _X_val, _y_val, random_state=self.seed, onehot=self.onehot_encoder if isinstance(self.scorer, _ThresholdScorer) else None, fit_params=fit_params) if 'rw_lock' not in kwargs or kwargs['rw_lock'] is None: self.logger.info('rw_lock not defined! Possible read-write conflicts may happen!') lock = kwargs.get('rw_lock', Lock()) lock.acquire() if np.isfinite(score): save_flag, model_path, delete_flag, model_path_deleted = self.topk_model_saver.add(hpo_config, fe_config, score, classifier_id) if save_flag is True: with open(model_path, 'wb') as f: pkl.dump([op_list, clf], f) self.logger.info("Model saved to %s" % model_path) self.topk_model_saver.save_topk_config() try: if delete_flag and os.path.exists(model_path_deleted): os.remove(model_path_deleted) self.logger.info("Model deleted from %s" % model_path_deleted) except: pass lock.release() except Exception as e: import traceback self.logger.info('%s-evaluator: %s' % (self.name, str(e))) score = -np.inf traceback.print_exc(file=sys.stdout) elif 'cv' in self.resampling_strategy: # Prepare data node. try: with warnings.catch_warnings(): warnings.filterwarnings("ignore") if 'cv' in self.resampling_strategy: if self.resampling_params is None or 'folds' not in self.resampling_params: folds = 5 else: folds = self.resampling_params['folds'] from sklearn.model_selection import StratifiedKFold skfold = StratifiedKFold(n_splits=folds, random_state=self.seed, shuffle=False) scores = list() for train_index, test_index in skfold.split(self.data_node.data[0], self.data_node.data[1]): _X_train, _X_val = self.data_node.data[0][train_index], self.data_node.data[0][test_index] _y_train, _y_val = self.data_node.data[1][train_index], self.data_node.data[1][test_index] self.train_node.data = [_X_train, _y_train] self.val_node.data = [_X_val, _y_val] data_node, op_list = parse_config(self.train_node, fe_config, record=True, if_imbal=self.if_imbal) _val_node = self.val_node.copy_() _val_node = construct_node(_val_node, op_list) _X_train, _y_train = data_node.data _X_val, _y_val = _val_node.data config_dict = hpo_config.get_dictionary().copy() # Prepare training and initial params for classifier. init_params, fit_params = {}, {} if data_node.enable_balance == 1: init_params, fit_params = self.get_fit_params(_y_train, self.estimator_id) for key, val in init_params.items(): config_dict[key] = val if data_node.data_balance == 1: fit_params['data_balance'] = True classifier_id, clf = get_estimator(config_dict, self.estimator_id) if self.onehot_encoder is None: self.onehot_encoder = OneHotEncoder(categories='auto') y = np.reshape(_y_train, (len(_y_train), 1)) self.onehot_encoder.fit(y) _score = validation(clf, self.scorer, _X_train, _y_train, _X_val, _y_val, random_state=self.seed, onehot=self.onehot_encoder if isinstance(self.scorer, _ThresholdScorer) else None, fit_params=fit_params) scores.append(_score) score = np.mean(scores) # TODO: Don't save models for cv if 'rw_lock' not in kwargs or kwargs['rw_lock'] is None: self.logger.info('rw_lock not defined! Possible read-write conflicts may happen!') lock = kwargs.get('rw_lock', Lock()) lock.acquire() if np.isfinite(score): _ = self.topk_model_saver.add(hpo_config, fe_config, score, classifier_id) self.topk_model_saver.save_topk_config() lock.release() except Exception as e: import traceback traceback.print_exc() self.logger.info('Evaluator: %s' % (str(e))) score = -np.inf elif 'partial' in self.resampling_strategy: try: # Prepare data node. with warnings.catch_warnings(): warnings.filterwarnings("ignore") if self.resampling_params is None or 'test_size' not in self.resampling_params: test_size = 0.33 else: test_size = self.resampling_params['test_size'] from sklearn.model_selection import StratifiedShuffleSplit ss = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=self.seed) for train_index, test_index in ss.split(self.data_node.data[0], self.data_node.data[1]): _X_train, _X_val = self.data_node.data[0][train_index], self.data_node.data[0][test_index] _y_train, _y_val = self.data_node.data[1][train_index], self.data_node.data[1][test_index] self.train_node.data = [_X_train, _y_train] self.val_node.data = [_X_val, _y_val] data_node, op_list = parse_config(self.train_node, fe_config, record=True, if_imbal=self.if_imbal) _val_node = self.val_node.copy_() _val_node = construct_node(_val_node, op_list) _X_train, _y_train = data_node.data if downsample_ratio != 1: down_ss = StratifiedShuffleSplit(n_splits=1, test_size=downsample_ratio, random_state=self.seed) for _, _val_index in down_ss.split(_X_train, _y_train): _act_x_train, _act_y_train = _X_train[_val_index], _y_train[_val_index] else: _act_x_train, _act_y_train = _X_train, _y_train _val_index = list(range(len(_X_train))) _X_val, _y_val = _val_node.data config_dict = hpo_config.get_dictionary().copy() # Prepare training and initial params for classifier. init_params, fit_params = {}, {} if data_node.enable_balance == 1: init_params, fit_params = self.get_fit_params(_y_train, self.estimator_id) for key, val in init_params.items(): config_dict[key] = val if 'sample_weight' in fit_params: fit_params['sample_weight'] = fit_params['sample_weight'][_val_index] if data_node.data_balance == 1: fit_params['data_balance'] = True classifier_id, clf = get_estimator(config_dict, self.estimator_id) if self.onehot_encoder is None: self.onehot_encoder = OneHotEncoder(categories='auto') y = np.reshape(_y_train, (len(_y_train), 1)) self.onehot_encoder.fit(y) score = validation(clf, self.scorer, _act_x_train, _act_y_train, _X_val, _y_val, random_state=self.seed, onehot=self.onehot_encoder if isinstance(self.scorer, _ThresholdScorer) else None, fit_params=fit_params) # TODO: Only save models with maximum resources if 'rw_lock' not in kwargs or kwargs['rw_lock'] is None: self.logger.info('rw_lock not defined! Possible read-write conflicts may happen!') lock = kwargs.get('rw_lock', Lock()) lock.acquire() if np.isfinite(score) and downsample_ratio == 1: save_flag, model_path, delete_flag, model_path_deleted = self.topk_model_saver.add(hpo_config, fe_config, score, classifier_id) if save_flag is True: with open(model_path, 'wb') as f: pkl.dump([op_list, clf], f) self.logger.info("Model saved to %s" % model_path) self.topk_model_saver.save_topk_config() try: if delete_flag and os.path.exists(model_path_deleted): os.remove(model_path_deleted) self.logger.info("Model deleted from %s" % model_path_deleted) except: pass lock.release() except Exception as e: import traceback traceback.print_exc() self.logger.info('Evaluator: %s' % (str(e))) score = -np.inf else: raise ValueError('Invalid resampling strategy: %s!' % self.resampling_strategy) try: self.logger.info('Evaluation<%s> | Score: %.4f | Time cost: %.2f seconds | Shape: %s' % (classifier_id, self.scorer._sign * score, time.time() - start_time, _X_train.shape)) except: pass # Turn it into a minimization problem. return_dict['objective_value'] = -score return -score
def __init__(self, stats, ensemble_method: str, ensemble_size: int, task_type: int, metric: _BaseScorer, data_node, output_dir=None): self.stats = stats self.ensemble_method = ensemble_method self.ensemble_size = ensemble_size self.task_type = task_type self.metric = metric self.output_dir = output_dir self.node = data_node self.predictions = [] self.train_labels = None self.timestamp = str(time.time()) logger_name = 'EnsembleBuilder' self.logger = get_logger(logger_name) for algo_id in self.stats.keys(): model_to_eval = self.stats[algo_id] for idx, (_, _, path) in enumerate(model_to_eval): with open(path, 'rb') as f: op_list, model = pkl.load(f) _node = self.node.copy_() _node = construct_node(_node, op_list) # TODO: Test size test_size = 0.33 X, y = _node.data if self.task_type in CLS_TASKS: ss = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=1) else: ss = ShuffleSplit(n_splits=1, test_size=test_size, random_state=1) for train_index, val_index in ss.split(X, y): X_valid = X[val_index] y_valid = y[val_index] if self.train_labels is not None: assert (self.train_labels == y_valid).all() else: self.train_labels = y_valid if self.task_type in CLS_TASKS: y_valid_pred = model.predict_proba(X_valid) else: y_valid_pred = model.predict(X_valid) self.predictions.append(y_valid_pred) if len(self.predictions) < self.ensemble_size: self.ensemble_size = len(self.predictions) if ensemble_method == 'ensemble_selection': return if task_type in CLS_TASKS: self.base_model_mask = choose_base_models_classification( np.array(self.predictions), self.ensemble_size) else: self.base_model_mask = choose_base_models_regression( np.array(self.predictions), np.array(y_valid), self.ensemble_size) self.ensemble_size = sum(self.base_model_mask)
def fit(self, data): # Split training data for phase 1 and phase 2 if self.task_type in CLS_TASKS: kf = StratifiedKFold(n_splits=self.kfold) else: kf = KFold(n_splits=self.kfold) # Train basic models using a part of training data model_cnt = 0 suc_cnt = 0 feature_p2 = None for algo_id in self.stats.keys(): model_to_eval = self.stats[algo_id] for idx, (config, _, path) in enumerate(model_to_eval): with open(path, 'rb') as f: op_list, model = pkl.load(f) _node = data.copy_() _node = construct_node(_node, op_list, mode='train') X, y = _node.data if self.base_model_mask[model_cnt] == 1: for j, (train, test) in enumerate(kf.split(X, y)): x_p1, x_p2, y_p1, _ = X[train], X[test], y[train], y[ test] estimator = fetch_predict_estimator( self.task_type, algo_id, config, x_p1, y_p1, weight_balance=data.enable_balance, data_balance=data.data_balance, combined=True) with open( os.path.join( self.output_dir, '%s-model%d_part%d' % (self.timestamp, model_cnt, j)), 'wb') as f: pkl.dump(estimator, f) if self.task_type in CLS_TASKS: pred = estimator.predict_proba(x_p2) n_dim = np.array(pred).shape[1] if n_dim == 2: # Binary classificaion n_dim = 1 # Initialize training matrix for phase 2 if feature_p2 is None: num_samples = len(train) + len(test) feature_p2 = np.zeros( (num_samples, self.ensemble_size * n_dim)) if n_dim == 1: feature_p2[test, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] = pred[:, 1:2] else: feature_p2[test, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] = pred else: pred = estimator.predict(x_p2).reshape(-1, 1) n_dim = 1 # Initialize training matrix for phase 2 if feature_p2 is None: num_samples = len(train) + len(test) feature_p2 = np.zeros( (num_samples, self.ensemble_size * n_dim)) feature_p2[test, suc_cnt * n_dim:(suc_cnt + 1) * n_dim] = pred suc_cnt += 1 model_cnt += 1 # Train model for stacking using the other part of training data self.meta_learner.fit(feature_p2, y) return self