def compute_feature_importance(self, X, y, features_to_use=None, preprocess=True, is_oof=True, silent=False, **kwargs) -> pd.Series: feature_importance_fold_list = [] fold_weights = [] # TODO: Preprocess data here instead of repeatedly model_index = 0 for n_repeat, k in enumerate(self._k_per_n_repeat): if is_oof: if not self.bagged_mode: raise AssertionError( 'Model trained with no validation data cannot get feature importances on training data, please specify new test data to compute feature importances (model=%s)' % self.name) kfolds = generate_kfold(X=X, y=y, n_splits=k, stratified=self.is_stratified(), random_state=self._random_state, n_repeats=n_repeat + 1) cur_kfolds = kfolds[n_repeat * k:(n_repeat + 1) * k] else: cur_kfolds = [(None, list(range(len(X))))] * k for i, fold in enumerate(cur_kfolds): _, test_index = fold model = self.load_child(self.models[model_index + i]) feature_importance_fold = model.compute_feature_importance( X=X.iloc[test_index, :], y=y.iloc[test_index], features_to_use=features_to_use, preprocess=preprocess, silent=silent, **kwargs) feature_importance_fold_list.append(feature_importance_fold) fold_weights.append(len(test_index)) model_index += k weight_total = sum(fold_weights) fold_weights = [weight / weight_total for weight in fold_weights] for i, result in enumerate(feature_importance_fold_list): feature_importance_fold_list[ i] = feature_importance_fold_list[i] * fold_weights[i] feature_importance = pd.concat( feature_importance_fold_list, axis=1, sort=True).sum(1).sort_values(ascending=False) # TODO: Consider utilizing z scores and stddev to make threshold decisions # stddev = pd.concat(feature_importance_fold_list, axis=1, sort=True).std(1).sort_values(ascending=False) # feature_importance_df = pd.DataFrame(index=feature_importance.index) # feature_importance_df['importance'] = feature_importance # feature_importance_df['stddev'] = stddev # feature_importance_df['z'] = feature_importance_df['importance'] / feature_importance_df['stddev'] return feature_importance
def _hyperparameter_tune(self, X_train, y_train, k_fold, scheduler_options=None, preprocess_kwargs=None, **kwargs): if len(self.models) != 0: raise ValueError( 'self.models must be empty to call hyperparameter_tune, value: %s' % self.models) self.model_base.feature_metadata = self.feature_metadata # TODO: Move this # TODO: Preprocess data here instead of repeatedly if preprocess_kwargs is None: preprocess_kwargs = dict() X_train = self.preprocess(X=X_train, preprocess=False, fit=True, **preprocess_kwargs) kfolds = generate_kfold(X=X_train, y=y_train, n_splits=k_fold, stratified=self.is_stratified(), random_state=self._random_state, n_repeats=1) train_index, test_index = kfolds[0] X_train_fold, X_val_fold = X_train.iloc[train_index, :], X_train.iloc[ test_index, :] y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[ test_index] orig_time = scheduler_options[1]['time_out'] scheduler_options[1][ 'time_out'] = orig_time * 0.8 # TODO: Scheduler doesn't early stop on final model, this is a safety net. Scheduler should be updated to early stop hpo_models, hpo_model_performances, hpo_results = self.model_base.hyperparameter_tune( X_train=X_train_fold, y_train=y_train_fold, X_val=X_val_fold, y_val=y_val_fold, scheduler_options=scheduler_options, **kwargs) scheduler_options[1]['time_out'] = orig_time bags = {} bags_performance = {} for i, (model_name, model_path) in enumerate(hpo_models.items()): child: AbstractModel = self._child_type.load(path=model_path) y_pred_proba = child.predict_proba(X_val_fold) # TODO: Create new Ensemble Here bag = copy.deepcopy(self) bag.name = bag.name + os.path.sep + str(i) bag.set_contexts(self.path_root + bag.name + os.path.sep) oof_pred_proba, oof_pred_model_repeats = self._construct_empty_oof( X=X_train, y=y_train) oof_pred_proba[test_index] += y_pred_proba oof_pred_model_repeats[test_index] += 1 bag.model_base = None child.set_contexts(bag.path + child.name + os.path.sep) bag.save_model_base(child.convert_to_template()) bag._k = k_fold bag._k_fold_end = 1 bag._n_repeats = 1 bag._oof_pred_proba = oof_pred_proba bag._oof_pred_model_repeats = oof_pred_model_repeats child.name = child.name + '_fold_0' child.set_contexts(bag.path + child.name + os.path.sep) if not self.save_bagged_folds: child.model = None if bag.low_memory: bag.save_child(child, verbose=False) bag.models.append(child.name) else: bag.models.append(child) bag.val_score = child.val_score bag._add_child_times_to_bag(model=child) bag.save() bags[bag.name] = bag.path bags_performance[bag.name] = bag.val_score # TODO: hpo_results likely not correct because no renames return bags, bags_performance, hpo_results
def compute_feature_importance(self, X, y, features=None, is_oof=True, time_limit=None, silent=False, **kwargs) -> pd.DataFrame: if features is None: features = self.load_child(model=self.models[0]).features if not is_oof: return super().compute_feature_importance(X, y, features=features, time_limit=time_limit, silent=silent, **kwargs) fi_fold_list = [] model_index = 0 num_children = len(self.models) if time_limit is not None: time_limit_per_child = time_limit / num_children else: time_limit_per_child = None if not silent: logging_message = f'Computing feature importance via permutation shuffling for {len(features)} features using out-of-fold (OOF) data aggregated across {num_children} child models...' if time_limit is not None: logging_message = f'{logging_message} Time limit: {time_limit}s...' logger.log(20, logging_message) time_start = time.time() early_stop = False children_completed = 0 log_final_suffix = '' for n_repeat, k in enumerate(self._k_per_n_repeat): if is_oof: if not self.bagged_mode: raise AssertionError( 'Model trained with no validation data cannot get feature importances on training data, please specify new test data to compute feature importances (model=%s)' % self.name) kfolds = generate_kfold(X=X, y=y, n_splits=k, stratified=self.is_stratified(), random_state=self._random_state, n_repeats=n_repeat + 1) cur_kfolds = kfolds[n_repeat * k:(n_repeat + 1) * k] else: cur_kfolds = [(None, list(range(len(X))))] * k for i, fold in enumerate(cur_kfolds): _, test_index = fold model = self.load_child(self.models[model_index + i]) fi_fold = model.compute_feature_importance( X=X.iloc[test_index, :], y=y.iloc[test_index], features=features, time_limit=time_limit_per_child, silent=silent, log_prefix='\t', importance_as_list=True, **kwargs) fi_fold_list.append(fi_fold) children_completed += 1 if time_limit is not None and children_completed != num_children: time_now = time.time() time_left = time_limit - (time_now - time_start) time_child_average = (time_now - time_start) / children_completed if time_left < (time_child_average * 1.1): log_final_suffix = f' (Early stopping due to lack of time...)' early_stop = True break if early_stop: break model_index += k # TODO: DON'T THROW AWAY SAMPLES! USE LARGER N fi_list_dict = dict() for val in fi_fold_list: val = val['importance'].to_dict( ) # TODO: Don't throw away stddev information of children for key in val: if key not in fi_list_dict: fi_list_dict[key] = [] fi_list_dict[key] += val[key] fi_df = _compute_fi_with_stddev(fi_list_dict) if not silent: logger.log( 20, f'\t{round(time.time() - time_start, 2)}s\t= Actual runtime (Completed {children_completed} of {num_children} children){log_final_suffix}' ) return fi_df
def _fit(self, X_train, y_train, k_fold=5, k_fold_start=0, k_fold_end=None, n_repeats=1, n_repeat_start=0, time_limit=None, **kwargs): if k_fold < 1: k_fold = 1 if k_fold_end is None: k_fold_end = k_fold if self._oof_pred_proba is None and (k_fold_start != 0 or n_repeat_start != 0): self._load_oof() if n_repeat_start != self._n_repeats_finished: raise ValueError( f'n_repeat_start must equal self._n_repeats_finished, values: ({n_repeat_start}, {self._n_repeats_finished})' ) if n_repeats <= n_repeat_start: raise ValueError( f'n_repeats must be greater than n_repeat_start, values: ({n_repeats}, {n_repeat_start})' ) if k_fold_start != self._k_fold_end: raise ValueError( f'k_fold_start must equal previous k_fold_end, values: ({k_fold_start}, {self._k_fold_end})' ) if k_fold_start >= k_fold_end: # TODO: Remove this limitation if n_repeats > 1 raise ValueError( f'k_fold_end must be greater than k_fold_start, values: ({k_fold_end}, {k_fold_start})' ) if (n_repeats - n_repeat_start) > 1 and k_fold_end != k_fold: # TODO: Remove this limitation raise ValueError( f'k_fold_end must equal k_fold when (n_repeats - n_repeat_start) > 1, values: ({k_fold_end}, {k_fold})' ) if self._k is not None and self._k != k_fold: raise ValueError( f'k_fold must equal previously fit k_fold value for the current n_repeat, values: (({k_fold}, {self._k})' ) fold_start = n_repeat_start * k_fold + k_fold_start fold_end = (n_repeats - 1) * k_fold + k_fold_end time_start = time.time() model_base = self._get_model_base() if self.features is not None: model_base.features = self.features model_base.feature_metadata = self.feature_metadata # TODO: Don't pass this here if self.model_base is not None: self.save_model_base(self.model_base) self.model_base = None if k_fold == 1: if self._n_repeats != 0: raise ValueError( f'n_repeats must equal 0 when fitting a single model with k_fold < 2, values: ({self._n_repeats}, {k_fold})' ) model_base.set_contexts(path_context=self.path + model_base.name + os.path.sep) time_start_fit = time.time() model_base.fit(X_train=X_train, y_train=y_train, time_limit=time_limit, **kwargs) model_base.fit_time = time.time() - time_start_fit model_base.predict_time = None self._oof_pred_proba = model_base.predict_proba( X=X_train) # TODO: Cheater value, will be overfit to valid set self._oof_pred_model_repeats = np.ones(shape=len(X_train), dtype=np.uint8) self._n_repeats = 1 self._n_repeats_finished = 1 self._k_per_n_repeat = [1] self.bagged_mode = False model_base.reduce_memory_size(remove_fit=True, remove_info=False, requires_save=True) if not self.save_bagged_folds: model_base.model = None if self.low_memory: self.save_child(model_base, verbose=False) self.models = [model_base.name] else: self.models = [model_base] self._add_child_times_to_bag(model=model_base) return # TODO: Preprocess data here instead of repeatedly kfolds = generate_kfold(X=X_train, y=y_train, n_splits=k_fold, stratified=self.is_stratified(), random_state=self._random_state, n_repeats=n_repeats) oof_pred_proba, oof_pred_model_repeats = self._construct_empty_oof( X=X_train, y=y_train) models = [] folds_to_fit = fold_end - fold_start for j in range(n_repeat_start, n_repeats): # For each n_repeat cur_repeat_count = j - n_repeat_start fold_start_n_repeat = fold_start + cur_repeat_count * k_fold fold_end_n_repeat = min(fold_start_n_repeat + k_fold, fold_end) # TODO: Consider moving model fit inner for loop to a function to simply this code for i in range(fold_start_n_repeat, fold_end_n_repeat): # For each fold folds_finished = i - fold_start folds_left = fold_end - i fold = kfolds[i] time_elapsed = time.time() - time_start if time_limit is not None: time_left = time_limit - time_elapsed required_time_per_fold = time_left / folds_left time_limit_fold = required_time_per_fold * 0.8 if folds_finished > 0: expected_time_required = time_elapsed * folds_to_fit / folds_finished expected_remaining_time_required = expected_time_required * folds_left / folds_to_fit if expected_remaining_time_required > time_left: raise TimeLimitExceeded if time_left <= 0: raise TimeLimitExceeded else: time_limit_fold = None time_start_fold = time.time() train_index, val_index = fold X_train_fold, X_val_fold = X_train.iloc[ train_index, :], X_train.iloc[val_index, :] y_train_fold, y_val_fold = y_train.iloc[ train_index], y_train.iloc[val_index] fold_model = copy.deepcopy(model_base) fold_model.name = f'{fold_model.name}_fold_{i}' fold_model.set_contexts(self.path + fold_model.name + os.path.sep) fold_model.fit(X_train=X_train_fold, y_train=y_train_fold, X_val=X_val_fold, y_val=y_val_fold, time_limit=time_limit_fold, **kwargs) time_train_end_fold = time.time() if time_limit is not None: # Check to avoid unnecessarily predicting and saving a model when an Exception is going to be raised later if i != (fold_end - 1): time_elapsed = time.time() - time_start time_left = time_limit - time_elapsed expected_time_required = time_elapsed * folds_to_fit / ( folds_finished + 1) expected_remaining_time_required = expected_time_required * ( folds_left - 1) / folds_to_fit if expected_remaining_time_required > time_left: raise TimeLimitExceeded pred_proba = fold_model.predict_proba(X_val_fold) time_predict_end_fold = time.time() fold_model.fit_time = time_train_end_fold - time_start_fold fold_model.predict_time = time_predict_end_fold - time_train_end_fold fold_model.val_score = fold_model.score_with_y_pred_proba( y=y_val_fold, y_pred_proba=pred_proba) fold_model.reduce_memory_size(remove_fit=True, remove_info=False, requires_save=True) if not self.save_bagged_folds: fold_model.model = None if self.low_memory: self.save_child(fold_model, verbose=False) models.append(fold_model.name) else: models.append(fold_model) oof_pred_proba[val_index] += pred_proba oof_pred_model_repeats[val_index] += 1 self._add_child_times_to_bag(model=fold_model) if (fold_end_n_repeat != fold_end) or (k_fold == k_fold_end): self._k_per_n_repeat.append(k_fold) self.models += models self.bagged_mode = True if self._oof_pred_proba is None: self._oof_pred_proba = oof_pred_proba self._oof_pred_model_repeats = oof_pred_model_repeats else: self._oof_pred_proba += oof_pred_proba self._oof_pred_model_repeats += oof_pred_model_repeats self._n_repeats = n_repeats if k_fold == k_fold_end: self._k = None self._k_fold_end = 0 self._n_repeats_finished = self._n_repeats else: self._k = k_fold self._k_fold_end = k_fold_end self._n_repeats_finished = self._n_repeats - 1
def hyperparameter_tune(self, X, y, k_fold, scheduler_options=None, compute_base_preds=True, **kwargs): if len(self.models) != 0: raise ValueError('self.models must be empty to call hyperparameter_tune, value: %s' % self.models) if len(self.models) == 0: type_map_raw = {column: R_FLOAT for column in self.stack_columns} type_group_map_special = {S_STACK: self.stack_columns} stacker_feature_metadata = FeatureMetadata(type_map_raw=type_map_raw, type_group_map_special=type_group_map_special) if self.feature_metadata is None: # TODO: This is probably not the best way to do this self.feature_metadata = stacker_feature_metadata else: self.feature_metadata = self.feature_metadata.join_metadata(stacker_feature_metadata) self.model_base.feature_metadata = self.feature_metadata # TODO: Move this # TODO: Preprocess data here instead of repeatedly X = self.preprocess(X=X, preprocess=False, fit=True, compute_base_preds=compute_base_preds) kfolds = generate_kfold(X=X, y=y, n_splits=k_fold, stratified=self.is_stratified(), random_state=self._random_state, n_repeats=1) train_index, test_index = kfolds[0] X_train, X_val = X.iloc[train_index, :], X.iloc[test_index, :] y_train, y_val = y.iloc[train_index], y.iloc[test_index] orig_time = scheduler_options[1]['time_out'] scheduler_options[1]['time_out'] = orig_time * 0.8 # TODO: Scheduler doesn't early stop on final model, this is a safety net. Scheduler should be updated to early stop hpo_models, hpo_model_performances, hpo_results = self.model_base.hyperparameter_tune(X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val, scheduler_options=scheduler_options, **kwargs) scheduler_options[1]['time_out'] = orig_time stackers = {} stackers_performance = {} for i, (model_name, model_path) in enumerate(hpo_models.items()): child: AbstractModel = self._child_type.load(path=model_path) y_pred_proba = child.predict_proba(X_val) # TODO: Create new StackerEnsemble Here stacker = copy.deepcopy(self) stacker.name = stacker.name + os.path.sep + str(i) stacker.set_contexts(self.path_root + stacker.name + os.path.sep) if self.problem_type == MULTICLASS: oof_pred_proba = np.zeros(shape=(len(X), len(y.unique()))) else: oof_pred_proba = np.zeros(shape=len(X)) oof_pred_model_repeats = np.zeros(shape=len(X)) oof_pred_proba[test_index] += y_pred_proba oof_pred_model_repeats[test_index] += 1 stacker.model_base = None child.set_contexts(stacker.path + child.name + os.path.sep) stacker.save_model_base(child.convert_to_template()) stacker._k = k_fold stacker._k_fold_end = 1 stacker._n_repeats = 1 stacker._oof_pred_proba = oof_pred_proba stacker._oof_pred_model_repeats = oof_pred_model_repeats child.name = child.name + '_fold_0' child.set_contexts(stacker.path + child.name + os.path.sep) if not self.save_bagged_folds: child.model = None if stacker.low_memory: stacker.save_child(child, verbose=False) stacker.models.append(child.name) else: stacker.models.append(child) stacker.val_score = child.val_score stacker._add_child_times_to_bag(model=child) stacker.save() stackers[stacker.name] = stacker.path stackers_performance[stacker.name] = stacker.val_score # TODO: hpo_results likely not correct because no renames return stackers, stackers_performance, hpo_results