def cross_validate( self, *, X: Union[pd.DataFrame, np.ndarray], y: Union[pd.DataFrame, np.ndarray], cv=KFold(n_splits=5, shuffle=True, random_state=0), **kwargs, ): """ Run Kfold cross validation on the model, and will update the model's threshold values based on a percentile of the validation metrics. Parameters ---------- X: Union[pd.DataFrame, np.ndarray] Input data to the model y: Union[pd.DataFrame, np.ndarray] Target data kwargs: dict Any additional kwargs to be passed to :func:`sklearn.model_selection.cross_validate` Returns ------- dict """ # Depend on having the trained fold models kwargs.update(dict(return_estimator=True, cv=cv)) cv_output = c_val(self, X=X, y=y, **kwargs) # Create empty dataframes to hold fold data y_pred = pd.DataFrame( np.zeros_like(y), index=getattr(y, "index", None), columns=getattr(y, "columns", None), ) y = pd.DataFrame(y) y_val_mse = pd.Series(index=getattr(y, "index", None)) # Calculate per-fold validation metrics for i, ((_, test_idxs), split_model) in enumerate( zip(kwargs["cv"].split(X, y), cv_output["estimator"])): y_pred.iloc[test_idxs] = split_model.predict( X.iloc[test_idxs].to_numpy() if isinstance(X, pd.DataFrame ) else X[test_idxs]) y_val_mse.iloc[test_idxs] = self._scaled_mse_per_timestep( split_model, y.iloc[test_idxs], y_pred.iloc[test_idxs]).to_numpy() # Calculate aggregate threshold self.aggregate_threshold_ = self._calculate_threshold(y_val_mse) # Calculate tag thresholds self.feature_thresholds_ = self._calculate_feature_thresholds( y, y_pred) return cv_output
def cross_validate( self, *, X: Union[pd.DataFrame, np.ndarray], y: Union[pd.DataFrame, np.ndarray], cv=TimeSeriesSplit(n_splits=3), **kwargs, ): """ Run TimeSeries cross validation on the model, and will update the model's threshold values based on the cross validation folds. Parameters ---------- X: Union[pd.DataFrame, np.ndarray] Input data to the model y: Union[pd.DataFrame, np.ndarray] Target data kwargs: dict Any additional kwargs to be passed to :func:`sklearn.model_selection.cross_validate` Returns ------- dict """ # Depend on having the trained fold models kwargs.update(dict(return_estimator=True, cv=cv)) cv_output = c_val(self, X=X, y=y, **kwargs) self.feature_thresholds_per_fold_ = pd.DataFrame() self.aggregate_thresholds_per_fold_ = {} self.smooth_feature_thresholds_per_fold_ = pd.DataFrame() self.smooth_aggregate_thresholds_per_fold_ = {} smooth_aggregate_threshold_fold = None smooth_tag_thresholds_fold = None for i, ((_, test_idxs), split_model) in enumerate( zip(kwargs["cv"].split(X, y), cv_output["estimator"])): y_pred = split_model.predict(X.iloc[test_idxs] if isinstance( X, pd.DataFrame) else X[test_idxs]) # Adjust y_true for any possible model offset in its prediction test_idxs = test_idxs[-len(y_pred):] y_true = y.iloc[test_idxs] if isinstance( y, pd.DataFrame) else y[test_idxs] # Model's timestep scaled mse over all features scaled_mse = self._scaled_mse_per_timestep(split_model, y_true, y_pred) # Absolute error mae = self._absolute_error(y_true, y_pred) # For the aggregate threshold for the fold model, # use the mse of scaled residuals per timestep aggregate_threshold_fold = scaled_mse.rolling(6).min().max() self.aggregate_thresholds_per_fold_[ f"fold-{i}"] = aggregate_threshold_fold # Accumulate the rolling mins of diffs into common df tag_thresholds_fold = mae.rolling(6).min().max() tag_thresholds_fold.name = f"fold-{i}" self.feature_thresholds_per_fold_ = self.feature_thresholds_per_fold_.append( tag_thresholds_fold) if self.window is not None: # Calculate smoothed thresholds only if len of data >= window smooth_aggregate_threshold_fold = (scaled_mse.rolling( self.window).min().max()) self.smooth_aggregate_thresholds_per_fold_[ f"fold-{i}"] = smooth_aggregate_threshold_fold smooth_tag_thresholds_fold = mae.rolling( self.window).min().max() smooth_tag_thresholds_fold.name = f"fold-{i}" self.smooth_feature_thresholds_per_fold_ = self.smooth_feature_thresholds_per_fold_.append( smooth_tag_thresholds_fold) # Final thresholds are the thresholds from the last cv split/fold self.feature_thresholds_ = tag_thresholds_fold # For the aggregate also use the thresholds from the last split/fold self.aggregate_threshold_ = aggregate_threshold_fold # For the smoothed thresholds also use the last fold self.smooth_aggregate_threshold_ = smooth_aggregate_threshold_fold self.smooth_feature_thresholds_ = smooth_tag_thresholds_fold return cv_output