Esempio n. 1
0
    def cross_validate(
            self,
            *,
            X: Union[pd.DataFrame, np.ndarray],
            y: Union[pd.DataFrame, np.ndarray],
            cv=KFold(n_splits=5, shuffle=True, random_state=0),
            **kwargs,
    ):
        """
        Run Kfold cross validation on the model, and will update the model's threshold
        values based on a percentile of the validation metrics.

        Parameters
        ----------
        X: Union[pd.DataFrame, np.ndarray]
            Input data to the model
        y: Union[pd.DataFrame, np.ndarray]
            Target data
        kwargs: dict
            Any additional kwargs to be passed to
            :func:`sklearn.model_selection.cross_validate`

        Returns
        -------
        dict
        """

        # Depend on having the trained fold models
        kwargs.update(dict(return_estimator=True, cv=cv))

        cv_output = c_val(self, X=X, y=y, **kwargs)

        # Create empty dataframes to hold fold data
        y_pred = pd.DataFrame(
            np.zeros_like(y),
            index=getattr(y, "index", None),
            columns=getattr(y, "columns", None),
        )
        y = pd.DataFrame(y)
        y_val_mse = pd.Series(index=getattr(y, "index", None))

        # Calculate per-fold validation metrics
        for i, ((_, test_idxs), split_model) in enumerate(
                zip(kwargs["cv"].split(X, y), cv_output["estimator"])):
            y_pred.iloc[test_idxs] = split_model.predict(
                X.iloc[test_idxs].to_numpy() if isinstance(X, pd.DataFrame
                                                           ) else X[test_idxs])

            y_val_mse.iloc[test_idxs] = self._scaled_mse_per_timestep(
                split_model, y.iloc[test_idxs],
                y_pred.iloc[test_idxs]).to_numpy()

        # Calculate aggregate threshold
        self.aggregate_threshold_ = self._calculate_threshold(y_val_mse)

        # Calculate tag thresholds
        self.feature_thresholds_ = self._calculate_feature_thresholds(
            y, y_pred)

        return cv_output
Esempio n. 2
0
    def cross_validate(
            self,
            *,
            X: Union[pd.DataFrame, np.ndarray],
            y: Union[pd.DataFrame, np.ndarray],
            cv=TimeSeriesSplit(n_splits=3),
            **kwargs,
    ):
        """
        Run TimeSeries cross validation on the model, and will update the model's
        threshold values based on the cross validation folds.

        Parameters
        ----------
        X: Union[pd.DataFrame, np.ndarray]
            Input data to the model
        y: Union[pd.DataFrame, np.ndarray]
            Target data
        kwargs: dict
            Any additional kwargs to be passed to
            :func:`sklearn.model_selection.cross_validate`

        Returns
        -------
        dict
        """
        # Depend on having the trained fold models
        kwargs.update(dict(return_estimator=True, cv=cv))

        cv_output = c_val(self, X=X, y=y, **kwargs)

        self.feature_thresholds_per_fold_ = pd.DataFrame()
        self.aggregate_thresholds_per_fold_ = {}
        self.smooth_feature_thresholds_per_fold_ = pd.DataFrame()
        self.smooth_aggregate_thresholds_per_fold_ = {}
        smooth_aggregate_threshold_fold = None
        smooth_tag_thresholds_fold = None

        for i, ((_, test_idxs), split_model) in enumerate(
                zip(kwargs["cv"].split(X, y), cv_output["estimator"])):
            y_pred = split_model.predict(X.iloc[test_idxs] if isinstance(
                X, pd.DataFrame) else X[test_idxs])

            # Adjust y_true for any possible model offset in its prediction
            test_idxs = test_idxs[-len(y_pred):]
            y_true = y.iloc[test_idxs] if isinstance(
                y, pd.DataFrame) else y[test_idxs]

            # Model's timestep scaled mse over all features
            scaled_mse = self._scaled_mse_per_timestep(split_model, y_true,
                                                       y_pred)

            # Absolute error
            mae = self._absolute_error(y_true, y_pred)

            # For the aggregate threshold for the fold model,
            # use the mse of scaled residuals per timestep
            aggregate_threshold_fold = scaled_mse.rolling(6).min().max()
            self.aggregate_thresholds_per_fold_[
                f"fold-{i}"] = aggregate_threshold_fold

            # Accumulate the rolling mins of diffs into common df
            tag_thresholds_fold = mae.rolling(6).min().max()
            tag_thresholds_fold.name = f"fold-{i}"
            self.feature_thresholds_per_fold_ = self.feature_thresholds_per_fold_.append(
                tag_thresholds_fold)

            if self.window is not None:
                # Calculate smoothed thresholds only if len of data >= window
                smooth_aggregate_threshold_fold = (scaled_mse.rolling(
                    self.window).min().max())
                self.smooth_aggregate_thresholds_per_fold_[
                    f"fold-{i}"] = smooth_aggregate_threshold_fold

                smooth_tag_thresholds_fold = mae.rolling(
                    self.window).min().max()
                smooth_tag_thresholds_fold.name = f"fold-{i}"
                self.smooth_feature_thresholds_per_fold_ = self.smooth_feature_thresholds_per_fold_.append(
                    smooth_tag_thresholds_fold)

        # Final thresholds are the thresholds from the last cv split/fold
        self.feature_thresholds_ = tag_thresholds_fold

        # For the aggregate also use the thresholds from the last split/fold
        self.aggregate_threshold_ = aggregate_threshold_fold

        # For the smoothed thresholds also use the last fold
        self.smooth_aggregate_threshold_ = smooth_aggregate_threshold_fold
        self.smooth_feature_thresholds_ = smooth_tag_thresholds_fold

        return cv_output