def fit(self, X, y=None): """ Fit transformer, generating random interval indices. Parameters ---------- X : pandas DataFrame of shape [n_samples, n_features] Input data y : pandas Series, shape (n_samples, ...), optional Targets for supervised learning. Returns ------- self : an instance of self. """ X = check_X(X, enforce_univariate=True) self.input_shape_ = X.shape # Retrieve time-series indexes from each column. self._time_index = get_time_index(X) if isinstance(self.intervals, np.ndarray): self.intervals_ = self.intervals elif is_int(self.intervals): self.intervals_ = np.array_split(self._time_index, self.intervals) else: raise ValueError( f"Intervals must be either an integer, an array with " f"start and end points, but found: {self.intervals}") self._is_fitted = True return self
def fit(self, X, y=None): """ Fit transformer, generating random interval indices. Parameters ---------- X : pandas DataFrame of shape [n_samples, n_features] Input data y : pandas Series, shape (n_samples, ...), optional Targets for supervised learning. Returns ------- self : RandomIntervalSegmenter This estimator """ validate_X(X) self.input_shape_ = X.shape # Retrieve time-series indexes from each column. # TODO generalise to columns with series of unequal length self._time_index = get_time_index(X) # Compute random intervals for each column. # TODO if multiple columns are passed, introduce option to compute one set of shared intervals, # or rely on ColumnTransformer? if self.n_intervals == 'random': self.intervals_ = self._rand_intervals_rand_n(self._time_index) else: self.intervals_ = self._rand_intervals_fixed_n( self._time_index, n_intervals=self.n_intervals) return self
def test_indices(n_components): np.random.seed(42) X = detabularize(pd.DataFrame(data=np.random.randn(10, 5))) X.columns = pd.CategoricalIndex(['col_0']) X.index = pd.Int64Index([i+10 for i in range(10)]) pca = PCATransformer(n_components=n_components) Xt = pca.fit_transform(X) assert X.columns.equals(Xt.columns) assert X.index.equals(Xt.index) assert get_time_index(Xt).equals(pd.Int64Index(range(pca.pca.n_components_)))
def transform(self, X, y=None): """Transform X. Parameters ---------- X : nested pandas DataFrame of shape [n_samples, n_features] Nested dataframe with time-series in cells. Returns ------- Xt : pandas DataFrame Transformed pandas DataFrame with same number of rows and one column for each generated interval. """ if self.check_input: validate_X(X) if X.shape[1] > 1: raise NotImplementedError(f"Currently does not work on multiple columns, make use of ColumnTransformer " f"instead") self._input_shape = X.shape # when seasonal periodicity is equal to 1 return X unchanged if self.sp == 1: return X # keep time index as transform/inverse transform depends on it, e.g. to carry forward trend in inverse_transform self._time_index = get_time_index(X) # convert into tabular format tabulariser = Tabulariser() Xs = tabulariser.transform(X.iloc[:, :1]) check_is_fitted(self, 'is_fitted_') validate_X(X) # fit seasonal decomposition model seasonal_components = self._fit_seasonal_decomposition_model(Xs) # remove seasonal components from data if self.model == 'additive': Xt = Xs - seasonal_components else: Xt = Xs / seasonal_components # keep fitted seasonal components for inverse transform, they are repeated after the first seasonal # period so we only keep the components for the first seasonal period self.seasonal_components_ = seasonal_components[:, :self.sp] # convert back into nested format Xt = tabulariser.inverse_transform(pd.DataFrame(Xt)) Xt.columns = X.columns return Xt
def inverse_transform(self, X, y=None): """Inverse transform X Parameters ---------- X : nested pandas DataFrame of shape [n_samples, n_features] Nested dataframe with time-series in cells. Returns ------- Xt : pandas DataFrame Transformed pandas DataFrame with same number of rows and one column for each generated interval. """ check_is_fitted_in_transform(self, 'coefs_') if self.check_input: if not isinstance(X, pd.DataFrame): raise ValueError(f"Input must be pandas DataFrame, but found: {type(X)}") if X.shape[1] > 1: raise NotImplementedError(f"Currently does not work on multiple columns, make use of ColumnTransformer " f"instead") if not X.shape[0] == self._input_shape[0]: raise ValueError(f"Inverse transform only works on data with the same number samples " f"as seen during transform, but found: {X.shape[0]} samples " f"!= {self._input_shape[0]} samples (seen during transform)") time_index = get_time_index(X) # convert into tabular format tabulariser = Tabulariser() Xs = tabulariser.transform(X.iloc[:, :1]) # add trend at given time series index Xit = add_trend(Xs, coefs=self.coefs_, time_index=time_index) # convert back into nested format Xit = tabulariser.inverse_transform(pd.DataFrame(Xit)) Xit.columns = X.columns return Xit
def fit(self, X, y=None): """ Fit transformer, generating random interval indices. Parameters ---------- X : pandas DataFrame of shape [n_samples, n_features] Input data y : pandas Series, shape (n_samples, ...), optional Targets for supervised learning. Returns ------- self : RandomIntervalSegmenter This estimator """ if not isinstance(self.min_length, int): raise ValueError(f"Min_lenght must be an integer, but found: " f"{type(self.min_length)}") if self.min_length < 1: raise ValueError(f"Min_lenght must be an positive integer (>= 1), " f"but found: {self.min_length}") X = check_X(X) self.input_shape_ = X.shape # Retrieve time-series indexes from each column. # TODO generalise to columns with series of unequal length self._time_index = get_time_index(X) # Compute random intervals for each column. # TODO if multiple columns are passed, introduce option to compute # one set of shared intervals, # or rely on ColumnTransformer? if self.n_intervals == 'random': self.intervals_ = self._rand_intervals_rand_n(self._time_index) else: self.intervals_ = self._rand_intervals_fixed_n( self._time_index, n_intervals=self.n_intervals) self._is_fitted = True return self
def transform(self, X, y=None): """Transform X. Parameters ---------- X : nested pandas DataFrame of shape [n_samples, n_features] Nested dataframe with time-series in cells. Returns ------- Xt : pandas DataFrame Transformed pandas DataFrame with same number of rows and one column for each generated interval. """ if self.check_input: if not isinstance(X, pd.DataFrame): raise ValueError(f"Input must be pandas DataFrame, but found: {type(X)}") if X.shape[1] > 1: raise NotImplementedError(f"Currently does not work on multiple columns") self._input_shape = X.shape # keep time index as trend depends on it, e.g. to carry forward trend in inverse_transform self._time_index = get_time_index(X) # convert into tabular format tabulariser = Tabulariser() Xs = tabulariser.transform(X.iloc[:, :1]) # fit polynomial trend self.coefs_ = fit_trend(Xs, order=self.order) # remove trend Xt = remove_trend(Xs, coefs=self.coefs_, time_index=self._time_index) # convert back into nested format Xt = tabulariser.inverse_transform(pd.DataFrame(Xt)) Xt.columns = X.columns return Xt
def fit(self, y, fh=1, X=None): """ Fit forecaster. Parameters ---------- y : pandas.Series Target time series to which to fit the forecaster. fh : array-like, optional (default=None) The forecasters horizon with the steps ahead to to predict. Default is one-step ahead forecast, i.e. np.array([1]) X : pandas.DataFrame, shape=[n_obs, n_vars], optional (default=None) An optional 2-d dataframe of exogenous variables. If provided, these variables are used as additional features in the regression operation. This should not include a constant or trend. Note that if an ``ARIMA`` is fit on exogenous features, it must also be provided exogenous features for making predictions. Returns ------- self : returns an instance of self. """ if self.check_input: validate_y_X(y, X) # validate forecasting horizon if fh is not None: fh = validate_fh(fh) # Keep index for predicting where forecasters horizon will be relative to y seen in fit self._time_index = get_time_index(y) # Make interface compatible with estimators that only take y and no X kwargs = {} if X is None else {'X': X} # Internal fit. self._fit(y, fh=fh, **kwargs) self._is_fitted = True return self
def transform(self, X, y=None): """Transform nested pandas dataframe into tabular dataframe. Parameters ---------- X : pandas DataFrame Nested dataframe with pandas series or numpy arrays in cells. y : array-like, optional (default=None) Returns ------- Xt : pandas DataFrame Transformed dataframe with only primitives in cells. """ if self.check_input: validate_X(X) self._columns = X.columns self._index = X.index self._time_index = get_time_index(X) Xt = tabularize(X) return Xt
def inverse_transform(self, X, y=None): """Inverse transform X Parameters ---------- X : nested pandas DataFrame of shape [n_samples, n_features] Nested dataframe with time-series in cells. Returns ------- Xt : pandas DataFrame Transformed pandas DataFrame with same number of rows and one column for each generated interval. """ if self.check_input: validate_X(X) check_X_is_univariate(X) # check that number of samples are the same, inverse transform depends on parameters fitted in transform and # hence only works on data with the same (number of) rows if not X.shape[0] == self._input_shape[0]: raise ValueError(f"Inverse transform only works on data with the same number samples " f"as seen during transform, but found: {X.shape[0]} samples " f"!= {self._input_shape[0]} samples (seen during transform)") # if the seasonal periodicity is 1, return unchanged X sp = self.sp if sp == 1: return X # check if seasonal decomposition model has been fitted in transform check_is_fitted_in_transform(self, 'seasonal_components_') # check if time index is aligned with time index seen during transform time_index = get_time_index(X) # align seasonal components with index of X if self._time_index.equals(time_index): # if time index is the same as used for fitting seasonal components, simply expand it to the size of X seasonal_components = self.seasonal_components_ else: # if time index is not aligned, make sure to align fitted seasonal components to new index seasonal_components = self._align_seasonal_components_to_index(time_index) # expand or shorten aligned seasonal components to same size as X n_obs = len(time_index) if n_obs > sp: n_tiles = np.int(np.ceil(n_obs / sp)) seasonal_components = np.tile(seasonal_components, n_tiles) seasonal_components = seasonal_components[:, :n_obs] # convert into tabular format tabulariser = Tabulariser() Xs = tabulariser.transform(X.iloc[:, :1]) # inverse transform data if self.model == 'additive': Xit = Xs + seasonal_components else: Xit = Xs * seasonal_components # convert back into nested format Xit = tabulariser.inverse_transform(pd.DataFrame(Xit)) Xit.columns = X.columns return Xit
def fit(self, X, y=None): self._columns = X.columns self._index = X.index self._time_index = get_time_index(X) self._is_fitted = True return self