def generate_diff(self, lag=1, differences=1): """ A utility for generating the array diff (lag differences) for each group. To support invertability, this method will return the starting value of each array as well as the differenced values. :param lag: Determines the magnitude of the lag to calculate the differencing function for. Default: ``1`` :param differences: The order of the differencing to be performed. Note that values > 1 will generate n fewer results. Default: ``1`` :return: Dictionary of ``{<group_key>: {"series_start": <float>, "diff": <diff_array>}}`` """ self._create_group_df() group_diff_data = {} for group, df in self._group_df: df.reset_index(inplace=True) group_data = { "diff": diff(x=df[self._y_col], lag=lag, differences=differences), "series_start": df[self._y_col][0], } group_diff_data[group] = group_data return group_diff_data
order = (1, 0, 12) # p=1, d=0, q=12 order = (1, 1, 3) # p=1, d=1, q=3 #The parameters p and q can be iteratively searched-for with the auto_arima function, but the differencing term, d, requires a special set of tests of stationarity to estimate. #%%% Understanding differencing (d) An integrative term, d, is typically only used in the case of non-stationary data. Stationarity in a time series indicates that a series’ statistical attributes, such as mean, variance, etc., are constant over time (i.e., it exhibits low heteroskedasticity. A stationary time series is far more easy to learn and forecast from. With the d parameter, you can force the ARIMA model to adjust for non-stationarity on its own, without having to worry about doing so manually. The value of d determines the number of periods to lag the response prior to computing differences. E.g., from pmdarima.utils import c, diff # lag 1, diff 1 x = c(10, 4, 2, 9, 34) diff(x, lag=1, differences=1) # Returns: array([ -6., -2., 7., 25.], dtype=float32) #lag and differences are not the same! diff(x, lag=1, differences=2) # Returns: array([ 4., 9., 18.], dtype=float32) diff(x, lag=2, differences=1) # Returns: array([-8., 5., 32.], dtype=float32 #The lag corresponds to the offset in the time period lag, whereas the differences parameter is the number of times the differences are computed. Therefore, e.g., for differences=2, the procedure is essentially computing the difference twice: x = c(10, 4, 2, 9, 34) x# 1 x[1:], x[:-1] x_lag = x[1:] # first lag x_lag x[:-1]
dates = pd.date_range('2020-09-01',periods=5, freq='D') dates sales = pd.Series([50,60,55,70,80], index=dates) sales #%%% ma3 = sales.rolling(window=3).mean() ma3 ma3c = sales.rolling(window=3, center=True).mean() ma3c #%%% sales.shift(1) sales - sales.shift(1) from pmdarima.utils import c, diff diff(sales, lag=1, differences=1) sales - sales.shift(1) #%%%%% diff(sales, lag=2, differences=1) sales - sales.shift(2) #lag is the gap : 1 with 3, 2 with 4 and so on #%%%%% sales2 = sales.copy() diff(sales, lag=1, differences=1) sales2 - sales2.shift(1) diff(sales, lag=1, differences=2) sales2 = sales2 - sales2.shift(1) sales2 - sales2.shift(1) diff(sales, lag=1, differences=2)
sales #%%% Simple Moving Average (SMA) #offset mean ma3 = sales.rolling(window=3).mean() ma3 ma3c = sales.rolling(window=3, center=True).mean() ma3c #%%% : shift 1 down and then find the daily diff sales.shift(1) sales - sales.shift(1) #daily changes #pip install pmdarima --user #see the syntax #restart session #https://pypi.org/project/pmdarima/ from pmdarima.utils import c, diff diff(sales, lag=1, differences=1) #diff from function sales - sales.shift(1) #same #%%%%% diff(sales, lag=2, differences=1) np.vstack((sales, sales.shift(2), sales - sales.shift(2))) sales - sales.shift(2) #lag is the gap : 1 with 3, 2 with 4 and so on #%%%%% sales2 = sales.copy() diff(sales, lag=1, differences=1) sales2 - sales2.shift(1) diff(sales, lag=1, differences=2) sales2 = sales2 - sales2.shift(1) sales2 - sales2.shift(1)