Ejemplo n.º 1
0
 def fit(self,
         y,
         period,
         x=None,
         metric="smape",
         val_size=None,
         verbose=False):
     """
     Build the model using best-tuned hyperparameter values.
     :param y: pd.Series or 1-D np.array, time series to predict.
     :param period: Int or Str, the number of observations per cycle: 1 or "annual" for yearly data, 4 or "quarterly"
     for quarterly data, 7 or "daily" for daily data, 12 or "monthly" for monthly data, 24 or "hourly" for hourly
     data, 52 or "weekly" for weekly data. First-letter abbreviations of strings work as well ("a", "q", "d", "m",
     "h" and "w", respectively). Additional reference: https://robjhyndman.com/hyndsight/seasonal-periods/.
     :param x: pd.DataFrame or 2-D np.array, exogeneous predictors, optional
     :param metric: Str, the metric used for model selection. One of "mse" (mean squared error), "mae" (mean absolute
     error).
     :param val_size: Int, the number of most recent observations to use as validation set for tuning.
     :param verbose: Boolean, True for printing additional info while tuning.
     :return: None
     """
     self.y = y
     self.name = "Bayesian Dynamic Linear Model"
     self.key = "bdlm"
     self._tune(y=y,
                period=period,
                x=x,
                metric=metric,
                val_size=val_size,
                verbose=verbose)
     self.model = pydlm.dlm(y)
     self.model = self.model + pydlm.trend(degree=self.params["trend"],
                                           discount=0.5)
     self.model = self.model + pydlm.seasonality(period=self.period,
                                                 discount=0.99)
     if self.params["ar"] is not None:
         self.model = self.model + pydlm.autoReg(degree=self.params["ar"],
                                                 discount=0.99)
     if x is not None:
         for variable_id, x_variable in enumerate(x.T):
             self.model = self.model + pydlm.dynamic(
                 features=[[v] for v in x_variable],
                 discount=0.99,
                 name=str(variable_id))
     with SuppressStdoutStderr():
         self.model.tune()
         self.model.fit()
Ejemplo n.º 2
0
def dlm_exogenous_r3(y, s, k, a, t, e, r):
    """ One way to use dlm
        :returns: x, s', w
    """
    if not s:
        s = dict()
        s['dim'] = dimension(y)
        s = dlm_set_exog_hyperparams(s=s, r=r)
        y0, exog = split_exogenous(y=y)
        s['n_obs'] = 0
        s['model'] = quietDlm([], printInfo=False) + trend(
            s['trend_degree'], s['discount']) + seasonality(
                s['period'], s['discount'])
        s['model'] = s['model'] + fixedAutoReg(
            degree=s['auto_degree'], name='ar', w=1.0)
        if exog:
            exog_wrapped = [[None if np.isnan(ex0) else ex0 for ex0 in exog]]
            s['model'] = s['model'] + dynamic(features=exog_wrapped,
                                              discount=0.99,
                                              name='exog')  # Set's first exog

    if y is not None:
        y = wrap(y)
        assert dimension(y) == s['dim'], 'Cannot change dimension of data sent'
        s['n_obs'] += 1
        y0, exog = split_exogenous(y=y)
        y0_passed_in = None if np.isnan(
            y0) else y0  # pydlm uses None for missing values
        s['model'].append([y0_passed_in])
        if exog:
            exog_wrapped = [[None if np.isnan(ex0) else ex0 for ex0 in exog]]
            if s['n_obs'] > 1:
                s['model'].append(
                    data=exog_wrapped,
                    component='exog')  # Don't get first exog twice
        num_obs = len(s['model'].data) if s.get('model') else 0
        if num_obs % s['n_fit'] == s['n_fit'] - 1:
            _, _, s = dlm_exogenous_r3(y=None, s=s, k=k, a=a, t=t, e=10, r=r)
        s['model'].fitForwardFilter()
        return _dlm_exog_prediction_helper(s=s, k=k, y=y)

    if y is None:
        if dimension(y) == 1:
            s['model'].tune(maxit=20)
            # Don't tune if exogenous ... haven't got this to work
        s['model'].fit()
        return None, None, s
Ejemplo n.º 3
0
def SerBayes(sDay,nAhead,x0,hWeek):
    dta = sDay['y']
    dta.index = [pd.datetime.strptime(str(x)[0:10],'%Y-%m-%d') for x in dta.index]
    t_line = [float(calendar.timegm(x.utctimetuple()))/1000000 for x in dta.index]
    dta.index = t_line
    model = pydlm.dlm(dta)
    model = model + pydlm.trend(degree=1,discount=0.98,name='a',w=10.0)
    model = model + pydlm.dynamic(features=[[v] for v in t_line],discount=1,name='b',w=10.0)
    model = model + pydlm.autoReg(degree=3,data=dta.values,name='ar3',w=1.0)
    allStates = model.getLatentState(filterType='forwardFilter')
    model.evolveMode('independent')
    model.noisePrior(2.0)
    model.fit()
    model.plot()
    model.turnOff('predict')
    model.plotCoef(name='a')
    model.plotCoef(name='b')
    model.plotCoef(name='ar3')
Ejemplo n.º 4
0
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

import numpy as np
import matplotlib.pyplot as plt
import pydlm

# Simple example (random walk)
n = 100
a = 1.0 + np.random.normal(0, 5, n)  # the intercept
x = np.random.normal(0, 2, n)  # the control variable
b = 3.0  # the coefficient
y = a + b * x

dlm = pydlm.dlm(y)
dlm = dlm + pydlm.trend(degree=0, discount=0.98, name='a', w=10.0)
dlm = dlm + pydlm.dynamic(
    features=[[v] for v in x], discount=1, name='b', w=10.0)

# randomly generate data
data = [0] * 100 + [3] * 100

# creadte model
dlm = pydlm.dlm(data)

# add components
dlm = dlm + pydlm.trend(1, name='lineTrend', w=1.0)  # covariance=1
dlm = dlm + pydlm.seasonality(7, name='7day', w=1.0)
dlm = dlm + pydlm.autoReg(degree=3, data=data, name='ar3', w=1.0)
dlm.ls()

# delete unwanted component
dlm.delete('7day')
    def ts_fit(self, suppress=False):
        """Fit DLM to the time series data.

         Parameters:
         ----------
         suppress: bool
            Suppress or not some of the output messages
         """
        self._prepare_fit()
        self._model = None
        self.ts_split()

        ts_df = self._train_dt.copy()

        # Fit
        self._dlm_logger.info("Trying to fit the DLM model....")
        try:
            if not suppress:
                self._dlm_logger.info("...via using parameters\n")
                print_attributes(self)

            ts_df = ts_df.reset_index()
            ts_df.columns = self._ts_df_cols

            self._model = dlm(ts_df['y'])

            # trend
            if self._dlm_trend is not None:
                self._model = self._model + trend(
                    degree=self._dlm_trend['degree'],
                    discount=self._dlm_trend['discount'],
                    name=self._dlm_trend['name'],
                    w=self._dlm_trend['w'])
            # seasonality
            if self._dlm_seasonality is not None:
                self._model = self._model + seasonality(
                    period=self._dlm_seasonality['period'],
                    discount=self._dlm_seasonality['discount'],
                    name=self._dlm_seasonality['name'],
                    w=self._dlm_seasonality['w'])
            # dynamic
            if self._train_dlm_dynamic is not None:
                for i in range(len(self._train_dlm_dynamic['features'])):
                    self._model = self._model + dynamic(
                        features=self._train_dlm_dynamic['features'][i]
                        ['features'],
                        discount=self._train_dlm_dynamic['features'][i]
                        ['discount'],
                        name=self._train_dlm_dynamic['features'][i]['name'],
                        w=self._train_dlm_dynamic['features'][i]['w'])
            # auto_reg
            if self._dlm_auto_reg is not None:
                self._model = self._model + autoReg(
                    degree=self._dlm_auto_reg['degree'],
                    discount=self._dlm_auto_reg['discount'],
                    name=self._dlm_auto_reg['name'],
                    w=self._dlm_auto_reg['w'])
            # long_season
            if self._dlm_long_season is not None:
                ls = longSeason(period=self._dlm_long_season['period'],
                                stay=self._dlm_long_season['stay'],
                                data=ts_df,
                                name=self._dlm_long_season['name'],
                                w=self._dlm_long_season['w'])
                self._model = self._model + ls

            if not suppress:
                self._dlm_logger.info("The constructed DLM model components:")
                print(self._model.ls())

            # tic
            start = time()
            if self._use_rolling_window:
                self._model.fitForwardFilter(useRollingWindow=True,
                                             windowLength=self._window_size)
                self._model.fitBackwardSmoother()
            else:
                self._model.fit()
            self.model_fit = self._model
            # toc
            if not suppress:
                self._dlm_logger.info("Time elapsed: {} sec.".format(time() -
                                                                     start))
        except (Exception, ValueError) as e:
            self._dlm_logger.exception("DLM error...{}".format(e))
            return -1
        else:
            self._dlm_logger.info("Model successfully fitted to the data!")
            self._dlm_logger.info("Computing fitted values and residuals...")

            # Residuals
            self.residuals = pd.Series(self.model_fit.getResidual(),
                                       index=self._train_dt.index)
            try:
                self.lower_conf_int = pd.Series(
                    self.model_fit.getInterval()[1],
                    index=self._train_dt.index)
                self.upper_conf_int = pd.Series(
                    self.model_fit.getInterval()[0],
                    index=self._train_dt.index)
            except ValueError as e:
                self._dlm_logger.exception(
                    "Something went wrong in getInterval...{}".format(e))

            self.mse = self.model_fit.getMSE()

            # Fitted values
            # this is not elegant, but found no other way
            self.fittedvalues = self._train_dt['y'] + self.residuals

            return self
Ejemplo n.º 6
0
simple_dlm.turnOff('data points')
simple_dlm.plot()
# Plot each component (attribution)
simple_dlm.turnOff('predict plot')
simple_dlm.turnOff('filtered plot')
simple_dlm.plot('linear_trend')
simple_dlm.plot('seasonal52')
# Plot the prediction give the first 350 weeks and forcast the next 200 weeks.
simple_dlm.plotPredictN(N=200, date=350)
# Plot the prediction give the first 250 weeks and forcast the next 200 weeks.
simple_dlm.plotPredictN(N=200, date=250)

# Build a dynamic regression model
from pydlm import dynamic
regressor10 = dynamic(features=features,
                      discount=1.0,
                      name='regressor10',
                      w=10)
drm = dlm(time_series) + linear_trend + seasonal52 + regressor10
drm.fit()

# Plot the fitted results
drm.turnOff('data points')
drm.plot()
# Plot each component (attribution)
drm.turnOff('predict plot')
drm.turnOff('filtered plot')
drm.plot('linear_trend')
drm.plot('seasonal52')
drm.plot('regressor10')
# Plot the prediction give the first 300 weeks and forcast the next 150 weeks.
drm.plotPredictN(N=150, date=300)
Ejemplo n.º 7
0
# Plot the fitted results
simple_dlm.turnOff('data points')
simple_dlm.plot()
# Plot each component (attribution)
simple_dlm.turnOff('predict plot')
simple_dlm.turnOff('filtered plot')
simple_dlm.plot('linear_trend')
simple_dlm.plot('seasonal52')
# Plot the prediction give the first 350 weeks and forcast the next 200 weeks.
simple_dlm.plotPredictN(N=200, date=350)
# Plot the prediction give the first 250 weeks and forcast the next 200 weeks.
simple_dlm.plotPredictN(N=200, date=250)

# Build a dynamic regression model
from pydlm import dynamic
regressor10 = dynamic(features=features, discount=1.0, name='regressor10', w=10)
drm = dlm(time_series) + linear_trend + seasonal52 + regressor10
drm.fit()

# Plot the fitted results
drm.turnOff('data points')
drm.plot()
# Plot each component (attribution)
drm.turnOff('predict plot')
drm.turnOff('filtered plot')
drm.plot('linear_trend')
drm.plot('seasonal52')
drm.plot('regressor10')
# Plot the prediction give the first 300 weeks and forcast the next 150 weeks.
drm.plotPredictN(N=150, date=300)
# Plot the prediction give the first 250 weeks and forcast the next 200 weeks.
Ejemplo n.º 8
0
def estimate_and_predict_dlm_PR(calendar,
                                df_propor_PR_ts,
                                punched_df,
                                end_train_date,
                                start_test_date,
                                start_of_this_year,
                                enable_sales,
                                pred_weeks=8,
                                locality=10,
                                r=0.05,
                                missing_val=201735):
    '''
       accept the forecasting sales_proportion data as one regressor
       df_propor_PR_test: []
       return type: DataFrame with prediction result
       return: columns = [wm_yr_wk_nbr,club,yhat]
    
    '''
    res = pd.DataFrame()
    punched = punched_df.groupby(['club_nbr', 'posting_date'])['cost'].sum()
    punched.column = ['total_punched_wg']
    punched = punched.reset_index()
    punched = pd.merge(left=punched,
                       right=calendar,
                       how='left',
                       left_on='posting_date',
                       right_on='calendar_date').drop('calendar_date', axis=1)
    # mean wage among all clubs
    punched = removehurricane('cost', punched, 201733, 201739, sales=False)
    punched_mean = punched.groupby(['wm_yr_wk_nbr',
                                    'posting_date'])['cost'].mean()
    punched_mean = punched_mean.reset_index()
    punched_mean.columns = ['wm_yr_wk_nbr', 'posting_date', 'cost']
    punched_mean['club_nbr'] = pd.Series(np.ones([punched_mean.shape[0]]))
    ##########################
    if missing_val not in punched_mean.wm_yr_wk_nbr.tolist():
        punched_mean.loc[-1] = [
            missing_val,
            punched_mean.loc[punched_mean.wm_yr_wk_nbr == wm_nbr_add(
                missing_val, -2)].iloc[0, 1] + timedelta(days=14),
            0.5 * punched_mean.loc[punched_mean.wm_yr_wk_nbr == wm_nbr_add(
                missing_val, -2)].iloc[0, 2] +
            0.5 * punched_mean.loc[punched_mean.wm_yr_wk_nbr == wm_nbr_add(
                missing_val, 2)].iloc[0, 2], 1
        ]  # adding a row
        punched_mean.index = punched_mean.index + 1
    #########################
    punched_mean1 = punched_mean.copy(deep=True)
    punched_mean1['cost'] = 0.5 * punched_mean1['cost'] + 0.25 * punched_mean1[
        'cost'].shift(1) + 0.25 * punched_mean1['cost'].shift(2)
    ty = punched_mean1['cost'].mean()
    punched_mean1[['cost']] = punched_mean1[['cost']].fillna(value=ty)
    punched_mean1 = estimate_and_predict_prophet_PR(
        calendar,
        punched_mean1,
        end_train_date,
        start_test_date,
        daily_view=False,
        pred_days=120)  #predict the mean wages.
    punched_mean1 = punched_mean1.drop('club', axis=1)
    punched_mean1.columns = ['posting_date', 'PR_cost']
    punched_mean1 = pd.merge(left=punched_mean1,
                             right=calendar,
                             how='left',
                             left_on='posting_date',
                             right_on='calendar_date').drop('calendar_date',
                                                            axis=1)
    tmp = punched.groupby(['wm_yr_wk_nbr', 'posting_date'])['cost'].mean()
    tmp = tmp.reset_index()
    tmp.columns = ['wm_yr_wk_nbr', 'posting_date', 'PR_cost']
    tmp = tmp.loc[tmp.wm_yr_wk_nbr <= end_train_date]
    tmp['PR_cost'] = 0.5 * tmp['PR_cost'] + 0.25 * tmp['PR_cost'].shift(
        1) + 0.25 * tmp['PR_cost'].shift(2)
    ty = tmp['PR_cost'].mean()
    tmp[['PR_cost']] = tmp[['PR_cost']].fillna(value=ty)

    punched_mean = pd.concat([tmp, punched_mean1], axis=0)
    if missing_val not in punched_mean.wm_yr_wk_nbr.tolist():
        tu = [
            0.5 * punched_mean.loc[punched_mean.wm_yr_wk_nbr == wm_nbr_add(
                missing_val, -2)].iloc[0, 0] +
            0.5 * punched_mean.loc[punched_mean.wm_yr_wk_nbr == wm_nbr_add(
                missing_val, 2)].iloc[0, 0]
        ]
        tu.append(punched_mean.loc[punched_mean.wm_yr_wk_nbr == wm_nbr_add(
            missing_val, -2)].iloc[0, 1] + timedelta(days=14))
        tu.append(missing_val)
        punched_mean.loc[-1] = tu  # adding a row
        punched_mean.index = punched_mean.index + 1  # shifting index
    punched_mean = punched_mean.sort_values(
        by='wm_yr_wk_nbr').reset_index().drop('index', axis=1)
    punched = punched.drop('posting_date', axis=1)
    punched_pro = punched_df.groupby(['club_nbr',
                                      'posting_date'])['cost'].sum()
    punched_pro.column = ['total_punched_wg']
    punched_pro = punched_pro.reset_index()
    punched_pro = pd.merge(left=punched_pro,
                           right=calendar,
                           how='left',
                           left_on='posting_date',
                           right_on='calendar_date').drop('calendar_date',
                                                          axis=1)
    punched_pro = removehurricane('cost',
                                  punched_pro,
                                  201733,
                                  201739,
                                  sales=False)
    #201735 is Maria Hurrican Missing
    #201737 is the Irma Hurricane
    club_ls = punched.club_nbr.unique()
    for club in club_ls:
        pro_club = punched_pro[punched_pro.club_nbr.isin([club])]
        #########################################
        # adding missing value
        if missing_val not in pro_club.wm_yr_wk_nbr.tolist():
            pro_club.loc[-1] = [
                club, pro_club.loc[pro_club.wm_yr_wk_nbr == wm_nbr_add(
                    missing_val, -2)].iloc[0, 1] + timedelta(days=14),
                0.5 * pro_club.loc[pro_club.wm_yr_wk_nbr == wm_nbr_add(
                    missing_val, -2)].iloc[0, 2] +
                0.5 * pro_club.loc[pro_club.wm_yr_wk_nbr == wm_nbr_add(
                    missing_val, 2)].iloc[0, 2], missing_val
            ]  # adding a row
            pro_club.index = pro_club.index + 1  # shifting index
        ####################################################
        pro_club = pro_club.sort_values(by='posting_date').reset_index().drop(
            'index', axis=1)
        pro_sales = df_propor_PR_ts.loc[df_propor_PR_ts.club == club].drop(
            ['club'], axis=1)
        pro_club = pro_club.drop(['club_nbr', 'posting_date'], axis=1)
        pro_club.columns = ['cost', 'wm_yr_wk_nbr']
        pro_sales['total_sales'] = pro_sales['total_sales_across'] * pro_sales[
            'per_nbr_fc']
        pro_sales = pd.concat(
            [pro_sales] +
            [pro_sales.total_sales.shift(x) for x in range(1, 3)],
            axis=1)
        pro_sales.columns = [
            'wm_yr_wk_nbr', 'per_nbr_fc', 'total_sales_across',
            'total_sales_0', 'sr_1', 'sr_2'
        ]
        #########################################
        # adding missing value
        if missing_val not in pro_sales.wm_yr_wk_nbr.unique().tolist():
            tu = []
            for k in range(len(pro_sales.columns)):
                tu.append(
                    0.5 * pro_sales.loc[pro_sales.wm_yr_wk_nbr == wm_nbr_add(
                        missing_val, -2)].iloc[0, k] +
                    0.5 * pro_sales.loc[pro_sales.wm_yr_wk_nbr == wm_nbr_add(
                        missing_val, 2)].iloc[0, k])
            tu[0] = int(tu[0])
            pro_sales.loc[-1] = tu  # adding a row
            pro_sales.index = pro_sales.index + 1  # shifting index
        pro_sales = pro_sales.sort_values(
            by='wm_yr_wk_nbr').reset_index().drop('index', axis=1)
        pro_sales = pd.merge(left=pro_sales,
                             right=punched_mean,
                             how='right',
                             left_on='wm_yr_wk_nbr',
                             right_on='wm_yr_wk_nbr',
                             validate='1:1')
        pro_sales = pro_sales.drop(['posting_date'], axis=1)
        pro_sales = pro_sales.apply(lambda x: x.fillna(x.mean()), axis=0)
        pro_sales_train = pro_sales.loc[
            pro_sales.wm_yr_wk_nbr <= end_train_date]
        pro_sales_test = pro_sales.loc[
            pro_sales.wm_yr_wk_nbr >= start_test_date]
        # trend
        linear_trend = trend(degree=2, discount=0.98, name='linear_trend', w=8)
        # seasonality
        seasonal26 = seasonality(period=26,
                                 discount=1,
                                 name='seasonal26',
                                 w=12)
        # control variable
        sales0 = pro_sales_train['total_sales_0'].values.tolist()
        s0 = [[x] for x in sales0]
        sales1 = pro_sales_train['sr_1'].values.tolist()
        s1 = [[x] for x in sales1]
        sales2 = pro_sales_train['sr_2'].values.tolist()
        s2 = [[x] for x in sales2]
        macro = pro_sales_train['PR_cost'].values.tolist()
        m1 = [[x] for x in macro]
        #####################################
        s0 = dynamic(features=s0, discount=0.99, name='sales0', w=8)
        s1 = dynamic(features=s1, discount=0.99, name='sales1',
                     w=6)  # use the actual sales and forecasting sales amount
        s2 = dynamic(features=s2, discount=0.95, name='sales2', w=6)
        m1 = dynamic(features=m1, discount=0.99, name='macro', w=12)

        #e1 = dynamic(features=e1,discount=0.95,name='eff',w=6)
        drm = dlm(pro_club['cost']) + linear_trend + seasonal26 + autoReg(
            degree=locality, name='ar2', w=6) + m1  #+s0+s1+s2+m1
        drm.fit()
        #testset
        pro_sales_test = pro_sales_test.head(pred_weeks)
        sales0test = pro_sales_test['total_sales_0'].head(
            pred_weeks).values.tolist()
        s0test = [[x] for x in sales0test]
        sales1test = pro_sales_test['sr_1'].head(pred_weeks).values.tolist()
        s1test = [[x] for x in sales1test]
        sales2test = pro_sales_test['sr_2'].head(pred_weeks).values.tolist()
        s2test = [[x] for x in sales2test]
        macrotest = pro_sales_test['PR_cost'].head(pred_weeks).values.tolist()
        m1test = [[x] for x in macrotest]
        #efftest = testset['eff'].head(pred_weeks).values.tolist()
        #e1test = [[x] for x in efftest]
        features = {
            'sales0': s0test,
            'sales1': s1test,
            'sales2': s2test,
            'macro': m1test
        }  #,'eff':e1test}
        (predictMean, predictVar) = drm.predictN(N=pred_weeks,
                                                 date=drm.n - 1,
                                                 featureDict=features)
        #locality
        pro_sales = pro_sales.drop(['sr_1', 'sr_2'], axis=1)
        pro_sales['ratio'] = pro_sales['total_sales_0'] / pro_sales[
            'total_sales_across']
        pro_sales['ratio_1'] = pro_sales['ratio'].shift(1)
        pro_sales['ratio_2'] = pro_sales['ratio'].shift(2)
        trainset1_year = pro_club.loc[
            pro_club.wm_yr_wk_nbr <= end_train_date].loc[
                pro_club.wm_yr_wk_nbr >= end_train_date - locality]
        trainset_year = pro_sales.loc[
            pro_sales.wm_yr_wk_nbr <= end_train_date].loc[
                pro_sales.wm_yr_wk_nbr >= end_train_date - locality]
        trainset_year.apply(lambda x: x.fillna(x.mean()), axis=0)
        linear_trend_year = trend(degree=1,
                                  discount=0.99,
                                  name='linear_trend_year',
                                  w=10)
        sales0_year = trainset_year['ratio'].values.tolist()
        s0_year = [[x] for x in sales0_year]
        # use the forecast of the ratio of each club among total in PR area
        # since this is a local model, the total amount in area can be assumed to be constant.
        sales1_year = trainset_year['ratio_1'].values.tolist()
        s1_year = [[x] for x in sales1_year]
        sales2_year = trainset_year['ratio_2'].values.tolist()
        s2_year = [[x] for x in sales2_year]
        macro_year = trainset_year['PR_cost'].values.tolist()
        m1_year = [[x] for x in macro_year]
        #####################################
        s0_year = dynamic(features=s0_year,
                          discount=0.99,
                          name='sales0_year',
                          w=10)
        s1_year = dynamic(features=s1_year,
                          discount=0.99,
                          name='sales1_year',
                          w=8)
        s2_year = dynamic(features=s2_year,
                          discount=0.95,
                          name='sales2_year',
                          w=6)
        m1_year = dynamic(features=m1_year,
                          discount=0.99,
                          name='macro_year',
                          w=10)
        #e1_year = dynamic(features=e1_year,discount=0.95,name='eff_year',w=6)
        if enable_sales:
            drm_year = dlm(trainset1_year['cost']) + autoReg(
                degree=locality, name='ar2', w=5
            ) + linear_trend_year + m1_year + s0_year + s1_year + s2_year
        else:
            drm_year = dlm(trainset1_year['cost']) + autoReg(
                degree=locality, name='ar2',
                w=5) + linear_trend_year + m1_year  #+s0_year+s1_year+s2_year
        drm_year.fit()
        testset_year = pro_sales.loc[
            pro_sales.wm_yr_wk_nbr >= start_test_date].head(pred_weeks)
        sales0test = testset_year['ratio'].head(pred_weeks).values.tolist()
        s0test = [[x] for x in sales0test]
        sales1test = testset_year['ratio_1'].head(pred_weeks).values.tolist()
        s1test = [[x] for x in sales1test]
        sales2test = testset_year['ratio_2'].head(pred_weeks).values.tolist()
        s2test = [[x] for x in sales2test]
        features_year = {
            'sales0_year': s0test,
            'sales1_year': s1test,
            'sales2_year': s2test,
            'macro_year': m1test
        }
        (predictMean_year,
         predictVar_year) = drm_year.predictN(N=pred_weeks,
                                              date=drm_year.n - 1,
                                              featureDict=features_year)
        weeklist = []
        p1 = np.exp(-r * (abs(end_train_date - start_of_this_year - 52)))
        p2 = 1 - p1
        for k in range(pred_weeks):
            weeklist.append(wm_nbr_add(start_test_date, 2 * k))

        if res.shape[0] == 0:
            res['wm_yr_wk_nbr'] = weeklist
            res['club'] = pd.Series(club * np.ones(pred_weeks),
                                    index=res.index)
            res['yhat'] = pd.Series(p1 * np.asarray(predictMean) +
                                    p2 * np.asarray(predictMean_year),
                                    index=res.index)
        else:
            tmp = pd.DataFrame()
            tmp['wm_yr_wk_nbr'] = weeklist
            tmp['club'] = pd.Series(club * np.ones(pred_weeks),
                                    index=tmp.index)
            tmp['yhat'] = pd.Series(p1 * np.asarray(predictMean) +
                                    p2 * np.asarray(predictMean_year),
                                    index=tmp.index)
            res = pd.concat([res, tmp], axis=0)
    return res
Ejemplo n.º 9
0
    def _tune(self,
              y,
              period,
              x=None,
              metric="smape",
              val_size=None,
              verbose=False):
        """
        Tune hyperparameters of the model.
        :param y: pd.Series or 1-D np.array, time series to predict.
        :param period: Int or Str, the number of observations per cycle: 1 or "annual" for yearly data, 4 or "quarterly"
        for quarterly data, 7 or "daily" for daily data, 12 or "monthly" for monthly data, 24 or "hourly" for hourly
        data, 52 or "weekly" for weekly data. First-letter abbreviations of strings work as well ("a", "q", "d", "m",
        "h" and "w", respectively). Additional reference: https://robjhyndman.com/hyndsight/seasonal-periods/.
        :param x: pd.DataFrame or 2-D np.array, exogeneous predictors, optional
        :param metric: Str, the metric used for model selection. One of "mse" (mean squared error), "mae" (mean absolute
        error).
        :param val_size: Int, the number of most recent observations to use as validation set for tuning.
        :param verbose: Boolean, True for printing additional info while tuning.
        :return: None
        """
        self.period = data_utils.period_to_int(period) if type(
            period) == str else period
        val_size = int(len(y) * .1) if val_size is None else val_size
        y_train, y_val = model_utils.train_val_split(y, val_size=val_size)
        if x is not None:
            x_train, x_val = model_utils.train_val_split(x, val_size=val_size)
        metric_fun = get_metric(metric)

        params_grid = {
            "trend": [0, 1, 2, 3],
            "ar": [None],
            # "ar": [None, 1, 2, 3],
        }
        params_keys, params_values = zip(*params_grid.items())
        params_permutations = [
            dict(zip(params_keys, v))
            for v in itertools.product(*params_values)
        ]

        scores = []
        for permutation in params_permutations:
            try:
                with warnings.catch_warnings():
                    warnings.simplefilter("ignore")
                    model = pydlm.dlm(y_train)
                    model = model + pydlm.trend(degree=permutation["trend"],
                                                discount=0.5)
                    model = model + pydlm.seasonality(period=self.period,
                                                      discount=0.99)
                    if permutation["ar"] is not None:
                        model = model + pydlm.autoReg(degree=permutation["ar"],
                                                      discount=0.99)
                    if x is not None:
                        for variable_id, x_variable in enumerate(x_train.T):
                            model = model + pydlm.dynamic(
                                features=[[v] for v in x_variable],
                                discount=0.99,
                                name=str(variable_id))
                    with SuppressStdoutStderr():
                        model.tune()
                        model.fit()
                    if x is not None:
                        x_val_dict = {}
                        for variable_id, x_variable in enumerate(x_val.T):
                            x_val_dict.update(
                                {str(variable_id): [[v] for v in x_variable]})
                    else:
                        x_val_dict = None
                    y_pred = model.predictN(date=model.n - 1,
                                            N=len(y_val),
                                            featureDict=x_val_dict)[0]

                    score = metric_fun(y_val, y_pred)
                    scores.append(score)
            except:
                scores.append(np.inf)

        best_params = params_permutations[np.nanargmin(scores)]
        self.params.update(best_params)
        self.params["tuned"] = True