Example #1
0
def _average(original_df, smoothed_df, smooth_type):
    #split to training and test set
    from math import ceil, sqrt
    import numpy as np
    import pandas as pd
    from functions import goodness_prediction_interval, forecast_pred_int, prediction_error

    if smooth_type == 'normal':
        df = original_df
    else:
        df = smoothed_df

    #split to training and test set
    df_train = df[0:ceil(len(df) * 0.9)]
    df_test = df[ceil(len(df) * 0.9):]

    #calcuale the average of training set.
    mdl = df_train.units.mean()
    prediction = df_test.copy()
    prediction.loc[:, 'units'] = mdl

    #compute prediction error
    pe = prediction_error(df_test.units,
                          prediction.units,
                          original_df=original_df,
                          smooth_type=smooth_type)

    ########Calcualte the prediction intervals######
    fcasterr = [
        np.std(df_test.units - prediction.units) * sqrt(1 + 1 / len(df_train))
    ] * len(df_test)
    prediction_interval = forecast_pred_int(mdl,
                                            pd.Series(fcasterr),
                                            alpha=0.05)

    #######Assess the goodness of prediction interval########################
    acc_pi, avg_diff_pi = goodness_prediction_interval(df_test,
                                                       prediction_interval)

    #    ############Plot the prediction and prediction intervals###################
    #    from func_visualisation import plot_prediction
    #    plot_prediction(df, prediction, prediction_interval)
    #

    return mdl, pe, acc_pi, avg_diff_pi
Example #2
0
def MTM_compute_pe_window(df_bin, corr_bin_name, group_names, window_size): 
#    #for each window:
#        #1. calculate the MTM.(if Row == 0, then do random jump)
#        #2. calcualte the forecasting error
#        #3. return forecsting error and the one-step ahead forecasting
#    
#    #calcualte the average pe, calcualte the prediction interval according to 
#    #one-step ahead forecasting.
    from sklearn.metrics import mean_squared_error
    from math import sqrt
    from functions import forecast_pred_int, goodness_prediction_interval
    import random
    random.seed(4)
    import numpy as np
    
    ts_test = []
    ts_forecast = []
    
    
    for df_window in MTM_window(df_bin.T.squeeze(), window_size + 1):
        #print (df_window)
        M = MTM_matrix(df_window[: -2], corr_bin_name, group_names)
        #df_window = list(df_window)
        
        last = group_names.index(df_window[-2])
        ts_test.append(df_window[-2])
        
        if not np.any(M[last]):##if M[last] is an array of 0
            one_step_forecast = random.choice(range(0, len(M)))
        else:
            one_step_forecast = M[last].index(max(M[last]))
        
        one_step_forecast = group_names[one_step_forecast]
        ts_forecast.append(one_step_forecast)
        
   
    pe = sqrt(mean_squared_error(ts_test, ts_forecast))
     
    pi = forecast_pred_int(ts_forecast, pe, alpha = 0.05)
    
    pi_cr, pi_width = goodness_prediction_interval(ts_test, pi)
    
    return pe, pi_cr, pi_width
Example #3
0
def state_space_UC(df1):
    import statsmodels.api as sm
    from math import ceil
    import warnings
    warnings.filterwarnings("ignore")
    from functions import goodness_prediction_interval, forecast_pred_int, prediction_error
    import numpy as np
    import pandas as pd

    ####Split the time series dataset into training and testing################
    df_train = df1.iloc[0:ceil(len(df1) * 0.9), :]
    df_test = df1.iloc[ceil(len(df1) * 0.9):, :]

    # Fit a local level model
    mdl = sm.tsa.UnobservedComponents(df_train,
                                      'local level',
                                      stochastic_trend=True,
                                      stochastic_cycle=True,
                                      irregular=True)

    res = mdl.fit()

    #firstdate = str(df_test.index[0])
    lastdate = str(df_test.index[-1])

    #ts_predict =  best_mdl.predict(start = ts_test.index[0].to_pydatetime(), end = ts_test.index[-1].to_pydatetime())
    predict = res.predict(end=df1.index.get_loc(pd.to_datetime(lastdate)))
    predict = predict[-len(df_test):]

    ########Compute the prediction error#############
    pe = prediction_error(df_test.units,
                          predict,
                          original_df=df1,
                          smooth_type='normal')

    ########Conpute the prediction interva##########
    predict_ci = forecast_pred_int(predict, pe, alpha=0.05)

    #######Assess the goodness of prediction interval########################
    acc_pi, avg_diff_pi = goodness_prediction_interval(df_test, predict_ci)

    return pe, acc_pi, avg_diff_pi
Example #4
0
def _naive(original_df, smoothed_df, smooth_type):
    from math import ceil, sqrt
    import numpy as np
    from functions import goodness_prediction_interval, forecast_pred_int, prediction_error

    if smooth_type == 'normal':
        df = original_df
    else:
        df = smoothed_df

    #split to training and test set
    df_train = df[0:ceil(len(df) * 0.9)]
    df_test = df[ceil(len(df) * 0.9):]

    #get the prediction series of the set.
    prediction = df_test.copy()
    mdl = df_train.iloc[-1].units
    prediction.units = mdl

    #compute prediction error
    pe = prediction_error(df_test.units,
                          prediction.units,
                          original_df=original_df,
                          smooth_type=smooth_type)

    ########Calcualte prediction intervals######
    h = np.arange(1, len(df_test) + 1, 1)
    fcasterr = np.std(df_test.units - prediction.units) * np.sqrt(h)
    prediction_interval = forecast_pred_int(mdl, fcasterr, alpha=0.05)

    #######Assess the goodness of prediction interval########################
    acc_pi, avg_diff_pi = goodness_prediction_interval(df_test,
                                                       prediction_interval)

    #    ############Plot the prediction and prediction intervals###################
    #    from func_visualisation import plot_prediction
    #    plot_prediction(df, prediction, prediction_interval)
    #
    return mdl, pe, acc_pi, avg_diff_pi
Example #5
0
def state_space_SARIMAX(df1):
    from statsmodels.tsa.arima_model import ARIMA
    import statsmodels.tsa.statespace.sarimax as sm
    from math import ceil
    import warnings
    warnings.filterwarnings("ignore")
    #from functions import goodness_prediction_interval, forecast_pred_int, prediction_error
    import numpy as np
    import pandas as pd
    from functions import goodness_prediction_interval, forecast_pred_int, prediction_error

    ####Split the time series dataset into training and testing################
    df_train = df1.iloc[0:ceil(len(df1) * 0.9), :]
    df_test = df1.iloc[ceil(len(df1) * 0.9):, :]

    #find the best ordered ARMA model
    best_hqic = np.inf
    best_order = None
    best_mdl = None

    rng = range(5)
    for p in rng:
        for d in rng:
            for q in rng:
                try:
                    tmp_mdl = sm.SARIMAX(df_train, order=(p, d, q)).fit()
                    tmp_hqic = tmp_mdl.hqic
                    if tmp_hqic < best_hqic:
                        best_hqic = tmp_hqic
                        best_order = (p, d, q)
                        best_mdl = tmp_mdl
                except:
                    continue
    #print('hqic: {:6.5f} | order: {}'.format(best_hqic, best_order))

#    mdl = sm.SARIMAX(df_train, order = best_order)
#
#    print('BEFORE sarimax MODEL')
#    res = mdl.fit()
#    print('Aftert sarimax model')
    res = best_mdl

    lastdate = str(df_test.index[-1])

    #predict =  best_mdl.predict(dynamic = df_test.index[0].to_pydatetime(), end = df_test.index[-1].to_pydatetime())
    predict = res.predict(end=df1.index.get_loc(pd.to_datetime(lastdate)))
    predict = predict[-len(df_test):]

    ########Compute the prediction error#############
    pe = prediction_error(df_test.units,
                          predict,
                          original_df=df1,
                          smooth_type='normal')

    ########Conpute the prediction interva##########
    predict_ci = forecast_pred_int(predict, pe, alpha=0.05)

    #######Assess the goodness of prediction interval########################
    acc_pi, avg_diff_pi = goodness_prediction_interval(df_test, predict_ci)

    return pe, acc_pi, avg_diff_pi
Example #6
0
def MTM_model(df):
    #input: the df for analysis:
    #output: the pe, acc_pi, width_pi
    
    import pandas as pd
    import numpy as np
    import copy
    from math import ceil
    

    #############STEPS:
    #convert the df to binned values, with bin name as the average of bin boundaries    
    number_quantile = min(ceil(len(df)/4), 100)
    
    while True :
        bins = np.unique(np.quantile(df.iloc[:, 0], np.arange(0, number_quantile+1)/number_quantile)) 
        group_names = []
        corr_bin_name = []
        bin_units = []
        for i in range(1,len(bins)):
            name = (bins[i-1]+ bins[i])/2
            group_names.append(name)
 
        bins[0] = bins[0] - 0.1 
        
        #the value that maps the observation to a unique bin
        bin_units = (pd.cut(df.units, bins, labels = group_names)).astype(float)
        #the bin number
        corr_bin_name =  (pd.cut(df.units, bins, labels = range(1, len(bins)))).astype(int)
        number_quantile = ceil(number_quantile/2)
        if 1 not in pd.Series(corr_bin_name).value_counts().values:
            break
    
    #check whether bins and group names is unique
    if not pd.Series(bins).is_unique:
        print (bins)
    if not pd.Series(group_names).is_unique:
        print (group_names)
    
    df.units = bin_units
 
    
    #calcuate the MTM
    df_train = df[0: ceil(len(df)*0.9)]
    df_test = df[ceil(len(df)*0.9):]
    
        ###Input: the converted time series, the bins.    
    n = len(group_names)
    
    M = [[0]*n for _ in range(n)]

    for (i,j) in zip(corr_bin_name, corr_bin_name[1:len(df_train)+1]):
        #print (i,j)
        M[i-1][j-1] += 1

    #now convert to probabilities:
    for row in M:
        s = sum(row)
        if s > 0:
            row[:] = [f/s for f in row]
    
    
    #do the prediction and calcualte required values.
    last = group_names.index(df_train.iloc[-1, 0])
    prediction = []
    
    for i in range(1, len(df_test) + 1):
        temp = M[last].index(max(M[last]))

        #####calcualte the prediction value.
        last = copy.deepcopy(temp)
        prediction.append(group_names[temp])
   
    #calculate the prediction error
    from sklearn.metrics import mean_squared_error
    from math import sqrt
    rmse = sqrt(mean_squared_error(df_test, prediction)) 
    
    #####calcualte the prediction interval#######
    from functions import forecast_pred_int, goodness_prediction_interval
    prediction_interval = forecast_pred_int(prediction, rmse, alpha = 0.05)
    acc_pi, width_pi = goodness_prediction_interval(df_test, prediction_interval)
      
    return rmse, acc_pi, width_pi
Example #7
0
def automation_single_ts_arma_analysis(original_df, smoothed_df, smooth_type,
                                       inclusion, stationarity):
    from statsmodels.tsa.arima_model import ARIMA
    from math import ceil
    import numpy as np
    import pandas as pd
    from functions import goodness_prediction_interval, forecast_pred_int, prediction_error

    if smooth_type == 'normal':
        ts = original_df
    else:
        ts = smoothed_df

    if (stationarity == True):
        ###Split the time series dataset into training and testing################
        ts_train = ts[0:ceil(len(ts) * 0.9)]
        ts_test = ts[ceil(len(ts) * 0.9):]

        #find the best ordered ARMA model
        best_hqic = np.inf
        best_order = None
        best_mdl = None

        rng = range(5)
        for p in rng:
            for d in rng:
                for q in rng:
                    try:
                        tmp_mdl = ARIMA(ts_train.values,
                                        order=(p, d, q)).fit(method='mle',
                                                             trend='nc')
                        tmp_hqic = tmp_mdl.hqic
                        if tmp_hqic < best_hqic:
                            best_hqic = tmp_hqic
                            best_order = (p, d, q)
                            best_mdl = tmp_mdl
                    except:
                        continue
        #print('hqic: {:6.5f} | order: {}'.format(best_hqic, best_order))

        #.plot_redict function has problem.
        firstdate = str(ts_test.index[0])
        lastdate = str(ts_test.index[-1])
        #ts_predict =  best_mdl.predict(start = ts_test.index[0].to_pydatetime(), end = ts_test.index[-1].to_pydatetime())
        #ts_predict = best_mdl.predict(start = ts.index.get_loc(pd.to_datetime(firstdate)), end = ts.index.get_loc(pd.to_datetime(lastdate)))

        ###calcualte the prediction interval.
        ts_forecast, std_error, prediction_interval = best_mdl.forecast(
            len(ts_test))

    else:
        #####remove trend and seasonality from the time series.#################
        from stldecompose import decompose, forecast
        from stldecompose.forecast_funcs import (naive, drift, mean,
                                                 seasonal_naive)

        #########################If the length of the ts is shorter than 130#####
        ########################This is weekly data#############################
        if len(ts) < 130:
            stl = decompose(ts, period=52)
        else:
            if (inclusion == False):
                stl = decompose(ts, period=251)
            else:
                stl = decompose(ts, period=365)

        ######Fit ARMA on the Residual##############
        ts_train = stl.resid[0:ceil(len(stl.resid) * 0.9)]
        ts_test = stl.resid[ceil(len(stl.resid) * 0.9):]

        best_hqic = np.inf
        best_order = None
        best_mdl = None

        rng = range(5)
        for p in rng:
            for d in rng:
                for q in rng:
                    try:
                        tmp_mdl = ARIMA(ts_train.values,
                                        order=(p, d, q)).fit(method='mle',
                                                             trend='nc')
                        tmp_hqic = tmp_mdl.hqic
                        if tmp_hqic < best_hqic:
                            best_hqic = tmp_hqic
                            best_order = (p, d, q)
                            best_mdl = tmp_mdl
                    except:
                        continue
        #print('hqic: {:6.5f} | order: {}'.format(best_hqic, best_order))

        #######Prediction#################
        firstdate = str(ts_test.index[0])
        lastdate = str(ts_test.index[-1])

        #ts_predict =  best_mdl.predict(start = ts_test.index[0].to_pydatetime(), end = ts_test.index[-1].to_pydatetime())
        ts_predict = best_mdl.predict(
            start=ts.index.get_loc(pd.to_datetime(firstdate)),
            end=ts.index.get_loc(pd.to_datetime(lastdate)))

        #######Add back the trend and seasonality ########
        ts_predict = stl.seasonal.units.loc[ts_test.index[0].to_pydatetime(
        ):ts_test.index[-1].to_pydatetime(
        )] + stl.trend.units.loc[ts_test.index[0].to_pydatetime(
        ):ts_test.index[-1].to_pydatetime()] + pd.Series(index=ts_test.index,
                                                         data=ts_predict)

        #########Compute the prediction interval
        ts_forecast, std_error, prediction_interval = best_mdl.forecast(
            len(ts_test))
        difference = stl.seasonal.units.loc[ts_test.index[0].to_pydatetime(
        ):ts_test.index[-1].to_pydatetime()] + stl.trend.units.loc[
            ts_test.index[0].to_pydatetime():ts_test.index[-1].to_pydatetime()]

        def f(a):
            return (a + difference)

        prediction_interval = np.apply_along_axis(f, 0, prediction_interval)

    ########Compute the prediction error#############
    pe = prediction_error(ts_test.units,
                          ts_forecast,
                          original_df=original_df,
                          smooth_type=smooth_type)

    #######Assess the goodness of prediction interval########################
    acc_pi, avg_diff_pi = goodness_prediction_interval(ts_test,
                                                       prediction_interval)

    #    ############Plot the prediction and prediction intervals###################
    #    from func_visualisation import plot_prediction
    #    plot_prediction(df, prediction, prediction_interval)
    #

    return best_order, pe, acc_pi, avg_diff_pi