def _average(original_df, smoothed_df, smooth_type): #split to training and test set from math import ceil, sqrt import numpy as np import pandas as pd from functions import goodness_prediction_interval, forecast_pred_int, prediction_error if smooth_type == 'normal': df = original_df else: df = smoothed_df #split to training and test set df_train = df[0:ceil(len(df) * 0.9)] df_test = df[ceil(len(df) * 0.9):] #calcuale the average of training set. mdl = df_train.units.mean() prediction = df_test.copy() prediction.loc[:, 'units'] = mdl #compute prediction error pe = prediction_error(df_test.units, prediction.units, original_df=original_df, smooth_type=smooth_type) ########Calcualte the prediction intervals###### fcasterr = [ np.std(df_test.units - prediction.units) * sqrt(1 + 1 / len(df_train)) ] * len(df_test) prediction_interval = forecast_pred_int(mdl, pd.Series(fcasterr), alpha=0.05) #######Assess the goodness of prediction interval######################## acc_pi, avg_diff_pi = goodness_prediction_interval(df_test, prediction_interval) # ############Plot the prediction and prediction intervals################### # from func_visualisation import plot_prediction # plot_prediction(df, prediction, prediction_interval) # return mdl, pe, acc_pi, avg_diff_pi
def MTM_compute_pe_window(df_bin, corr_bin_name, group_names, window_size): # #for each window: # #1. calculate the MTM.(if Row == 0, then do random jump) # #2. calcualte the forecasting error # #3. return forecsting error and the one-step ahead forecasting # # #calcualte the average pe, calcualte the prediction interval according to # #one-step ahead forecasting. from sklearn.metrics import mean_squared_error from math import sqrt from functions import forecast_pred_int, goodness_prediction_interval import random random.seed(4) import numpy as np ts_test = [] ts_forecast = [] for df_window in MTM_window(df_bin.T.squeeze(), window_size + 1): #print (df_window) M = MTM_matrix(df_window[: -2], corr_bin_name, group_names) #df_window = list(df_window) last = group_names.index(df_window[-2]) ts_test.append(df_window[-2]) if not np.any(M[last]):##if M[last] is an array of 0 one_step_forecast = random.choice(range(0, len(M))) else: one_step_forecast = M[last].index(max(M[last])) one_step_forecast = group_names[one_step_forecast] ts_forecast.append(one_step_forecast) pe = sqrt(mean_squared_error(ts_test, ts_forecast)) pi = forecast_pred_int(ts_forecast, pe, alpha = 0.05) pi_cr, pi_width = goodness_prediction_interval(ts_test, pi) return pe, pi_cr, pi_width
def state_space_UC(df1): import statsmodels.api as sm from math import ceil import warnings warnings.filterwarnings("ignore") from functions import goodness_prediction_interval, forecast_pred_int, prediction_error import numpy as np import pandas as pd ####Split the time series dataset into training and testing################ df_train = df1.iloc[0:ceil(len(df1) * 0.9), :] df_test = df1.iloc[ceil(len(df1) * 0.9):, :] # Fit a local level model mdl = sm.tsa.UnobservedComponents(df_train, 'local level', stochastic_trend=True, stochastic_cycle=True, irregular=True) res = mdl.fit() #firstdate = str(df_test.index[0]) lastdate = str(df_test.index[-1]) #ts_predict = best_mdl.predict(start = ts_test.index[0].to_pydatetime(), end = ts_test.index[-1].to_pydatetime()) predict = res.predict(end=df1.index.get_loc(pd.to_datetime(lastdate))) predict = predict[-len(df_test):] ########Compute the prediction error############# pe = prediction_error(df_test.units, predict, original_df=df1, smooth_type='normal') ########Conpute the prediction interva########## predict_ci = forecast_pred_int(predict, pe, alpha=0.05) #######Assess the goodness of prediction interval######################## acc_pi, avg_diff_pi = goodness_prediction_interval(df_test, predict_ci) return pe, acc_pi, avg_diff_pi
def _naive(original_df, smoothed_df, smooth_type): from math import ceil, sqrt import numpy as np from functions import goodness_prediction_interval, forecast_pred_int, prediction_error if smooth_type == 'normal': df = original_df else: df = smoothed_df #split to training and test set df_train = df[0:ceil(len(df) * 0.9)] df_test = df[ceil(len(df) * 0.9):] #get the prediction series of the set. prediction = df_test.copy() mdl = df_train.iloc[-1].units prediction.units = mdl #compute prediction error pe = prediction_error(df_test.units, prediction.units, original_df=original_df, smooth_type=smooth_type) ########Calcualte prediction intervals###### h = np.arange(1, len(df_test) + 1, 1) fcasterr = np.std(df_test.units - prediction.units) * np.sqrt(h) prediction_interval = forecast_pred_int(mdl, fcasterr, alpha=0.05) #######Assess the goodness of prediction interval######################## acc_pi, avg_diff_pi = goodness_prediction_interval(df_test, prediction_interval) # ############Plot the prediction and prediction intervals################### # from func_visualisation import plot_prediction # plot_prediction(df, prediction, prediction_interval) # return mdl, pe, acc_pi, avg_diff_pi
def state_space_SARIMAX(df1): from statsmodels.tsa.arima_model import ARIMA import statsmodels.tsa.statespace.sarimax as sm from math import ceil import warnings warnings.filterwarnings("ignore") #from functions import goodness_prediction_interval, forecast_pred_int, prediction_error import numpy as np import pandas as pd from functions import goodness_prediction_interval, forecast_pred_int, prediction_error ####Split the time series dataset into training and testing################ df_train = df1.iloc[0:ceil(len(df1) * 0.9), :] df_test = df1.iloc[ceil(len(df1) * 0.9):, :] #find the best ordered ARMA model best_hqic = np.inf best_order = None best_mdl = None rng = range(5) for p in rng: for d in rng: for q in rng: try: tmp_mdl = sm.SARIMAX(df_train, order=(p, d, q)).fit() tmp_hqic = tmp_mdl.hqic if tmp_hqic < best_hqic: best_hqic = tmp_hqic best_order = (p, d, q) best_mdl = tmp_mdl except: continue #print('hqic: {:6.5f} | order: {}'.format(best_hqic, best_order)) # mdl = sm.SARIMAX(df_train, order = best_order) # # print('BEFORE sarimax MODEL') # res = mdl.fit() # print('Aftert sarimax model') res = best_mdl lastdate = str(df_test.index[-1]) #predict = best_mdl.predict(dynamic = df_test.index[0].to_pydatetime(), end = df_test.index[-1].to_pydatetime()) predict = res.predict(end=df1.index.get_loc(pd.to_datetime(lastdate))) predict = predict[-len(df_test):] ########Compute the prediction error############# pe = prediction_error(df_test.units, predict, original_df=df1, smooth_type='normal') ########Conpute the prediction interva########## predict_ci = forecast_pred_int(predict, pe, alpha=0.05) #######Assess the goodness of prediction interval######################## acc_pi, avg_diff_pi = goodness_prediction_interval(df_test, predict_ci) return pe, acc_pi, avg_diff_pi
def MTM_model(df): #input: the df for analysis: #output: the pe, acc_pi, width_pi import pandas as pd import numpy as np import copy from math import ceil #############STEPS: #convert the df to binned values, with bin name as the average of bin boundaries number_quantile = min(ceil(len(df)/4), 100) while True : bins = np.unique(np.quantile(df.iloc[:, 0], np.arange(0, number_quantile+1)/number_quantile)) group_names = [] corr_bin_name = [] bin_units = [] for i in range(1,len(bins)): name = (bins[i-1]+ bins[i])/2 group_names.append(name) bins[0] = bins[0] - 0.1 #the value that maps the observation to a unique bin bin_units = (pd.cut(df.units, bins, labels = group_names)).astype(float) #the bin number corr_bin_name = (pd.cut(df.units, bins, labels = range(1, len(bins)))).astype(int) number_quantile = ceil(number_quantile/2) if 1 not in pd.Series(corr_bin_name).value_counts().values: break #check whether bins and group names is unique if not pd.Series(bins).is_unique: print (bins) if not pd.Series(group_names).is_unique: print (group_names) df.units = bin_units #calcuate the MTM df_train = df[0: ceil(len(df)*0.9)] df_test = df[ceil(len(df)*0.9):] ###Input: the converted time series, the bins. n = len(group_names) M = [[0]*n for _ in range(n)] for (i,j) in zip(corr_bin_name, corr_bin_name[1:len(df_train)+1]): #print (i,j) M[i-1][j-1] += 1 #now convert to probabilities: for row in M: s = sum(row) if s > 0: row[:] = [f/s for f in row] #do the prediction and calcualte required values. last = group_names.index(df_train.iloc[-1, 0]) prediction = [] for i in range(1, len(df_test) + 1): temp = M[last].index(max(M[last])) #####calcualte the prediction value. last = copy.deepcopy(temp) prediction.append(group_names[temp]) #calculate the prediction error from sklearn.metrics import mean_squared_error from math import sqrt rmse = sqrt(mean_squared_error(df_test, prediction)) #####calcualte the prediction interval####### from functions import forecast_pred_int, goodness_prediction_interval prediction_interval = forecast_pred_int(prediction, rmse, alpha = 0.05) acc_pi, width_pi = goodness_prediction_interval(df_test, prediction_interval) return rmse, acc_pi, width_pi
def automation_single_ts_arma_analysis(original_df, smoothed_df, smooth_type, inclusion, stationarity): from statsmodels.tsa.arima_model import ARIMA from math import ceil import numpy as np import pandas as pd from functions import goodness_prediction_interval, forecast_pred_int, prediction_error if smooth_type == 'normal': ts = original_df else: ts = smoothed_df if (stationarity == True): ###Split the time series dataset into training and testing################ ts_train = ts[0:ceil(len(ts) * 0.9)] ts_test = ts[ceil(len(ts) * 0.9):] #find the best ordered ARMA model best_hqic = np.inf best_order = None best_mdl = None rng = range(5) for p in rng: for d in rng: for q in rng: try: tmp_mdl = ARIMA(ts_train.values, order=(p, d, q)).fit(method='mle', trend='nc') tmp_hqic = tmp_mdl.hqic if tmp_hqic < best_hqic: best_hqic = tmp_hqic best_order = (p, d, q) best_mdl = tmp_mdl except: continue #print('hqic: {:6.5f} | order: {}'.format(best_hqic, best_order)) #.plot_redict function has problem. firstdate = str(ts_test.index[0]) lastdate = str(ts_test.index[-1]) #ts_predict = best_mdl.predict(start = ts_test.index[0].to_pydatetime(), end = ts_test.index[-1].to_pydatetime()) #ts_predict = best_mdl.predict(start = ts.index.get_loc(pd.to_datetime(firstdate)), end = ts.index.get_loc(pd.to_datetime(lastdate))) ###calcualte the prediction interval. ts_forecast, std_error, prediction_interval = best_mdl.forecast( len(ts_test)) else: #####remove trend and seasonality from the time series.################# from stldecompose import decompose, forecast from stldecompose.forecast_funcs import (naive, drift, mean, seasonal_naive) #########################If the length of the ts is shorter than 130##### ########################This is weekly data############################# if len(ts) < 130: stl = decompose(ts, period=52) else: if (inclusion == False): stl = decompose(ts, period=251) else: stl = decompose(ts, period=365) ######Fit ARMA on the Residual############## ts_train = stl.resid[0:ceil(len(stl.resid) * 0.9)] ts_test = stl.resid[ceil(len(stl.resid) * 0.9):] best_hqic = np.inf best_order = None best_mdl = None rng = range(5) for p in rng: for d in rng: for q in rng: try: tmp_mdl = ARIMA(ts_train.values, order=(p, d, q)).fit(method='mle', trend='nc') tmp_hqic = tmp_mdl.hqic if tmp_hqic < best_hqic: best_hqic = tmp_hqic best_order = (p, d, q) best_mdl = tmp_mdl except: continue #print('hqic: {:6.5f} | order: {}'.format(best_hqic, best_order)) #######Prediction################# firstdate = str(ts_test.index[0]) lastdate = str(ts_test.index[-1]) #ts_predict = best_mdl.predict(start = ts_test.index[0].to_pydatetime(), end = ts_test.index[-1].to_pydatetime()) ts_predict = best_mdl.predict( start=ts.index.get_loc(pd.to_datetime(firstdate)), end=ts.index.get_loc(pd.to_datetime(lastdate))) #######Add back the trend and seasonality ######## ts_predict = stl.seasonal.units.loc[ts_test.index[0].to_pydatetime( ):ts_test.index[-1].to_pydatetime( )] + stl.trend.units.loc[ts_test.index[0].to_pydatetime( ):ts_test.index[-1].to_pydatetime()] + pd.Series(index=ts_test.index, data=ts_predict) #########Compute the prediction interval ts_forecast, std_error, prediction_interval = best_mdl.forecast( len(ts_test)) difference = stl.seasonal.units.loc[ts_test.index[0].to_pydatetime( ):ts_test.index[-1].to_pydatetime()] + stl.trend.units.loc[ ts_test.index[0].to_pydatetime():ts_test.index[-1].to_pydatetime()] def f(a): return (a + difference) prediction_interval = np.apply_along_axis(f, 0, prediction_interval) ########Compute the prediction error############# pe = prediction_error(ts_test.units, ts_forecast, original_df=original_df, smooth_type=smooth_type) #######Assess the goodness of prediction interval######################## acc_pi, avg_diff_pi = goodness_prediction_interval(ts_test, prediction_interval) # ############Plot the prediction and prediction intervals################### # from func_visualisation import plot_prediction # plot_prediction(df, prediction, prediction_interval) # return best_order, pe, acc_pi, avg_diff_pi