twitterColumns = [0, 2] pollColumns = [1, 3, 4, 5, 6, 7, 8, 9] # avdate, Remain (norm), Leave (norm) lh, rh, p = m.getPanda(twitterColumns, pollColumns) h_agg, p_agg, p_var = m.aggregate(lh, rh, p, splitPolls=False, interpolate=interpolate) _, p_onl, p_tel = m.aggregate(lh, rh, p, splitPolls=True, interpolate=interpolate) kalmanData = m.getKalmanData(p_agg, h_agg) kalmanData_o = m.getKalmanData(p_onl, h_agg) kalmanData_t = m.getKalmanData(p_tel, h_agg) # 1. Moving Average df_orig = kalmanData df_ma = df_orig.rolling(3, center=True, closed='both').mean() # 2. Loess Smoothing (5% and 15%) print("LOESS SMOETHING") df_loess_5 = pd.DataFrame(lowess(df_orig, np.arange(len(df_orig)), frac=0.05)[:, 1], index=df_orig.index, columns=['remain_perc']) df_loess_15 = pd.DataFrame(lowess(df_orig,
### Load in data and normalise twitterColumns = [0, 2] pollColumns = [1, 3, 4, 5, 6, 7, 8, 9] # avdate, Remain (norm), Leave (norm) lh, rh, p = m.getPanda(twitterColumns, pollColumns) h_agg, p_agg, p_var = m.aggregate(lh, rh, p, splitPolls=False, interpolate=True) p_orig = p_agg.copy() h_orig = h_agg.copy() p_agg = m.shift_polls(p_agg, tPolls, addFake=addFake) h_agg = m.shift_tweets(h_agg, tTwitter) kalmanData = m.getKalmanData(p_agg, h_agg) startDate = kalmanData.index[0] + dt.timedelta(days=startTrain) endDate = dt.datetime(day=23, month=6, year=2016) ### FIND KF VARIABLES: 1) R and 2) P0 # find R preds = [] R_r = p_var['Remain'].mean() r = kalmanData['remain_perc'].to_numpy(dtype=float) P0_r = r.var() / 10 H = 1 K_r = P0_r * H / (H * P0_r * H + R_r) K_r = 0.95 ### KF MODEL ### # apply interpolation
2) evaluates the fit of the optimal model using one-out predictions without retraining 3) calculates the performance of the optimal model when increasing training set size """ if __name__ == '__main__': startTrain = 53 # index at which to start training (corresponds to 1st of March with interpolation) n_lag = 1 # number of lags to include in ARIMA model n_diff = 1 # number of differencing steps n_ma = 1 # number of ARIMA terms to include ### Load in data and normalise twitterColumns = [0, 2] pollColumns = [1, 3, 4, 5, 6, 7, 8, 9] lh, rh, p = m.getPanda(twitterColumns, pollColumns) h_agg, p_agg, p_var = m.aggregate(lh, rh, p, splitPolls=False, interpolate=True) kalmanData = m.getKalmanData(p_agg, h_agg) # panda that holds both twitter and polling data all_data = kalmanData['remain_perc'].iloc[startTrain:] remain_data = all_data.values dates_train = all_data.index # prepare training and test set startDate = kalmanData.index[0] + dt.timedelta(days=startTrain + n_lag + n_diff) endDate = kalmanData.index[-1] pred_dates = pd.date_range(start=startDate, end=endDate) end_train = math.floor(len(remain_data) * 0.2) predictions = [] m.setFonts('timeseries') test = remain_data[-end_train:] train = remain_data[:-end_train] history = train.tolist()