startTrain = 52 addFake = False m.setFonts('timeseries') ### Load in data and normalise twitterColumns = [0, 2] pollColumns = [1, 3, 4, 5, 6, 7, 8, 9] # avdate, Remain (norm), Leave (norm) lh, rh, p = m.getPanda(twitterColumns, pollColumns) h_agg, p_agg, p_var = m.aggregate(lh, rh, p, splitPolls=False, interpolate=True) p_orig = p_agg.copy() h_orig = h_agg.copy() p_agg = m.shift_polls(p_agg, tPolls, addFake=addFake) h_agg = m.shift_tweets(h_agg, tTwitter) kalmanData = m.getKalmanData(p_agg, h_agg) startDate = kalmanData.index[0] + dt.timedelta(days=startTrain) endDate = dt.datetime(day=23, month=6, year=2016) ### FIND KF VARIABLES: 1) R and 2) P0 # find R preds = [] R_r = p_var['Remain'].mean() r = kalmanData['remain_perc'].to_numpy(dtype=float) P0_r = r.var() / 10 H = 1 K_r = P0_r * H / (H * P0_r * H + R_r) K_r = 0.95
twitterColumns = [0, 2] pollColumns = [1, 3, 4, 5, 6, 7, 8, 9] # avdate, Remain (norm), Leave (norm) lh, rh, p = m.getPanda(twitterColumns, pollColumns) h_agg, p_agg, p_var = m.aggregate(lh, rh, p, splitPolls=False, interpolate=interpolate) _, p_onl, p_tel = m.aggregate(lh, rh, p, splitPolls=True, interpolate=interpolate) p_agg = m.shift_polls(p_agg, tPolls, addFake=False) p_onl = m.shift_polls(p_onl, tPolls, addFake=False) p_tel = m.shift_polls(p_tel, tPolls, addFake=False) h_agg = m.shift_tweets(h_agg, tTwitter) ### CORRELATION AND FITTING ### #only do non telephone and online split startDate = dt.datetime(year=2016, month=3, day=1) endDate = dt.datetime(year=2016, month=6, day=23) h = h_agg['remain_perc'].loc[startDate:endDate] p = p_agg['Remain'].loc[startDate:endDate] X = pd.merge(h, p, left_index=True, right_index=True) print("Interpolated = " + str(interpolate) + "; all data: ", np.corrcoef(X['remain_perc'], X['Remain']))