-
Notifications
You must be signed in to change notification settings - Fork 0
/
KNN.py
234 lines (213 loc) · 12.1 KB
/
KNN.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.io as pio
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
# from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
# from sklearn import preprocessing
from sklearn.neighbors import LocalOutlierFactor
# from sklearn.covariance import EllipticEnvelope
# from sklearn.ensemble import IsolationForest
import plotly.figure_factory as ff
pio.renderers.default = 'browser'
# todo:
# residual demands minus imports/exports to neighbouring countries
# Eurostat values
# Done
# check if outliers are out properly (why percentage error increases if outliers are out, and
# how model choice is effected by outliers out)
# outlier removal (95% percentile)
# analysis of high error hours demand range: mid
# sum of residual demands Nope 15%
# run for a given period to see if there is improvement by splitting the data to seasons - no help
# multiple renewable technologies
# figure based on hours of the year (if error happens in a certain season) - plot and run for 4 to 8 -
# worst predictions and increased price anamolies
# including demand as input (to match hours with similar underlying demand (not residual demand) behavior )
# - triple increased price!
# checking if error of the predictor is symmetric - yes, good
# parameters
country = 'FR'
tech = ["onshore", "offshore"] # tech = 'onshore' # offshore national
year = '2019'
installed_res_cap = [24600 - 13610, 2400] # 2023 # installed_res_cap = [34100 - 13610, 4700] # 2028
# importing data points
country_data = pd.read_csv(country + '_data_missing_data_handeled.csv', index_col=0)
# country_data = country_data.iloc[int((5/12)*8760):int((9/12)*8760),:] # uncomment to run for a given period only
import_countries = [i.split('_')[1] for i in country_data.columns if i.startswith('Imported_')]
export_countries = [i.split('_')[1] for i in country_data.columns if i.startswith('Exported_')]
neighbours = list(set(import_countries + export_countries))
for neighbour in neighbours:
neighbour_data = pd.read_csv(neighbour + '_data_missing_data_handeled.csv', index_col=0)
country_data[neighbour + '_Residual_Demand'] = neighbour_data.loc[:, 'Residual_Demand']
# residual_demand_columns = [i for i in country_data.columns if i.endswith('Residual_Demand')]
# columns_to_consider = [i for i in country_data.columns if i.endswith('Demand')]
columns_to_consider = [i for i in country_data.columns if
i.endswith('Residual_Demand')] # columns_to_consider.append("Demand")
# import wind data
wind_data = pd.read_csv('French Data/ninja_wind_country_FR_current-merra-2_corrected.csv', index_col=0, header=2)
# ninja_wind_country_FR_current-merra-2_corrected.csv # ninja_wind_country_FR_near-termfuture-merra-2_corrected.csv # ninja_wind_country_FR_long-termfuture-merra-2_corrected.csv
indices = [i for i in wind_data.index if i.startswith(year)]
wind_data_year_tech = wind_data.loc[indices, tech].values
# wind_data_year_tech[:] = 0.5 # do the simulation for a given average availabitliy factor
if np.ndim(wind_data_year_tech) > 1:
res_gen = np.sum(installed_res_cap * wind_data_year_tech, axis=1)
else:
res_gen = installed_res_cap * wind_data_year_tech # wind_data_year_tech = wind_data_year_tech[int((5/12)*8760):int((9/12)*8760)] # uncomment to run for a given period only
# outlier detection
X_FR = country_data.loc[:, 'Residual_Demand'].values.reshape(-1, 1)
# X_FR = np.delete(X_FR, [1, 2, 3, 4, 5, 6], 1)
Y_Pr = country_data.loc[:, ['Price']].values.reshape(-1, 1)
X_Y = np.append(X_FR, Y_Pr, 1)
# Unsupervised Outlier Detection using Local Outlier Factor (LOF)
clf = LocalOutlierFactor(n_neighbors=5, contamination=0.20) # played with n neighbours 20 50 30 cont 0.2 10:6.7 5:6.2
inliers = clf.fit_predict(X_Y)
outlier_score = clf.negative_outlier_factor_
# #isolation forest
# clf = IsolationForest(random_state=0, contamination="auto").fit(X_Y)
# inliers = clf.predict(X_Y)
# # covariance
# cov = EllipticEnvelope(random_state=0, contamination=0.2).fit(X_Y)
# inliers = cov.predict(X_Y)
print('Number of inliners are ', str(len(inliers[inliers == 1])))
# keeping only the inliers (all variables)
X = country_data.loc[:, columns_to_consider].values.reshape(-1, len(columns_to_consider))
Y = country_data.loc[:, ['Price', 'Demand']].values.reshape(-1, 2)
X = X[inliers == 1, :]
Y = Y[inliers == 1, :]
# scaling X and Y
scaler_Y = StandardScaler()
scaler_Y.fit(Y)
Y_scaled = scaler_Y.transform(Y)
scaler_X = StandardScaler()
scaler_X.fit(X)
X_scaled = scaler_X.transform(X)
FR_RD_scaler = 2
X_scaled[:, 0] = FR_RD_scaler * X_scaled[:, 0]
# finding KNN design
neigh = KNeighborsRegressor()
param_grid = [{'n_neighbors': [5, 15, 52, 168], 'weights': ['uniform']}]
clf = GridSearchCV(neigh, param_grid, scoring='r2', cv=10, refit=True) # scoring='neg_mean_squared_error'
clf.fit(X_scaled, Y_scaled)
# loading all data points (not just the inliers) -------------------------------
X = country_data.loc[:, columns_to_consider].values.reshape(-1, len(
columns_to_consider)) # comment to ignore outliers in the predictions
X_scaled = scaler_X.transform(X)
X_scaled[:, 0] = FR_RD_scaler * X_scaled[:, 0]
Y = country_data.loc[:, ['Price', 'Demand']].values.reshape(-1, 2) # comment to ignore outliers in the predictions
Y_scaled = scaler_Y.transform(Y)
# wind_data_year_tech = wind_data_year_tech[inliers == 1] #uncomment to ignore outliers in the predictions
# prediction and error calculation for base model
Y_pred = clf.predict(X_scaled)
Y_pred_org_scale = scaler_Y.inverse_transform(Y_pred)
mse = mean_squared_error(Y[:, 0], Y_pred_org_scale[:, 0])
r2 = r2_score(Y[:, 0], Y_pred_org_scale[:, 0])
orders = np.argsort(X_scaled[:, 0].flatten())
X_scaled_sorted = X_scaled[orders]
X_sorted_org_scale = X[orders]
Y_pred_ordered = clf.predict(X_scaled_sorted)
Y_pred_ordered_org_scale = scaler_Y.inverse_transform(Y_pred_ordered)
# prediction and error calculation for reduced residual demand
X_reduced = np.copy(X)
X_reduced[:, 0] = X_reduced[:, 0] - res_gen
X_reduced_scaled = scaler_X.transform(X_reduced)
X_reduced_scaled[:, 0] = FR_RD_scaler * X_reduced_scaled[:, 0]
Y_pred_reduced = clf.predict(X_reduced_scaled)
Y_pred_reduced_org_scale = scaler_Y.inverse_transform(Y_pred_reduced)
Y_pred_reduced_org_scale = Y_pred_reduced_org_scale[orders]
# calculating differences in Y
price_dif = Y_pred_ordered_org_scale[:, 0].flatten() - Y_pred_reduced_org_scale[:, 0].flatten()
price_dif_pos = price_dif[price_dif >= 0]
quantity_dif = Y_pred_ordered_org_scale[:, 1].flatten() - Y_pred_reduced_org_scale[:, 1].flatten()
quantity_dif_pos = quantity_dif[quantity_dif >= 0]
q_final_ordered = Y_pred_ordered_org_scale[:, 1].flatten() - quantity_dif + res_gen[orders]
price_weighted_FIP = np.average(Y_pred_ordered_org_scale[:, 0].flatten(),
weights=Y_pred_ordered_org_scale[:, 1].flatten())
price_weighted_FIT = np.average(Y_pred_reduced_org_scale[:, 0].flatten(), weights=q_final_ordered)
price_dif_weighted_mean = price_weighted_FIP - price_weighted_FIT
price_dif_mean = price_dif.mean()
price_dif_pos_mean = price_dif[price_dif >= 0].mean()
fig = go.Figure(
data=[go.Histogram(x=Y_pred_ordered_org_scale[:, 0].flatten() - Y_pred_reduced_org_scale[:, 0].flatten())])
fig.update_layout(title="(Ignoring negatives) Average weighted price reduction is " + str(
price_dif_weighted_mean) + " (not weighted is " + str(price_dif_pos_mean) + ") vs. original wa of " + str(
price_weighted_FIP))
fig.write_html("figures/" + country + "_histogram.html")
fig.show()
mean_reduction_in_quantity = quantity_dif.mean()
mean_reduction_in_quantity_only_positive = quantity_dif[quantity_dif >= 0].mean()
fig = go.Figure(
data=[go.Histogram(x=Y_pred_ordered_org_scale[:, 1].flatten() - Y_pred_reduced_org_scale[:, 1].flatten())])
fig.update_layout(title="Average quantity reduction is " + str(int(mean_reduction_in_quantity)) +
" (" + str(int(mean_reduction_in_quantity_only_positive)) + " ignoring negatives)" +
" while average additional RES generation is " + str(int(res_gen.mean())) +
" making total demand increase of " + str(int(-mean_reduction_in_quantity + res_gen.mean())) +
" (average original Demand is " + str(Y[:, 1].mean()) + ")"
)
fig.show()
# other figures
fig = go.Figure()
fig.add_trace(go.Scatter(x=X[:, 0].flatten(), y=Y[:, 0].flatten(), mode='markers', name='Original'))
fig.add_trace(go.Scatter(x=X[inliers == 1, 0].flatten(), y=Y[inliers == 1, 0].flatten(), mode='markers',
name='Inliers')) # comment to ignore outliers in the predictions
fig.add_trace(go.Scatter(x=X_sorted_org_scale[:, 0].flatten(), y=Y_pred_ordered_org_scale[:, 0].flatten(), mode='lines',
name='Fit:' + str(clf.best_params_['n_neighbors']) + ' R2: ' + str(clf.best_score_)))
fig.add_trace(go.Scatter(x=X_sorted_org_scale[:, 0].flatten(), y=10 * np.sign(
Y_pred_ordered_org_scale[:, 0].flatten() - Y_pred_reduced_org_scale[:, 0].flatten()), mode='lines', name='Error'))
fig.add_trace(go.Scatter(x=X_sorted_org_scale[:, 0].flatten(), y=res_gen[orders] / 1000, mode='lines', name='New RD'))
fig.add_trace(go.Scatter(x=X_sorted_org_scale[:, 0].flatten(), y=Y_pred_reduced_org_scale[:, 0].flatten(), mode='lines',
name='Fit:' + str(clf.best_params_['n_neighbors']) + ' Reduced'))
unique, counts = np.unique((Y_pred_ordered_org_scale[:, 0] - Y_pred_reduced_org_scale[:, 0]) > 0, return_counts=True)
a = dict(zip(unique, counts))
fig.update_layout(
title="Number of hours in which the price is not reduced: " + str(a[False]) + ' i.e. % ' + str(
100 * a[False] / (a[False] + a[True]))
)
fig.show()
fig.write_html("figures/" + country + "_KKN_1000.html")
fig = ff.create_distplot([Y[:, 0][orders] - Y_pred_ordered_org_scale[:, 0].flatten()], ["group_labels"],
curve_type='normal')
fig.show()
# fig = go.Figure(go.Histogram2d(x=X_sorted_org_scale[:, 0].flatten(), y=Y_pred_ordered_org_scale[:, 0].flatten(
# )-Y_pred_reduced_org_scale[:, 0].flatten()))
fig = go.Figure(go.Scatter(x=X_sorted_org_scale[:, 0].flatten(),
y=Y_pred_ordered_org_scale[:, 0].flatten() - Y_pred_reduced_org_scale[:, 0].flatten(),
mode='markers', name='Original'))
fig.update_layout(title="Residual Demand vs. DPrice")
fig.show()
# fig = go.Figure(go.Histogram2d(x=X_sorted_org_scale[:, 0].flatten(),
# y=Y_pred_ordered_org_scale[:, 0].flatten()-Y_pred_reduced_org_scale[:, 0].flatten()))
# seasons_No = 6
# fig = go.Figure()
# for i in range(seasons_No):
# start_hour = int((i/seasons_No)*8760)
# end_hour = int(((i+1)/seasons_No)*8760)
# fig.add_trace(go.Scatter(x=X[start_hour:end_hour, 0].flatten(),
# y=Y_pred_org_scale[start_hour:end_hour, 0].flatten()-
# scaler_Y.inverse_transform(Y_pred_reduced)[start_hour:end_hour,0], mode='markers', name=str(i)))
# fig.update_layout(title="Residual Demand vs. DPrice in seasons")
# # fig.update_yaxes(range=[-2, 0])
# # fig.update_xaxes(range=[20000, 80000])
# fig.show()
fig = go.Figure(go.Histogram2d(x=res_gen[orders],
y=Y_pred_ordered_org_scale[:, 0].flatten() - Y_pred_reduced_org_scale[:, 0].flatten()))
# fig = go.Figure(go.Scatter(x=wind_data_year_tech[orders],
# y=Y_pred_ordered_org_scale[:, 0].flatten()-Y_pred_reduced_org_scale[:, 0].flatten(), mode='markers', name='Original'))
fig.update_layout(title="RES vs. DPrice")
fig.show()
xx = X_sorted_org_scale.sum(axis=1)
fig = go.Figure(
go.Histogram2d(x=xx, y=Y_pred_ordered_org_scale[:, 0].flatten() - Y_pred_reduced_org_scale[:, 0].flatten()))
# fig = go.Figure(go.Scatter(x=xx, y=Y_pred_ordered_org_scale[:, 0].flatten()-Y_pred_reduced_org_scale[:, 0].flatten(),
# mode='markers', name='Original'))
fig.update_layout(title="Sum neighbours vs. DPrice")
fig.show()
print(clf.cv_results_)
print(clf.best_estimator_)
print(clf.best_score_)
print(clf.best_params_)
print(clf.scorer_)