forked from Machine-Learning-Auckland/Imitate
-
Notifications
You must be signed in to change notification settings - Fork 0
/
IMITATE.py
265 lines (220 loc) · 12 KB
/
IMITATE.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
import random
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
import numpy as np
import copy
import Bias, Transformations, Confidence, Distributions
from sklearn import svm
# Main Class for Imitate
class IMITATE:
def __init__(self, num_hist_int, bias_gen, data_gen, DE, repeat=1, trafo=Transformations.trafo_keep_axes,
model=svm.SVC(kernel='linear')):
self.num_hist_int = num_hist_int
self.bias_gen = bias_gen
self.data_gen = data_gen
self.repeat = repeat
self.model = svm.SVC(kernel='linear')
self.reset(new_data=True)
self.density_func = Distributions.scaled_norm(ends_zero=True)
self.DE = DE
self.trafo = trafo
self.colors = [plt.cm.viridis(0), 'teal', 'goldenrod', 'deepskyblue']
self.colormap = lambda x: [self.colors[self.D.labels.index(xi)] for xi in x]
# Helper for experiments: Tests and compares different numbers of histogram/KDE bins
def run(self, fill_up_plots=False, point_plots=False, result_plot=False, iterations=10, remove_outliers=True):
err_acc = np.zeros((len(self.num_hist_int) + 1, self.repeat))
confidence = np.zeros((self.D.num_classes, len(self.num_hist_int) + 1, self.repeat))
num_added = np.zeros((self.D.num_classes, len(self.num_hist_int) + 1, self.repeat))
for it in range(self.repeat):
if it > 0:
self.reset(new_data=True)
err_acc[0][it] += self.D.acc_init
for i in range(len(self.num_hist_int)):
self.reset(new_data=False)
conf = self.fill_up(self.num_hist_int[i], fill_up_plots=fill_up_plots,
point_plots=point_plots, iterations=iterations, RO=remove_outliers)
if result_plot:
self.plot_result(str(self.num_hist_int[i]))
# evaluate result
err_acc[i + 1][it] += self.D.accuracyBiased(self.added_points, self.added_labels)
for l in range(self.D.num_classes):
confidence[l][i + 1][it] += conf[l]
num_added[l][i + 1][it] += list(self.added_labels).count(self.D.labels[l])
self.err_acc = err_acc
self.confidence = confidence
self.num_added = num_added
self.err_acc_mean = err_acc.sum(axis=1) / self.repeat
self.confidence_mean = confidence.sum(axis=2) / self.repeat
self.num_added_mean = num_added.sum(axis=2) / self.repeat
self.plot_eval()
# fill up the distribution for a certain number of histogram bins per dimension
# This is IMITATE as presented in the paper!
def fill_up(self, num_bins, iterations=10,
fill_up_plots=False, point_plots=False, RO=True, t=1):
# consider every label seperately
label_confidence = []
for label in self.D.labels:
label_idx = self.D.labels.index(label)
'''collect training data'''
data = self.D.X_b_train[self.D.Y_b_train == label]
'''remove outliers, rotate data'''
if RO:
data = Transformations.remove_outliers_lof(data)
trafo = self.trafo()
data = trafo.transform(data)
cdfs_scaled = np.empty((len(data[0]), num_bins))
fitted_cdf = np.empty((len(data[0]), num_bins))
fitted_ = np.empty((len(data[0]), num_bins))
num_fill_up = 0
data_range = []
DE_list = []
if fill_up_plots:
f, ax = plt.subplots(nrows=1, ncols=len(data[0]), figsize=(6, 2.5))
# consider every dimension
for line in range(len(data[0])):
'''project onto line, determine borders'''
d = data[:, line]
d_min = min(d)
d_max = max(d)
data_range.append([d_min, d_max])
'''define Density Estimator here!'''
DE_list.append(self.DE(num_bins))
DE_list[line].estimate(d, d_min, d_max)
'''estimate distribution'''
fitted = self.density_func.fit(DE_list[line].mids, DE_list[line].values, d)
fitted_[line] = copy.deepcopy(fitted)
fitted_cdf[line] = np.cumsum(fitted)
fitted_cdf[line] = fitted_cdf[line] / fitted_cdf[line][-1]
'''to be filled up: the differences between the distribution curve and the histogram'''
diff = fitted - DE_list[line].values
'''number of points to add'''
num_points_line = (len(d) / sum(DE_list[line].values)) * sum(diff)
num_fill_up = max(num_fill_up, num_points_line)
'''probability distribution for the fill-up'''
if sum(diff) == 0:
cdfs_scaled[line] = [0] * num_bins
else:
diff = diff / sum(diff)
diff = [max(diff[i], 0) for i in range(len(diff))]
cdfs_scaled[line] = np.cumsum(diff)
cdfs_scaled[line] = (cdfs_scaled[line] / cdfs_scaled[line][-1]) * num_points_line
if fill_up_plots:
barWidth = DE_list[line].mids[1] - DE_list[line].mids[0]
fill = fitted_[line] - DE_list[line].values
ax[line].bar(DE_list[line].mids, DE_list[line].values, label='data', color='teal',
width=barWidth)
ax[line].bar(DE_list[line].mids, [max(fill[i], 0) for i in range(len(fill))],
bottom=DE_list[line].values, label='fill up', color='goldenrod', width=barWidth,
hatch="...", edgecolor="white")
ax[line].plot(DE_list[line].mids, fitted_[line], label='fitted', c='mediumvioletred', linewidth=2)
ax[line].get_xaxis().set_ticks([])
ax[line].get_yaxis().set_ticks([])
if fill_up_plots:
ax[-1].legend()
plt.show()
# f.savefig('Results/Example_cluster_distr.pdf', format='pdf', dpi=1200, bbox_inches='tight')
# determine the number of added points in total: max over dimensions
num_fill_up = int(num_fill_up)
if num_fill_up == 0:
label_confidence.append(0)
continue
# best out of 10: go for the result with the highest confidence
best_conf = 0
leftover_points = []
# kNN_rnd_dist, kNN_rnd_std = confidence_kNN_rnd_coeff(data_range, num_fill_up)
kNN_rnd_dist, kNN_rnd_std = Confidence.confidence_kNN_train_sized_coeff(data, num_fill_up)
for it in range(iterations):
points = np.empty((num_fill_up, 0))
# generate points
for line in range(len(data[0])):
'''adjust cdf (in case there have to be more points added because of other lines)'''
distr_scaled = fitted_cdf[line] * max((num_fill_up - cdfs_scaled[line][-1]), 0)
cdf = cdfs_scaled[line] + distr_scaled
cdf = cdf / cdf[-1] # normalize
'''generate random values according to the cdf'''
values = np.random.rand(num_fill_up)
value_bins = np.searchsorted(cdf, values)
coords = np.array([random.uniform(DE_list[line].grid[value_bins[i]],
DE_list[line].grid[value_bins[i] + 1])
for i in range(num_fill_up)]).reshape(num_fill_up, 1)
points = np.concatenate((points, coords), axis=1)
'''compute the confidence of the result'''
if len(points) < 20:
conf_b, conf_a, l_p = (0, 0, [[]])
else:
conf_b, conf_a, l_p = Confidence.confidence_kNN_rnd(points, kNN_rnd_dist, t * kNN_rnd_std)
# add the points to the data set
if conf_a > best_conf:
best_conf = conf_a
leftover_points = copy.deepcopy(l_p)
# leftover_points = points
if point_plots:
plt.figure(it)
plt.scatter(data[:, 0], data[:, 1], c=self.colors[label_idx], alpha=0.2, s=3)
plt.scatter(points[:, 0], points[:, 1], c='red', alpha=0.8, s=8)
if len(l_p) > 0 and len(l_p[0]) > 0:
plt.scatter(l_p[:, 0], l_p[:, 1], c=self.colors[label_idx], alpha=0.8, s=8)
plt.show()
if len(data[0]) > 2:
plt.figure(it * 100)
plt.scatter(data[:, 0], data[:, 2], c=self.colors[label_idx], alpha=0.2, s=3)
plt.scatter(points[:, 0], points[:, 2], c='red', alpha=0.8, s=8)
if len(l_p) > 0 and len(l_p[0]) > 0:
plt.scatter(l_p[:, 0], l_p[:, 2], c=self.colors[label_idx], alpha=0.8, s=8)
plt.show()
'''remove the points with low confidence, discard the result entirely
if the confidence is too low. Transform back the leftover points'''
if len(leftover_points) > 0: # and 1 / best_conf <= kNN_rnd_dist + t*kNN_rnd_std:
add_me = trafo.transform_back(leftover_points)
self.added_points = np.concatenate((self.added_points, add_me))
self.added_labels = np.append(self.added_labels, [label] * len(add_me))
label_confidence.append(best_conf)
if point_plots:
plt.show()
return label_confidence
def reset(self, new_data=False):
if new_data:
# Draw a new data set
self.D = Bias.BIASme(self.bias_gen, self.data_gen, model=self.model)
self.added_points = np.empty((0, self.D.dims))
self.added_labels = np.empty(0)
def plot_result(self, title=""):
# plt.title(title)
plt.scatter(self.D.X_b_train[:, 0], self.D.X_b_train[:, 1], c=self.colormap(self.D.Y_b_train), alpha=0.2, s=3)
plt.scatter(self.added_points[:, 0], self.added_points[:, 1],
c=self.colormap([self.D.labels.index(l) for l in self.added_labels]),
alpha=0.8, s=8)
plt.show()
def plot_eval(self):
fig, acc = plt.subplots(nrows=1, ncols=1)
x = np.concatenate(([0], self.num_hist_int))
conf = acc.twinx()
added = acc.twinx()
acc.set_xticks(range(len(x)))
acc.set_xticklabels(x)
acc.set_xlabel("# Bins")
# acc.set_ylabel("Acc_unb - Acc_b+add")
acc.set_ylabel("Accuracy_Test")
conf.set_ylabel("Confidence")
added.set_ylabel("Added Points")
color1 = plt.cm.viridis(0)
color2 = plt.cm.viridis(0.5)
color3 = plt.cm.viridis(.9)
m = ['o', '^', '*', 'x']
l = ['-', '--', '-.', ':']
acc_line, = acc.plot(self.err_acc_mean, color=color1, label=acc.get_ylabel())
for i in range(len(x)):
acc.scatter([i] * self.repeat, self.err_acc[i], color=color1, alpha=0.2)
conf_line = mlines.Line2D([], [], color=color2, label=conf.get_ylabel())
added_line = mlines.Line2D([], [], color=color3, label=added.get_ylabel())
lns = [acc_line, conf_line, added_line]
for i in range(self.D.num_classes):
conf.plot(self.confidence_mean[i], color=color2, linestyle=l[i], marker=m[i])
added.plot(self.num_added_mean[i], color=color3, linestyle=l[i], marker=m[i])
lns.append(mlines.Line2D([], [], color='black', linestyle=l[i], marker=m[i], label=str(self.D.labels[i])))
acc.legend(handles=lns, loc='best')
added.spines['right'].set_position(('outward', 60))
acc.yaxis.label.set_color(acc_line.get_color())
conf.yaxis.label.set_color(conf_line.get_color())
added.yaxis.label.set_color(added_line.get_color())
plt.show()