forked from Trinli/calibration
-
Notifications
You must be signed in to change notification settings - Fork 0
/
test.py
457 lines (441 loc) · 28 KB
/
test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
# Test script for reliably calibrated isotonic regression.
# This script contains tests for three datasets.
# In the basic setting, we find the isotonic regression model,
# the 'best' RCIR-model using validation set performance, and
# compare the testing set performance of these to a Naeini model trained with
# both the training and validation set. This way both models have access to the
# same data. The RCIR-model could utilize the data more efficiently by cross-validation
# but that is left out here.
# ARGH. IR MODEL IS NOT DIRECTLY COMPARABLE. SHOULD USE THE SAME DATA AS THE
# NAEINI MODEL AS NO CV IS REQUIRED.
# The script can also be used to find the RCIR-model that corresponds to the 'd' value
# of a Naeini model. In this case, both models are trained on the same data as the
# RCIR does not need a validation set in the context. Performance is then compared
# on a separate testing set.
import isotonic
from sklearn.isotonic import IsotonicRegression
# from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from scipy.interpolate import interp1d
from oct2py import octave
from sklearn.metrics import roc_auc_score
# Enable octave to find Naeini's functions!
# (download from https://github.com/pakdaman/calibration)
octave.eval("addpath('./calibration/BBQ/')", verbose=False)
# Set test parameters:
dataset = input("Select dataset to run experiment on (1, 2, or 3): ")
n_iterations = input("Set number of iterations (30 used in paper): ")
# metric = input("Select metric ('mse' or 'auc_roc'): ") # Perhaps allow only auc-roc and exclude mse?
metric = 'auc_roc' # Can also be set to 'mse' to run the algorithm with mse-based bin merges.
# reshuffle = input("Shuffle data? (y/n): ") # Should be shuffled at least once. Perhaps remove option.
# It seems that there is no convenient way of estimating the credible intervals, not to speak of the maximum
# credible intervals for the Naeini procedure. Hence the 'Naeini vs. RCIR with d set by Naeini vs. IR'
# is pointless.
# model_comparison = input("Naeini vs. RCIR-CV ('Naeini vs. RCIR-CV') or Naeini vs. RCIR? ")
model_comparison = 'Naeini vs. RCIR-CV' # Set to anything else, it will run Naeini vs. RCIR with d set by the Naeini model.
# model_comparison = 'Naeini vs. RCIR with d set by Naeini vs. IR'
naeini_metrics = []
rcir_better_than_naeini_metrics = []
rcir_cv_best_model_metrics = []
s_isotonic_regression_metrics = []
threshold = 0.10
t_isotonic_metrics = []
# Load dataset:
if dataset == 1:
# Read data for test 1:
test_description = "test1"
data_class = isotonic.load_pickle("./data/dataset_1_class.pickle")
data_scores = isotonic.load_pickle("./data/dataset_1_scores.pickle")
elif dataset == 2:
# Read data for test 2:
test_description = "test2"
data_class = isotonic.load_pickle("./data/dataset_2_class.pickle")
data_scores = isotonic.load_pickle("./data/dataset_2_scores.pickle")
elif dataset == 3:
test_description = "test3"
data_class = isotonic.load_pickle("./data/dataset_3_class.pickle")
data_scores = isotonic.load_pickle("./data/dataset_3_scores.pickle")
elif dataset == 4: # Hidden experiment. Does not really provide anything interesting.
test_description = "test4"
data_class = np.random.binomial(1, .5, 30000)
data_scores = np.random.uniform(low=0, high=1, size=30000)
elif dataset == 5: # Test mode
tmp_class = isotonic.load_pickle("./data/dataset_1_class.pickle")
tmp_scores = isotonic.load_pickle("./data/dataset_1_scores.pickle")
test_description = "test1"
data_class = tmp_class[:1000]
data_scores = tmp_scores[:1000]
else:
print("Not a valid dataset selection.")
import sys
sys.exit()
print("Dataset with " + str(data_class.shape[0]) + " samples loaded.")
if(metric == 'mse' or metric == 'auc_roc'):
pass # All in order.
else:
print("Not a valid metric selection.")
import sys
sys.exit()
for j in range(n_iterations):
# Shuffle samples:
n_rows = data_scores.shape[0]
idx = np.random.permutation(range(n_rows))
data_class = data_class[idx]
data_scores = data_scores[idx]
# If we compare Naeini's model to RCIR with validation set, we need to allow Naeini's model
# to use the data used for validation with the RCIR for training Naeini's model. It does not
# need a validation sample, whereas RCIR with CV does.
# If we want to compare Naeini's model to RCIR with d set by Naeini's model, then RCIR does
# not either need any validation set, and the same data should be used for training both
# models.
# Data for RCIR with CV, or RCIR with d provided by Naeini's model
test_class = data_class[:n_rows * 1 // 4]
training_class = data_class[n_rows // 4: 3 * n_rows // 4]
validation_class = data_class[3 * n_rows // 4:]
test_scores = data_scores[:n_rows * 1 // 4]
training_scores = data_scores[n_rows // 4: 3 * n_rows // 4]
validation_scores = data_scores[3 * n_rows // 4:]
# Give all models exactly same data, save for validation set for IR
naeini_training_class = training_class
naeini_training_scores = training_scores
# if(model_comparison == 'Naeini vs. RCIR-CV'):
# # Use training set + validation set for Naeini model as training set.
# naeini_training_class = data_class[n_rows * 1 // 4:]
# naeini_training_scores = data_scores[n_rows * 1 // 4:]
# else:
# # Naeini vs. RCIR (d set by Naeini)
# naeini_training_scores = training_scores
# naeini_training_class = training_class
# Standard isotonic regression. Needs to have same data as Naeini to produce fair metrics.
s_isotonic_regression = IsotonicRegression(y_min=0, y_max=1, out_of_bounds='clip')
s_isotonic_regression.fit(X=training_scores, y=training_class)
s_metrics = isotonic.estimate_performance(s_isotonic_regression, test_class, test_scores)
s_training_prob = s_isotonic_regression.predict(training_scores)
s_max_p = max(s_training_prob)
s_data = np.unique(s_training_prob, return_counts=True)
s_credible_intervals = [isotonic.credible_interval(np.round(p * n), n) for
(p, n) in zip(s_data[0], s_data[1])]
s_width_of_intervals = np.array([row['p_max'] - row['p_min'] for row in s_credible_intervals])
s_max_width = max(s_width_of_intervals)
s_bins = len(np.unique(s_training_prob))
s_test_prob = s_isotonic_regression.predict(test_scores)
s_mce = isotonic.maximum_calibration_error(test_class, s_test_prob)
s_ece = isotonic.expected_calibration_error(test_class, s_test_prob)
s_acc = isotonic.accuracy(test_class, s_test_prob)
s_isotonic_regression_metrics.append({'mse': s_metrics['mse'],
'auc_roc': s_metrics['auc_roc'],
'max_width': s_max_width,
'bins': s_bins,
'max_p': s_max_p,
'mce': s_mce,
'ece': s_ece,
'acc': s_acc})
# Train separate isotonic regression model model for RCIR. It need a separate validation
# set and must thus be different from the s_isotonic_regression above.
# Usually I would use y_min=.0001, and y_max=.99. Probabilities of 0 or 1 are not
# plausible and can cause problems downstream.
isotonic_regression_model = IsotonicRegression(y_min=0, y_max=1, out_of_bounds='clip')
# Fit isotonic regression model
isotonic_regression_model.fit(X=training_scores, y=training_class)
training_probabilities = isotonic_regression_model.predict(T=training_scores)
print("Isotonic regression model trained.")
# To evaluate goodness of fit, we need to predict the probabilities for the test scores
# using our isotonic regression model (here stored in an interpolation model) and
# calculate some test statistics for it. Let it be mse and AUC-ROC.
# Line below corrects for bin widths (i.e. not use interpolation
# between bins). Will produce equivalent results for training data
# but slightly different for testing data.
# interpolation_model = isotonic.modify_model(model_tmp) # This is not how IR usually works.
# IsotonicRegression uses an interpolation model at its core. The 'x' and 'y' define this model
# completely. In some rare cases, the bins end up having zero width and the library
# IsotonicRegression produces x and y items for the interpolation model that don't have
# pairs. The rest of our code is based on the assumption that the pair exist, hence we make
# a check and correction if necessary.
tmp_x = isotonic_regression_model.f_.x
tmp_y = isotonic_regression_model.f_.y
tmp = isotonic.correct_for_point_bins(tmp_x, tmp_y)
x = tmp['x']
y = tmp['y']
# The following array contains all information defining the IR transformation
data = np.unique(training_probabilities, return_counts=True)
credible_intervals = [isotonic.credible_interval(np.round(p * n), n) for (p, n) in zip(data[0], data[1])]
width_of_intervals = np.array([row['p_max'] - row['p_min'] for row in credible_intervals])
# print "Test- and validation set performance, (standard) isotonic regression" # All bins.
isotonic_regression_performance = isotonic.estimate_performance(isotonic_regression_model, test_class, test_scores)
# print(isotonic_regression_performance)
# print(isotonic.estimate_performance(interpolation_model, training_class, training_scores))
# print(isotonic.estimate_performance(isotonic_regression_model, validation_class, validation_scores))
isotonic.plot_intervals(data, credible_intervals,
file_name="./" + test_description + "/all_bins.png",
label="Credible intervals")
# Create reliability diagram:
isotonic.plot_reliability_diagram(isotonic_regression_model, test_scores, test_class,
file_name="./" + test_description + "/reliability_diagram" + ".png",
label="Reliability diagram")
# Naeini's model
# Comparison to Naeini's model (it's a matlab model, using Octave).
print("Training Naeini-model. This might take a while (~20 minutes on a laptop).")
octave.push('training_scores', naeini_training_scores, verbose=False)
octave.push('training_class', naeini_training_class, verbose=False)
octave.eval("options.N0 = 2", verbose=False)
octave.eval("BBQ_model = build(training_scores', training_class', options)", verbose=False)
# In the following, '1' indicates model averaging, as done in the paper by Naeini & al.
octave.eval("training_bbq_prob = predict(BBQ_model, training_scores, 1)", verbose=False)
training_bbq_prob = octave.pull("training_bbq_prob", verbose=False)
training_bbq_prob = np.array([item[0] for item in training_bbq_prob])
octave.push('test_scores', test_scores, verbose=False)
octave.eval("test_bbq_prob = predict(BBQ_model, test_scores, 0)", verbose=False)
test_bbq_prob = octave.pull("test_bbq_prob", verbose=False)
test_bbq_prob = np.array([item[0] for item in test_bbq_prob])
naeini_bins = len(np.unique(training_bbq_prob))
naeini_mse = sum((test_bbq_prob - test_class) ** 2) / len(test_bbq_prob)
naeini_auc_roc = roc_auc_score(test_class, test_bbq_prob)
samples_per_bin = int(len(naeini_training_class) / naeini_bins)
naeini_p = max(training_bbq_prob) # Highest predicted value.
# Estimate credible intervals. np.unique() returns a list of two lists. The first one contains probabilities,
# the second counts. Their product corresponds to 'k', whereas the counts correspond to 'n'.
# For some reason, the library produces some negative values. These are set to zero.
naeini_data = np.unique(training_bbq_prob, return_counts=True)
# The naeini-model produces negative probabilities for some reason (?!?)
# Clear out the 'naeini_corrected data'. Turns out it was just my error. Doesn't affect results, though.
naeini_corrected_data = [[0 if item < 0 else item for item in naeini_data[0]], naeini_data[1]]
naeini_credible_intervals = [isotonic.credible_interval(np.round(p * n), n) for
(p, n) in zip(naeini_corrected_data[0], naeini_corrected_data[1])]
naeini_width_of_intervals = np.array([row['p_max'] - row['p_min'] for row in naeini_credible_intervals])
naeini_max_width = max(naeini_width_of_intervals)
# naeini_mce = isotonic.maximum_calibration_error(test_bbq_prob, test_class)
# naeini_ece = isotonic.expected_calibration_error(test_bbq_prob, test_class)
naeini_mce = isotonic.maximum_calibration_error(test_class, test_bbq_prob)
naeini_ece = isotonic.expected_calibration_error(test_class, test_bbq_prob)
naeini_acc = isotonic.accuracy(test_class, test_bbq_prob)
naeini_metrics.append({'mse': naeini_mse, 'auc_roc': naeini_auc_roc, 'max_width': naeini_max_width,
'bins': naeini_bins, 'samples_per_bin': samples_per_bin, 'max_p': naeini_p,
'mce': naeini_mce, 'ece': naeini_ece, 'acc': naeini_acc})
naeini_flag = True # Flag for tracking whether the new RCIR model is the first one beating the naeini model.
print("Naeini model trained.")
print(naeini_metrics[j])
# RCIR experiment
i = 0 # Tracker for number of bin merges.
best_score = {'mse': 1.0, 'auc_roc': 0.0} # Set to worst possible, used in comparison later.
isotonic_regression_max_width = max(width_of_intervals) # Store largest 'd'
first_model_below_threshold = True # Reset flag.
while(len(width_of_intervals) > 2): # Something has length... can still drop bins
i += 1 # Add one to counter.
max_interval = max(width_of_intervals)
# print "Max width: " + str(max_interval)
drop_idx = width_of_intervals.tolist().index(max_interval)
if drop_idx == 0: # Exception handling. Fist bin has largest credible interval.
# remove first two elements in x and y, update new first elements,
# remove first element from width_of_intervals
# and remove first elements from data[0] and data[1]
y = np.delete(y, [0, 1]) # Drop first and second items.
x = np.delete(x, [0, 1])
new_prob = (data[0][0] * data[1][0] + data[0][1] * data[1][1]) / (data[1][0] + data[1][1])
y[0] = new_prob
try: # y[1] doesn't exist if this is also the last bin.
y[1] = new_prob
except IndexError:
pass
# Leave x as is. data and width_of_intervals handled at end of loop.
int_mod = interp1d(x, y, bounds_error=False)
int_mod._fill_value_below = 0
int_mod._fill_value_above = 1
# print("Test-, and training performance, " + str(i) + " bins removed.d")
# print(isotonic.estimate_performance(int_mod, test_class, test_scores))
# print(isotonic.estimate_performance(int_mod, training_class, training_scores))
tmp = isotonic.credible_interval(k=round(data[0][0] * data[1][0] + data[0][1] * data[1][1]),
n=(data[1][0] + data[1][1]))
width_of_intervals[0] = tmp['p_max'] - tmp['p_min']
credible_intervals.pop(drop_idx) # Remove line from credible intervals
credible_intervals[0] = tmp
data[0][1] = new_prob
data[1][1] = data[1][0] + data[1][1]
validation_score = isotonic.estimate_performance(int_mod, validation_class, validation_scores)
# print(isotonic.estimate_performance(int_mod, test_class, test_scores))
# print(isotonic.estimate_performance(int_mod, training_class, training_scores))
elif drop_idx == len(width_of_intervals) - 1:
# More exception handling. '-1' for last element?
# remove last element (only one!) of x and y:
two_y_end = False
if(y[-1] == y[-2]):
two_y_end = True
y = np.delete(y, drop_idx * 2)
y = np.delete(y, drop_idx * 2 - 1)
x = np.delete(x, drop_idx * 2)
x = np.delete(x, drop_idx * 2 - 1)
new_prob = (data[0][-1] * data[1][-1] + data[0][-2] * data[1][-2]) / (data[1][-1] + data[1][-2])
# Hmm, there might be two bins for this y
if(two_y_end):
y[-2] = new_prob
y[-1] = new_prob
tmp = isotonic.credible_interval(k=round(data[0][-1] * data[1][-1] + data[0][-2] * data[1][-2]),
n=(data[1][-1] + data[1][-2]))
width_of_intervals[-2] = tmp['p_max'] - tmp['p_min']
credible_intervals.pop(-1) # Drop last.
credible_intervals[-1] = tmp
data[0][-2] = new_prob
data[1][-2] = data[1][-1] + data[1][-2]
# if((drop_idx != len(width_of_intervals)) or (drop_idx != 0): # Main handling
int_mod = interp1d(x, y, bounds_error=False)
int_mod._fill_value_below = 0
int_mod._fill_value_above = 1
# print("Testing set performance, " + str(i) + " bins removed.c")
# print(isotonic.estimate_performance(int_mod, test_class, test_scores))
validation_score = isotonic.estimate_performance(int_mod, validation_class, validation_scores)
else:
# Main method, i.e. when we are not dealing with the first or last bin.
# y contains the probability to be dropped twice
y = np.delete(y, drop_idx * 2 + 1)
y = np.delete(y, drop_idx * 2)
# Test lower:
x_tmp_lower = np.array(x) # Create NEW array!!
x_tmp_lower = np.delete(x_tmp_lower, drop_idx * 2) # Lower boundary of *this bin
x_tmp_lower = np.delete(x_tmp_lower, drop_idx * 2 - 1) # Upper boundary of smaller bin
y_tmp_lower = np.array(y) # Creates _new_ array!!!
new_prob_lower = ((data[1][drop_idx] * data[0][drop_idx] +
data[1][drop_idx - 1] * data[0][drop_idx - 1]) /
(data[1][drop_idx] + data[1][drop_idx - 1]))
y_tmp_lower[drop_idx * 2 - 1] = new_prob_lower # New value
y_tmp_lower[drop_idx * 2 - 2] = new_prob_lower # Same value
# Test upper:
x_tmp_upper = np.array(x)
x_tmp_upper = np.delete(x, drop_idx * 2 + 2) # Lower boundary of larger bin
x_tmp_upper = np.delete(x_tmp_upper, drop_idx * 2 + 1) # Upper boundary of *this bin
y_tmp_upper = np.array(y)
new_prob_upper = ((data[1][drop_idx] * data[0][drop_idx] +
data[1][drop_idx + 1] * data[0][drop_idx + 1]) /
(data[1][drop_idx] + data[1][drop_idx + 1]))
y_tmp_upper[drop_idx * 2] = new_prob_upper # New value, bin guaranteed to exist.
try: # Bin doesn't exist if it is last
y_tmp_upper[drop_idx * 2 + 1] = new_prob_upper # New value
except IndexError:
pass
# Now, which bin to add it to?
# Compare the two:
int_mod_lower = interp1d(x_tmp_lower, y_tmp_lower, bounds_error=False)
int_mod_upper = interp1d(x_tmp_upper, y_tmp_upper, bounds_error=False)
int_mod_lower._fill_value_below = 0
int_mod_lower._fill_value_above = 1
int_mod_upper._fill_value_below = 0
int_mod_upper._fill_value_above = 1
# Left (smaller) bin: idx
score_lower = isotonic.estimate_performance(int_mod_lower, validation_class, validation_scores)
score_upper = isotonic.estimate_performance(int_mod_upper, validation_class, validation_scores)
if((score_lower['auc_roc'] > score_upper['auc_roc'] and metric == 'auc_roc') or
(score_lower['mse'] < score_upper['mse'] and metric == 'mse')): # Select the model with better auc_roc.
x = x_tmp_lower
y = y_tmp_lower
data[1][drop_idx - 1] = data[1][drop_idx] + data[1][drop_idx - 1]
data[0][drop_idx - 1] = new_prob_lower
tmp = isotonic.credible_interval(k=round(data[0][drop_idx - 1] * data[1][drop_idx - 1]),
n=(data[1][drop_idx - 1]))
width_of_intervals[drop_idx - 1] = tmp['p_max'] - tmp['p_min']
credible_intervals.pop(drop_idx)
credible_intervals[drop_idx - 1] = tmp
int_mod = int_mod_lower
# print("Testing set performance, " + str(i) + " bins removed.b")
# print(isotonic.estimate_performance(int_mod_lower, test_class, test_scores))
else:
x = x_tmp_upper
y = y_tmp_upper
data[1][drop_idx + 1] = data[1][drop_idx] + data[1][drop_idx + 1]
data[0][drop_idx + 1] = new_prob_upper
tmp = isotonic.credible_interval(k=round(data[0][drop_idx + 1] * data[1][drop_idx + 1]),
n=(data[1][drop_idx + 1]))
width_of_intervals[drop_idx + 1] = tmp['p_max'] - tmp['p_min']
credible_intervals[drop_idx + 1] = tmp
credible_intervals.pop(drop_idx)
int_mod = int_mod_upper
# print("Testing set performance, " + str(i) + " bins removed.a")
# print(isotonic.estimate_performance(int_mod_upper, test_class, test_scores))
# print(isotonic.estimate_performance(int_mod_upper, training_class, training_scores))
validation_score = isotonic.estimate_performance(int_mod, validation_class, validation_scores)
width_of_intervals = np.delete(width_of_intervals, drop_idx)
# Drop samples from data[1][drop_idx]. The samples are previously added to adequate new bin.
data = (np.delete(data[0], drop_idx), np.delete(data[1], drop_idx))
isotonic.plot_intervals(data, credible_intervals,
file_name="./" + test_description + "/" + str(i) + "_bins_dropped.png",
label="Credible intervals, d=" +
str(round(max(width_of_intervals), 3)))
isotonic.plot_reliability_diagram(int_mod, test_scores, test_class,
label="Reliability diagram, " + str(i) + " bins dropped",
file_name="./" + test_description + "/reliability_diagram" + str(i) + ".png")
# Store best model:
# if(validation_score['auc_roc'] > best_score['auc_roc']):
if((validation_score['auc_roc'] > best_score['auc_roc'] and metric == 'auc_roc') or
(validation_score['mse'] < best_score['mse'] and metric == 'mse')): # Select the model with better auc_roc.
# This is the validation set control
best_model = int_mod # This is the RCIR model (technically an interpolation model)
best_model_bin_merges = i # Number of bin merges
best_model_max_width = max(width_of_intervals) # Maximum width of credible intervals
best_model_bins_left = len(width_of_intervals) # Number of bins left
best_model_max_p = max(isotonic.predict(best_model, training_scores))
best_score = validation_score # Note!! Best score on VALIDATION SET, not testing set.
best_model_test_prob = isotonic.predict(best_model, test_scores)
best_model_mce = isotonic.maximum_calibration_error(test_class, best_model_test_prob)
best_model_ece = isotonic.expected_calibration_error(test_class, best_model_test_prob)
best_model_acc = isotonic.accuracy(test_class, best_model_test_prob)
# If the current model is the first one to have smaller maximum credible interval (max_d)
# than the Naeini-model, then estimate and store metrics for that model.
# NO POINT IN ANYTHING ELSE THAN RCIR-CV AS COMPARISON FOR NAEINI AS NAEINI DOES NOT REALLY
# PROVIDE A VALID NUMBER FOR 'd'
# if(model_comparison != 'Naeini vs. RCIR-CV'):
# max_width = max(width_of_intervals)
# if(max_width < naeini_max_width and naeini_flag):
# naeini_flag = False # set flag to not enter this block again, store values.
# # Testing set metrics of _current_ model:
# test_score = isotonic.estimate_performance(int_mod, test_class, test_scores)
# max_p = max(isotonic.predict(int_mod, training_scores)) # Maximum value of prediction
# bins_left = len(width_of_intervals)
# bin_merges = i
# rcir_better_than_naeini_metrics.append({'mse': test_score['mse'],
# 'auc_roc': test_score['auc_roc'],
# 'max_width': max_width,
# 'bins': bins_left,
# 'bin_merges': bin_merges,
# 'max_p': max_p})
if(max(width_of_intervals) < threshold and first_model_below_threshold): # I.e. if this is the first time we fall below the threshold
first_model_below_threshold = False # Set flag.
max_width = max(width_of_intervals)
# Testing set metrics of _current_ model:
test_score = isotonic.estimate_performance(int_mod, test_class, test_scores)
t_test_prob = isotonic.predict(int_mod, test_scores)
t_max_p = max(isotonic.predict(int_mod, training_scores)) # Maximum value of prediction
t_bins_left = len(width_of_intervals)
t_bin_merges = i
t_mce = isotonic.maximum_calibration_error(test_class, t_test_prob)
t_ece = isotonic.expected_calibration_error(test_class, t_test_prob)
t_acc = isotonic.accuracy(test_class, t_test_prob)
t_isotonic_metrics.append({'mse': test_score['mse'],
'auc_roc': test_score['auc_roc'],
'max_width': max_width,
'bins': t_bins_left,
'bin_merges': t_bin_merges,
'max_p': t_max_p,
'mce': t_mce,
'ece': t_ece,
'acc': t_acc})
if(model_comparison == 'Naeini vs. RCIR-CV'):
# At this point, the while-loop has finished. I.e. data and width_of_intervals correspond to a model with
# _two_ bins remaining.
best_model_performance = isotonic.estimate_performance(best_model, test_class, test_scores)
# All values below for the best rcir-model unless explicitly stated otherwise ('_ir').
rcir_cv_best_model_metrics.append({'mse': best_model_performance['mse'],
'auc_roc': best_model_performance['auc_roc'],
'max_width': best_model_max_width,
'bins': best_model_bins_left,
'bin_merges': best_model_bin_merges,
'max_p': best_model_max_p,
'mse_ir': isotonic_regression_performance['mse'],
'auc_roc_ir': isotonic_regression_performance['auc_roc'],
'max_width_ir': isotonic_regression_max_width,
'mce': best_model_mce,
'ece': best_model_ece,
'acc': best_model_acc})
print("Best model testing set scores")
print(rcir_cv_best_model_metrics[j])
print("Dropped bins " + str(best_model_bin_merges))
if(model_comparison != 'Naeini vs. RCIR-CV'):
print("First model to reach Naeini's d: ")
print(rcir_better_than_naeini_metrics[j])
print("\n") # Newline to separate runs