/
ignoring_unused_features.py
126 lines (88 loc) · 3.66 KB
/
ignoring_unused_features.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# -*- coding: utf-8 -*-
"""
@author: dgarreau
In this script, we show how Tabular LIME ignores unused coordinates for a
non-linear f. We take the example of a kernel ridge regressor trained on a
subset of the coordinates of the Wine dataset. This is Figure 15 in the paper.
"""
import numpy as np
import matplotlib.pyplot as plt
import lime
import lime.lime_tabular
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.kernel_ridge import KernelRidge
from sklearn.gaussian_process.kernels import RBF
from utils.aux_functions import get_training_data_stats
from utils.aux_functions import format_coefs
from utils.plot_functions import plot_whisker_boxes
if __name__ == "__main__":
# number of experiments
n_exp = 100
# number of perturbed samples
n_samples = 1000
# for reproducibility
np.random.seed(1)
# non linear model, train some kernel ridge regressor on Wine
wine = load_wine()
X_orig = wine['data']
# removing the last n_removed coordinate
n_removed = 6
X = X_orig[:,:-n_removed]
Y = wine['target']
# scale evrything
X_scaled = preprocessing.scale(X)
X_orig_scaled = preprocessing.scale(X_orig)
# train test split
X_train, X_test, Y_train, Y_test = train_test_split(X_scaled, Y, test_size = 0.2, random_state=5)
# let us train the model
kernel = RBF(length_scale=10)
krr_model = KernelRidge(kernel=kernel,alpha=1)
krr_model.fit(X_train,Y_train)
# and restrict to the first coordinates
def my_model(array):
return krr_model.predict(array[:,:-n_removed])
# dimension of the ambient space
dim = X_orig.shape[1]
# the example to explain
xi = X_orig_scaled[0,:]
# bandwidth parameter
nu = 5
# number of bins along each dimension
p = 4
# getting the stats
my_stats = get_training_data_stats(X_orig_scaled,p)
# creating the explainer
explainer = lime.lime_tabular.LimeTabularExplainer(X_orig_scaled,
mode='regression',
feature_selection='none',
training_data_stats=my_stats,
kernel_width=nu)
# main loop
beta_emp_store = np.zeros((n_exp,dim+1))
for i_exp in range(n_exp):
if np.mod(i_exp + 1,10) == 0:
s_exp = "Experiment {} / {} is running...".format(i_exp + 1,n_exp)
print(s_exp)
# getting the explanation
exp = explainer.explain_instance(xi,
my_model,
num_samples=n_samples)
# getting the coefficients of the local model
beta_emp_store[i_exp,:] = format_coefs(exp)
###########################################################################
# getting nicer feature names
#my_features = wine['feature_names']
my_features = ['alcohol','Malic acid','ash','alcalinity','Mg',
'phenols','flavanoids','non-flavanoid phenol',
'proanthocyanins','color intensity','hue',
'part of diluted wines','proline']
# plotting the result
fig, ax = plt.subplots(figsize=(15,10))
plot_whisker_boxes(beta_emp_store,
ax,
rotate=True,
feature_names=my_features)
s_name = "results/ignore_non_linear_default_weights"
fig.savefig(s_name + '.pdf',format='pdf',bbox_inches = 'tight',pad_inches = 0)