/
hands-on.py
292 lines (228 loc) · 11.5 KB
/
hands-on.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
import pandas as pd
import statsmodels.api as sm
import pylab as pl
import numpy as np
import sklearn.linear_model as skl
import sklearn.preprocessing as skp
# Our own plotting functions, somewhat specific to this hands-on session.
import plots
# Read the data (edit the path if necessary).
# We us pandas DataFrame data structure. It is a 2-dimensional
# labeled data structure with columns of potentially
# different type.
# See more info: http://pandas.pydata.org/pandas-docs/dev/dsintro.html
# The data file has no row indices/names, hence index_col=F.
df = pd.read_csv("wine_quality_data.csv", index_col=False)
# How the data looks like... first rows.
print df.head()
# What columns/variables are there?
print df.columns
# Variables that could be used to predict the wine type.
def chemvars(df):
return df[['fixed_acidity', 'volatile_acidity', 'citric_acid',
'residual_sugar', 'chlorides', 'free_sulfur_dioxide',
'total_sulfur_dioxide', 'density', 'pH', 'sulphates',
'alcohol']]
# Yet another way to look at the data.
print chemvars(df).describe()
# Histograms of all predictors.
plots.histograms(chemvars(df))
# The wine type is a string. Make a boolean type indicator.
def is_red(df): return np.array(df['wine_color']=="red", int)
# Calling the thing to be predicted 'y' is a convention.
y = is_red(df)
# Histograms by the wine type.
# Hint: This is very useful when you select or transform variables for a model
# later.
plots.class_histograms(y, chemvars(df), alpha=.3)
# Pandas data frames are handy for calculating class-wise summary statistics.
# Some examples...
# - Percentages of wine types in the data
print 100*df.groupby(y).size()/float(len(df))
# - Means (averages) by wine type
print df.groupby(y).mean()
# - Standard deviations by wine type
print df.groupby(y).std()
# We could use all the predictors in a model.
# But to keep things simple, let's start with two variables only.
# (Things are easier to plot if you have only two dimensions.)
# Choose variables that separate the target classes (wine type) well.
# For example...
# Hint: Try other predictors. Look at the histograms by wine type.
# Plot scatter plot of the chosen vars, colors specify wine type.
plots.scatter(y, chemvars(df), 'fixed_acidity', 'chlorides', alpha=0.2, ymax=0.4)
# Our function allows transformations, see the code in plots.py
# Logarithm compresses high values and opens up the scale at the lower end.
plots.scatter(y, chemvars(df), 'fixed_acidity', 'chlorides', np.log, np.log, alpha=0.2, ymax=0.4)
# Notice: The second part starts here.
# Fit a logistic regression model using two variables.
# More info for logistic regression: http://en.wikipedia.org/wiki/Logistic_regression
# In a regression model, you should have a free constant in the linear combination.
# It is called intercept.
# (Think for a while a logistic regression with no variables, but either with an intercept
# or without. The latter always gives p=0.5, the former adjusts to the prior class probabilities
# of the data.)
# The intercept becomes conveniently added if you add a constant variable to the data.
# The function returns predictors for chosen two variables:
def preds2(df, v1, v2):
X = df[[v1, v2]]
X['intercept'] = 1.0
return X
# For convenience, define a list of variables we will use.
# (Note: tuple does not work here because this is used for indexing later)
# The magic happens here.
X = preds2(df, 'fixed_acidity', 'chlorides')
model = sm.Logit(y, X) # Create model object.
mfit = model.fit() # Train the model, i.e., optimize its parameters
print mfit.summary() # It did something!
# Predictions: what the model "remembers" about the wines in the training data,
# are the wines red or white (probabilities for the wine being red)
p = mfit.predict(X)
# How those probabilities compare to what the wines actually were?
# There are many ways to evaluate the predictions, here we just
# see how many hits are correct if probabilities over 0.5 are guessed as red.
def pacc(y, p):
return 100*sum(y == (p>0.5)) / float(len(y))
print pacc(y, p)
# From the model's point of view, wine is what we have shown about it: vars2d.
# Because of the '2d', it is easy to plot what the model thinks about
# all possible wines, that is, all possible values of vars2d.
# (remember, 'intercept' is just a constant)
# Note the zoom tool in the fig!
plots.decision_surface(y, X, mfit.predict, 'fixed_acidity', 'chlorides')
# Shades of gray are probabilities.
# (You may want to look at the plotting code in plots.py, but it is not relevant
# to the big picture.)
# Note the linear decision boundary above. This comes from the linearity of the
# logistic regression model.
# If you want a more flexible decision boundary, you can add nonlinear terms
# to the model.
# Then the model is still linear with respect to the variables it sees, but not
# with respect to the original data!
def add_mterm(X, vars):
"""Add a multiplicative term to the model."""
X = X.copy()
X['*'.join(vars)] = np.multiply.reduce(X[list(vars)].values, 1)
return X
# Here are some multiplicative terms.
# For example, (var1, var1) will add the term var1*var1 to the model,
# so the model would be var1 + var2 + var1*var2 + intercept
def some_combinations(v1, v2):
# A bunch of simple nonlinear multiplicative terms.
# Hint: You may want to edit the terms or define a more general function.
return [(v1, v1), (v2, v2), (v1, v1, v1), (v2, v2, v2), (v1, v2)]
# Let's make a data with all the nonlinear terms above!
v1, v2 = 'fixed_acidity', 'chlorides'
# add_nonlins() is needed later for plotting.
def add_nonlins(X, v1, v2): return reduce(add_mterm, some_combinations(v1, v2), X)
Xn = add_nonlins(preds2(df, v1, v2), v1, v2)
y = is_red(df) # (just as a reminder)
# Fit the model with all the new nonlinear terms.
m_nl = sm.Logit(y, Xn) # Model object with the expanded set of predictors.
mfit_nl = m_nl.fit() # Fit it.
print mfit_nl.summary() # See how it is
# Probabilities the new model gives to wines being red
# Hint: you may plot p_nl against the predictions of the linear model (p).
p_nl = mfit_nl.predict(Xn)
# How many does it get right, in percentages?
print pacc(y, p_nl)
# Visualize the probabily surface of the nonlinear model.
# Note that the plotter needs a function that maps a two-dimensional data frame
# onto the higher dimensional training data.
plots.decision_surface(y, X, mfit_nl.predict, v1, v2, nlmap=add_nonlins)
# Quite a complex decision surface!
# Again, note the zoom tool in the plot window.
# Notice: The third part starts here.
# Evaluating models with leave-out data
# Make this a bit more functional. :)
# Split the data set randomly into two.
df_train, df_test = [i[1] for i in df.groupby(np.random.random(len(df))>.5)]
def my_preds2(df): return preds2(df, 'fixed_acidity', 'chlorides')
model = sm.Logit(is_red(df_train), my_preds2(df_train))
mfit = model.fit()
print mfit.summary()
# Predict on training data
p_train = mfit.predict(my_preds2(df_train))
print pacc(is_red(df_train), p_train)
# Predict on *test* data. The model has never seen these samples.
p_test = mfit.predict(my_preds2(df_test))
print pacc(is_red(df_test), p_test)
# ... or both
print [pacc(is_red(d), mfit.predict(my_preds2(d))) for d in (df_train, df_test)]
# Hmm, how about with all variables?
def preds_all(df):
X = chemvars(df)
X['intercept'] = 1.0
return X
# You may experiment with more quadratic terms
def diag_triangular(x): return [(i, j) for i in x for j in x if i<=j]
def triangular(x): return [(i, j) for i in x for j in x if i<j]
# By following command you may create quadratic terms
df_big = reduce(add_mterm, diag_triangular(chemvars(df).columns), chemvars(df))
# Note with big models:
# If your models blow up, that is, the fitter gives an error about a singular matrix,
# try the regularization approach (below).
# As a side step, before regularization...
# Cross validation does not waste half of the data for testing:
# It splits the data set into (e.g.) 10 pieces, trains on 9, tests on the remaining one,
# and loops this through all the ten pieces.
from crossvalidate import cv
n_success = cv(is_red(df), my_preds2(df), 10,
# The function to build the model and predict
lambda ytr, Xtr, Xev: sm.Logit(ytr, Xtr).fit().predict(Xev),
# The function to evaluate results: How many are right?
lambda p, y: sum(y == (p>0.5)))
print 100*n_success/float(len(df))
# Does your model blow up, complaining about a singular matrix?
# "numpy.linalg.linalg.LinAlgError: Singular matrix"
# This is because the maximum likelihood solution becomes unidentifiable:
# It is either not unique any more, or is numerically very unstable.
# Then you need a better estimation method, and that is Maximum a Posteriori (MAP).
# https://en.wikipedia.org/wiki/Maximum_a_posteriori_estimation
# http://en.wikipedia.org/wiki/Regularization_(mathematics)
# https://en.wikipedia.org/wiki/Bayesian_interpretation_of_regularization
# MAP or regularization makes the solution identifiable by making
# coefficients near zero more probable. To realize this near-zero preference,
# a new term is introduced to the cost function, with a weight 1/C.
# The smaller the C, the closer to zero the coefficients tend to be.
# If you use L1 regularization, some coefficients will be exactly zero,
# or up to numerical (estimation) accuracy anyways. That is, L1 is a way
# to realize automatic variable selection.
# In practice, one tries different values of C and evaluates the models
# with test data or cross validation.
# Regularization is our prior statement about the coefficients of the model.
# As the coefficients are (inversely) related to the scales of the variables,
# we need to rescale the data to some known and standard scale beforehand.
# The convention is to make the standard deviation of all variables equal to one.
# So here we go...
# Create a scaler object. This needs to be wrapped in an object, because
# the same scaling needs to be used multiple times.
# Notice: sklearn package does not work correctly with pandas DataFrames,
# but it assumes data as numpy arrays.
scaler = skp.StandardScaler().fit(chemvars(df_train).values)
# Create a model object with L1 regularization, and C=0.01.
# The data is not shown to the model yet. :)
m_l1 = skl.LogisticRegression(C=0.01, penalty='l1')
# Fit the model, i.e., optimize the parameters.
m_l1.fit(scaler.transform(chemvars(df_train).values), is_red(df_train))
# The values of parameters
print m_l1.coef_
# The number of variables (columns) having zero coefficient
print sum((abs(m_l1.coef_[0])<0.001).astype(int))
# The names of the variables with non-zero coefficients.
print chemvars(df_train).columns[(abs(m_l1.coef_[0])>0.001)]
# Compute predictions for the training data.
# This class gives probabilities for both classes, hence [:, 1].
p_train = m_l1.predict_proba(scaler.transform(chemvars(df_train).values))[:, 1]
# Training set accuracy
print pacc(is_red(df_train), p_train)
# Predictions (probabilities) for test data
p_test = m_l1.predict_proba(scaler.transform(chemvars(df_test).values))[:,1]
# Test accuracy
print pacc(is_red(df_test), p_test)
# You may want to play with a larger model (consider trainign with df_big above), and different values of C.
# Use the test set or cross validation to evaluate the results.
# You may also try to plot the decision surface with respect to two important variables in the model.
# (For that, you need to fix the values of other variables, of course.)
# Hint: You may start by small enough value for C to make all the model coefficients to zero,
# and then gradually increasing C, preferably using a logarithmic scale.