-
Notifications
You must be signed in to change notification settings - Fork 4
/
sparserbm.py
358 lines (309 loc) · 13.7 KB
/
sparserbm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
import numpy as np
from numpy import random
from scipy import sparse
from matplotlib.mlab import find
import time
from sys import stdout
class SparseRBM(object):
"""
SparseRBM implements an RBM using a sparse subsampling procedure to
speed up runtime. The idea is to perform gibbs sampling on a subset of
the visible layer instead of the full visible layer. This is achieved by
passing along a list of indexes for the active features in each gibbs step.
Gibbs is run on the active features while all other features are assumed to
have an activation probability of 0.
This current implementation only considered features that are active in at
least one data point in the mini batch. Other selection procedures can be
used by implementing a different 'make_batches' procedure in the source code
Training is performed using stochastic gradient descent (SGD) with momentum
using fast persistent contrastive divergence (FPCD-n)
Documentation on RBM: http://www.cs.toronto.edu/~hinton/absps/guideTR.pdf
Documentation on FPCD: http://www.cs.toronto.edu/~tijmen/fpcd/fpcd.pdf
Parameters:
visible_size: The size of the visible layer
hidden_size: The size of hidden layer
epochs: The number of training iterations over the dataset (default 10)
batch_size: The size of each individual mini-batch (default 100)
n: The number of gibbs steps in the negative phase of training (default 1)
learn_rate: The learning rate of SGD (default 0.1)
momentum: The momentum of gradient updates in SGD (default 0.9)
fw_decay: The decay rate of the fast-weights in FPCD
(the default of 0.98 gives good emperical results)
l2: The magnitude of L2 regularization (default 0.0)
verbose: Display costs and runtime during training (default False)
Set attributes:
W: The weights of the RBM
vbias: The visible bias term
hbias: The hidden bias term
fW: The fast-weights used for FPCD
total_epochs: The total number of epochs this RBM has been trained
cost_hist: A list of cost histories of each mini-batch during training
"""
def __init__(self, visible_size, hidden_size, epochs=10, batch_size=100,
n=1, learn_rate=0.1, momentum=0.9, fw_decay=0.98, l2=0.0,
verbose=False):
self.visible_size = visible_size
self.hidden_size = hidden_size
self.hbias = np.zeros(hidden_size)
self.vbias = np.zeros(visible_size)
self.W = random.randn(hidden_size, visible_size) / \
np.sqrt(hidden_size + visible_size)
self.batch_size = batch_size
self.epochs = epochs
self.learn_rate = learn_rate
self.n = n
self.l2 = l2
self.momentum = momentum
self.verbose = verbose
self.fw_decay = fw_decay
self.fW = np.zeros(self.W.shape)
self.flr = learn_rate * np.exp(1) # fast learn rate heuristic
self.p = np.zeros((self.batch_size, self.hidden_size))
self._prevgrad = {'W': np.zeros(self.W.shape),
'hbias': np.zeros(hidden_size),
'vbias': np.zeros(visible_size)}
self.total_epochs = 0
self.cost_hist = []
def __repr__(self):
return ('SparseRBM(visible_size=%i, hidden_size=%i, epochs=%i, '
'batch_size=%i, learn_rate=%.4g, momentum=%.4g, l2=%.4g, '
'fw_decay=%.4g, verbose=%s)') %(self.visible_size, self.hidden_size,
self.epochs, self.batch_size, self.learn_rate, self.momentum,
self.l2, self.fw_decay, self.verbose)
def propup(self, vis, subsample_ids=None, fw=False):
"""
Compute and sample P(h | v)
Note: Calling this function without subsample_ids can be used to perform
a transformation of the full dataset to be used in a ML pipeline
Args:
vis: The state of the visible layer - shape (m, len(subsample_ids))
subsample_ids: The feature indices used in this subsample
fw: Boolean controlling if fast weights should be used
Returns:
a 3-tuple: (sample, probabilities, linear ouputs before nonlinearity)
"""
W = self.fW + self.W if fw else self.W
if subsample_ids is not None:
W = W[:, subsample_ids]
pre_non_lin = vis.dot(W.T) + self.hbias
non_lin = sigmoid(pre_non_lin)
sample = sample_bernoulli(non_lin)
return (sample, non_lin, pre_non_lin)
def propdown(self, hid, subsample_ids=None, fw=False):
"""
Compute and sample P(v | h)
Note: Calling this function without subsample_ids with a large
visible_size will be extremely slow and may potentially cause
memory issues resulting in massive slowdowns of your computer.
You've been warned!
Args:
hid: The state of the hidden layer - shape: (m, hidden_size)
subsample_ids: The visible indices we want from this subsample
fw: Boolean controlling if fast weights should be used
Returns:
a 3-tuple: (sample, probabilities, linear ouputs before nonlinearity)
"""
W = self.fW + self.W if fw else self.W
vbias = self.vbias
if subsample_ids is not None:
W = W[:, subsample_ids]
vbias = vbias[subsample_ids]
pre_non_lin = hid.dot(W) + vbias
non_lin = sigmoid(pre_non_lin)
sample = sample_bernoulli(non_lin)
return (sample, non_lin, pre_non_lin)
def gibbs_hvh(self, h, meanfield=False, **args):
"""
Performs one step of gibbs sampling given the hidden state
Args:
h: The hidden state
meanfield: Boolean controlling if we want to use the mean field values
during gibbs instead of samples
**args: arguments to pass to propup/propdown procedures
Returns:
a 2-tuple of 3-tuples (visible samples, hidden samples)
"""
v_samples = self.propdown(h, **args)
v = v_samples[1] if meanfield else v_samples[0]
h_samples = self.propup(v, **args)
return v_samples, h_samples
def gibbs_vhv(self, v, meanfield=False, **args):
"""
Performs one step of gibbs sampling given the visible state
Args:
v: The visible state
meanfield: Boolean controlling if we want to use the mean field values
during gibbs instead of samples
**args: arguments to pass to propup/propdown procedures
Returns:
a 2-tuple of 3-tuples (visible samples, hidden samples)
"""
h_samples = self.propup(v, **args)
h = h_samples[1] if meanfield else h_samples[-1]
v_samples = self.propdown(h, **args)
return v_samples, h_samples
def cost(self, v, subsample_ids=None):
"""
Compute the 'cost' and gradient using FPCD.
NOTE: The 'cost' is not an actual cost metric, it is only the
approximate reconstruction error of the visible sample. What RBMs
are actually minimizing is an energy function
Args:
v: The visible state
subsample_ids: The visible indices we want to consider when computing
the reconstruction error and gradient
Returns:
cost: The reconstruction error
grad: A dict containing gradient approximations for W, vbias, hbias
"""
num_points = v.shape[0]
# positive phase
pos_h_samples = self.propup(v, subsample_ids)
# negative phase
nh0 = self.p[:num_points]
for i in xrange(self.n):
neg_v_samples, neg_h_samples = self.gibbs_hvh(nh0,
subsample_ids=subsample_ids,
fw=True)
nh0 = neg_h_samples[0]
# compute gradients
grad = self._grad(v, pos_h_samples, neg_v_samples, neg_h_samples)
self.p[:num_points] = nh0
# compute reconstruction error
reconstruction = self.propdown(pos_h_samples[0], subsample_ids)[1]
cost = np.abs(v - reconstruction).sum(1).mean(0)
return cost, grad
def _grad(self, pv0, pos_h, neg_v, neg_h):
"""
Helper to compute the gradient approximation
Args:
pv0: visible layer state from the positive phase
pos_h: hidden layer state from the postive phase
neg_v: visible layer state from the negative phase
neg_h: hidden layer state from the negative phase
Returns:
The gradient dict required in the cost function
"""
grad = {}
num_points = pv0.shape[0]
E_v = neg_v[1]
E_h = neg_h[1]
E_hgv = pos_h[1]
E_vh = E_h.T.dot(E_v)
E_vhgv = E_hgv.T.dot(pv0)
grad['W'] = (E_vhgv - E_vh) / num_points
grad['vbias'] = (pv0 - E_v).mean(0)
grad['hbias'] = (E_hgv - E_h).mean(0)
return grad
def update(self, grad, subsample_ids=None):
"""
Update the RBM parameters W, vbias, hbias, fW using momentum
Args:
grad: The gradient dict returned from the cost function
subsample_ids: The subsample indices used when generating the gradient
Returns:
self
"""
prev_grad = self._prevgrad
dW0 = grad['W']
dv0 = grad['vbias']
dh0 = grad['hbias']
if subsample_ids is not None:
dv0 = np.zeros(self.vbias.shape)
dW0 = np.zeros(self.W.shape)
dv0[subsample_ids] = grad['vbias']
dW0[:, subsample_ids] = grad['W']
dW = self.momentum * prev_grad['W'] + \
self.learn_rate * (dW0 - self.l2 * self.W)
dh = self.momentum * prev_grad['hbias'] + self.learn_rate * dh0
dv = self.momentum * prev_grad['vbias'] + self.learn_rate * dv0
self.W += dW
self.hbias += dh
self.vbias += dv
self.fW = self.fw_decay * self.fW + self.flr * dW0 # Fast weight update for PCD
self._prevgrad['W'] = dW
self._prevgrad['hbias'] = dh
self._prevgrad['vbias'] = dv
return self
def transform(self, data):
"""
Perform a transformation of the data to activation probabilities of the
hidden layer
Args:
data: the data to be transformed interpreted as the visible layer
Returns:
The activation probabilities p(h | v)
"""
return self.propup(data)[1]
def fit(self, data):
"""
Trains the RBM using stochastic gradient descent for self.epochs
iterations over the dataset
Note: contrary to idioms from Scikit-Learn, calling fit will not
reinitialize the weights of the model. Training will continue
given the weights and biases the RBM has configured
Args:
data - the data to be interpreted as the visible states of the RBM
Returns:
self
"""
n, m = data.shape
num_batches = n / self.batch_size
e = 0
if self.verbose:
start_time = time.clock()
while e < self.epochs:
e += 1
batches = make_batches(data, self.batch_size)
for i, (batch, subsample_ids) in enumerate(batches):
cost, grad = self.cost(batch, subsample_ids)
self = self.update(grad, subsample_ids)
self.cost_hist.append(cost)
if self.verbose:
print 'Batch %i - Cost %0.6f\r'%(i+1, cost),
stdout.flush()
if self.verbose:
print 'Training Epoch %i'%(self.total_epochs),
print 'Average Cost: %0.6f\t\t'%np.mean(self.cost_hist[-num_batches:])
stdout.flush()
self.total_epochs += 1
if self.verbose:
end_time = time.clock()
print 'Runtime %0.2fs'%(end_time-start_time)
return self
def make_batches(data, batch_size=100):
"""
Split the data into minibatches of size batch_size
This procedure generates subsamples ids for batches by only considering
features that are active in the minibatch
Args:
data - the data to be split into minibatches (must be rank 2)
batch_size - the size of the minibatches
Returns:
batches - a list: [(batch, subsample_ids) for batch in minibatchs]
"""
n = data.shape[0]
perm = random.permutation(range(n))
i = 0
batches = []
while i < n:
batch = perm[i:i+batch_size]
i += batch_size
batches.append(data[batch])
try:
ids = [find((b.sum(0) != 0).A.flatten()) for b in batches]
except AttributeError:
ids = [find((b.sum(0) != 0).flatten()) for b in batches]
batches = [(b[:,i].toarray(), i) for b,i in zip(batches, ids)]
return batches
def sigmoid(X):
"""Compute sigmoid function"""
return 1 / (1 + np.exp(-X))
def sample_bernoulli(X):
"""
All values of X must be probabilities of independent events occuring according
to a binomial distribution
Returns an indicator array:
output[i,j] = 1 iif X[i,j] >= uniform(0, 1)
"""
return (X >= random.uniform(size=X.shape)).astype('b')