def fit(self, X, Y, lr=10e-5, mu=0.99, reg=10e-7, decay=0.99999, eps=10e-3, batch_sz=30, epochs=100, show_fig=True): lr = np.float32(lr) mu = np.float32(mu) reg = np.float32(reg) decay = np.float32(decay) eps = np.float32(eps) # make a validation set X, Y = shuffle(X, Y) X = X.astype(np.float32) Y = Y.astype(np.int32) Xvalid, Yvalid = X[-1000:], Y[-1000:] X, Y = X[:-1000], Y[:-1000] # initialize convpool layers N, c, width, height = X.shape mi = c outw = width outh = height self.convpool_layers = [] for mo, fw, fh in self.convpool_layer_sizes: layer = ConvPoolLayer(mi, mo, fw, fh) self.convpool_layers.append(layer) outw = (outw - fw + 1) // 2 outh = (outh - fh + 1) // 2 mi = mo # initialize mlp layers K = len(set(Y)) self.hidden_layers = [] M1 = self.convpool_layer_sizes[-1][ 0] * outw * outh # size must be same as output of last convpool layer count = 0 for M2 in self.hidden_layer_sizes: h = HiddenLayer(M1, M2, count) self.hidden_layers.append(h) M1 = M2 count += 1 # logistic regression layer W, b = init_weight_and_bias(M1, K) self.W = theano.shared(W, 'W_logreg') self.b = theano.shared(b, 'b_logreg') # collect params for later use self.params = [self.W, self.b] for c in self.convpool_layers: self.params += c.params for h in self.hidden_layers: self.params += h.params # for momentum dparams = [ theano.shared(np.zeros(p.get_value().shape, dtype=np.float32)) for p in self.params ] # for rmsprop cache = [ theano.shared(np.zeros(p.get_value().shape, dtype=np.float32)) for p in self.params ] # set up theano functions and variables thX = T.tensor4('X', dtype='float32') thY = T.ivector('Y') pY = self.forward(thX) rcost = reg * T.sum([(p * p).sum() for p in self.params]) cost = -T.mean(T.log(pY[T.arange(thY.shape[0]), thY])) + rcost prediction = self.th_predict(thX) cost_predict_op = theano.function(inputs=[thX, thY], outputs=[cost, prediction]) # updates = [ # (c, decay*c + (np.float32(1)-decay)*T.grad(cost, p)*T.grad(cost, p)) for p, c in zip(self.params, cache) # ] + [ # (p, p + mu*dp - lr*T.grad(cost, p)/T.sqrt(c + eps)) for p, c, dp in zip(self.params, cache, dparams) # ] + [ # (dp, mu*dp - lr*T.grad(cost, p)/T.sqrt(c + eps)) for p, c, dp in zip(self.params, cache, dparams) # ] # momentum only updates = [(p, p + mu * dp - lr * T.grad(cost, p)) for p, dp in zip(self.params, dparams) ] + [(dp, mu * dp - lr * T.grad(cost, p)) for p, dp in zip(self.params, dparams)] train_op = theano.function(inputs=[thX, thY], updates=updates) n_batches = N // batch_sz costs = [] for i in range(epochs): X, Y = shuffle(X, Y) for j in range(n_batches): Xbatch = X[j * batch_sz:(j * batch_sz + batch_sz)] Ybatch = Y[j * batch_sz:(j * batch_sz + batch_sz)] train_op(Xbatch, Ybatch) if j % 20 == 0: c, p = cost_predict_op(Xvalid, Yvalid) costs.append(c) e = error_rate(Yvalid, p) print("i:", i, "j:", j, "nb:", n_batches, "cost:", c, "error rate:", e) if show_fig: plt.plot(costs) plt.show()
def fit(self, X, Y, lr=10e-5, mu=0.99, reg=10e-7, decay=0.99999, eps=10e-3, batch_sz=30, epochs=100, show_fig=True): lr = np.float32(lr) mu = np.float32(mu) reg = np.float32(reg) decay = np.float32(decay) eps = np.float32(eps) # ============= Prep Data ============= # Validation set X, Y = shuffle(X, Y) X = X.astype(np.float32) Y = Y.astype(np.int32) # Valid set - last 1000 entries Xvalid, Yvalid = X[-1000:], Y[-1000:] # Training set - Everything except last 1000 entries X, Y = X[:-1000], Y[:-1000] # ============= Prep ConvPool layers ============= # initialize convpool layers N, c, width, height = X.shape mi = c outw = width outh = height self.convpool_layers = [] # For each parameterised convpool layer conv_layer_count = 0 for mo, fw, fh in self.convpool_layer_sizes: layer = ConvPoolLayer(mi, mo, fw, fh, self.pool_sz[conv_layer_count]) # Add layer self.convpool_layers.append(layer) # Output W after convolution layer outw = (outw - fw + 1) // self.pool_sz[conv_layer_count][0] outh = (outh - fh + 1) // self.pool_sz[conv_layer_count][1] # Set feature input to previous feature output # for the next loop mi = mo conv_layer_count += 1 # ============= Prep ANN layers ============= # K = length of all the unique values of Y K = len(set(Y)) # list to store all the hidden layers self.hidden_layers = [] # Output of last convpool layer feature output # This is to flatten the last convpool feature output as an input to the ANN M1 = self.convpool_layer_sizes[-1][ 0] * outw * outh # size must be same as output of last convpool layer count = 0 # Loop through the hidden layers in hidden_layer_sizes for M2 in self.hidden_layer_sizes: # Create hidden layer h = HiddenLayer(M1, M2, count) self.hidden_layers.append(h) # Set feature input to previous feature output # for the next loop M1 = M2 count += 1 # ============= Prep Log Regression layer ============= W, b = init_weight_and_bias(M1, K) self.W = theano.shared(W, 'W_logreg') self.b = theano.shared(b, 'b_logreg') # ============= Collect parameters for SGD ============= self.params = [self.W, self.b] for c in self.convpool_layers: self.params += c.params for h in self.hidden_layers: self.params += h.params # momentum dparams = [ theano.shared(np.zeros(p.get_value().shape, dtype=np.float32)) for p in self.params ] # rmsprop cache = [ theano.shared(np.zeros(p.get_value().shape, dtype=np.float32)) for p in self.params ] # define theano variables - X and Y thX = T.tensor4('X', dtype='float32') thY = T.ivector('Y') # Probability of Y pY = self.forward(thX) # regularisation cost # rcost = reg_parameter*sum(each_parameter^2) rcost = reg * T.sum([(p * p).sum() for p in self.params]) # cost = mean*log(all the relevant targets) cost = -T.mean(T.log(pY[T.arange(thY.shape[0]), thY])) + rcost # prediction prediction = self.th_predict(thX) # function to calculate the prediction cost without updates # used to calculate cost of prediction for the validation set cost_predict_op = theano.function(inputs=[thX, thY], outputs=[cost, prediction]) # momentum updates # momentum only. Update params and dparams updates = [(p, p + mu * dp - lr * T.grad(cost, p)) for p, dp in zip(self.params, dparams) ] + [(dp, mu * dp - lr * T.grad(cost, p)) for p, dp in zip(self.params, dparams)] train_op = theano.function(inputs=[thX, thY], updates=updates) n_batches = N // batch_sz costs = [] for i in range(epochs): X, Y = shuffle(X, Y) for j in range(n_batches): Xbatch = X[j * batch_sz:(j * batch_sz + batch_sz)] Ybatch = Y[j * batch_sz:(j * batch_sz + batch_sz)] train_op(Xbatch, Ybatch) if j % 20 == 0: c, p = cost_predict_op(Xvalid, Yvalid) costs.append(c) e = error_rate(Yvalid, p) print("i:", i, "j:", j, "nb:", n_batches, "cost:", c, "error rate:", e) if show_fig: plt.plot(costs) plt.savefig("cost.png")
def fit(self, X, Y, Xvalid, Yvalid, lr=1e-3, mu=0.99, reg=1e-3, decay=0.99999, eps=1e-10, batch_sz=30, epochs=3, show_fig=True): # downcast lr = np.float32(lr) mu = np.float32(mu) reg = np.float32(reg) decay = np.float32(decay) eps = np.float32(eps) X = X.astype(np.float32) Xvalid = Xvalid.astype(np.float32) Y = Y.astype(np.int32) Yvalid = Yvalid.astype(np.int32) # initialize convpool layers N, c, width, height = X.shape mi = c outw = width outh = height self.convpool_layers = [] for mo, fw, fh in self.convpool_layer_sizes: layer = ConvPoolLayer(mi, mo, fw, fh) self.convpool_layers.append(layer) outw = (outw - fw + 1) // 2 outh = (outh - fh + 1) // 2 mi = mo # initialize mlp layers K = len(set(Y)) self.hidden_layers = [] M1 = self.convpool_layer_sizes[-1][0]*outw*outh # size must be same as output of last convpool layer count = 0 for M2 in self.hidden_layer_sizes: h = HiddenLayer(M1, M2, count) self.hidden_layers.append(h) M1 = M2 count += 1 # logistic regression layer W, b = init_weight_and_bias(M1, K) self.W = theano.shared(W, 'W_logreg') self.b = theano.shared(b, 'b_logreg') # collect params for later use self.params = [self.W, self.b] for c in self.convpool_layers: self.params += c.params for h in self.hidden_layers: self.params += h.params # set up theano functions and variables thX = T.tensor4('X', dtype='float32') thY = T.ivector('Y') pY = self.forward(thX) rcost = reg*T.sum([(p*p).sum() for p in self.params]) cost = -T.mean(T.log(pY[T.arange(thY.shape[0]), thY])) + rcost prediction = self.th_predict(thX) cost_predict_op = theano.function(inputs=[thX, thY], outputs=[cost, prediction]) updates = rmsprop(cost, self.params, lr, mu, decay, eps) train_op = theano.function( inputs=[thX, thY], outputs=cost, updates=updates ) n_batches = N // batch_sz costs = [] for i in range(epochs): X, Y = shuffle(X, Y) for j in range(n_batches): Xbatch = X[j*batch_sz:(j*batch_sz+batch_sz)] Ybatch = Y[j*batch_sz:(j*batch_sz+batch_sz)] train_c = train_op(Xbatch, Ybatch) if j % 20 == 0: c, p = cost_predict_op(Xvalid, Yvalid) costs.append(c) e = error_rate(Yvalid, p) print( "i:", i, "j:", j, "nb:", n_batches, "train cost:", train_c, "cost:", c, "error rate:", e ) if show_fig: plt.plot(costs) plt.show()
def fit(self, X, Y, lr=10e-5, mu=0.99, reg=10e-7, decay=0.99999, eps=10e-3, batch_sz=30, epochs=100, show_fig=True): # step 1: process parameters to suitble type and preprocess input data lr = np.float32(lr) mu = np.float32(mu) reg = np.float32(reg) decay = np.float32(decay) eps = np.float32(eps) # make a validation set X, Y = shuffle(X, Y) X = X.astype(np.float32) Y = Y.astype(np.int32) Xvalid, Yvalid = X[-1000:], Y[-1000:] Xtrain, Ytrain = X[:-1000], Y[:-1000] # step 2: initialize weights in convpool layers and mlp layers # convpool use padding='valid', convpool initialization N, c, width, height = Xtrain.shape mi = c outw = width outh = height self.convpool_layers = [] for mo, fw, fh in self.conv_pool_size: h = ConvpoolLayer(mi, mo, fw, fh) self.convpool_layers.append(h) outw = (outw - fw + 1) // 2 outh = (outh - fh + 1) // 2 mi = mo # mlp initialization K = len(Ytrain) M1 = self.conv_pool_size[-1][0] * outw * outh count = 0 self.hidden_layers = [] for M2 in self.hidden_layer_size: h = HiddenLayer(M1, M2, count) self.hidden_layers.append(h) count += 1 M1 = M2 # the output layer W, b = weights_and_bias_init(M1, K) self.W = theano.shared(W, 'W_logreg') self.b = theano.shared(b, 'b_logreg') # collect all parameters matrix as a list self.params = [self.W, self.b] for h in self.convpool_layers: self.params += h.params for h in self.hidden_layers: self.params += h.params # step 3: theano structure and cost, prediction, and updates expression # initialize: (momentum and RMSprop) dparams = [ theano.shared(np.zeros(p.get_value().shape, dtype=np.float32)) for p in self.params ] cache = [ theano.shared(np.zeros(p.get_value().shape, dtype=np.float32)) for p in self.params ] thX = T.tensor4('X', dtype='float') thT = T.ivector('T') Y = self.th_forward(thX) rcost = reg * T.sum((p * p).sum() for p in self.params) cost = -T.mean(T.log(Y[T.arrange(thT.shape[0]), thT])) + rcost prediction = self.th_predict(thX) self.predict_op = theano.function(inputs=[thX], outputs=prediction) cost_predict_op = theano.function(inputs=[thX, thT], outputs=[cost, prediction]) updates = [(p, p + mu * dp - learning_rate * T.grad(cost, p)) for p, dp in zip(self.params, dparams) ] + [(dp, mu * dp - learning_rate * T.grad(cost, p)) for p, dp in zip(self.params, dparams)] train_op = theano.function(inputs=[thX, thT], updates=updates) n_batches = N // batch_sz costs = [] for i in range(epochs): X, Y = shuffle(X, Y) for j in range(n_batches): Xbatch = X[j * batch_sz:(j * batch_sz + batch_sz)] Ybatch = Y[j * batch_sz:(j * batch_sz + batch_sz)] train_op(Xbatch, Ybatch) if j % 20 == 0: c, p = cost_predict_op(Xvalid, Yvalid) costs.append(c) e = error_rate(Yvalid, p) print("i:", i, "j:", j, "nb:", n_batches, "cost:", c, "error rate:", e) if show_fig: plt.plot(costs) plt.show()