def SGD_train(minibatch_size, data, labels, alpha, momentum, epochs): """Train the network with stochastic gradient descent :type minibatch_size: an integer :param minibatch_size: the size of the minibatches (usually something like 256) :type data: 3D matrix height x width x num training data pts. :param data: A 3D matrix that contains all of the training data points of the set :type labels: num training data pts x 1 vector :param labels: the labels for each image :type alpha: float :param alpha: the learning rate :type momentum: float :param momentum: the momentum :type epochs: an integer :param epochs: the number of epochs (ie. iterations) through the training """ it = 0 # convolutional layer, taking in a 28x28 image, using 2 9x9 filters # output should be 2 28-9+1x28-9+1 = 2 20x20 feature maps in a (20, 20, 2) form layer0 = ConvLayer((28, 28, 1), (9,9,2)) print "initialized convolutional layer" # pooling layer, taking in 2 20x20 feature maps # output should be 2 10x10 feature maps layer1 = PoolingLayer((20, 20, 2)) print "initialized pooling layer" # fully-connected softmax layer, taking in 2 10x10 feature maps (if downsampled by 2) # flattened into a long input vector layer2 = FullyConnectedLayer(200, 10) print "initialized fully-connected layer" params = np.concatenate((layer0.W.flatten(), layer0.bias.flatten(), layer2.W.flatten(), layer2.bias.flatten())) velocity = np.zeros(params.shape) for i in range(0, epochs): correct_class = 0 cost = 0.0 # shuffle the dataset--shuffle_vec will be used as indices shuffle_vec = rand.permutation(data.shape[2]) for j in range(0, data.shape[2] - minibatch_size + 1, minibatch_size): # perform gradient descent w/each batch it += 1 if it == 20: # increase momentum after 20 iterations momentum = 0.9 # gradient should be an unrolled vector of the avg. sum of the 256 gradients gotten # from the forward pass and backprop for k in range(0, minibatch_size): layer0.forwardprop(data[:,:,shuffle_vec[j+k]].reshape((28,28,1))) layer1.downsample(layer0.output, (20,20,2)) layer2_input = layer1.output.flatten() layer2.softmax_output(layer2_input.reshape((layer2_input.size, 1))) cost += J(layer2.output, labels[shuffle_vec[j+k]]) # print "%d %d" % (np.argmax(layer2.output), labels[shuffle_vec[j+k]]) if np.argmax(layer2.output) == labels[shuffle_vec[j+k]]: correct_class += 1 # backprop layer2.backprop(0, 0, encode_label(labels[shuffle_vec[j+k]])) layer1.upsample(layer2, 0) layer0.backprop(layer1) # flatten the gradient vector if k == 0: grad = np.concatenate((layer0.gradient_w.flatten(), layer0.gradient_b.flatten(), layer2.gradient_w.flatten(), layer2.gradient_b.flatten())) else: grad += np.concatenate((layer0.gradient_w.flatten(), layer0.gradient_b.flatten(), layer2.gradient_w.flatten(), layer2.gradient_b.flatten())) grad /= minibatch_size # update velocity vector velocity = momentum*velocity + alpha*grad params = params - velocity # update the parameters layer0.W = params[0:layer0.W.flatten().size].reshape(layer0.W.shape) next_begin = layer0.W.flatten().size layer0.bias = params[next_begin:next_begin+layer0.bias.flatten().size].reshape(layer0.bias.shape) next_begin += layer0.bias.flatten().size layer2.W = params[next_begin:next_begin+layer2.W.flatten().size].reshape(layer2.W.shape) next_begin += layer2.W.flatten().size layer2.bias = params[next_begin:].reshape(layer2.bias.shape) # reduce learning rate by half after each epoch alpha /= 2.0 print "%d correct classifications" % correct_class print "cost function is ", cost/(minibatch_size*(data.shape[2] - minibatch_size + 1))
def testGradient(): """Test the backprop implementation by checking the gradients on a small network""" # load the training data images, labels = load_mnist() images /= 255.0 grad_images = images[:,:,0:10] #use 10 image subset for gradient checking grad_labels = labels[0,0:10] #respective labels for the images--going to have to encode these labels # create a small network, 1 conv layer + 1 pooling layer + 1 fully connected softmax # convolutional layer, taking in a 28x28 image, using 2 9x9 filters # output should be 2 28-9+1x28-9+1 = 2 20x20 feature maps in a (20, 20, 2) form layer0 = ConvLayer(grad_images[:,:,0].reshape((28,28,1)), (28, 28, 1), (9, 9, 2, 1)) print "initalized convolutional layer" layer0.forwardprop(grad_images[:,:,0].reshape((28,28,1))) print "finished forward pass of convolutional layer" # pooling layer, taking in 2 20x20 feature maps # output should be 2 10x10 feature maps (though may want to downsample 5x for gradient check) layer1 = PoolingLayer(layer0.output, (20, 20, 2)) print "initialized pooling layer" layer1.downsample(layer0.output, (20, 20, 2)) print "finished forward pass of pooling layer" # fully-connected softmax layer, taking in 2 10x10 feature maps (if downsampled by 2) # or taking in 2 4x4 feature maps (if downsampled by 5) # either way, flattened into a long input vector full_conn_input = layer1.output.flatten() layer2 = FullyConnectedLayer(full_conn_input.reshape((full_conn_input.size, 1)), full_conn_input.size, 10) print "initialized fully-conn layer" layer2.softmax_output(full_conn_input.reshape((full_conn_input.size, 1))) print "finished forward pass of fully-conn layer" # perform backpropagation target = np.zeros((10,1)) for i in range(0, 10): if grad_labels[i] == 1: target[i] = 1 layer2.backprop(0, 0, target) print "finished layer 2 backprop" layer1.upsample(layer2, 0) print "finished layer 1 backprop" layer0.backprop(layer1) print "finished layer 0 backprop" # # after initialization, finish training # for i in range(1, grad_labels.size): # # forward propagation # layer0.forwardprop(grad_images[:,:,i].reshape((28,28,1))) # layer1.downsample(layer0.output, (20,20,2)) # full_conn_input = layer1.output.flatten() # layer2.softmax_output(full_conn_input.reshape((full_conn_input.size, 1))) # # # backpropagation # target = np.zeros((10,1)) # for j in range(0,10): # if grad_labels[i] == 1: # target[i] = 1 # layer2.backprop(0, 0, target) # layer1.upsample(layer2, 0) # layer0.backprop(layer1) # check the gradient epsilon = 1.0e-4 layer0_check = layer0 layer1_check = layer1 layer2_check = layer2 layer0_w_vec = layer0.W.flatten() layer0_bias_vec = layer0.bias.flatten() layer0_gradw = layer0.gradient_w.flatten() layer0_gradb = layer0.gradient_b.flatten() layer2_w_vec = layer2.W.flatten() layer2_bias_vec = layer2.bias.flatten() layer2_gradw = layer2.gradient_w.flatten() layer2_gradb = layer2.gradient_b.flatten() w_vec = np.concatenate((layer0_w_vec, layer0_bias_vec, layer2_w_vec, layer2_bias_vec)) backprop_vec = np.concatenate((layer0_gradw, layer0_gradb, layer2_gradw, layer2_gradb)) print layer0_gradw gradient_check = np.zeros(w_vec.size) for i in range(0, w_vec.size): pos = w_vec pos[i] += epsilon neg = w_vec neg[i] -= epsilon # feed-forward to get J(w+e), J(w-e), subtract and calculate gradient # J(w+e) layer0_check.W = pos[0:layer0_w_vec.size].reshape(layer0.filter_shape) layer0_check.bias = pos[layer0_w_vec.size : layer0_w_vec.size+layer0_bias_vec.size].reshape(layer0.bias_shape) layer2_check.W = pos[layer0_w_vec.size+layer0_bias_vec.size : layer0.W.size+layer0.bias.size+layer2_w_vec.size].reshape(layer2.W.shape) layer2_check.bias = pos[layer0.W.size+layer0.bias.size+layer2_w_vec.size:].reshape(layer2.bias.shape) layer0_check.forwardprop(grad_images[:,:,0].reshape((28,28,1))) layer1_check.downsample(layer0_check.output, (20,20,2)) full_conn_input = layer1.output.flatten() layer2_check.softmax_output(full_conn_input.reshape((full_conn_input.size, 1))) pos_out = J(layer2_check.output, grad_labels[0]) # J(w-e) layer0_check.W = neg[0:layer0_w_vec.size].reshape(layer0.filter_shape) layer0_check.bias = neg[layer0_w_vec.size : layer0_w_vec.size+layer0_bias_vec.size].reshape(layer0.bias_shape) layer2_check.W = neg[layer0_w_vec.size+layer0_bias_vec.size : layer0.W.size+layer0.bias.size+layer2_w_vec.size].reshape(layer2.W.shape) layer2_check.bias = neg[layer0.W.size+layer0.bias.size+layer2_w_vec.size:].reshape(layer2.bias.shape) layer0_check.forwardprop(grad_images[:,:,0].reshape((28,28,1))) layer1_check.downsample(layer0_check.output, (20,20,2)) full_conn_input = layer1.output.flatten() layer2_check.softmax_output(full_conn_input.reshape((full_conn_input.size, 1))) neg_out = J(layer2_check.output, grad_labels[0]) # compute gradient for i gradient_check[i] = (pos_out - neg_out)/(2*epsilon) # print gradient_check print gradient_check[0:layer0_w_vec.size]