Example #1
0
def SGD_train(minibatch_size, data, labels, alpha, momentum, epochs):
    """Train the network with stochastic gradient descent

    :type minibatch_size: an integer
    :param minibatch_size: the size of the minibatches (usually something like 256)

    :type data: 3D matrix height x width x num training data pts.
    :param data: A 3D matrix that contains all of the training data points of the set

    :type labels: num training data pts x 1 vector
    :param labels: the labels for each image

    :type alpha: float
    :param alpha: the learning rate

    :type momentum: float
    :param momentum: the momentum

    :type epochs: an integer
    :param epochs: the number of epochs (ie. iterations) through the training
    """

    it = 0
    # convolutional layer, taking in a 28x28 image, using 2 9x9 filters
    # output should be 2 28-9+1x28-9+1 = 2 20x20 feature maps in a (20, 20, 2) form
    layer0 = ConvLayer((28, 28, 1), (9,9,2))
    print "initialized convolutional layer"
    # pooling layer, taking in 2 20x20 feature maps
    # output should be 2 10x10 feature maps
    layer1 = PoolingLayer((20, 20, 2))
    print "initialized pooling layer"
    # fully-connected softmax layer, taking in 2 10x10 feature maps (if downsampled by 2)
    # flattened into a long input vector
    layer2 = FullyConnectedLayer(200, 10)
    print "initialized fully-connected layer"
    params = np.concatenate((layer0.W.flatten(), layer0.bias.flatten(), layer2.W.flatten(), layer2.bias.flatten()))
    velocity = np.zeros(params.shape)

    for i in range(0, epochs):
        correct_class = 0
        cost = 0.0
        # shuffle the dataset--shuffle_vec will be used as indices
        shuffle_vec = rand.permutation(data.shape[2])

        for j in range(0, data.shape[2] - minibatch_size + 1, minibatch_size):
            # perform gradient descent w/each batch
            it += 1

            if it == 20:
                # increase momentum after 20 iterations
                momentum = 0.9

            # gradient should be an unrolled vector of the avg. sum of the 256 gradients gotten
            # from the forward pass and backprop
            for k in range(0, minibatch_size):
                layer0.forwardprop(data[:,:,shuffle_vec[j+k]].reshape((28,28,1)))
                layer1.downsample(layer0.output, (20,20,2))
                layer2_input = layer1.output.flatten()
                layer2.softmax_output(layer2_input.reshape((layer2_input.size, 1)))
                cost += J(layer2.output, labels[shuffle_vec[j+k]])
                # print "%d %d" % (np.argmax(layer2.output), labels[shuffle_vec[j+k]])

                if np.argmax(layer2.output) == labels[shuffle_vec[j+k]]:
                    correct_class += 1

                # backprop
                layer2.backprop(0, 0, encode_label(labels[shuffle_vec[j+k]]))
                layer1.upsample(layer2, 0)
                layer0.backprop(layer1)
                # flatten the gradient vector
                if k == 0:
                    grad = np.concatenate((layer0.gradient_w.flatten(), layer0.gradient_b.flatten(), layer2.gradient_w.flatten(), layer2.gradient_b.flatten()))
                else:
                    grad += np.concatenate((layer0.gradient_w.flatten(), layer0.gradient_b.flatten(), layer2.gradient_w.flatten(), layer2.gradient_b.flatten()))

            grad /= minibatch_size
            # update velocity vector
            velocity = momentum*velocity + alpha*grad
            params =  params - velocity

            # update the parameters
            layer0.W = params[0:layer0.W.flatten().size].reshape(layer0.W.shape)
            next_begin = layer0.W.flatten().size
            layer0.bias = params[next_begin:next_begin+layer0.bias.flatten().size].reshape(layer0.bias.shape)
            next_begin += layer0.bias.flatten().size
            layer2.W = params[next_begin:next_begin+layer2.W.flatten().size].reshape(layer2.W.shape)
            next_begin += layer2.W.flatten().size
            layer2.bias = params[next_begin:].reshape(layer2.bias.shape)

        # reduce learning rate by half after each epoch
        alpha /= 2.0
        print "%d correct classifications" % correct_class
        print "cost function is ", cost/(minibatch_size*(data.shape[2] - minibatch_size + 1))