def build_cnn_model(image_height, image_width, n_kernel, batch_size,
                    learning_rate, rng):
    # learning_rate, n_kernel, batch_size, print, rng, T,
    print('... building the model')
    x = T.matrix('x', dtype=theano.config.floatX)
    y = T.ivector('y')
    layer_1_input = x.reshape((batch_size, 1, image_height, image_width))
    layer_1 = LeNetConvPoolLayer(rng,
                                 input=layer_1_input,
                                 image_shape=(batch_size, 1, image_height,
                                              image_width),
                                 filter_shape=(n_kernel[0], 1, 7, 7),
                                 poolsize=(2, 2))
    layer_2 = LeNetConvPoolLayer(rng,
                                 input=layer_1.output,
                                 image_shape=(batch_size, n_kernel[0], 57, 77),
                                 filter_shape=(n_kernel[1], n_kernel[0], 6, 6),
                                 poolsize=(2, 2))
    layer_3 = LeNetConvPoolLayer(rng,
                                 input=layer_2.output,
                                 image_shape=(batch_size, n_kernel[1], 26, 36),
                                 filter_shape=(n_kernel[2], n_kernel[1], 5, 5),
                                 poolsize=(2, 2))
    layer_4 = HiddenLayer(rng,
                          input=layer_3.output.flatten(2),
                          n_in=n_kernel[2] * 11 * 16,
                          n_out=batch_size,
                          activation=T.tanh)
    layer_5 = LogisticRegression(input=layer_4.output,
                                 input_dim=batch_size,
                                 output_dim=12)
    cost = layer_5.negative_log_likelihood(y)
    error = layer_5.errors(y)
    params = layer_5.params + layer_4.params + layer_3.params + layer_2.params + layer_1.params
    params = layer_5.params + layer_4.params + layer_2.params + layer_1.params
    grads = T.grad(cost, params)
    updates = [(param_i, param_i - learning_rate * grad_i)
               for param_i, grad_i in zip(params, grads)]
    train_model = theano.function([x, y], cost, updates=updates)
    validation_model = theano.function([x, y], error)

    return train_model, validation_model
Beispiel #2
0
class LRTest:
    def __init__(self):
        import theano
        import util
        from theano import tensor as T
        from logistic_regression import LogisticRegression
        self.index = T.iscalar('index')
        self.BATCH_SIZE = 100
        self.LEARNING_RATE = 0.12
        self.dataSets = util.loadMnistData("mnist.pkl.gz")
        self.x = T.dmatrix('x')
        self.y = T.ivector('y')
        self.index = T.iscalar('index')
        self.classifier = LogisticRegression(input=self.x, nIn=28 * 28, nOut=10)
        self.cost = self.classifier.negativeLogLikelihood(self.y)
        self.gW = T.grad(cost=self.cost, wrt=self.classifier.W)
        self.gB = T.grad(cost=self.cost, wrt=self.classifier.b)
        self.trainSet, self.validSet, self.testSet = self.dataSets
        self.nTrainSet, self.nValidSet, self.nTestSet = map(self.numBatches, self.dataSets)
        updates = [
            (self.classifier.W, self.classifier.W - self.LEARNING_RATE * self.gW),
            (self.classifier.b, self.classifier.b - self.LEARNING_RATE * self.gB)
        ]

        def makeGivens(data):
            return {
                self.x: data[0][self.index * self.BATCH_SIZE:(self.index + 1) * self.BATCH_SIZE],
                self.y: data[1][self.index * self.BATCH_SIZE:(self.index + 1) * self.BATCH_SIZE]
            }

        self.testModel = theano.function(
            inputs=[self.index],
            outputs=self.classifier.errors(self.y),
            givens=makeGivens(self.dataSets[2])
        )
        self.validationModel = theano.function(
            inputs=[self.index],
            outputs=self.classifier.errors(self.y),
            givens=makeGivens(self.dataSets[1])
        )
        self.trainModel = theano.function(
            inputs=[self.index],
            outputs=self.cost,
            updates=updates,
            givens=makeGivens(self.dataSets[0])
        )

    def numBatches(self, dataSet):
        return dataSet[0].get_value(borrow=True).shape[0] / self.BATCH_SIZE

    def printValid(self, epoch, batchIndex, loss):
        return 'epoch %i, minibatch %i/%i, validation error %f %%' % (
            epoch,
            batchIndex + 1,
            self.nTrainSet,
            loss * 100.
        )

    def printTestScore(self, epoch, batchIndex, score):
        return (
                   '     epoch %i, minibatch %i/%i, test error of'
                   ' best model %f %%'
               ) % (
                   epoch,
                   batchIndex + 1,
                   self.nTrainSet,
                   score * 100.
               )

    def resultString(self, best, test):
        return ('Optimization complete with best validation score of %f %%,'
                'with test performance %f %%') % (best * 100., test * 100.)

    def train(self):
        import numpy

        patience = 5000
        patienceIncrease = 2
        MAX_EPOCH = 1000
        improveThresh = 0.995
        validationFreq = min(self.nTrainSet, patience / 2)
        bestValidationLoss = numpy.inf
        epoch = 0
        done = False
        testLoss = 0

        while (epoch < MAX_EPOCH) and not done:
            epoch += 1
            for batchIndex in xrange(self.nTrainSet):
                avgCost = self.trainModel(batchIndex)
                iter = (epoch - 1) * self.nTrainSet + batchIndex
                if (iter + 1) % validationFreq == 0:
                    loss = numpy.mean(map(self.validationModel, xrange(self.nValidSet)))
                    if loss < bestValidationLoss:
                        if loss < bestValidationLoss * improveThresh:
                            patience = max(patience, iter * patienceIncrease)
                        bestValidationLoss = loss
                        testLoss = numpy.mean(map(self.testModel, xrange(self.nTestSet)))
                    yield epoch,batchIndex,loss, testLoss, bestValidationLoss
                if patience <= iter:
                    done = True
                    break

    def doTrain(self):
        for epoch,batchIndex, loss,testScore,bestScore in self.train():
            str = self.printValid(epoch,batchIndex,loss)
            str += self.printTestScore(epoch,batchIndex,testScore)
            print(str)

        print(self.resultString(bestScore,testScore))
Beispiel #3
0
def fit(n_windows, win_width, rand_state, data_set, data_labels, filename="LR_weights.pkl"):
    # Permuting data
    rng = np.random.RandomState(8000)
    indices = rng.permutation(len(data_set))
    data_set = np.array(data_set)
    data_labels = np.array(data_labels)
    data_set, data_labels = data_set[indices], data_labels[indices]
    print str(len(data_set)) + " all samples"

    train_len = int(len(data_set) * 9.0 / 10.0)
    valid_len = len(data_set) - train_len
    print "Train: " + str(train_len)
    print "Validate: " + str(valid_len)

    # Splitting fs
    train_dir = fs.File("LR_training.hdf5", "a")
    train_data = train_dir.create_dataset("LR_train_data", shape=((train_len + 1) * n_windows, 41, 41), dtype="i")
    train_labels = train_dir.create_dataset("LR_train_labels", shape=((train_len + 1) * n_windows,), dtype="i")

    valid_dir = fs.File("LR_validating.hdf5", "a")
    valid_data = valid_dir.create_dataset("LR_valid_data", shape=((valid_len + 1) * n_windows, 41, 41), dtype="i")
    valid_labels = valid_dir.create_dataset("LR_valid_labels", shape=((valid_len + 1) * n_windows,), dtype="i")
    counter = 0
    next_counter = 0
    for iter, data_sample in enumerate(data_set):
        if iter % 10000 == 0:
            print iter
        windows = WinExt.get_windows(data_sample, n_windows, win_width, rand_state)
        for window in windows:
            # First windows part for training
            # Second part for validation
            if iter < train_len:
                train_data[counter] = window
                train_labels[counter] = data_labels[iter]
                counter += 1
            else:
                valid_data[next_counter] = window
                valid_labels[next_counter] = data_labels[iter]
                next_counter += 1
    # Setting real length
    train_len = counter
    valid_len = next_counter
    print "Size of train is " + str(train_len)
    print "Size of valid is " + str(valid_len)
    print "Extracting has finished its work..."

    batch_size = 500

    if train_len % batch_size != 0:  # if the last batch is not full, just don't use the remainder
        whole = (train_len / batch_size) * batch_size
        train_len = whole
    if valid_len % batch_size != 0:
        whole = (valid_len / batch_size) * batch_size
        valid_len = whole

    n_train_batches = train_len / batch_size
    n_valid_batches = valid_len / batch_size

    data_tr = theano.shared(
        np.asarray(np.zeros((batch_size, 41, 41), dtype=np.int), dtype=theano.config.floatX), borrow=True
    )
    labels_tr = theano.shared(np.asarray(np.zeros(batch_size, dtype=np.int), dtype="int32"), borrow=True)
    data_val = theano.shared(
        np.asarray(np.zeros((batch_size, 41, 41), dtype=np.int), dtype=theano.config.floatX), borrow=True
    )
    labels_val = theano.shared(np.asarray(np.zeros(batch_size, dtype=np.int), dtype="int32"), borrow=True)

    print "Building logistic regression classifier..."
    x = T.dtensor3("x")  # dtensor3 for 3d array
    y = T.ivector("y")  # the labels are presented as 1D vector of [int] labels
    rng = np.random.RandomState(8000)

    classifier = LogisticRegression(input=x.flatten(2), n_in=41 * 41, n_out=2)

    cost = classifier.negative_log_likelihood(y)
    learning_rate = 0.03  # 0.3 / float(n_train_batches)

    g_W = T.grad(cost=cost, wrt=classifier.W)
    g_b = T.grad(cost=cost, wrt=classifier.b)

    # start-snippet-3
    # specify how to update the parameters of the model as a list of
    # (variable, update expression) pairs.
    updates = [(classifier.W, classifier.W - learning_rate * g_W), (classifier.b, classifier.b - learning_rate * g_b)]

    validate_model = theano.function(inputs=[], outputs=classifier.errors(y), givens={x: data_val, y: labels_val})

    # indices - for random shuffle
    train_model = theano.function(
        inputs=[], outputs=classifier.errors(y), updates=updates, givens={x: data_tr, y: labels_tr}
    )

    print "Training..."
    # GDM with batches
    epoch = 0
    n_epochs = 30
    min_error = 100.0
    errors = []
    indices = rng.permutation(train_len)
    while epoch < n_epochs:
        print "================= " + str(epoch + 1) + " epoch =============== "
        for minibatch_index in range(n_train_batches):
            if minibatch_index % 50 == 0:
                print str(minibatch_index) + " batch"
            data_tr.set_value(
                np.array([train_data[indices[minibatch_index * batch_size + i]] for i in range(batch_size)]),
                borrow=True,
            )
            labels_tr.set_value(
                np.array([train_labels[indices[minibatch_index * batch_size + i]] for i in range(batch_size)]),
                borrow=True,
            )
            train_model()
        # compute zero-one loss on validation set
        validation_losses = []
        for i in range(n_valid_batches):
            data_val.set_value(np.array(valid_data[i * batch_size : (i + 1) * batch_size]), borrow=True)
            labels_val.set_value(np.array(valid_labels[i * batch_size : (i + 1) * batch_size]), borrow=True)
            validation_losses.append(validate_model())
        this_validation_loss = np.mean(validation_losses) * 100
        errors.append(this_validation_loss)
        if this_validation_loss < min_error:
            print str(this_validation_loss) + "% error"
            min_error = this_validation_loss
            save_parameters(classifier, filename)
        epoch += 1
        print "Shuffling..."
        indices = rng.permutation(train_len)

    show_errors(errors, "LogReg: 4 windows, h=41")

    # Cleaning data
    train_dir.clear()
    valid_dir.clear()
    train_dir.close()
    valid_dir.close()
def evaluate_convnet(learning_rate=0.1, n_epochs=1,
                    dataset='mnist.pkl.gz',
                    nkerns=[20, 50], batch_size=500):
    """ Demonstrates lenet on MNIST dataset

    :type learning_rate: float
    :param learning_rate: learning rate used (factor for the stochastic
                          gradient)

    :type n_epochs: int
    :param n_epochs: maximal number of epochs to run the optimizer

    :type dataset: string
    :param dataset: path to the dataset used for training /testing (MNIST here)

    :type nkerns: list of ints
    :param nkerns: number of kernels on each layer
    """

    rng = numpy.random.RandomState(23455)

    datasets = load_data(dataset)

    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0]
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]
    n_test_batches = test_set_x.get_value(borrow=True).shape[0]
    n_train_batches /= batch_size
    n_valid_batches /= batch_size
    n_test_batches /= batch_size

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch

    # start-snippet-1
    x = T.matrix('x')   # the data is presented as rasterized images
    y = T.ivector('y')  # the labels are presented as 1D vector of
                        # [int] labels

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized images of shape (batch_size, 28 * 28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    # (28, 28) is the size of MNIST images.
    layer0_input = x.reshape((batch_size, 1, 28, 28))

    # Construct the first convolutional pooling layer:
    # filtering reduces the image size to (28-5+1 , 28-5+1) = (24, 24)
    # maxpooling reduces this further to (24/2, 24/2) = (12, 12)
    # 4D output tensor is thus of shape (batch_size, nkerns[0], 12, 12)
    layer0 = ConvPoolLayer(
        rng,
        input=layer0_input,
        image_shape=(batch_size, 1, 28, 28),
        filter_shape=(nkerns[0], 1, 5, 5),
        poolsize=(2, 2)
    )

    # Construct the second convolutional pooling layer
    # filtering reduces the image size to (12-5+1, 12-5+1) = (8, 8)
    # maxpooling reduces this further to (8/2, 8/2) = (4, 4)
    # 4D output tensor is thus of shape (nkerns[0], nkerns[1], 4, 4)
    layer1 = ConvPoolLayer(
        rng,
        input=layer0.output,
        image_shape=(batch_size, nkerns[0], 12, 12),
        filter_shape=(nkerns[1], nkerns[0], 5, 5),
        poolsize=(2, 2)
    )

    # the HiddenLayer being fully-connected, it operates on 2D matrices of
    # shape (batch_size, num_pixels) (i.e matrix of rasterized images).
    # This will generate a matrix of shape (batch_size, nkerns[1] * 4 * 4),
    # or (500, 50 * 4 * 4) = (500, 800) with the default values.
    layer2_input = layer1.output.flatten(2)

    # construct a fully-connected sigmoidal layer
    layer2 = HiddenLayer(
        rng,
        input=layer2_input,
        n_in=nkerns[1] * 4 * 4,
        n_out=500,
        activation=T.tanh
    )

    # classify the values of the fully-connected sigmoidal layer
    layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=10)

    # the cost we minimize during training is the NLL of the model
    cost = layer3.negative_log_likelihood(y)

    # create a function to compute the mistakes that are made by the model
    test_model = theano.function(
        [index],
        layer3.errors(y),
        givens={
            x: test_set_x[index * batch_size: (index + 1) * batch_size],
            y: test_set_y[index * batch_size: (index + 1) * batch_size]
        }
    )

    validate_model = theano.function(
        [index],
        layer3.errors(y),
        givens={
            x: valid_set_x[index * batch_size: (index + 1) * batch_size],
            y: valid_set_y[index * batch_size: (index + 1) * batch_size]
        }
    )

    # create a list of all model parameters to be fit by gradient descent
    params = layer3.params + layer2.params + layer1.params + layer0.params

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    # train_model is a function that updates the model parameters by
    # SGD Since this model has many parameters, it would be tedious to
    # manually create an update rule for each model parameter. We thus
    # create the updates list by automatically looping over all
    # (params[i], grads[i]) pairs.
    updates = [
        (param_i, param_i - learning_rate * grad_i)
        for param_i, grad_i in zip(params, grads)
    ]

    train_model = theano.function(
        [index],
        cost, # This is the negative-log-likelihood of the Logisitc Regression layer
        updates=updates,
        givens={
            x: train_set_x[index * batch_size: (index + 1) * batch_size],
            y: train_set_y[index * batch_size: (index + 1) * batch_size]
        }
    )
    # end-snippet-1

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 10000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
                           # found
    improvement_threshold = 0.995  # a relative improvement of this much is
                                   # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
                                  # go through this many
                                  # minibatche before checking the network
                                  # on the validation set; in this case we
                                  # check every epoch

    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()

    epoch = 0
    done_looping = False

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        for minibatch_index in xrange(n_train_batches):

            iter = (epoch - 1) * n_train_batches + minibatch_index

            if iter % 100 == 0:
                print 'training @ iter = ', iter
            cost_ij = train_model(minibatch_index)
            
            if (iter + 1) % validation_frequency == 0:

                # compute zero-one loss on validation set
                validation_losses = [validate_model(i) for i
                                     in xrange(n_valid_batches)]
                this_validation_loss = numpy.mean(validation_losses)
                print('epoch %i, minibatch %i/%i, validation error %f %%' %
                      (epoch, minibatch_index + 1, n_train_batches,
                       this_validation_loss * 100.))

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:

                    #improve patience if loss improvement is good enough
                    if this_validation_loss < best_validation_loss *  \
                       improvement_threshold:
                        patience = max(patience, iter * patience_increase)

                    # save best validation score and iteration number
                    best_validation_loss = this_validation_loss
                    best_iter = iter

                    # test it on the test set
                    test_losses = [
                        test_model(i)
                        for i in xrange(n_test_batches)
                    ]
                    test_score = numpy.mean(test_losses)
                    print(('     epoch %i, minibatch %i/%i, test error of '
                           'best model %f %%') %
                          (epoch, minibatch_index + 1, n_train_batches,
                           test_score * 100.))

            if patience <= iter:
                done_looping = True
                break

    end_time = time.clock()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i, '
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
Beispiel #5
0
def train_CNN_mini_batch(learning_rate,
                         n_epochs,
                         num_kernels,
                         batch_size,
                         filter_size,
                         is_multi_scale,
                         num_of_classes,
                         height,
                         width,
                         use_interpolation,
                         use_hidden_layer):
    train_set_x_by_1, train_set_y, valid_set_x_by_1, valid_set_y, test_set_x_by_1, test_set_y, train_set_x_by_2, \
    train_set_x_by_4, valid_set_x_by_2, valid_set_x_by_4, test_set_x_by_2, test_set_x_by_4 \
        = load_processed_img_data()

    n_train_batches = train_set_x_by_1.get_value(borrow=True).shape[0]
    n_valid_batches = valid_set_x_by_1.get_value(borrow=True).shape[0]
    n_test_batches = test_set_x_by_1.get_value(borrow=True).shape[0]
    n_train_batches /= batch_size
    n_valid_batches /= batch_size
    n_test_batches /= batch_size

    index = theano.tensor.lscalar()
    x_by_1 = theano.tensor.ftensor4('x_by_1')
    x_by_2 = theano.tensor.ftensor4('x_by_2')
    x_by_4 = theano.tensor.ftensor4('x_by_4')

    y = theano.tensor.ivector('y')

    print '... initialize the model'

    cnn_dir = 'models/CNN_'
    if is_multi_scale is True:
        cnn_dir += 'M_'
    else:
        cnn_dir += 'S_'

    if use_hidden_layer is True:
        cnn_dir += 'H_'
    else:
        cnn_dir += 'L_'

    if use_interpolation is True:
        cnn_dir += 'I_'
    else:
        cnn_dir += 'N_'

    cnn_dir = cnn_dir + str(num_kernels[0]) + '_' + str(num_kernels[1]) + '_' + str(num_kernels[2]) + '_' + str(
        batch_size) + '_'
    curr_date = str(datetime.date.today())
    curr_date = curr_date.replace('-', '_')
    cnn_dir = cnn_dir + curr_date + str(time.strftime('_%H_%M_%S'))

    print 'CNN model is ', cnn_dir

    if not os.path.exists(cnn_dir):
        os.makedirs(cnn_dir)

    class Logger(object):
        def __init__(self):
            self.terminal = sys.stdout
            self.log = open(cnn_dir + '/log.txt', 'w')

        def write(self, message):
            self.terminal.write(message)
            self.log.write(message)

    sys.stdout = Logger()

    layer0 = CNN_Layer(
        name='Layer_0',
        W=None,
        b=None,
        filter_shape=(num_kernels[0], 3, filter_size, filter_size),
    )

    layer1 = CNN_Layer(
        name='Layer_1',
        W=None,
        b=None,
        filter_shape=(num_kernels[1], num_kernels[0], filter_size, filter_size),
    )

    layer2 = CNN_Layer(
        name='Layer_2',
        W=None,
        b=None,
        filter_shape=(num_kernels[2], num_kernels[1], filter_size, filter_size),
    )

    layer3 = HiddenLayer(
        name='Layer_3',
        W=None,
        b=None,
        n_in=num_kernels[2] * 3 if is_multi_scale is True else num_kernels[2],
        n_out=num_kernels[2] * 4 if is_multi_scale is True else num_kernels[2] * 2,
        activation=theano.tensor.tanh
    )

    if is_multi_scale and use_hidden_layer:
        layer4_in = num_kernels[2] * 4
    elif is_multi_scale and not use_hidden_layer:
        layer4_in = num_kernels[2] * 3
    elif not is_multi_scale and use_hidden_layer:
        layer4_in = num_kernels[2] * 2
    else:
        layer4_in = num_kernels[2]

    layer4 = LogisticRegression(
        name='Layer_4',
        W=None,
        b=None,
        n_in=layer4_in,
        n_out=num_of_classes,
    )

    forward_propagation(
        layer0=layer0,
        layer1=layer1,
        layer2=layer2,
        layer3=layer3,
        layer4=layer4,
        x_by_1=x_by_1,
        x_by_2=x_by_2,
        x_by_4=x_by_4,
        num_kernels=num_kernels,
        batch_size=batch_size,
        filter_size=filter_size,
        is_multi_scale=is_multi_scale,
        height=height,
        width=width,
        use_interpolation=use_interpolation,
        use_hidden_layer=use_hidden_layer
    )

    if use_hidden_layer is True:
        L2_norm = (layer4.W ** 2).sum() + (layer3.W ** 2).sum() + (layer2.W ** 2).sum() + (layer1.W ** 2).sum() + (
        layer0.W ** 2).sum()
    else:
        L2_norm = (layer4.W ** 2).sum() + (layer2.W ** 2).sum() + (layer1.W ** 2).sum() + (layer0.W ** 2).sum()

    regularization = 0.00001
    cost = layer4.negative_log_likelihood(y) + (regularization * L2_norm)

    if is_multi_scale is True:
        test_model = theano.function(
            [index],
            layer4.errors(y),
            givens={
                x_by_1: test_set_x_by_1[index * batch_size: (index + 1) * batch_size],
                x_by_2: test_set_x_by_2[index * batch_size: (index + 1) * batch_size],
                x_by_4: test_set_x_by_4[index * batch_size: (index + 1) * batch_size],
                y: test_set_y[index * batch_size * height * width: (index + 1) * batch_size * height * width]
            }
        )
    else:
        test_model = theano.function(
            [index],
            layer4.errors(y),
            givens={
                x_by_1: test_set_x_by_1[index * batch_size: (index + 1) * batch_size],
                y: test_set_y[index * batch_size * height * width: (index + 1) * batch_size * height * width]
            }
        )

    if is_multi_scale is True:
        validate_model = theano.function(
            [index],
            layer4.errors(y),
            givens={
                x_by_1: valid_set_x_by_1[index * batch_size: (index + 1) * batch_size],
                x_by_2: valid_set_x_by_2[index * batch_size: (index + 1) * batch_size],
                x_by_4: valid_set_x_by_4[index * batch_size: (index + 1) * batch_size],
                y: valid_set_y[index * batch_size * height * width: (index + 1) * batch_size * height * width]
            }
        )
    else:
        validate_model = theano.function(
            [index],
            layer4.errors(y),
            givens={
                x_by_1: valid_set_x_by_1[index * batch_size: (index + 1) * batch_size],
                y: valid_set_y[index * batch_size * height * width: (index + 1) * batch_size * height * width]
            }
        )

    if use_hidden_layer is True:
        params = layer4.params + layer3.params + layer2.params + layer1.params + layer0.params
    else:
        params = layer4.params + layer2.params + layer1.params + layer0.params

    grads = theano.tensor.grad(cost, params)

    updates = [
        (param_i, param_i - learning_rate * grad_i)
        for param_i, grad_i in zip(params, grads)
    ]

    if is_multi_scale is True:
        train_model = theano.function(
            [index],
            cost,
            updates=updates,
            givens={
                x_by_1: train_set_x_by_1[index * batch_size: (index + 1) * batch_size],
                x_by_2: train_set_x_by_2[index * batch_size: (index + 1) * batch_size],
                x_by_4: train_set_x_by_4[index * batch_size: (index + 1) * batch_size],
                y: train_set_y[index * batch_size * width * height: (index + 1) * batch_size * width * height]
            }
        )
    else:
        train_model = theano.function(
            [index],
            cost,
            updates=updates,
            givens={
                x_by_1: train_set_x_by_1[index * batch_size: (index + 1) * batch_size],
                y: train_set_y[index * batch_size * width * height: (index + 1) * batch_size * width * height]
            }
        )

    print '... training the model'
    patience = 10000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is found
    improvement_threshold = 0.995  # a relative improvement of this much is considered significant
    validation_frequency = min(n_train_batches, patience / 2)

    best_layer_0_W = numpy.zeros_like(layer0.W.get_value())
    best_layer_0_b = numpy.zeros_like(layer0.b.get_value())
    best_layer_1_W = numpy.zeros_like(layer1.W.get_value())
    best_layer_1_b = numpy.zeros_like(layer1.b.get_value())
    best_layer_2_W = numpy.zeros_like(layer2.W.get_value())
    best_layer_2_b = numpy.zeros_like(layer2.b.get_value())
    best_layer_3_W = numpy.zeros_like(layer3.W.get_value())
    best_layer_3_b = numpy.zeros_like(layer3.b.get_value())
    best_layer_4_W = numpy.zeros_like(layer4.W.get_value())
    best_layer_4_b = numpy.zeros_like(layer4.b.get_value())

    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()

    epoch = 0
    done_looping = False
    while (epoch < n_epochs) and (not done_looping):
        epoch += 1
        for mini_batch_index in xrange(n_train_batches):

            start = time.clock()
            iter = (epoch - 1) * n_train_batches + mini_batch_index
            cost_ij = train_model(mini_batch_index)

            if (iter + 1) % validation_frequency == 0:

                # compute zero-one loss on validation set
                validation_losses = [validate_model(i) for i
                                     in xrange(n_valid_batches)]
                this_validation_loss = numpy.mean(validation_losses)
                print('epoch %i, mini-batch %i/%i, validation error %f %%' %
                      (epoch, mini_batch_index + 1, n_train_batches,
                       this_validation_loss * 100.))

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:

                    # improve patience if loss improvement is good enough
                    if this_validation_loss < best_validation_loss * \
                            improvement_threshold:
                        patience = max(patience, iter * patience_increase)

                    # save best validation score and iteration number
                    best_validation_loss = this_validation_loss
                    best_iter = iter

                    # save best filters
                    best_layer_0_W = layer0.W.get_value()
                    best_layer_0_b = layer0.b.get_value()
                    best_layer_1_W = layer1.W.get_value()
                    best_layer_1_b = layer1.b.get_value()
                    best_layer_2_W = layer2.W.get_value()
                    best_layer_2_b = layer2.b.get_value()
                    best_layer_3_W = layer3.W.get_value()
                    best_layer_3_b = layer3.b.get_value()
                    best_layer_4_W = layer4.W.get_value()
                    best_layer_4_b = layer4.b.get_value()

                    # test it on the test set
                    test_losses = [
                        test_model(i)
                        for i in xrange(n_test_batches)
                    ]

                    test_score = numpy.mean(test_losses)
                    print(('     epoch %i, mini-batch %i/%i, test error of '
                           'best model %f %%') %
                          (epoch, mini_batch_index + 1, n_train_batches,
                           test_score * 100.))

            if patience <= iter:
                done_looping = True
                break

            print 'training @ iter = %d, time taken = %f' % (iter, (time.clock() - start))

    end_time = time.clock()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i, '
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))

    if not os.path.exists(cnn_dir + '/params'):
        os.makedirs(cnn_dir + '/params')

    numpy.save(cnn_dir + '/params/layer_0_W.npy', best_layer_0_W)
    numpy.save(cnn_dir + '/params/layer_0_b.npy', best_layer_0_b)
    numpy.save(cnn_dir + '/params/layer_1_W.npy', best_layer_1_W)
    numpy.save(cnn_dir + '/params/layer_1_b.npy', best_layer_1_b)
    numpy.save(cnn_dir + '/params/layer_2_W.npy', best_layer_2_W)
    numpy.save(cnn_dir + '/params/layer_2_b.npy', best_layer_2_b)
    numpy.save(cnn_dir + '/params/layer_3_W.npy', best_layer_3_W)
    numpy.save(cnn_dir + '/params/layer_3_b.npy', best_layer_3_b)
    numpy.save(cnn_dir + '/params/layer_4_W.npy', best_layer_4_W)
    numpy.save(cnn_dir + '/params/layer_4_b.npy', best_layer_4_b)
    numpy.save(cnn_dir + '/params/filer_kernels.npy', num_kernels)
    numpy.save(cnn_dir + '/params/filter_size.npy', filter_size)

    return cnn_dir
Beispiel #6
0
def evaluate_lenet5(learning_rate=0.1,
                    n_epochs=200,
                    dataset='mnist.pkl.gz',
                    n_kernels=[20, 50],
                    batch_size=500):
    """ Demonstrates lenet on MNIST dataset

    :type learning_rate: float
    :param learning_rate: learning rate used (factor for the stochastic
                          gradient)

    :type n_epochs: int
    :param n_epochs: maximal number of epochs to run the optimizer

    :type dataset: string
    :param dataset: path to the dataset used for training /testing (MNIST here)

    :type n_kernels: list of ints
    :param n_kernels: number of kernels on each layer
    """

    rng = numpy.random.RandomState(23455)

    datasets = load_data(dataset)

    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0]
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]
    n_test_batches = test_set_x.get_value(borrow=True).shape[0]
    n_train_batches //= batch_size
    n_valid_batches //= batch_size
    n_test_batches //= batch_size

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch

    # start-snippet-1
    x = T.matrix('x')  # the data is presented as rasterized images
    y = T.ivector('y')  # the labels are presented as 1D vector of
    # [int] labels

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print('... building the model')

    # Reshape matrix of rasterized images of shape (batch_size, 28 * 28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    # (28, 28) is the size of MNIST images.
    layer0_input = x.reshape((batch_size, 1, 28, 28))

    # Construct the first convolutional pooling layer:
    # filtering reduces the image size to (28-5+1 , 28-5+1) = (24, 24)
    # maxpooling reduces this further to (24/2, 24/2) = (12, 12)
    # 4D output tensor is thus of shape (batch_size, n_kernels[0], 12, 12)
    layer0 = LeNetConvPoolLayer(rng,
                                input=layer0_input,
                                image_shape=(batch_size, 1, 28, 28),
                                filter_shape=(n_kernels[0], 1, 5, 5),
                                poolsize=(2, 2))

    # Construct the second convolutional pooling layer
    # filtering reduces the image size to (12-5+1, 12-5+1) = (8, 8)
    # maxpooling reduces this further to (8/2, 8/2) = (4, 4)
    # 4D output tensor is thus of shape (batch_size, n_kernels[1], 4, 4)
    layer1 = LeNetConvPoolLayer(rng,
                                input=layer0.output,
                                image_shape=(batch_size, n_kernels[0], 12, 12),
                                filter_shape=(n_kernels[1], n_kernels[0], 5,
                                              5),
                                poolsize=(2, 2))

    # the HiddenLayer being fully-connected, it operates on 2D matrices of
    # shape (batch_size, num_pixels) (i.e matrix of rasterized images).
    # This will generate a matrix of shape (batch_size, n_kernels[1] * 4 * 4),
    # or (500, 50 * 4 * 4) = (500, 800) with the default values.
    layer2_input = layer1.output.flatten(2)

    # construct a fully-connected sigmoidal layer
    layer2 = HiddenLayer(rng,
                         input=layer2_input,
                         n_in=n_kernels[1] * 4 * 4,
                         n_out=500,
                         activation=T.tanh)

    # classify the values of the fully-connected sigmoidal layer
    layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=10)

    # the cost we minimize during training is the NLL of the model
    cost = layer3.negative_log_likelihood(y)

    # create a function to compute the mistakes that are made by the model
    test_model = theano.function(
        [index],
        layer3.errors(y),
        givens={
            x: test_set_x[index * batch_size:(index + 1) * batch_size],
            y: test_set_y[index * batch_size:(index + 1) * batch_size]
        })

    validate_model = theano.function(
        [index],
        layer3.errors(y),
        givens={
            x: valid_set_x[index * batch_size:(index + 1) * batch_size],
            y: valid_set_y[index * batch_size:(index + 1) * batch_size]
        })

    # create a list of all model parameters to be fit by gradient descent
    params = layer3.params + layer2.params + layer1.params + layer0.params

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    # train_model is a function that updates the model parameters by
    # SGD Since this model has many parameters, it would be tedious to
    # manually create an update rule for each model parameter. We thus
    # create the updates list by automatically looping over all
    # (params[i], grads[i]) pairs.
    updates = [(param_i, param_i - learning_rate * grad_i)
               for param_i, grad_i in zip(params, grads)]

    train_model = theano.function(
        [index],
        cost,
        updates=updates,
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size],
            y: train_set_y[index * batch_size:(index + 1) * batch_size]
        })
    # end-snippet-1

    ###############
    # TRAIN MODEL #
    ###############
    print('... training')
    # early-stopping parameters
    patience = 10000  # look as this many examples regardless
    patience_increase = 1  # wait this much longer when a new best is
    # found
    improvement_threshold = 0.995  # a relative improvement of this much is
    # considered significant
    validation_frequency = min(n_train_batches, patience // 2)
    # go through this many
    # minibatche before checking the network
    # on the validation set; in this case we
    # check every epoch

    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = timeit.default_timer()

    epoch = 0
    done_looping = False

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        for minibatch_index in range(n_train_batches):

            iter = (epoch - 1) * n_train_batches + minibatch_index

            if iter % 100 == 0:
                print('training @ iter = ', iter)
            cost_ij = train_model(minibatch_index)

            if (iter + 1) % validation_frequency == 0:

                # compute zero-one loss on validation set
                validation_losses = [
                    validate_model(i) for i in range(n_valid_batches)
                ]
                this_validation_loss = numpy.mean(validation_losses)
                print('epoch %i, minibatch %i/%i, validation error %f %%' %
                      (epoch, minibatch_index + 1, n_train_batches,
                       this_validation_loss * 100.))

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:

                    #improve patience if loss improvement is good enough
                    if this_validation_loss < best_validation_loss *  \
                       improvement_threshold:
                        patience = max(patience, iter * patience_increase)

                    # save best validation score and iteration number
                    best_validation_loss = this_validation_loss
                    best_iter = iter

                    # test it on the test set
                    test_losses = [
                        test_model(i) for i in range(n_test_batches)
                    ]
                    test_score = numpy.mean(test_losses)
                    print(('     epoch %i, minibatch %i/%i, test error of '
                           'best model %f %%') %
                          (epoch, minibatch_index + 1, n_train_batches,
                           test_score * 100.))
            print(('     patience %i') % (patience))

            if patience <= iter:
                done_looping = True
                break

    end_time = timeit.default_timer()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i, '
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print(
        ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' %
         ((end_time - start_time) / 60.)),
        file=sys.stderr)
class RRNN(object):
    """Recurrent ReLU Neural Network
    """
    def __init__(
            self,
            numpy_rng,
            theano_rng=None,
            n_ins=N_FEATURES * N_FRAMES,
            relu_layers_sizes=[1024, 1024, 1024],
            recurrent_connections=[2],  # layer(s), can only be i^t -> i^{t+1}
            n_outs=62 * 3,
            rho=0.9,
            eps=1.E-6):
        """ TODO 
        """

        self.relu_layers = []
        self.params = []
        self.n_layers = len(relu_layers_sizes)
        self._rho = rho  # ``momentum'' for adadelta
        self._eps = eps  # epsilon for adadelta
        self._accugrads = []  # for adadelta
        self._accudeltas = []  # for adadelta
        self.n_outs = n_outs

        assert self.n_layers > 0

        if not theano_rng:
            theano_rng = RandomStreams(numpy_rng.randint(2**30))

        self.x = T.fmatrix('x')
        self.y = T.ivector('y')

        for i in xrange(self.n_layers):
            if i == 0:
                input_size = n_ins
            else:
                input_size = relu_layers_sizes[i - 1]

            if i == 0:
                layer_input = self.x
            else:
                layer_input = self.relu_layers[-1].output

            if i in recurrent_connections:
                inputr_size = relu_layers_sizes[i]
                previous_output = T.fmatrix('previous_output')
                relu_layer = RecurrentReLU(rng=numpy_rng,
                                           input=layer_input,
                                           in_stack=previous_output,
                                           n_in=input_size,
                                           n_in_stack=inputr_size,
                                           n_out=inputr_size)
                #relu_layer.in_stack = relu_layer.output # TODO TODO TODO

                self.params.extend(relu_layer.params)
                self._accugrads.extend([
                    shared(value=numpy.zeros((n_ins, relu_layers_sizes[0]),
                                             dtype='float32'),
                           name='accugrad_W',
                           borrow=True),
                    shared(value=numpy.zeros((relu_layers_sizes[0], ),
                                             dtype='float32'),
                           name='accugrad_b',
                           borrow=True),
                    shared(value=numpy.zeros((n_outs, relu_layers_sizes[0]),
                                             dtype='float32'),
                           name='accugrad_Ws',
                           borrow=True)
                ])
                self._accudeltas.extend([
                    shared(value=numpy.zeros((n_ins, relu_layers_sizes[0]),
                                             dtype='float32'),
                           name='accudelta_W',
                           borrow=True),
                    shared(value=numpy.zeros((relu_layers_sizes[0], ),
                                             dtype='float32'),
                           name='accudelta_b',
                           borrow=True),
                    shared(value=numpy.zeros((n_outs, relu_layers_sizes[0]),
                                             dtype='float32'),
                           name='accudelta_Ws',
                           borrow=True)
                ])
            else:
                relu_layer = ReLU(rng=numpy_rng,
                                  input=layer_input,
                                  n_in=input_size,
                                  n_out=relu_layers_sizes[i])

                self.params.extend(relu_layer.params)
                self._accugrads.extend([
                    shared(value=numpy.zeros(
                        (input_size, relu_layers_sizes[i]), dtype='float32'),
                           name='accugrad_W',
                           borrow=True),
                    shared(value=numpy.zeros((relu_layers_sizes[i], ),
                                             dtype='float32'),
                           name='accugrad_b',
                           borrow=True)
                ])
                self._accudeltas.extend([
                    shared(value=numpy.zeros(
                        (input_size, relu_layers_sizes[i]), dtype='float32'),
                           name='accudelta_W',
                           borrow=True),
                    shared(value=numpy.zeros((relu_layers_sizes[i], ),
                                             dtype='float32'),
                           name='accudelta_b',
                           borrow=True)
                ])

            self.relu_layers.append(relu_layer)

        # We now need to add a logistic layer on top of the MLP
        self.logLayer = LogisticRegression(input=self.relu_layers[-1].output,
                                           n_in=relu_layers_sizes[-1],
                                           n_out=n_outs)
        self.params.extend(self.logLayer.params)
        self._accugrads.extend([
            shared(value=numpy.zeros((relu_layers_sizes[-1], n_outs),
                                     dtype='float32'),
                   name='accugrad_W',
                   borrow=True),
            shared(value=numpy.zeros((n_outs, ), dtype='float32'),
                   name='accugrad_b',
                   borrow=True)
        ])
        self._accudeltas.extend([
            shared(value=numpy.zeros((relu_layers_sizes[-1], n_outs),
                                     dtype='float32'),
                   name='accudelta_W',
                   borrow=True),
            shared(value=numpy.zeros((n_outs, ), dtype='float32'),
                   name='accudelta_b',
                   borrow=True)
        ])

        # compute the cost for second phase of training, defined as the
        # negative log likelihood of the logistic regression (output) layer
        self.finetune_cost = self.logLayer.negative_log_likelihood(self.y)
        self.finetune_cost_sum = self.logLayer.negative_log_likelihood_sum(
            self.y)

        # compute the gradients with respect to the model parameters
        # symbolic variable that points to the number of errors made on the
        # minibatch given by self.x and self.y
        self.errors = self.logLayer.errors(self.y)

    def get_SGD_trainer(self):
        """ Returns a plain SGD minibatch trainer with learning rate as param.
        """
        batch_x = T.fmatrix('batch_x')
        batch_y = T.ivector('batch_y')
        learning_rate = T.fscalar('lr')  # learning rate to use
        cost = self.finetune_cost_sum
        # compute the gradients with respect to the model parameters
        gparams = T.grad(cost, self.params)

        # compute list of fine-tuning updates
        updates = OrderedDict()
        for param, gparam in zip(self.params, gparams):
            updates[param] = param - gparam * learning_rate

        train_fn = theano.function(inputs=[
            theano.Param(batch_x),
            theano.Param(batch_y),
            theano.Param(learning_rate)
        ],
                                   outputs=cost,
                                   updates=updates,
                                   givens={
                                       self.x: batch_x,
                                       self.y: batch_y
                                   })

        return train_fn

    def get_adadelta_trainer(self):
        """ Returns an Adadelta (Zeiler 2012) trainer using self._rho and self._eps params.
        """
        batch_x = T.fmatrix('batch_x')
        batch_y = T.ivector('batch_y')
        cost = self.finetune_cost_sum
        # compute the gradients with respect to the model parameters
        gparams = T.grad(cost, self.params)

        # compute list of fine-tuning updates
        updates = OrderedDict()
        for accugrad, accudelta, param, gparam in zip(self._accugrads,
                                                      self._accudeltas,
                                                      self.params, gparams):
            # c.f. Algorithm 1 in the Adadelta paper (Zeiler 2012)
            agrad = self._rho * accugrad + (1 - self._rho) * gparam * gparam
            dx = -T.sqrt(
                (accudelta + self._eps) / (agrad + self._eps)) * gparam
            updates[accudelta] = self._rho * accudelta + (1 -
                                                          self._rho) * dx * dx
            updates[param] = param + dx
            updates[accugrad] = agrad

        train_fn = theano.function(
            inputs=[theano.Param(batch_x),
                    theano.Param(batch_y)],
            outputs=cost,
            updates=updates,
            givens={
                self.x: batch_x,
                self.y: batch_y
            })

        return train_fn

    def get_adagrad_trainer(self):
        """ Returns an Adagrad (Duchi et al. 2010) trainer using a learning rate.
        """
        batch_x = T.fmatrix('batch_x')
        batch_y = T.ivector('batch_y')
        learning_rate = T.fscalar('lr')  # learning rate to use
        cost = self.finetune_cost_sum
        # compute the gradients with respect to the model parameters
        gparams = T.grad(cost, self.params)

        # compute list of fine-tuning updates
        updates = OrderedDict()
        for accugrad, param, gparam in zip(self._accugrads, self.params,
                                           gparams):
            # c.f. Algorithm 1 in the Adadelta paper (Zeiler 2012)
            agrad = accugrad + gparam * gparam
            dx = -(learning_rate / T.sqrt(agrad + self._eps)) * gparam
            updates[param] = param + dx
            updates[accugrad] = agrad

        train_fn = theano.function(inputs=[
            theano.Param(batch_x),
            theano.Param(batch_y),
            theano.Param(learning_rate)
        ],
                                   outputs=cost,
                                   updates=updates,
                                   givens={
                                       self.x: batch_x,
                                       self.y: batch_y
                                   })

        return train_fn

    def score_classif(self, given_set):
        """ Returns functions to get current classification scores. """
        batch_x = T.fmatrix('batch_x')
        batch_y = T.ivector('batch_y')
        score = theano.function(
            inputs=[theano.Param(batch_x),
                    theano.Param(batch_y)],
            outputs=self.errors,
            givens={
                self.x: batch_x,
                self.y: batch_y
            })

        # Create a function that scans the entire set given as input
        def scoref():
            return [score(batch_x, batch_y) for batch_x, batch_y in given_set]

        return scoref
Beispiel #8
0
class DBN(object):
    """Deep Belief Network

    A deep belief network is obtained by stacking several RBMs on top of each
    other. The hidden layer of the RBM at layer `i` becomes the input of the
    RBM at layer `i+1`. The first layer RBM gets as input the input of the
    network, and the hidden layer of the last RBM represents the output. When
    used for classification, the DBN is treated as a MLP, by adding a logistic
    regression layer on top.
    """
    def __init__(self,
                 numpy_rng,
                 theano_rng=None,
                 n_ins=784,
                 hidden_layers_sizes=[500, 500],
                 n_outs=10):
        """This class is made to support a variable number of layers.

        :type numpy_rng: numpy.random.RandomState
        :param numpy_rng: numpy random number generator used to draw initial
                    weights

        :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams
        :param theano_rng: Theano random generator; if None is given one is
                           generated based on a seed drawn from `rng`

        :type n_ins: int
        :param n_ins: dimension of the input to the DBN

        :type hidden_layers_sizes: list of ints
        :param hidden_layers_sizes: intermediate layers size, must contain
                               at least one value

        :type n_outs: int
        :param n_outs: dimension of the output of the network
        """

        self.sigmoid_layers = []
        self.rbm_layers = []
        self.params = []
        self.n_layers = len(hidden_layers_sizes)

        assert self.n_layers > 0

        if not theano_rng:
            theano_rng = RandomStreams(numpy_rng.randint(2**30))

        # allocate symbolic variables for the data
        self.x = T.matrix('x')  # the data is presented as rasterized images
        self.y = T.ivector('y')  # the labels are presented as 1D vector
        # of [int] labels
        # end-snippet-1
        # The DBN is an MLP, for which all weights of intermediate
        # layers are shared with a different RBM.  We will first
        # construct the DBN as a deep multilayer perceptron, and when
        # constructing each sigmoidal layer we also construct an RBM
        # that shares weights with that layer. During pretraining we
        # will train these RBMs (which will lead to chainging the
        # weights of the MLP as well) During finetuning we will finish
        # training the DBN by doing stochastic gradient descent on the
        # MLP.

        for i in xrange(self.n_layers):
            # construct the sigmoidal layer

            # the size of the input is either the number of hidden
            # units of the layer below or the input size if we are on
            # the first layer
            if i == 0:
                input_size = n_ins
            else:
                input_size = hidden_layers_sizes[i - 1]

            # the input to this layer is either the activation of the
            # hidden layer below or the input of the DBN if you are on
            # the first layer
            if i == 0:
                layer_input = self.x
            else:
                layer_input = self.sigmoid_layers[-1].output

            sigmoid_layer = HiddenLayer(rng=numpy_rng,
                                        input=layer_input,
                                        n_in=input_size,
                                        n_out=hidden_layers_sizes[i],
                                        activation=T.nnet.sigmoid)

            # add the layer to our list of layers
            self.sigmoid_layers.append(sigmoid_layer)

            # its arguably a philosophical question...  but we are
            # going to only declare that the parameters of the
            # sigmoid_layers are parameters of the DBN. The visible
            # biases in the RBM are parameters of those RBMs, but not
            # of the DBN.
            self.params.extend(sigmoid_layer.params)

            # Construct an RBM that shared weights with this layer
            rbm_layer = RBM(numpy_rng=numpy_rng,
                            theano_rng=theano_rng,
                            input=layer_input,
                            n_visible=input_size,
                            n_hidden=hidden_layers_sizes[i],
                            W=sigmoid_layer.W,
                            hbias=sigmoid_layer.b)
            self.rbm_layers.append(rbm_layer)

        # We now need to add a logistic layer on top of the MLP
        self.logLayer = LogisticRegression(
            input=self.sigmoid_layers[-1].output,
            n_in=hidden_layers_sizes[-1],
            n_out=n_outs)
        self.params.extend(self.logLayer.params)

        # compute the cost for second phase of training, defined as the
        # negative log likelihood of the logistic regression (output) layer
        self.finetune_cost = self.logLayer.negative_log_likelihood(self.y)

        # compute the gradients with respect to the model parameters
        # symbolic variable that points to the number of errors made on the
        # minibatch given by self.x and self.y
        self.errors = self.logLayer.errors(self.y)

    def pretraining_functions(self, train_set_x, batch_size, k):
        '''Generates a list of functions, for performing one step of
        gradient descent at a given layer. The function will require
        as input the minibatch index, and to train an RBM you just
        need to iterate, calling the corresponding function on all
        minibatch indexes.

        :type train_set_x: theano.tensor.TensorType
        :param train_set_x: Shared var. that contains all datapoints used
                            for training the RBM
        :type batch_size: int
        :param batch_size: size of a [mini]batch
        :param k: number of Gibbs steps to do in CD-k / PCD-k

        '''

        # index to a [mini]batch
        index = T.lscalar('index')  # index to a minibatch
        learning_rate = T.scalar('lr')  # learning rate to use

        # number of batches
        n_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
        # begining of a batch, given `index`
        batch_begin = index * batch_size
        # ending of a batch given `index`
        batch_end = batch_begin + batch_size

        pretrain_fns = []
        for rbm in self.rbm_layers:

            # get the cost and the updates list
            # using CD-k here (persisent=None) for training each RBM.
            # TODO: change cost function to reconstruction error
            cost, updates = rbm.get_cost_updates(learning_rate,
                                                 persistent=None,
                                                 k=k)

            # compile the theano function
            fn = theano.function(
                inputs=[index, theano.Param(learning_rate, default=0.1)],
                outputs=cost,
                updates=updates,
                givens={self.x: train_set_x[batch_begin:batch_end]})
            # append `fn` to the list of functions
            pretrain_fns.append(fn)

        return pretrain_fns

    def build_finetune_functions(self, datasets, batch_size):
        '''Generates a function `train` that implements one step of
        finetuning, a function `validate` that computes the error on a
        batch from the validation set, and a function `test` that
        computes the error on a batch from the testing set

        :type datasets: list of pairs of theano.tensor.TensorType
        :param datasets: It is a list that contain all the datasets;
                        the has to contain three pairs, `train`,
                        `valid`, `test` in this order, where each pair
                        is formed of two Theano variables, one for the
                        datapoints, the other for the labels
        :type batch_size: int
        :param batch_size: size of a minibatch
        '''

        (train_set_x, train_set_y) = datasets[0]
        (valid_set_x, valid_set_y) = datasets[1]
        (test_set_x, test_set_y) = datasets[2]

        # compute number of minibatches for training, validation and testing
        n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]
        n_valid_batches /= batch_size
        n_test_batches = test_set_x.get_value(borrow=True).shape[0]
        n_test_batches /= batch_size

        index = T.lscalar('index')  # index to a [mini]batch
        learning_rate = T.scalar('lr')  # learning rate to used

        # compute the gradients with respect to the model parameters
        gparams = T.grad(self.finetune_cost, self.params)

        # compute list of fine-tuning updates
        updates = []
        for param, gparam in zip(self.params, gparams):
            updates.append(
                (param, param -
                 gparam * T.cast(learning_rate, dtype=theano.config.floatX)))

        train_fn = theano.function(
            inputs=[index, theano.Param(learning_rate, default=0.1)],
            outputs=self.finetune_cost,
            updates=updates,
            givens={
                self.x:
                train_set_x[index * batch_size:(index + 1) * batch_size],
                self.y:
                train_set_y[index * batch_size:(index + 1) * batch_size]
            })

        test_score_i = theano.function(
            [index],
            self.errors,
            givens={
                self.x:
                test_set_x[index * batch_size:(index + 1) * batch_size],
                self.y: test_set_y[index * batch_size:(index + 1) * batch_size]
            })

        valid_score_i = theano.function(
            [index],
            self.errors,
            givens={
                self.x:
                valid_set_x[index * batch_size:(index + 1) * batch_size],
                self.y:
                valid_set_y[index * batch_size:(index + 1) * batch_size]
            })

        # Create a function that scans the entire validation set
        def valid_score():
            return [valid_score_i(i) for i in xrange(n_valid_batches)]

        # Create a function that scans the entire test set
        def test_score():
            return [test_score_i(i) for i in xrange(n_test_batches)]

        return train_fn, valid_score, test_score
Beispiel #9
0
class StackedDenoisingAutoencoder:
    def __init__(self,
                 numpyRng,
                 theanoRng=None,
                 nIn=28*28,
                 hiddenLayerSizes=[500,500],
                 nOut=10):
        self.nLayers = len(hiddenLayerSizes)
        if not theanoRng:
            theanoRng = theano.tensor.shared_randomstreams.RandomStreams(numpyRng.randint(2 ** 30))
        self.x = T.matrix('x')
        self.y = T.ivector('y')
        def makeSigmoidLayer(lastLayer,lastLayerSize,size):
            return Layer(rng=numpyRng,input=lastLayer,nIn=lastLayerSize,nOut=size,activation=T.nnet.sigmoid)
        def makeDALayer(lastLayer,lastLayerSize,size,sigmoidLayer):
            return DenoisingAutoEncoder(
                numpyRng=numpyRng,theanoRng=theanoRng,input=lastLayer,
                nVisible=lastLayerSize,
                nHidden=size,
                W=sigmoidLayer.W,
                bHidden=sigmoidLayer.b)
        def makeLayers(lastLayer,lastInputSize,nextLayerSizes):
            if nextLayerSizes:
                newList = list(nextLayerSizes)
                size = newList.pop()
                sigmoidLayer = makeSigmoidLayer(lastLayer,lastInputSize,size)
                daLayer = makeDALayer(lastLayer,lastInputSize,size,sigmoidLayer)
                yield (sigmoidLayer,daLayer)
                for layer in makeLayers(sigmoidLayer.output,size,newList):
                    yield layer
        self.sigmoidLayers,self.dALayers = zip(*makeLayers(self.x,nIn,reversed(hiddenLayerSizes)))
        print "created sda with layer shapes below."
        for da in self.dALayers:
            
            print "layersize:", da.W.get_value().shape
        self.logLayer = LogisticRegression(self.sigmoidLayers[-1].output,hiddenLayerSizes[-1],nOut)
        self.params = [l.params for l in self.sigmoidLayers] + [self.logLayer.negativeLogLikelihood(self.y)]
        self.fineTuneCost = self.logLayer.negativeLogLikelihood(self.y)
        self.errors = self.logLayer.errors(self.y)

    def pretrainingFunctions(self,trainSetX,batchSize):
        index = T.lscalar("index")
        corruptionLevel = T.scalar('corruption')
        learningRate = T.scalar("learning")
        batchBegin = batchSize * index
        batchEnd = batchBegin + batchSize
        for dA in self.dALayers:
            cost,updates = dA.costFunctionAndUpdates(corruptionLevel,learningRate)
            f = theano.function(
                inputs=[
                    index,
                    theano.Param(corruptionLevel,default=0.2),
                    theano.Param(learningRate,default=0.1)
                ],
                outputs=cost,
                updates=updates,
                givens={self.x:trainSetX[batchBegin:batchEnd]},
            )
            yield f
            
    def pretrainingFunctionsWithOptimizer(self,trainSetX,batchSize,optimizer):
        """
        with optimizer.
        optimizer(params,grads)
        """
        index = T.lscalar("index")
        corruptionLevel = T.scalar('corruption')
        learningRate = T.scalar("learning")
        batchBegin = batchSize * index
        batchEnd = batchBegin + batchSize
        for dA in self.dALayers:
            #cost,updates = dA.costFunctionAndUpdates(corruptionLevel,learningRate)
            cost, param, grads = dA.costParamGrads(corruptionLevel)
            updates = optimizer(param,grads)
            f = theano.function(
                inputs=[
                    index,
                    theano.Param(corruptionLevel,default=0.2),
                ],
                outputs=cost,
                updates=updates,
                givens={self.x:trainSetX[batchBegin:batchEnd]},
            )
            yield f
            
    def fineTuneFunctions(self,datasets,batchSize,learningRate):
        index = T.lscalar('i')
        trainSetX,trainSetY = datasets[0]
        validSetX,validSetY = datasets[1]
        testSetX,testSetY = datasets[2]
        gparams = T.grad(self.fineTuneCost,self.params)
        updates = [
            (param,param-gparam*learningRate)
            for param,gparam in zip(self.params,gparams)
        ]
        def makeGivens(x,y):
            return {self.x:x[index*batchSize:(index+1)*batchSize],
                    self.y:y[index*batchSize:(index+1)*batchSize]}
        trainer = theano.function(
            inputs=[index],
            outputs=self.fineTuneCost,
            updates=updates,
            givens=makeGivens(trainSetX,trainSetY),
            name='train'
        )
        testScoreI=theano.function(
            inputs=[index],
            outputs=self.errors,
            givens=makeGivens(testSetX,testSetY),
            name='test'
        )
        validScoreI=theano.function(
            inputs=[index],
            outputs=self.errors,
            givens=makeGivens(validSetX,validSetY),
            name='valid'
        )

        def validationScore():
            return [validScoreI(i) for i in xrange(validSetX.get_value(borrow=True).shape[0]/batchSize)]

        def testScore():
            return [testScoreI(i) for i in xrange(validSetX.get_value(borrow=True).shape[0]/batchSize)]

        return trainer,validationScore,testScore

    def preTrain(self,
                 data,
                 batchSize=20,
                 preLearningRate=0.1,
                 corruptionLevels=(.1,.2,.3)):
        import numpy,util
        preTrainer = list(self.pretrainingFunctions(data,batchSize=batchSize))
        assert len(corruptionLevels) == len(preTrainer) , "given corruption levels do not correspond to the layers!!!"
        for i,(trainer,corruptionLevel) in enumerate(zip(preTrainer,corruptionLevels)):
            for epoch in xrange(15):
                print 'Pre-training layer %i, epoch %d start' % (i,epoch)
                trainScores = [trainer(batchIndex,corruptionLevel,preLearningRate) for batchIndex in xrange(data.get_value(borrow=True).shape[0]/batchSize)]
                print 'Pre-training layer %i, epoch %d, cost ' % (i, epoch),numpy.mean(trainScores)
Beispiel #10
0
class StackedDenoisingAutoEncoder(object):
    """ Stacked Denoising Auto-Encoder

    A stacked denoising autoencoder is obtained by stacking several
    denoising autoencoder.

    The hidden layer of the denoising autoencoder at layer `i` becomes
    the input of the layer `i+1`.
    """
    def __init__(self,
                 numpy_rng,
                 theano_rng=None,
                 n_ins=784,
                 hidden_layers_sizes=[500, 500],
                 n_outs=10,
                 corruption_levels=[0.1, 0.1]):
        """
        This class is made to support a variable number of layers.

        :type numpy_rng: numpy.random.RandomState
        :param numpy_rng: numpy random number generator used to draw
                          initial weights

        :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams
        :param theano_rng: Theano random generator; if None is given, one
                           is generated based on a seed drawn from `rng`

        :type n_ins: int
        :param n_ins: dimension of the input

        :type hidden_layers_sizes: list of ints
        :param hidden_layers_sizes: intermediate layers size, must contain
                                    at least one value

        :type n_outs: int
        :param n_outs: dimension of the output of the network

        :type corruption_levels: list of float
        :param corruption_levels: amount of corruption to use for each layer
        """

        self.sigmoid_layers = []
        self.da_layers = []
        self.params = []
        self.n_layers = len(hidden_layers_sizes)

        assert self.n_layers > 0

        if not theano_rng:
            theano_rng = RandomStreams(numpy_rng.randint(2**30))

        # allocate symbolic variables for the data
        # the data is presented as rasterized images
        self.x = T.matrix('x')
        # the labels are presented as 1D vector of int labels
        self.y = T.ivector('y')

        # SDA is an MLP, for which all weights of intermediate layers
        # are shared with a different denoising autoencoders.
        # We will first construct the SDA as a deep multilayer perceptron,
        # and when constructing each sigmoidal layer we also construct a
        # denoising autoencoder that shares weights with that layer
        # During pretraining we will train these autoencoders (which will
        # lead to chainging the weights of the MLP as well)
        # During finetunining we will finish training the SDA by doing
        # stochastich gradient descent on the MLP

        for i in range(self.n_layers):
            # construct the sigmoidal layer
            # the size of the input is either the number of hidden units of
            # the layer below or the input size if we are on the first layer
            if i == 0:
                input_size = n_ins
            else:
                input_size = hidden_layers_sizes[i - 1]

            # the input to this layer is either the activation of the hidden
            # layer below or the input of the SDA if you are on the first layer
            if i == 0:
                layer_input = self.x
            else:
                layer_input = self.sigmoid_layers[-1].output

            sigmoid_layer = HiddenLayer(rng=numpy_rng,
                                        input=layer_input,
                                        n_in=input_size,
                                        n_out=hidden_layers_sizes[i],
                                        activation=T.nnet.sigmoid)
            # add the layer to our list of layers
            self.sigmoid_layers.append(sigmoid_layer)
            # its arguably a philosophical question ...
            # but we are going to only declare that the parameters of the
            # sigmoid_layers are parameters of the StackedDA
            # the visible biases in the DA are parameters of those DA
            # but not the SDA
            self.params.extend(sigmoid_layer.params)

            # construct a denoising autoencoder that shared weights with this
            # layer
            da_layer = DenoisingAutoEncoder(numpy_rng=numpy_rng,
                                            theano_rng=theano_rng,
                                            input=layer_input,
                                            n_visible=input_size,
                                            n_hidden=hidden_layers_sizes[i],
                                            W=sigmoid_layer.W,
                                            bhid=sigmoid_layer.b)
            self.da_layers.append(da_layer)

        # we now need to add a logistic layer on top of the MLP
        self.logLayer = LogisticRegression(
            input=self.sigmoid_layers[-1].output,
            n_in=hidden_layers_sizes[-1],
            n_out=n_outs)

        self.params.extend(self.logLayer.params)

        # construct a function that implements one step of finetunining
        # compute the cost for second phase of training
        # defined as the negative log likelihood
        self.finetune_cost = self.logLayer.negative_log_likelihood(self.y)
        # compute the gradients with respect to the model parameters
        # symbolic variable that points to the number of errors made on the
        # minibatch given by self.x and self.y.
        self.errors = self.logLayer.errors(self.y)

    def pretraining_functions(self, train_x, batch_size):
        """
        Generates a list of functions, each of them implementting one step
        in training the DA corresponding to the layer with same index.

        The function will require as input the minibatch index, and to train
        a DA you just need to iterate, calling the corresponding function on
        all minibatch indexes.

        :type train_x: theano.tensor.TensorType
        :param train_x: shared variable that contains all datapoints used
                        for training the DA

        :type batch_size: int
        :param batch_size: size of a minibatch

        :type learning_rate: float
        :param learning_rate: learning rate used during training for any of
                              the DA layers
        """

        # index to a minibatch
        index = T.lscalar('index')
        # % of corruption to use
        corruption_level = T.scalar('corruption')
        # learning rate to use
        learning_rate = T.scalar('lr')
        # begining of a batch, given `index`
        batch_begin = index * batch_size
        # ending of a batch given `index`
        batch_end = batch_begin + batch_size

        pretrain_fns = []
        for da in self.da_layers:
            # get the cost and the updates list
            cost, updates = da.get_cost_updates(corruption_level,
                                                learning_rate)

            # compile the theano function
            fn = theano.function(
                inputs=[
                    index,
                    theano.In(corruption_level, value=0.2),
                    theano.In(learning_rate, value=0.1)
                ],
                outputs=cost,
                updates=updates,
                givens={self.x: train_x[batch_begin:batch_end]})

            # append `fn` to the list of functions
            pretrain_fns.append(fn)

        return pretrain_fns

    def build_finetune_functions(self, datasets, batch_size, learning_rate):
        """
        Generates a function `train` that implements one step of finetuning,
        a function `validate` that computes the error on a batch from the
        validation set, and a function `test` that computes the error on a 
        batch from the testing set.

        :type datasets: list of pairs of theano.tensor.TensorType
        :param datasets: it is a list that contain all the datasets;
                         the has to contain three paris, `train`, `valid`,
                         `test` in this order, where each pari is formed
                         of two theano variables, one for the datapoints,
                         the other for the labels

        :type batch_size: int
        :param batch_size: size of a minibatch
  
        :type learning_rate: float
        :param learning_rate: learning rate used during finetune stage
        """

        (train_x, train_y) = datasets[0]
        (valid_x, valid_y) = datasets[1]
        (test_x, test_y) = datasets[2]

        # compute number of minibatches for training, validation and testing
        n_valid_batches = valid_x.get_value(borrow=True).shape[0]
        n_valid_batches //= batch_size
        n_test_batches = test_x.get_value(borrow=True).shape[0]
        n_test_batches //= batch_size

        # index to a minibatch
        index = T.lscalar('index')

        # compute the gradients with respect to the model parameters
        gparams = T.grad(self.finetune_cost, self.params)

        # compute list of fine-tuning updates
        updates = [(param, param - gparam * learning_rate)
                   for param, gparam in zip(self.params, gparams)]

        givens = {
            self.x: train_x[index * batch_size:(index + 1) * batch_size],
            self.y: train_y[index * batch_size:(index + 1) * batch_size]
        }
        train_fn = theano.function(inputs=[index],
                                   outputs=self.finetune_cost,
                                   updates=updates,
                                   givens=givens,
                                   name='train')

        givens = {
            self.x: test_x[index * batch_size:(index + 1) * batch_size],
            self.y: test_y[index * batch_size:(index + 1) * batch_size]
        }
        test_score_i = theano.function([index],
                                       self.errors,
                                       givens=givens,
                                       name='test')

        givens = {
            self.x: valid_x[index * batch_size:(index + 1) * batch_size],
            self.y: valid_y[index * batch_size:(index + 1) * batch_size]
        }
        valid_score_i = theano.function([index],
                                        self.errors,
                                        givens=givens,
                                        name='valid')

        # create a function that scans the entire validation set
        def valid_score():
            return [valid_score_i(i) for i in range(n_valid_batches)]

        # create a function that scans the entire test set
        def test_score():
            return [test_score_i(i) for i in range(n_test_batches)]

        return train_fn, valid_score, test_score
Beispiel #11
0
def optimize_lenet(learning_rate=0.01,
                   n_epochs=200,
                   dataset='data/mnist.pkl.gz',
                   batch_size=500,
                   n_hidden=500,
                   nkerns=[20, 50],
                   rng=np.random.RandomState(23455)):
    print '... load training set'
    datasets = load_data(dataset)

    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]

    n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size
    n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size

    # ミニバッチのindex
    index = T.lscalar()

    # dataシンボル
    x = T.matrix('x')
    # labelシンボル
    y = T.ivector('y')

    print '... building the model'
    # LeNetConvPoolLayerと矛盾が起きないように、(batch_size, 28*28)にラスタ化された行列を4DTensorにリシェイプする
    # 追加した1はチャンネル数
    # ここではグレイスケール画像なのでチャンネル数は1
    layer0_input = x.reshape((batch_size, 1, 28, 28))

    # layer0
    # filterのnkerns[0]は20
    layer0 = LeNetConvPoolLayer(rng,
                                input=layer0_input,
                                image_shape=(batch_size, 1, 28, 28),
                                filter_shape=(nkerns[0], 1, 5, 5),
                                poolsize=(2, 2))

    # layer1
    # filterのnkerns[1]は50
    layer1 = LeNetConvPoolLayer(rng,
                                input=layer0.output,
                                image_shape=(batch_size, nkerns[0], 12, 12),
                                filter_shape=(nkerns[1], nkerns[0], 5, 5),
                                poolsize=(2, 2))

    # layer2_input
    # layer1の出力は4x4ピクセルの画像が50チャンネル分4次元Tensorで出力されるが、多層パーセプトロンの入力にそのまま使えない
    # 4x4x50=800次元のベクトルに変換する(batch_size, 50, 4, 4)から(batch_size, 800)にする
    layer2_input = layer1.output.flatten(2)

    # layer2
    # 500ユニットの隠れレイヤー
    # layer2_inputで作成した入力ベクトルのサイズ=n_in
    layer2 = HiddenLayer(rng,
                         input=layer2_input,
                         n_in=nkerns[1] * 4 * 4,
                         n_out=n_hidden,
                         activation=T.tanh)

    # layer3
    # 出力は500ユニット
    layer3 = LogisticRegression(input=layer2.output, n_in=n_hidden, n_out=10)

    # cost(普通の多層パーセプトロンは正則化項が必要だが、CNNは構造自体で正則化の効果を含んでいる)
    cost = layer3.negative_log_likelihood(y)

    # testモデル
    # 入力indexからgivensによって計算した値を使ってlayer3.errorsを計算する
    test_model = theano.function(
        [index],
        layer3.errors(y),
        givens={
            x: test_set_x[index * batch_size:(index + 1) * batch_size],
            y: test_set_y[index * batch_size:(index + 1) * batch_size]
        })

    # validationモデル
    validate_model = theano.function(
        [index],
        layer3.errors(y),
        givens={
            x: valid_set_x[index * batch_size:(index + 1) * batch_size],
            y: valid_set_y[index * batch_size:(index + 1) * batch_size]
        })

    # 微分用のパラメータ
    params = layer3.params + layer2.params + layer1.params + layer0.params

    # コスト関数パラメータについてのの微分
    grads = T.grad(cost, params)

    # パラメータの更新
    updates = [(param_i, param_i - learning_rate * grad_i)
               for param_i, grad_i in zip(params, grads)]

    # trainモデル
    train_model = theano.function(
        inputs=[index],
        outputs=cost,
        updates=updates,
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size],
            y: train_set_y[index * batch_size:(index + 1) * batch_size]
        })

    # optimize
    print "train model ..."
    patience = 10000
    patience_increase = 2
    improvement_threshold = 0.995
    validation_frequency = min(n_train_batches, patience / 2)

    best_validation_loss = np.inf
    best_iter = 0
    test_score = 0
    start_time = timeit.default_timer()

    epoch = 0
    done_looping = False

    fp1 = open('log/lenet_validation_error.txt', 'w')
    fp2 = open('log/lenet_test_error.txt', 'w')

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        for minibatch_index in xrange(n_train_batches):
            minibatch_avg_cost = train_model(minibatch_index)
            iter = (epoch - 1) * n_train_batches + minibatch_index

            if (iter + 1) % validation_frequency == 0:
                ## validationのindexをvalidationのエラー率を計算するfunctionに渡し、配列としてかえす
                validation_losses = [
                    validate_model(i) for i in xrange(n_valid_batches)
                ]
                # 平均してscoreにする
                this_validation_loss = np.mean(validation_losses)
                print('epoch %i, minibatch %i/%i, validation error %f ' %
                      (epoch, minibatch_index + 1, n_train_batches,
                       this_validation_loss * 100.))
                fp1.write("%d\t%f\n" % (epoch, this_validation_loss * 100))

                if this_validation_loss < best_validation_loss:
                    if (this_validation_loss <
                            best_validation_loss * improvement_threshold):
                        patience = max(patience, iter * patience_increase)

                    best_validation_loss = this_validation_loss
                    best_iter = iter
                    ## testのindex をtestのエラー率を計算するfunctionに渡し、配列として渡す
                    test_losses = [
                        test_model(i) for i in xrange(n_test_batches)
                    ]

                    ## 平均してscoreにする
                    test_score = np.mean(test_losses)
                    ##
                    print('epoch %i, minibatch %i/%i, test error %f ' %
                          (epoch, minibatch_index + 1, n_train_batches,
                           test_score * 100.))
                    fp2.write("%d\t%f\n" % (epoch, test_score * 100))
            if patience <= iter:
                done_looping = True
                break

    end_time = timeit.default_timer()
    print((
        'optimization complete. Best validation score of %f obtained at iteration %i, with test performance %f'
    ) % (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('This code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
    fp1.close()
    fp2.close()

    import cPickle
    cPickle.dump(layer0, open("model/lenet_layer0.pkl", "wb"))
    cPickle.dump(layer1, open("model/lenet_layer1.pkl", "wb"))
Beispiel #12
0
class SRNN(object):
    """Stacking ReLU Neural Network
    """

    def __init__(self, numpy_rng, theano_rng=None, 
            n_ins=N_FEATURES * N_FRAMES,
            relu_layers_sizes=[1024, 1024, 1024],
            n_outs=62 * 3,
            rho=0.90, eps=1.E-6):
        """ TODO WRITEME
        """

        self.relu_layers = []
        self.params = []
        self.n_layers = len(relu_layers_sizes)
        self._rho = rho  # ``momentum'' for adadelta
        self._eps = eps  # epsilon for adadelta
        self._accugrads = []  # for adadelta
        self._accudeltas = []  # for adadelta
        self.n_outs = n_outs

        assert self.n_layers > 0

        if not theano_rng:
            theano_rng = RandomStreams(numpy_rng.randint(2 ** 30))

        self.x = T.fmatrix('x')
        self.y = T.ivector('y')
        self.p_y_in = T.fmatrix('p_y')

        input_relu_layer = StackReLU(rng=numpy_rng,
                input=self.x, in_stack=self.p_y_in,
                n_in=n_ins, n_in_stack=n_outs, n_out=relu_layers_sizes[0])
        self.relu_layers.append(input_relu_layer)
        self.params.extend(input_relu_layer.params)
        self._accugrads.extend([shared(value=numpy.zeros((n_ins, relu_layers_sizes[0]), dtype='float32'), name='accugrad_W', borrow=True), shared(value=numpy.zeros((relu_layers_sizes[0], ), dtype='float32'), name='accugrad_b', borrow=True), shared(value=numpy.zeros((n_outs, relu_layers_sizes[0]), dtype='float32'), name='accugrad_Ws', borrow=True)])
        self._accudeltas.extend([shared(value=numpy.zeros((n_ins, relu_layers_sizes[0]), dtype='float32'), name='accudelta_W', borrow=True), shared(value=numpy.zeros((relu_layers_sizes[0], ), dtype='float32'), name='accudelta_b', borrow=True), shared(value=numpy.zeros((n_outs, relu_layers_sizes[0]), dtype='float32'), name='accudelta_Ws', borrow=True)])

        for i in xrange(1, self.n_layers):
            input_size = relu_layers_sizes[i-1]
            layer_input = self.relu_layers[-1].output

            relu_layer = ReLU(rng=numpy_rng,
                    input=layer_input,
                    n_in=input_size,
                    n_out=relu_layers_sizes[i])

            self.relu_layers.append(relu_layer)

            self.params.extend(relu_layer.params)
            self._accugrads.extend([shared(value=numpy.zeros((input_size, relu_layers_sizes[i]), dtype='float32'), name='accugrad_W', borrow=True), shared(value=numpy.zeros((relu_layers_sizes[i], ), dtype='float32'), name='accugrad_b', borrow=True)])
            self._accudeltas.extend([shared(value=numpy.zeros((input_size, relu_layers_sizes[i]), dtype='float32'), name='accudelta_W', borrow=True), shared(value=numpy.zeros((relu_layers_sizes[i], ), dtype='float32'), name='accudelta_b', borrow=True)])


        # We now need to add a logistic layer on top of the MLP
        self.logLayer = LogisticRegression(
            input=self.relu_layers[-1].output,
            n_in=relu_layers_sizes[-1],
            n_out=n_outs)
        self.params.extend(self.logLayer.params)
        self._accugrads.extend([shared(value=numpy.zeros((relu_layers_sizes[-1], n_outs), dtype='float32'), name='accugrad_W', borrow=True), shared(value=numpy.zeros((n_outs, ), dtype='float32'), name='accugrad_b', borrow=True)])
        self._accudeltas.extend([shared(value=numpy.zeros((relu_layers_sizes[-1], n_outs), dtype='float32'), name='accudelta_W', borrow=True), shared(value=numpy.zeros((n_outs, ), dtype='float32'), name='accudelta_b', borrow=True)])

        # compute the cost for second phase of training, defined as the
        # negative log likelihood of the logistic regression (output) layer
        self.finetune_cost = self.logLayer.negative_log_likelihood(self.y)
        self.finetune_cost_sum = self.logLayer.negative_log_likelihood_sum(self.y)
        self.p_y_out = self.logLayer.p_y_given_x

        # compute the gradients with respect to the model parameters
        # symbolic variable that points to the number of errors made on the
        # minibatch given by self.x and self.y
        self.errors = self.logLayer.errors(self.y)

    def get_SGD_trainer(self):
        """ Returns a plain SGD minibatch trainer with learning rate as param.
        FIXME TODO
        """
        # TODO
        return -1

    def get_stacked_adadelta_trainer(self):
        """ Returns an Adadelta (Zeiler 2012) trainer using self._rho and
        self._eps params, that works on stacks, that is first a classification
        step to get the output probabilities, and then a step taking these
        outputs into account.
        """
        batch_x = T.fmatrix('batch_x')
        batch_p_y = T.fmatrix('batch_p_y')
        batch_y = T.ivector('batch_y')

        first_pass = theano.function(inputs=[theano.Param(batch_x),
            theano.Param(batch_p_y)],
            outputs=self.p_y_out,
            givens={self.x: batch_x,
                    self.p_y_in: batch_p_y})

        cost = self.finetune_cost_sum
        gparams = T.grad(cost, self.params)
        updates = OrderedDict()
        for accugrad, accudelta, param, gparam in zip(self._accugrads,
                self._accudeltas, self.params, gparams):
            # c.f. Algorithm 1 in the Adadelta paper (Zeiler 2012)
            agrad = self._rho * accugrad + (1 - self._rho) * gparam * gparam
            dx = - T.sqrt((accudelta + self._eps) / (agrad + self._eps)) * gparam
            updates[accudelta] = self._rho * accudelta + (1 - self._rho) * dx * dx
            updates[param] = param + dx
            updates[accugrad] = agrad

        train_fn = theano.function(inputs=[theano.Param(batch_x), 
            theano.Param(batch_p_y), theano.Param(batch_y)],
            outputs=cost,
            updates=updates,
            givens={self.x: batch_x,
                self.p_y_in: batch_p_y,
                self.y: batch_y})

        return first_pass, train_fn


    def get_bptt_adadelta_trainer(self):
        """ Returns an Adadelta (Zeiler 2012) trainer using self._rho and self._eps params.
        """
        cost = self.finetune_cost_sum
        # compute the gradients with respect to the model parameters
        gparams = T.grad(cost, self.params)

        # compute list of fine-tuning updates
        updates = OrderedDict()
        for accugrad, accudelta, param, gparam in zip(self._accugrads,
                self._accudeltas, self.params, gparams):
            # c.f. Algorithm 1 in the Adadelta paper (Zeiler 2012)
            agrad = self._rho * accugrad + (1 - self._rho) * gparam * gparam
            dx = - T.sqrt((accudelta + self._eps) / (agrad + self._eps)) * gparam
            updates[accudelta] = self._rho * accudelta + (1 - self._rho) * dx * dx
            updates[param] = param + dx
            updates[accugrad] = agrad
            
        #p_y_given_x_init = shared(numpy.asarray(numpy.random.uniform((1, self.n_outs)), dtype='float32'))
        p_y_given_x_init = T.zeros((1, self.n_outs)) + 1./self.n_outs

        def one_step(x_t, p_y):
            self.x = x_t
            self.p_y_in = p_y
            return [x_t, self.p_y_out]
        
        batch_x = T.fmatrix('batch_x')
        batch_y = T.ivector('batch_y')
        #batch_p_y = T.fmatrix('batch_p_y')
        [x, p_y_out], _ = theano.scan(lambda x_t, p_y_g_x_m1, *_: one_step(x_t, p_y_g_x_m1),
                sequences=batch_x[:-1],
                outputs_info=[None, p_y_init],)
                

        train_fn = theano.function(inputs=[theano.Param(batch_x), 
            theano.Param(batch_y)],
            outputs=cost,
            updates=updates,
            givens={self.y: batch_y,
                self.p_y_in: T.concatenate([p_y_init, 
                    p_y_out], axis=0),
                self.x: batch_x
                })

        return train_fn

    def get_adagrad_trainer(self):
        """ Returns an Adagrad (Duchi et al. 2010) trainer using a learning rate.
        FIXME TODO
        """
        # TODO
        return -1

    def score_stacked_classif(self, given_set):
        """ Returns functions to get current stacked-based (not RNN)
        classification scores. """
        batch_x = T.fmatrix('batch_x')
        batch_p_y = T.fmatrix('batch_p_y')
        batch_y = T.ivector('batch_y')
        p_y_init = T.zeros((batch_x.shape[0], self.n_outs)) + 1./self.n_outs  # TODO try = 0

        first_pass = theano.function(inputs=[theano.Param(batch_x)],
            outputs=self.p_y_out,
            givens={self.x: batch_x,
                self.p_y_in: p_y_init})

        score = theano.function(inputs=[theano.Param(batch_x), 
            theano.Param(batch_p_y), theano.Param(batch_y)],
                outputs=self.errors,
                givens={self.x: batch_x, 
                    self.p_y_in: batch_p_y,
                    self.y: batch_y})

        # Create a function that scans the entire set given as input
        def scoref():
            return [score(batch_x, first_pass(batch_x), batch_y) for batch_x, batch_y in given_set]

        return scoref

    def score_rnn_classif(self, given_set):
        """ Returns functions to get current RNN classification scores. """
        # TODO
        batch_x = T.fmatrix('batch_x')
        batch_y = T.ivector('batch_y')
        score = theano.function(inputs=[theano.Param(batch_x), theano.Param(batch_y)],
                outputs=self.errors,
                givens={self.x: batch_x, self.y: batch_y})

        # Create a function that scans the entire set given as input
        def scoref():
            return [score(batch_x, batch_y) for batch_x, batch_y in given_set]

        return scoref

    def score_rnn_PER(self, given_set):
        """ Returns functions to get reccurrent PER. 
        FIXME TODO"""
        # TODO 
        return -1
Beispiel #13
0
class SdA(object):
    """ stacked autoencoder """
    def __init__(self,
                 numpy_rng,
                 theano_rng=None,
                 n_ins=784,
                 hidden_layers_sizes=[500, 500],
                 n_outs=10,
                 corruption_levels=[0.1, 0.1]):
        # 必要なレイヤー配列を定義
        self.sigmoid_layers = []
        self.dA_layers = []
        self.params = []
        # 隠れ層の数
        self.n_layers = len(hidden_layers_sizes)

        # 隠れ層の数は1以上
        assert self.n_layers > 0

        if not theano_rng:
            theano_rng = RandomStreams(numpy_rng.randint(2**30))

        # 画像データ
        self.x = T.matrix('x')

        # int型正解ラベルデータ
        self.y = T.ivector('y')

        for i in xrange(self.n_layers):
            if i == 0:
                # 最初の隠れ層の入力データの数は、入力層のユニット数
                input_size = n_ins
            else:
                # 2つ目以降の隠れ層の入力データの数は、ひとつ前の隠れ層のユニット数
                input_size = hidden_layers_sizes[i - 1]
            if i == 0:
                layer_input = self.x
            else:
                layer_input = self.sigmoid_layers[-1].output

            # 隠れ層
            sigmoid_layer = HiddenLayer(rng=numpy_rng,
                                        input=layer_input,
                                        n_in=input_size,
                                        n_out=hidden_layers_sizes[i],
                                        activation=T.nnet.sigmoid)

            # 隠れ層のリストに追加
            self.sigmoid_layers.append(sigmoid_layer)

            # 隠れ層のWとb
            self.params.extend(sigmoid_layer.params)

            # AutoEncoder
            dA_layer = dA(numpy_rng=numpy_rng,
                          theano_rng=theano_rng,
                          input=layer_input,
                          n_visible=input_size,
                          n_hidden=hidden_layers_sizes[i],
                          W=sigmoid_layer.W,
                          bhid=sigmoid_layer.b)

            # AutoEncoderのリストに追加
            self.dA_layers.append(dA_layer)

        # sigmlid_layresの最後のレイヤーを入力にする、hidden_layers_sizesの最後の層は入力のユニット数、出力ユニットの数はn_outs
        self.logLayer = LogisticRegression(
            input=self.sigmoid_layers[-1].output,
            n_in=hidden_layers_sizes[-1],
            n_out=n_outs)

        # LogisticRegression層のWとb
        self.params.extend(self.logLayer.params)

        # 正則化項は無しで良い
        self.finetune_cost = self.logLayer.negative_log_likelihood(self.y)

        # LogisticRegression層のエラーを使う
        self.errors = self.logLayer.errors(self.y)

    def pretraining_functions(self, train_set_x, batch_size):
        """ 各レイヤーのAutoEncoderによる学習 """
        # minibatchのindex
        index = T.lscalar('index')

        # ノイズ率
        corruption_level = T.scalar('corruption')
        # 学習率
        learning_rate = T.scalar('lr')

        batch_begin = index * batch_size
        batch_end = batch_begin + batch_size

        # 事前学習のfunctionリスト
        pretrain_functions = []
        for dA in self.dA_layers:
            cost, updates = dA.get_cost_updates(corruption_level,
                                                learning_rate)

            fn = theano.function(
                inputs=[
                    index,
                    theano.Param(corruption_level,
                                 default=0.2),  # Paramを使うとTensorの引数の名前で値を指定できる
                    theano.Param(learning_rate, default=0.1)
                ],
                outputs=cost,
                updates=updates,
                givens={self.x: train_set_x[batch_begin:batch_end]})
            # 事前学習の関数リストに各層のオートエンコーダのcost計算とパラメータupdatesのfunctionを追加
            pretrain_functions.append(fn)
        return pretrain_functions

    def build_finetune_functions(self, datasets, batch_size, learning_rate):
        """ ネットワーク全体でfinetuning """
        train_set_x, train_set_y = datasets[0]
        valid_set_x, valid_set_y = datasets[1]
        test_set_x, test_set_y = datasets[2]

        # まとめて計算するようのbatch
        n_valid_batches = valid_set_x.get_value(
            borrow=True).shape[0] / batch_size
        n_test_batches = test_set_x.get_value(
            borrow=True).shape[0] / batch_size

        index = T.lscalar('index')

        # logiticlayerの微分
        gparams = T.grad(self.finetune_cost, self.params)

        # ネットワークのパラメータ更新
        updates = [(param, param - gparam * learning_rate)
                   for param, gparam in zip(self.params, gparams)]

        train_model = theano.function(
            inputs=[index],
            outputs=self.finetune_cost,
            updates=updates,
            givens={
                self.x:
                train_set_x[index * batch_size:(index + 1) * batch_size],
                self.y:
                train_set_y[index * batch_size:(index + 1) * batch_size]
            },
            name='train')

        # minibatch index i testのエラースコアfunction
        test_score_i = theano.function(
            inputs=[index],
            outputs=self.errors,
            givens={
                self.x:
                test_set_x[index * batch_size:(index + 1) * batch_size],
                self.y: test_set_y[index * batch_size:(index + 1) * batch_size]
            },
            name='test')

        # minibatch index i validateのエラースコアfunction
        valid_score_i = theano.function(
            inputs=[index],
            outputs=self.errors,
            givens={
                self.x:
                valid_set_x[index * batch_size:(index + 1) * batch_size],
                self.y:
                valid_set_y[index * batch_size:(index + 1) * batch_size]
            },
            name='validate')

        def valid_score():
            return [valid_score_i(i) for i in xrange(n_valid_batches)]

        def test_score():
            return [test_score_i(i) for i in xrange(n_test_batches)]

        return train_model, valid_score, test_score
Beispiel #14
0
def main():
    rng = np.random.RandomState(23455)
    datasets = load_data()
    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]

    # compute number of minibatches for training, validation and testing
    batch_size = 500
    n_train_batches = train_set_x.get_value(borrow=True).shape[0]
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]
    n_test_batches = test_set_x.get_value(borrow=True).shape[0]
    n_train_batches //= batch_size
    n_valid_batches //= batch_size
    n_test_batches //= batch_size

    nkerns = [20, 50]

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch

    # start-snippet-1
    x = T.matrix('x')   # the data is presented as rasterized images
    y = T.ivector('y')  # the labels are presented as 1D vector of [int] labels

    layer0_input = x.reshape(batch_size, 1, 28, 28)

    layer0 = LeNetConvPoolLayer(rng, layer0_input,
            filter_shape=(nkerns[0], 1, 5, 5),
            image_shape=(batch_size, 1, 28, 28), poolsize=(2, 2))

    layer1 = LeNetConvPoolLayer(rng, input=layer0.output,
            filter_shape=(nkerns[1], nkerns[0], 5, 5),
            image_shape=(batch_size, nkerns[0], 12, 12), poolsize=(2, 2))

    layer2_input = layer1.output.flaten(2)
    layer2 = HiddenLayer(rng, layer2_input, n_in=nkerns[1] * 4 * 4,
            n_out=500)

    layer3 = LogisticRegression(layer2.output, n_in=500, n_out=10)
    cost = layer3.negative_log_likelihood(y)

    test_model = theano.function([index], layer3.errors(y),
            givens={
                x: test_set_x[index * batch_size: (index + 1) * batch_size],
                y: test_set_y[index * batch_size: (index + 1) * batch_size]
            })
    validate_model = theano.function([index], layer3.errors(y),
            givens={
                x: valid_set_x[index * batch_size: (index + 1) * batch_size],
                y: valid_set_y[index * batch_size: (index + 1) * batch_size]
            })
    params = layer3.params + layer2.params + layer1.params + layer0.params
    grads = T.grad(cost, params)

    learning_rate = 0.1

    updates = [(param_i, param_i - learning_rate * grad_i) for param_i, grad_i
            in zip(params, grads)]
    train_model = theano.function([index], cost, updates=updates,
            givens={
                x: train_set_x[index * batch_size: (index + 1) * batch_size],
                y: train_set_y[index * batch_size: (index + 1) * batch_size]
            })

    print "Start training..."
    patience = 10000
    patience_increase = 2
    improvement_threshold = 0.995
    n_epochs = 200
    validation_frequency = min(n_train_batches, patience // 2)

    best_validation_loss = np.inf
    test_score = 0.

    epoch = 0
    done_looping = False

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        for minibatch_index in range(n_train_batches):

            iter = (epoch - 1) * n_train_batches + minibatch_index

            if iter % 100 == 0:
                print('training @ iter = ', iter)
            cost_ij = train_model(minibatch_index)  # NOQA

            if (iter + 1) % validation_frequency == 0:

                # compute zero-one loss on validation set
                validation_losses = [validate_model(i) for i
                                     in range(n_valid_batches)]
                this_validation_loss = np.mean(validation_losses)
                print('epoch %i, minibatch %i/%i, validation error %f %%' %
                      (epoch, minibatch_index + 1, n_train_batches,
                       this_validation_loss * 100.))

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:

                    # improve patience if loss improvement is good enough
                    if this_validation_loss < best_validation_loss *  \
                       improvement_threshold:
                        patience = max(patience, iter * patience_increase)

                    # save best validation score and iteration number
                    best_validation_loss = this_validation_loss

                    # test it on the test set
                    test_losses = [
                        test_model(i)
                        for i in range(n_test_batches)
                    ]
                    test_score = np.mean(test_losses)
                    print(('     epoch %i, minibatch %i/%i, test error of '
                           'best model %f %%') %
                          (epoch, minibatch_index + 1, n_train_batches,
                           test_score * 100.))

            if patience <= iter:
                done_looping = True
                break
Beispiel #15
0
def evaluate_model(learning_rate=0.001,
                   n_epochs=100,
                   nkerns=[16, 40, 50, 60],
                   batch_size=20):
    """ 
    Network for classification of MNIST database

    :type learning_rate: float
    :param learning_rate: this is the initial learning rate used
                            (factor for the stochastic gradient)

    :type n_epochs: int
    :param n_epochs: maximal number of epochs to run the optimizer

    :type nkerns: list of ints
    :param nkerns: number of kernels on each layer

    :type batch_size: int
    :param batch_size: the batch size for training
    """

    print("Evaluating model")

    rng = numpy.random.RandomState(23455)

    # loading the data
    datasets = load_test_data()

    valid_set_x, valid_set_y = datasets[0]
    test_set_x, test_set_y = datasets[1]

    # compute number of minibatches for training, validation and testing
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]
    n_test_batches = test_set_x.get_value(borrow=True).shape[0]
    n_valid_batches //= batch_size
    n_test_batches //= batch_size

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch

    # start-snippet-1
    x = T.matrix('x')  # the data is presented as rasterized images
    y = T.ivector('y')  # the labels are presented as 1D vector of
    # [int] labels

    loaded_params = numpy.load('../saved_models/model.npy')
    layer4_W, layer4_b, layer3_W, layer3_b, layer2_W, layer2_b, layer1_W, layer1_b, layer0_W, layer0_b = loaded_params

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print('Building the model...')

    chosen_height = 64
    chosen_width = 64

    # Reshape matrix of rasterized images of shape (batch_size, 32 * 32)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    # (32, 32) is the size of MNIST images.
    layer0_input = x.reshape((batch_size, 3, chosen_height, chosen_width))

    # Construct the first convolutional pooling layer:
    # filtering does not reduce the layer size because we use padding
    # maxpooling reduces the size to (32/2, 32/2) = (16, 16)
    # 4D output tensor is thus of shape (batch_size, nkerns[0], 16, 16)
    layer0 = MyConvPoolLayer(rng,
                             input=layer0_input,
                             image_shape=(batch_size, 3, chosen_height,
                                          chosen_width),
                             p1=2,
                             p2=2,
                             filter_shape=(nkerns[0], 3, 5, 5),
                             poolsize=(2, 2))

    # Construct the second convolutional pooling layer:
    # filtering does not reduce the layer size because we use padding
    # maxpooling reduces the size to (16/2, 16/2) = (8, 8)
    # 4D output tensor is thus of shape (batch_size, nkerns[1], 5, 5)
    layer1 = MyConvPoolLayer(rng,
                             input=layer0.output,
                             image_shape=(batch_size, nkerns[0],
                                          chosen_height / 2, chosen_width / 2),
                             p1=2,
                             p2=2,
                             filter_shape=(nkerns[1], nkerns[0], 5, 5),
                             poolsize=(2, 2))

    # Construct the third convolutional pooling layer
    # filtering does not reduce the layer size because we use padding
    # maxpooling reduces the size to (8/2, 8/2) = (4, 4)
    # 4D output tensor is thus of shape (batch_size, nkerns[2], 4, 4)
    layer2 = MyConvPoolLayer(rng,
                             input=layer1.output,
                             image_shape=(batch_size, nkerns[1],
                                          chosen_height / 4, chosen_width / 4),
                             p1=2,
                             p2=2,
                             filter_shape=(nkerns[2], nkerns[1], 5, 5),
                             poolsize=(2, 2))

    # the HiddenLayer being fully-connected, it operates on 2D matrices of
    # shape (batch_size, num_pixels) (i.e matrix of rasterized images).
    # This will generate a matrix of shape (batch_size, nkerns[2] * 4 * 4),
    # or (500, 20 * 4 * 4) = (500, 320) with the default values.
    layer3_input = layer2.output.flatten(2)

    # construct a fully-connected sigmoidal layer
    layer3 = HiddenLayer(rng,
                         input=layer3_input,
                         n_in=nkerns[2] * (chosen_height / 8) *
                         (chosen_width / 8),
                         n_out=800,
                         activation=T.tanh)

    # classify the values of the fully-connected sigmoidal layer
    layer4 = LogisticRegression(input=layer3.output, n_in=800, n_out=6)

    cost = layer4.negative_log_likelihood(y)

    predicted_output = layer4.y_pred

    # create a function to compute the mistakes that are made by the model
    test_model = theano.function(
        [index],
        layer4.errors(y),
        givens={
            x: test_set_x[index * batch_size:(index + 1) * batch_size],
            y: test_set_y[index * batch_size:(index + 1) * batch_size]
        })

    validate_model = theano.function(
        [index],
        layer4.errors(y),
        givens={
            x: valid_set_x[index * batch_size:(index + 1) * batch_size],
            y: valid_set_y[index * batch_size:(index + 1) * batch_size]
        })

    # create a list of all model parameters to be fit by gradient descent
    params = layer4.params + layer3.params + layer2.params + layer1.params + layer0.params

    #Loading the model
    # f = file('../saved_models/model317.save.npy', 'r')
    # params = cPickle.load(f)
    # print(params)
    # f.close()
    # # layer4.params, layer3.params, layer2.params, layer1.params, layer0.params = params
    # # layer4.W, layer4.b = layer4.params
    # # layer3.W, layer3.b = layer3.params
    # # layer2.W, layer2.b = layer2.params
    # # layer1.W, layer1.b = layer1.params
    # # layer0.W, layer0.b = layer0.params
    # layer4.W, layer4.b, layer3.W, layer3.b, layer2.W, layer2.b, layer1.W, layer1.b, layer0.W, layer0.b = params
    # layer4.params = [layer4.W, layer4.b]
    # layer3.params = [layer3.W, layer3.b]
    # layer2.params = [layer2.W, layer2.b]
    # layer1.params = [layer1.W, layer1.b]
    # layer0.params = [layer0.W, layer0.b]

    # x = cPickle.load(f)
    # layer4.params = [layer4.W, layer4.b]
    # layer3.params = [layer3.W, layer3.b]
    # layer2.params = [layer2.W, layer2.b]
    # layer1.params = [layer1.W, layer1.b]
    # layer0.params = [layer0.W, layer0.b]

    # test it on the test set
    test_losses = [test_model(i) for i in range(n_test_batches)]
    validation_losses = [validate_model(i) for i in range(n_valid_batches)]

    test_score = numpy.mean(test_losses)
    validation_score = numpy.mean(validation_losses)
    print((' Validation error is %f %%') % (validation_score * 100.))
    print((' Test error is %f %%') % (test_score * 100.))
def stochastic_gradient_descent_mnist(
        learning_rate=0.13,
        n_epochs=1000,
        path='/home/tao/Projects/machine-learning/data/mnist.pkl.gz',
        batch_size=600):

    datasets = load_data(path)

    train_set_data, train_set_label = datasets[0]
    validation_set_data, validation_set_label = datasets[1]
    test_set_data, test_set_label = datasets[2]

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_data.get_value(
        borrow=True).shape[0] // batch_size
    n_valid_batches = validation_set_data.get_value(
        borrow=True).shape[0] // batch_size
    n_test_batches = test_set_data.get_value(
        borrow=True).shape[0] // batch_size

    print('... building the model')

    index = T.lscalar()  # index to a [mini]batch

    data = T.matrix('x')  # data, presented as rasterized images
    label = T.ivector('y')  # labels, presented as 1D vector of [int] labels

    classifier = LogisticRegression(input=data,
                                    input_dim=28 * 28,
                                    output_dim=10)

    objective_function = classifier.negative_log_likelihood(label)

    # testing model
    test_model = theano.function(
        inputs=[index],
        outputs=classifier.errors(label),
        givens={
            data: test_set_data[index * batch_size:(index + 1) * batch_size],
            label: test_set_label[index * batch_size:(index + 1) * batch_size]
        })
    # validation model
    validate_model = theano.function(
        inputs=[index],
        outputs=classifier.errors(label),
        givens={
            data:
            validation_set_data[index * batch_size:(index + 1) * batch_size],
            label:
            validation_set_label[index * batch_size:(index + 1) * batch_size]
        })

    # gradients
    g_W = T.grad(cost=objective_function, wrt=classifier.W)
    g_b = T.grad(cost=objective_function, wrt=classifier.b)

    # update rule
    updates = [(classifier.W, classifier.W - learning_rate * g_W),
               (classifier.b, classifier.b - learning_rate * g_b)]

    # training model
    train_model = theano.function(
        inputs=[index],
        outputs=objective_function,
        updates=updates,
        givens={
            data: train_set_data[index * batch_size:(index + 1) * batch_size],
            label: train_set_label[index * batch_size:(index + 1) * batch_size]
        })

    print('... training the model')
    # early-stopping parameters
    patience = 5000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is found
    improvement_threshold = 0.995  # a relative improvement of this much is considered significant
    # go through this many minibatche before checking the network on the validation set; in this case we check every epoch
    validation_frequency = min(n_train_batches, patience // 2)

    best_validation_loss = numpy.inf
    test_score = 0.
    start_time = timeit.default_timer()

    done_looping = False
    epoch = 0
    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        for minibatch_index in range(n_train_batches):
            minibatch_avg_cost = train_model(minibatch_index)
            # iteration number
            iter = (epoch - 1) * n_train_batches + minibatch_index
            if (iter + 1) % validation_frequency == 0:
                # compute zero-one loss on validation set
                validation_losses = [
                    validate_model(i) for i in range(n_valid_batches)
                ]  # grammar sugar
                this_validation_loss = numpy.mean(validation_losses)

                print('epoch %i, minibatch %i/%i, validation error %f %%' %
                      (epoch, minibatch_index + 1, n_train_batches,
                       this_validation_loss * 100.))

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:
                    #improve patience if loss improvement is good enough
                    if this_validation_loss < best_validation_loss * improvement_threshold:
                        patience = max(patience, iter * patience_increase)

                    best_validation_loss = this_validation_loss
                    # test it on the test set
                    test_losses = [
                        test_model(i) for i in range(n_test_batches)
                    ]
                    test_score = numpy.mean(test_losses)

                    print((
                        '     epoch %i, minibatch %i/%i, test error of best model %f %%'
                    ) % (epoch, minibatch_index + 1, n_train_batches,
                         test_score * 100.))

                    with open('best_model.pkl', 'wb') as f:
                        pickle.dump(classifier, f)

            if patience <= iter:
                done_looping = True
                break

    end_time = timeit.default_timer()
    print(
        'Optimization complete with best validation score of %f %%, with test performance %f %%'
        % (best_validation_loss * 100., test_score * 100.))
    print('The code run for %d epochs, with %f epochs/sec' %
          (epoch, 1. * epoch / (end_time - start_time)))
    print(
        ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.1fs' %
         ((end_time - start_time))),
        file=sys.stderr)
class CNN(object):
    ''' 
        Convolutional Neural Network with 2 convolutional pooling layers
        The default parameters are for the MNIST dataset
        NOTE: Dataset is required to be 28x28 images with three sub data sets 
    '''
    def __init__(self,
                 datasets,
                 batch_size=500,
                 nkerns=[20, 50],
                 img_size=(28, 28),
                 learning_rate=0.1):

        train_set_x, train_set_y = datasets[0]
        valid_set_x, valid_set_y = datasets[1]
        test_set_x, test_set_y = datasets[2]

        self.batch_size = batch_size
        # compute number of minibatches for training, validation and testing
        self.n_train_batches = train_set_x.get_value(borrow=True).shape[0]
        self.n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]
        self.n_test_batches = test_set_x.get_value(borrow=True).shape[0]
        self.n_train_batches /= batch_size
        self.n_valid_batches /= batch_size
        self.n_test_batches /= batch_size

        # allocate symbolic variables for the data
        self.index = T.lscalar()  # index to a [mini]batch
        self.x = T.matrix('x')
        self.y = T.ivector('y')

        rng = np.random.RandomState(23455)

        layer0_input = self.x.reshape(
            (batch_size, 1, img_size[0], img_size[1]))

        # Create the two convolutional layers that also perform downsampling using maxpooling
        self.layer0 = ConvPoolLayer(rng,
                                    input=layer0_input,
                                    image_shape=(batch_size, 1, img_size[0],
                                                 img_size[1]),
                                    filter_shape=(nkerns[0], 1, 5, 5),
                                    poolsize=(2, 2))

        self.layer1 = ConvPoolLayer(rng,
                                    input=self.layer0.output,
                                    image_shape=(batch_size, nkerns[0], 12,
                                                 12),
                                    filter_shape=(nkerns[1], nkerns[0], 5, 5),
                                    poolsize=(2, 2))

        layer2_input = self.layer1.output.flatten(2)

        # Create the hidden layer of the MLP
        self.layer2 = HiddenLayer(rng,
                                  input=layer2_input,
                                  n_in=nkerns[1] * 4 * 4,
                                  n_out=500,
                                  activation=T.tanh)

        # Create the logistic regression layer for classifiying the results
        self.layer3 = LogisticRegression(input=self.layer2.output,
                                         n_in=500,
                                         n_out=10)

        self.cost = self.layer3.negative_log_likelihood(self.y)

        self.params = self.layer3.params + self.layer2.params + self.layer1.params + self.layer0.params

        self.grads = T.grad(self.cost, self.params)

        # Update list for the paramters to be used when training the model
        updates = [(param_i, param_i - learning_rate * grad_i)
                   for param_i, grad_i in zip(self.params, self.grads)]

        # This function updates the model parameters using Stochastic Gradient Descent
        self.train_model = th.function(
            [self.index],
            self.
            cost,  # This is the negative-log-likelihood of the Logistic Regression layer
            updates=updates,
            givens={
                self.x:
                test_set_x[self.index * batch_size:(self.index + 1) *
                           batch_size],
                self.y:
                test_set_y[self.index * batch_size:(self.index + 1) *
                           batch_size]
            })

        # These are Theano functions for testing performance on our test and validation datasets
        self.test_model = th.function(
            [self.index],
            self.layer3.errors(self.y),
            givens={
                self.x:
                test_set_x[self.index * batch_size:(self.index + 1) *
                           batch_size],
                self.y:
                test_set_y[self.index * batch_size:(self.index + 1) *
                           batch_size]
            })

        self.validate_model = th.function(
            [self.index],
            self.layer3.errors(self.y),
            givens={
                self.x:
                valid_set_x[self.index * batch_size:(self.index + 1) *
                            batch_size],
                self.y:
                valid_set_y[self.index * batch_size:(self.index + 1) *
                            batch_size]
            })

    def train(self,
              n_epochs,
              patience=10000,
              patience_increase=2,
              improvement_threshold=0.995):
        ''' Train the CNN on the training data for a defined number of epochs '''
        # Setup the variables for training the model
        n_train_batches = self.n_train_batches
        n_valid_batches = self.n_valid_batches
        n_test_batches = self.n_test_batches
        validation_frequency = min(n_train_batches, patience / 2)
        best_validation_loss = np.inf
        best_iter = 0
        best_score = 0.
        epoch = 0
        done_looping = False
        # Train the CNN for a defined number of epochs
        while (epoch < n_epochs) and (not done_looping):
            epoch = epoch + 1
            for minibatch_index in xrange(n_train_batches):
                iter = (epoch - 1) * n_train_batches + minibatch_index
                # Every 100 iterations
                if iter % 100 == 0:
                    print 'Training iteration ', iter
                    cost_ij = self.train_model(minibatch_index)

                if (iter + 1) % validation_frequency == 0:
                    # Compute zero-one loss on validation set
                    validation_losses = [
                        self.validate_model(i) for i in xrange(n_valid_batches)
                    ]
                    this_validation_loss = np.mean(validation_losses)
                    print('epoch %i, minibatch %i/%i, validation error %f %%' %
                          (epoch, minibatch_index + 1, n_train_batches,
                           this_validation_loss * 100.))

                    # Check if current validation loss is best so far
                    if this_validation_loss < best_validation_loss:
                        # Improve patience if loss improvement is good enough
                        if this_validation_loss < best_validation_loss * improvement_threshold:
                            patience = max(patience, iter * patience_increase)
                        # Save best validation score and iteration number
                        best_validation_loss = this_validation_loss
                        best_iter = iter
                if patience <= iter:
                    done_looping = True
                    break
        print 'Optimization complete.'
        print('Best validation score of %f %% obtained at iteration %i' %
              (best_validation_loss * 100., best_iter + 1))

    def test(self, set_x, set_y):
        ''' Test data sets and return the test score '''
        # allocate symbolic variables for the data
        n_test_batches = set_x.get_value(borrow=True).shape[0]
        n_test_batches /= self.batch_size
        test_model = th.function(
            inputs=[self.index],
            outputs=self.layer3.errors(self.y),
            givens={
                self.x:
                set_x[self.index * self.batch_size:(self.index + 1) *
                      self.batch_size],
                self.y:
                set_y[self.index * self.batch_size:(self.index + 1) *
                      self.batch_size]
            })
        test_losses = [test_model(i) for i in xrange(n_test_batches)]
        test_score = np.mean(test_losses)
        return test_score

    def classify(self, set):
        ''' 
           Return the labels for the given set
           NOTE: The batch size must be the same as the training set  
        '''
        n_test_batches = set.get_value(borrow=True).shape[0]
        n_test_batches /= self.batch_size
        classify_data = th.function(
            inputs=[self.index
                    ],  # Input to this function is a mini-batch at index
            outputs=self.layer3.y_pred,  # Output the y_predictions
            givens={
                self.x:
                set[self.index * batch_size:(self.index + 1) * batch_size]
            })
        # Generate labels for the given data
        labels = [classify_data(i) for i in xrange(n_test_batches)]
        return np.array(labels)
class DBN(object):
    """Deep Belief Network

    A deep belief network is obtained by stacking several RBMs on top of each
    other. The hidden layer of the RBM at layer `i` becomes the input of the
    RBM at layer `i+1`. The first layer RBM gets as input the input of the
    network, and the hidden layer of the last RBM represents the output. When
    used for classification, the DBN is treated as a MLP, by adding a logistic
    regression layer on top.
    """

    def __init__(self, numpy_rng, theano_rng=None, n_ins=N_FEATURES * N_FRAMES,
                 hidden_layers_sizes=[1024, 1024], n_outs=62 * 3,
                 rho=0.90, eps=1.E-6):
        """This class is made to support a variable number of layers.

        :type numpy_rng: numpy.random.RandomState
        :param numpy_rng: numpy random number generator used to draw initial
                    weights

        :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams
        :param theano_rng: Theano random generator; if None is given one is
                           generated based on a seed drawn from `rng`

        :type n_ins: int
        :param n_ins: dimension of the input to the DBN

        :type n_layers_sizes: list of ints
        :param n_layers_sizes: intermediate layers size, must contain
                               at least one value

        :type n_outs: int
        :param n_outs: dimension of the output of the network
        """

        self.sigmoid_layers = []
        self.rbm_layers = []
        self.params = []
        self.n_layers = len(hidden_layers_sizes)
        #self._rho = shared(numpy.cast['float32'](rho), name='rho')  # for adadelta
        #self._eps = shared(numpy.cast['float32'](eps), name='eps')  # for adadelta
        self._rho = rho
        self._eps = eps
        self._accugrads = []  # for adadelta
        self._accudeltas = []  # for adadelta

        assert self.n_layers > 0

        if not theano_rng:
            theano_rng = RandomStreams(numpy_rng.randint(2 ** 30))

        # allocate symbolic variables for the data
        self.x = T.fmatrix('x')  # the data is presented as rasterized images
        self.y = T.ivector('y')  # the labels are presented as 1D vector
                                 # of [int] labels

        # The DBN is an MLP, for which all weights of intermediate
        # layers are shared with a different RBM.  We will first
        # construct the DBN as a deep multilayer perceptron, and when
        # constructing each sigmoidal layer we also construct an RBM
        # that shares weights with that layer. During pretraining we
        # will train these RBMs (which will lead to chainging the
        # weights of the MLP as well) During finetuning we will finish
        # training the DBN by doing stochastic gradient descent on the
        # MLP.

        for i in xrange(self.n_layers):
            # construct the sigmoidal layer

            # the size of the input is either the number of hidden
            # units of the layer below or the input size if we are on
            # the first layer
            if i == 0:
                input_size = n_ins
            else:
                input_size = hidden_layers_sizes[i - 1]

            # the input to this layer is either the activation of the
            # hidden layer below or the input of the DBN if you are on
            # the first layer
            if i == 0:
                layer_input = self.x
            else:
                layer_input = self.sigmoid_layers[-1].output

            sigmoid_layer = HiddenLayer(rng=numpy_rng,
                                        input=layer_input,
                                        n_in=input_size,
                                        n_out=hidden_layers_sizes[i],
                                        activation=T.nnet.sigmoid)

            # add the layer to our list of layers
            self.sigmoid_layers.append(sigmoid_layer)

            # its arguably a philosophical question...  but we are
            # going to only declare that the parameters of the
            # sigmoid_layers are parameters of the DBN. The visible
            # biases in the RBM are parameters of those RBMs, but not
            # of the DBN.
            self.params.extend(sigmoid_layer.params)
            self._accugrads.extend([shared(value=numpy.zeros((input_size, hidden_layers_sizes[i]), dtype='float32'), name='accugrad_W', borrow=True), shared(value=numpy.zeros((hidden_layers_sizes[i], ), dtype='float32'), name='accugrad_b', borrow=True)]) # TODO
            self._accudeltas.extend([shared(value=numpy.zeros((input_size, hidden_layers_sizes[i]), dtype='float32'), name='accudelta_W', borrow=True), shared(value=numpy.zeros((hidden_layers_sizes[i], ), dtype='float32'), name='accudelta_b', borrow=True)]) # TODO

            # Construct an RBM that shared weights with this layer
            if i == 0:
                rbm_layer = GRBM(numpy_rng=numpy_rng,
                                theano_rng=theano_rng,
                                input=layer_input,
                                n_visible=input_size,
                                n_hidden=hidden_layers_sizes[i],
                                W=sigmoid_layer.W,
                                hbias=sigmoid_layer.b)
            else:
                rbm_layer = RBM(numpy_rng=numpy_rng,
                                theano_rng=theano_rng,
                                input=layer_input,
                                n_visible=input_size,
                                n_hidden=hidden_layers_sizes[i],
                                W=sigmoid_layer.W,
                                hbias=sigmoid_layer.b)
            self.rbm_layers.append(rbm_layer)

        # We now need to add a logistic layer on top of the MLP
        self.logLayer = LogisticRegression(
            input=self.sigmoid_layers[-1].output,
            n_in=hidden_layers_sizes[-1],
            n_out=n_outs)
        self.params.extend(self.logLayer.params)
        self._accugrads.extend([shared(value=numpy.zeros((hidden_layers_sizes[-1], n_outs), dtype='float32'), name='accugrad_W', borrow=True), shared(value=numpy.zeros((n_outs, ), dtype='float32'), name='accugrad_b', borrow=True)]) # TODO
        self._accudeltas.extend([shared(value=numpy.zeros((hidden_layers_sizes[-1], n_outs), dtype='float32'), name='accudelta_W', borrow=True), shared(value=numpy.zeros((n_outs, ), dtype='float32'), name='accudelta_b', borrow=True)]) # TODO

        # compute the cost for second phase of training, defined as the
        # negative log likelihood of the logistic regression (output) layer
        self.finetune_cost = self.logLayer.negative_log_likelihood(self.y)
        self.finetune_cost_sum = self.logLayer.negative_log_likelihood_sum(self.y)

        # compute the gradients with respect to the model parameters
        # symbolic variable that points to the number of errors made on the
        # minibatch given by self.x and self.y
        self.errors = self.logLayer.errors(self.y)

    def pretraining_functions(self, k):
        batch_x = T.fmatrix('batch_x')
        learning_rate = T.scalar('lr')  # learning rate to use

        pretrain_fns = []
        for rbm in self.rbm_layers:

            # get the cost and the updates list
            # using CD-k here (persisent=None) for training each RBM.
            # TODO: change cost function to reconstruction error
            #markov_chain = shared(numpy.empty((batch_size, rbm.n_hidden), dtype='float32'), borrow=True)
            markov_chain = None
            cost, updates = rbm.get_cost_updates(learning_rate,
                                                 persistent=markov_chain, k=k)

            # compile the theano function
            fn = theano.function(inputs=[batch_x,
                            theano.Param(learning_rate, default=0.1)],
                                 outputs=cost,
                                 updates=updates,
                                 givens={self.x: batch_x})
            # append `fn` to the list of functions
            pretrain_fns.append(fn)

        return pretrain_fns

    def get_SGD_trainer(self):
        """ Returns a plain SGD minibatch trainer with learning rate as param.
        """
        batch_x = T.fmatrix('batch_x')
        batch_y = T.ivector('batch_y')
        learning_rate = T.fscalar('lr')  # learning rate to use
        cost = self.finetune_cost_sum
        # compute the gradients with respect to the model parameters
        gparams = T.grad(cost, self.params)

        # compute list of fine-tuning updates
        updates = OrderedDict()
        for param, gparam in zip(self.params, gparams):
            updates[param] = param - gparam * learning_rate 

        train_fn = theano.function(inputs=[theano.Param(batch_x), 
            theano.Param(batch_y),
            theano.Param(learning_rate)],
            outputs=cost,
            updates=updates,
            givens={self.x: batch_x, self.y: batch_y})

        return train_fn

    def get_adadelta_trainer(self):
        """ Returns an Adadelta (Zeiler 2012) trainer using self._rho and self._eps params.
        """
        batch_x = T.fmatrix('batch_x')
        batch_y = T.ivector('batch_y')
        cost = self.finetune_cost_sum
        # compute the gradients with respect to the model parameters
        gparams = T.grad(cost, self.params)

        # compute list of fine-tuning updates
        updates = OrderedDict()
        for accugrad, accudelta, param, gparam in zip(self._accugrads,
                self._accudeltas, self.params, gparams):
            # c.f. Algorithm 1 in the Adadelta paper (Zeiler 2012)
            agrad = self._rho * accugrad + (1 - self._rho) * gparam * gparam
            dx = - T.sqrt((accudelta + self._eps) / (agrad + self._eps)) * gparam
            updates[accudelta] = self._rho * accudelta + (1 - self._rho) * dx * dx
            updates[param] = param + dx
            updates[accugrad] = agrad

        train_fn = theano.function(inputs=[theano.Param(batch_x), 
            theano.Param(batch_y)],
            outputs=cost,
            updates=updates,
            givens={self.x: batch_x, self.y: batch_y})

        return train_fn

    def get_adagrad_trainer(self):
        """ Returns an Adagrad (Duchi et al. 2010) trainer using a learning rate.
        """
        batch_x = T.fmatrix('batch_x')
        batch_y = T.ivector('batch_y')
        learning_rate = T.fscalar('lr')  # learning rate to use
        cost = self.finetune_cost_sum
        # compute the gradients with respect to the model parameters
        gparams = T.grad(cost, self.params)

        # compute list of fine-tuning updates
        updates = OrderedDict()
        for accugrad, param, gparam in zip(self._accugrads, self.params, gparams):
            # c.f. Algorithm 1 in the Adadelta paper (Zeiler 2012)
            agrad = accugrad + gparam * gparam
            dx = - (learning_rate / T.sqrt(agrad + self._eps)) * gparam
            updates[param] = param + dx
            updates[accugrad] = agrad

        train_fn = theano.function(inputs=[theano.Param(batch_x), 
            theano.Param(batch_y),
            theano.Param(learning_rate)],
            outputs=cost,
            updates=updates,
            givens={self.x: batch_x, self.y: batch_y})

        return train_fn

    def get_SAG_trainer(self):
        """ Returns a Stochastic Averaged Gradient (Bach & Moulines 2011) trainer.

        This is based on Bach 2013 slides: 
        PRavg(theta_n) = Polyak-Ruppert averaging = (1+n)^{-1} * \sum_{k=0}^n theta_k
        theta_n = theta_{n-1} - gamma [ f'_n(PR_avg(theta_{n-1})) + f''_n(PR_avg(
                  theta_{n-1})) * (theta_{n-1} - PR_avg(theta_{n-1}))]

        That returns two trainers: one for the first epoch, one for subsequent epochs.
        We use self._accudeltas to but the Polyak-Ruppert averaging,
        and self._accugrads for the number of iterations (updates).
        """
        print "UNFINISHED, see TODO in get_SAG_trainer()"
        sys.exit(-1)

        batch_x = T.fmatrix('batch_x')
        batch_y = T.ivector('batch_y')
        learning_rate = T.fscalar('lr')  # learning rate to use
        cost = self.finetune_cost_sum

        # First trainer:
        gparams = T.grad(cost, self.params)
        updates = OrderedDict()
        for accudelta, accugrad, param, gparam in zip(self._accudeltas, self._accugrads, self.params, gparams):
            theta = param - gparam * learning_rate 
            updates[accudelta] = (theta + accudelta * accugrad) / (accugrad + 1.)
            updates[param] = theta
            updates[accugrad] = accugrad + 1.

        train_fn_init = theano.function(inputs=[theano.Param(batch_x), 
            theano.Param(batch_y),
            theano.Param(learning_rate)],
            outputs=cost,
            updates=updates,
            givens={self.x: batch_x, self.y: batch_y})

        # Second trainer:
        gparams = T.grad(cost, self._accudeltas)  # TODO recreate the network with 
        # (TODO) self._accudeltas instead of self.params so that we can compute the cost
        hparams = T.grad(cost, gparams)

        # compute list of fine-tuning updates
        updates = OrderedDict()
        for accudelta, accugrad, param, gparam, hparam in zip(self._accudeltas, self._accugrads, self.params, gparams, hparams):
            theta = param - learning_rate * (gparam + hparam * (param - accudelta))
            updates[accudelta] = (theta + accudelta * accugrad) / (accugrad + 1.)
            updates[param] = theta
            updates[accugrad] = accugrad + 1.

        train_fn = theano.function(inputs=[theano.Param(batch_x), 
            theano.Param(batch_y),
            theano.Param(learning_rate)],
            outputs=cost,
            updates=updates,
            givens={self.x: batch_x, self.y: batch_y})

        return train_fn_init, train_fn

    def get_SGD_ld_trainer(self):
        """ Returns an SGD-ld trainer (Schaul et al. 2012).
        """
        print "UNFINISHED, see TODO in get_SGD_ld_trainer()"
        sys.exit(-1)

        batch_x = T.fmatrix('batch_x')
        batch_y = T.ivector('batch_y')
        cost = self.finetune_cost_sum
        # compute the gradients with respect to the model parameters
        gparams = T.grad(cost, self.params)
        # INIT TODO

        # compute list of fine-tuning updates
        updates = OrderedDict()
        for accugrad, accudelta, accuhess, param, gparam in zip(self._accugrads, self._accudeltas, self._accuhess, self.params, gparams):
            pass  # TODO
            # TODO 
            # TODO 

        train_fn = theano.function(inputs=[theano.Param(batch_x), 
            theano.Param(batch_y)],
            outputs=cost,
            updates=updates,
            givens={self.x: batch_x, self.y: batch_y})

        return train_fn

    def score_classif(self, given_set):
        """ Returns functions to get current classification scores. """
        batch_x = T.fmatrix('batch_x')
        batch_y = T.ivector('batch_y')
        score = theano.function(inputs=[theano.Param(batch_x), theano.Param(batch_y)],
                outputs=self.errors,
                givens={self.x: batch_x, self.y: batch_y})

        # Create a function that scans the entire set given as input
        def scoref():
            return [score(batch_x, batch_y) for batch_x, batch_y in given_set]

        return scoref
class CNN(object):
    ''' 
        Convolutional Neural Network with 2 convolutional pooling layers
        The default parameters are for the MNIST dataset
        NOTE: Dataset is required to be 28x28 images with three sub data sets 
    '''
    def __init__(self, datasets, batch_size=500, nkerns=[20, 50], img_size=(28, 28), learning_rate=0.1):
        
        train_set_x, train_set_y = datasets[0]
        valid_set_x, valid_set_y = datasets[1]
        test_set_x, test_set_y = datasets[2]
        
        self.batch_size = batch_size
        # compute number of minibatches for training, validation and testing
        self.n_train_batches = train_set_x.get_value(borrow=True).shape[0]
        self.n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]
        self.n_test_batches = test_set_x.get_value(borrow=True).shape[0]
        self.n_train_batches /= batch_size
        self.n_valid_batches /= batch_size
        self.n_test_batches /= batch_size

        # allocate symbolic variables for the data
        self.index = T.lscalar()  # index to a [mini]batch
        self.x = T.matrix('x')
        self.y = T.ivector('y')

        rng = np.random.RandomState(23455)
        
        layer0_input = self.x.reshape((batch_size, 1, img_size[0], img_size[1]))
        
        # Create the two convolutional layers that also perform downsampling using maxpooling
        self.layer0 = ConvPoolLayer(rng,
                                    input=layer0_input,
                                    image_shape=(batch_size, 1, img_size[0], img_size[1]),
                                    filter_shape=(nkerns[0], 1, 5, 5), 
                                    poolsize=(2,2))

        self.layer1 = ConvPoolLayer(rng,
                                    input=self.layer0.output,
                                    image_shape=(batch_size, nkerns[0], 12, 12),
                                    filter_shape=(nkerns[1], nkerns[0], 5, 5), 
                                    poolsize=(2,2))

        layer2_input = self.layer1.output.flatten(2)
       
        # Create the hidden layer of the MLP
        self.layer2 = HiddenLayer(rng,
                                  input=layer2_input,
                                  n_in=nkerns[1] * 4 * 4,
                                  n_out=500,
                                  activation=T.tanh)

        # Create the logistic regression layer for classifiying the results
        self.layer3 = LogisticRegression(input=self.layer2.output, n_in=500, n_out=10)

        self.cost = self.layer3.negative_log_likelihood(self.y)

        self.params = self.layer3.params + self.layer2.params + self.layer1.params + self.layer0.params

        self.grads = T.grad(self.cost, self.params)

        # Update list for the paramters to be used when training the model
        updates = [(param_i, param_i - learning_rate * grad_i)
                   for param_i, grad_i in zip(self.params, self.grads)]

        # This function updates the model parameters using Stochastic Gradient Descent
        self.train_model = th.function([self.index],
                                       self.cost, # This is the negative-log-likelihood of the Logistic Regression layer
                                       updates=updates,
                                       givens={self.x: test_set_x[self.index * batch_size: (self.index + 1) * batch_size],
                                               self.y: test_set_y[self.index * batch_size: (self.index + 1) * batch_size]})
                                     
        # These are Theano functions for testing performance on our test and validation datasets
        self.test_model = th.function([self.index],
                                      self.layer3.errors(self.y),
                                      givens={self.x: test_set_x[self.index * batch_size: (self.index + 1) * batch_size],
                                              self.y: test_set_y[self.index * batch_size: (self.index + 1) * batch_size]})

        self.validate_model = th.function([self.index],
                                          self.layer3.errors(self.y),
                                          givens={self.x: valid_set_x[self.index * batch_size: (self.index + 1) * batch_size],
                                                  self.y: valid_set_y[self.index * batch_size: (self.index + 1) * batch_size]})

    def train(self, n_epochs, patience=10000, patience_increase=2, improvement_threshold=0.995):
        ''' Train the CNN on the training data for a defined number of epochs '''
        # Setup the variables for training the model
        n_train_batches = self.n_train_batches
        n_valid_batches = self.n_valid_batches
        n_test_batches = self.n_test_batches
        validation_frequency = min(n_train_batches, patience / 2)
        best_validation_loss = np.inf
        best_iter = 0
        best_score = 0.
        epoch = 0
        done_looping = False
        # Train the CNN for a defined number of epochs
        while (epoch < n_epochs) and (not done_looping):
            epoch = epoch + 1
            for minibatch_index in xrange(n_train_batches):
                iter = (epoch - 1) * n_train_batches + minibatch_index
                # Every 100 iterations
                if iter % 100 == 0:
                    print 'Training iteration ', iter
                    cost_ij = self.train_model(minibatch_index)

                if (iter + 1) % validation_frequency == 0:
                    # Compute zero-one loss on validation set
                    validation_losses = [self.validate_model(i) for i
                                         in xrange(n_valid_batches)]
                    this_validation_loss = np.mean(validation_losses)
                    print('epoch %i, minibatch %i/%i, validation error %f %%' %
                          (epoch, minibatch_index + 1, n_train_batches,
                           this_validation_loss * 100.))

                    # Check if current validation loss is best so far
                    if this_validation_loss < best_validation_loss:
                        # Improve patience if loss improvement is good enough
                        if this_validation_loss < best_validation_loss * improvement_threshold:
                            patience = max(patience, iter * patience_increase)
                        # Save best validation score and iteration number
                        best_validation_loss = this_validation_loss
                        best_iter = iter
                if patience <= iter:
                    done_looping = True
                    break
        print 'Optimization complete.'
        print('Best validation score of %f %% obtained at iteration %i' %
              (best_validation_loss * 100., best_iter + 1))

    def test(self, set_x, set_y):
        ''' Test data sets and return the test score '''
        # allocate symbolic variables for the data
        n_test_batches = set_x.get_value(borrow=True).shape[0]
        n_test_batches /= self.batch_size
        test_model = th.function(inputs=[self.index],
                                 outputs=self.layer3.errors(self.y),
                                 givens={self.x: set_x[self.index * self.batch_size: (self.index + 1) * self.batch_size],
                                         self.y: set_y[self.index * self.batch_size: (self.index + 1) * self.batch_size]})
        test_losses = [test_model(i)
                       for i in xrange(n_test_batches)]
        test_score = np.mean(test_losses)
        return test_score

    def classify(self, set):
        ''' 
           Return the labels for the given set
           NOTE: The batch size must be the same as the training set  
        '''
        n_test_batches = set.get_value(borrow=True).shape[0]
        n_test_batches /= self.batch_size
        classify_data = th.function(inputs=[self.index], # Input to this function is a mini-batch at index
                                    outputs=self.layer3.y_pred, # Output the y_predictions
                                    givens={self.x: set[self.index * batch_size: (self.index + 1) * batch_size]}) 
        # Generate labels for the given data
        labels = [classify_data(i)
                  for i in xrange(n_test_batches)]
        return np.array(labels)
Beispiel #20
0
class DBN(object):
    """Deep Belief Network

    A deep belief network is obtained by stacking several RBMs on top of each
    other. The hidden layer of the RBM at layer `i` becomes the input of the
    RBM at layer `i+1`. The first layer RBM gets as input the input of the
    network, and the hidden layer of the last RBM represents the output. When
    used for classification, the DBN is treated as a MLP, by adding a logistic
    regression layer on top.
    """

    def __init__(self, numpy_rng, theano_rng=None, n_ins=N_FEATURES * N_FRAMES,
                 hidden_layers_sizes=[1024, 1024], n_phn=62 * 3, n_spkr=1,
                 rho=0.90, eps=1.E-6):
        """This class is made to support a variable number of layers.

        :type numpy_rng: numpy.random.RandomState
        :param numpy_rng: numpy random number generator used to draw initial
                    weights

        :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams
        :param theano_rng: Theano random generator; if None is given one is
                           generated based on a seed drawn from `rng`

        :type n_ins: int
        :param n_ins: dimension of the input to the DBN

        :type n_layers_sizes: list of ints
        :param n_layers_sizes: intermediate layers size, must contain
                               at least one value

        :type n_outs: int
        :param n_outs: dimension of the output of the network
        """

        self.sigmoid_layers = []
        self.rbm_layers = []
        self.params = []
        self.n_layers = len(hidden_layers_sizes)
        #self._rho = shared(numpy.cast['float32'](rho), name='rho')  # for adadelta
        #self._eps = shared(numpy.cast['float32'](eps), name='eps')  # for adadelta
        self._rho = rho
        self._eps = eps
        self._accugrads = []  # for adadelta
        self._accudeltas = []  # for adadelta

        assert self.n_layers > 0

        if not theano_rng:
            theano_rng = RandomStreams(numpy_rng.randint(2 ** 30))

        # allocate symbolic variables for the data
        self.x = T.fmatrix('x')  # the data is presented as rasterized images
        self.y_phn = T.ivector('y_phn')  # the labels are presented as 1D vector
                                 # of [int] labels
        self.y_spkr = T.ivector('y_spkr')  # the labels are presented as 1D vector
                                 # of [int] labels

        # The DBN is an MLP, for which all weights of intermediate
        # layers are shared with a different RBM.  We will first
        # construct the DBN as a deep multilayer perceptron, and when
        # constructing each sigmoidal layer we also construct an RBM
        # that shares weights with that layer. During pretraining we
        # will train these RBMs (which will lead to chainging the
        # weights of the MLP as well) During finetuning we will finish
        # training the DBN by doing stochastic gradient descent on the
        # MLP.

        for i in xrange(self.n_layers):
            # construct the sigmoidal layer

            # the size of the input is either the number of hidden
            # units of the layer below or the input size if we are on
            # the first layer
            if i == 0:
                input_size = n_ins
            else:
                input_size = hidden_layers_sizes[i - 1]

            # the input to this layer is either the activation of the
            # hidden layer below or the input of the DBN if you are on
            # the first layer
            if i == 0:
                layer_input = self.x
            else:
                layer_input = self.sigmoid_layers[-1].output

            sigmoid_layer = HiddenLayer(rng=numpy_rng,
                                        input=layer_input,
                                        n_in=input_size,
                                        n_out=hidden_layers_sizes[i],
                                        activation=T.nnet.sigmoid)

            # add the layer to our list of layers
            self.sigmoid_layers.append(sigmoid_layer)

            # its arguably a philosophical question...  but we are
            # going to only declare that the parameters of the
            # sigmoid_layers are parameters of the DBN. The visible
            # biases in the RBM are parameters of those RBMs, but not
            # of the DBN.
            self.params.extend(sigmoid_layer.params)
            self._accugrads.extend([shared(value=numpy.zeros((input_size, hidden_layers_sizes[i]), dtype='float32'), name='accugrad_W', borrow=True), shared(value=numpy.zeros((hidden_layers_sizes[i], ), dtype='float32'), name='accugrad_b', borrow=True)]) # TODO
            self._accudeltas.extend([shared(value=numpy.zeros((input_size, hidden_layers_sizes[i]), dtype='float32'), name='accudelta_W', borrow=True), shared(value=numpy.zeros((hidden_layers_sizes[i], ), dtype='float32'), name='accudelta_b', borrow=True)]) # TODO

            # Construct an RBM that shared weights with this layer
            if i == 0:
                rbm_layer = GRBM(numpy_rng=numpy_rng,
                                theano_rng=theano_rng,
                                input=layer_input,
                                n_visible=input_size,
                                n_hidden=hidden_layers_sizes[i],
                                W=sigmoid_layer.W,
                                hbias=sigmoid_layer.b)
            else:
                rbm_layer = RBM(numpy_rng=numpy_rng,
                                theano_rng=theano_rng,
                                input=layer_input,
                                n_visible=input_size,
                                n_hidden=hidden_layers_sizes[i],
                                W=sigmoid_layer.W,
                                hbias=sigmoid_layer.b)
            self.rbm_layers.append(rbm_layer)

        # We now need to add a logistic layer on top of the MLP
        self.logLayerPhn = LogisticRegression(
            input=self.sigmoid_layers[-1].output,
            n_in=hidden_layers_sizes[-1],
            n_out=n_phn)
        self.params.extend(self.logLayerPhn.params)
        self._accugrads.extend([shared(value=numpy.zeros((hidden_layers_sizes[-1], n_phn), dtype='float32'), name='accugrad_W', borrow=True), shared(value=numpy.zeros((n_phn, ), dtype='float32'), name='accugrad_b', borrow=True)]) # TODO
        self._accudeltas.extend([shared(value=numpy.zeros((hidden_layers_sizes[-1], n_phn), dtype='float32'), name='accudelta_W', borrow=True), shared(value=numpy.zeros((n_phn, ), dtype='float32'), name='accudelta_b', borrow=True)]) # TODO
        self.logLayerSpkr = LogisticRegression(
            input=self.sigmoid_layers[-1].output,
            n_in=hidden_layers_sizes[-1],
            n_out=n_spkr)
        self.params.extend(self.logLayerSpkr.params)
        self._accugrads.extend([shared(value=numpy.zeros((hidden_layers_sizes[-1], n_spkr), dtype='float32'), name='accugrad_W', borrow=True), shared(value=numpy.zeros((n_spkr, ), dtype='float32'), name='accugrad_b', borrow=True)]) # TODO
        self._accudeltas.extend([shared(value=numpy.zeros((hidden_layers_sizes[-1], n_spkr), dtype='float32'), name='accudelta_W', borrow=True), shared(value=numpy.zeros((n_spkr, ), dtype='float32'), name='accudelta_b', borrow=True)]) # TODO

        self.finetune_cost_sum_phn = self.logLayerPhn.negative_log_likelihood_sum(self.y_phn)
        self.finetune_cost_sum_spkr = self.logLayerSpkr.negative_log_likelihood_sum(self.y_spkr)
        self.finetune_cost_phn = self.logLayerPhn.negative_log_likelihood(self.y_phn)
        self.finetune_cost_spkr = self.logLayerSpkr.negative_log_likelihood(self.y_spkr)

        self.errors_phn = self.logLayerPhn.errors(self.y_phn)
        self.errors_spkr = self.logLayerSpkr.errors(self.y_spkr)

    def get_SGD_trainer(self):
        """ Returns a plain SGD minibatch trainer with learning rate as param.
        """
        batch_x = T.fmatrix('batch_x')
        batch_y = T.ivector('batch_y')
        learning_rate = T.fscalar('lr')  # learning rate to use
        cost = self.finetune_cost_sum
        # compute the gradients with respect to the model parameters
        gparams = T.grad(cost, self.params)

        # compute list of fine-tuning updates
        updates = OrderedDict()
        for param, gparam in zip(self.params, gparams):
            updates[param] = param - gparam * learning_rate 

        train_fn = theano.function(inputs=[theano.Param(batch_x), 
            theano.Param(batch_y),
            theano.Param(learning_rate)],
            outputs=cost,
            updates=updates,
            givens={self.x: batch_x, self.y: batch_y})

        return train_fn

    def get_adadelta_trainer(self):
        """ Returns an Adadelta (Zeiler 2012) trainer using self._rho and self._eps params.
        """
        batch_x = T.fmatrix('batch_x')
        batch_y_phn = T.ivector('batch_y_phn')
        batch_y_spkr = T.ivector('batch_y_spkr')
        cost_phn = self.finetune_cost_sum_phn
        cost_spkr = self.finetune_cost_sum_spkr
        # compute the gradients with respect to the model parameters
        gparams_phn = T.grad(cost_phn, self.params[:-2])
        gparams_spkr = T.grad(cost_spkr, self.params[:-4] + self.params[-2:])

        # compute list of fine-tuning updates
        updates = OrderedDict()
        for accugrad, accudelta, param, gparam in zip(self._accugrads[:-2],
                self._accudeltas[:-2], self.params[:-2], gparams_phn):
            # c.f. Algorithm 1 in the Adadelta paper (Zeiler 2012)
            agrad = self._rho * accugrad + (1 - self._rho) * gparam * gparam
            dx = - T.sqrt((accudelta + self._eps) / (agrad + self._eps)) * gparam
            updates[accudelta] = self._rho * accudelta + (1 - self._rho) * dx * dx
            updates[param] = param + dx
            updates[accugrad] = agrad
        for accugrad, accudelta, param, gparam in zip(self._accugrads[:-4] + self._accugrads[-2:], self._accudeltas[:-4] + self._accudeltas[-2:], self.params[:-4] + self.params[-2:], gparams_spkr):
            # c.f. Algorithm 1 in the Adadelta paper (Zeiler 2012)
            agrad = self._rho * accugrad + (1 - self._rho) * gparam * gparam
            dx = - T.sqrt((accudelta + self._eps) / (agrad + self._eps)) * gparam
            updates[accudelta] = self._rho * accudelta + (1 - self._rho) * dx * dx
            updates[param] = param + dx
            updates[accugrad] = agrad

        train_fn = theano.function(inputs=[theano.Param(batch_x), 
            theano.Param(batch_y_phn),
            theano.Param(batch_y_spkr)],
            outputs=(cost_phn, cost_spkr),
            updates=updates,
            givens={self.x: batch_x, self.y_phn: batch_y_phn, self.y_spkr: batch_y_spkr})

        return train_fn

    def get_adadelta_trainers(self):
        """ Returns an Adadelta (Zeiler 2012) trainer using self._rho and self._eps params.
        """
        batch_x = T.fmatrix('batch_x')
        batch_y_phn = T.ivector('batch_y_phn')
        batch_y_spkr = T.ivector('batch_y_spkr')
        #cost_phn = self.finetune_cost_sum_phn
        cost_phn = self.finetune_cost_phn
        #cost_spkr = self.finetune_cost_sum_spkr
        cost_spkr = self.finetune_cost_spkr
        # compute the gradients with respect to the model parameters
        gparams_phn = T.grad(cost_phn, self.params[:-2])
        gparams_spkr = T.grad(cost_spkr, self.params[:-4] + self.params[-2:])

        # compute list of fine-tuning updates
        updates = OrderedDict()
        for accugrad, accudelta, param, gparam in zip(self._accugrads[:-2],
                self._accudeltas[:-2], self.params[:-2], gparams_phn):
            # c.f. Algorithm 1 in the Adadelta paper (Zeiler 2012)
            agrad = self._rho * accugrad + (1 - self._rho) * gparam * gparam
            dx = - T.sqrt((accudelta + self._eps) / (agrad + self._eps)) * gparam
            updates[accudelta] = self._rho * accudelta + (1 - self._rho) * dx * dx
            updates[param] = param + dx
            updates[accugrad] = agrad
        train_fn_phn = theano.function(inputs=[theano.Param(batch_x), 
            theano.Param(batch_y_phn)],
            outputs=cost_phn,
            updates=updates,
            givens={self.x: batch_x, self.y_phn: batch_y_phn})

        updates = OrderedDict()
        for accugrad, accudelta, param, gparam in zip(self._accugrads[:-4] + self._accugrads[-2:], self._accudeltas[:-4] + self._accudeltas[-2:], self.params[:-4] + self.params[-2:], gparams_spkr):
            # c.f. Algorithm 1 in the Adadelta paper (Zeiler 2012)
            agrad = self._rho * accugrad + (1 - self._rho) * gparam * gparam
            dx = - T.sqrt((accudelta + self._eps) / (agrad + self._eps)) * gparam
            updates[accudelta] = self._rho * accudelta + (1 - self._rho) * dx * dx
            updates[param] = param + dx
            updates[accugrad] = agrad
        train_fn_spkr = theano.function(inputs=[theano.Param(batch_x), 
            theano.Param(batch_y_spkr)],
            outputs=cost_spkr,
            updates=updates,
            #givens={self.x: batch_x[20:24,:], self.y_spkr: batch_y_spkr[20:24]})
            givens={self.x: batch_x, self.y_spkr: batch_y_spkr})

        return train_fn_phn, train_fn_spkr

    def train_only_classif(self):
        batch_x = T.fmatrix('batch_x')
        batch_y_phn = T.ivector('batch_y_phn')
        batch_y_spkr = T.ivector('batch_y_spkr')
        #cost_phn = self.finetune_cost_sum_phn
        cost_phn = self.finetune_cost_phn
        #cost_spkr = self.finetune_cost_sum_spkr
        cost_spkr = self.finetune_cost_spkr
        # compute the gradients with respect to the model parameters
        gparams_phn = T.grad(cost_phn, self.params[-4:-2])
        gparams_spkr = T.grad(cost_spkr, self.params[-2:])

        # compute list of fine-tuning updates
        updates = OrderedDict()
        for accugrad, accudelta, param, gparam in zip(self._accugrads[-4:-2],
                self._accudeltas[-4:-2], self.params[-4:-2], gparams_phn):
            # c.f. Algorithm 1 in the Adadelta paper (Zeiler 2012)
            agrad = self._rho * accugrad + (1 - self._rho) * gparam * gparam
            dx = - T.sqrt((accudelta + self._eps) / (agrad + self._eps)) * gparam
            updates[accudelta] = self._rho * accudelta + (1 - self._rho) * dx * dx
            updates[param] = param + dx
            updates[accugrad] = agrad
        train_fn_phn = theano.function(inputs=[theano.Param(batch_x), 
            theano.Param(batch_y_phn)],
            outputs=cost_phn,
            updates=updates,
            givens={self.x: batch_x, self.y_phn: batch_y_phn})

        updates = OrderedDict()
        for accugrad, accudelta, param, gparam in zip(self._accugrads[-2:], self._accudeltas[-2:], self.params[-2:], gparams_spkr):
            # c.f. Algorithm 1 in the Adadelta paper (Zeiler 2012)
            agrad = self._rho * accugrad + (1 - self._rho) * gparam * gparam
            dx = - T.sqrt((accudelta + self._eps) / (agrad + self._eps)) * gparam
            updates[accudelta] = self._rho * accudelta + (1 - self._rho) * dx * dx
            updates[param] = param + dx
            updates[accugrad] = agrad
        train_fn_spkr = theano.function(inputs=[theano.Param(batch_x), 
            theano.Param(batch_y_spkr)],
            outputs=cost_spkr,
            updates=updates,
            #givens={self.x: batch_x[20:24,:], self.y_spkr: batch_y_spkr[20:24]})
            givens={self.x: batch_x, self.y_spkr: batch_y_spkr})

        return train_fn_phn, train_fn_spkr

    def score_classif(self, given_set):
        """ Returns functions to get current classification scores. """
        batch_x = T.fmatrix('batch_x')
        batch_y_phn = T.ivector('batch_y_phn')
        batch_y_spkr = T.ivector('batch_y_spkr')
        score = theano.function(inputs=[theano.Param(batch_x), theano.Param(batch_y_phn), theano.Param(batch_y_spkr)],
                outputs=(self.errors_phn, self.errors_spkr),
                givens={self.x: batch_x, self.y_phn: batch_y_phn, self.y_spkr: batch_y_spkr})

        # Create a function that scans the entire set given as input
        def scoref():
            return [score(batch_x, batch_y_phn, batch_y_spkr) for batch_x, batch_y_phn, batch_y_spkr in given_set]

        return scoref
Beispiel #21
0
def test_regression_model_mnist(dataset_name='mnist.pkl.gz',
                     learning_rate=0.13,
                     n_epochs=1000,
                     batch_size=600):
    # Set up the dataset
    dataset = load_data(dataset_name)
    # Split the data into a training, validation and test set
    train_data, train_labels = dataset[0]
    test_data, test_labels = dataset[1]
    validation_data, validation_labels = dataset[2]
    # Compute number of minibatches for each set
    n_train_batches = train_data.get_value(borrow=True).shape[0] / batch_size
    n_valid_batches = validation_data.get_value(borrow=True).shape[0] / batch_size
    n_test_batches = test_data.get_value(borrow=True).shape[0] / batch_size
    data_dim = (28, 28) # The dimension of each image in the dataset
    data_classes = 10 # The number of classes within the data
    
    # Build the model
    # ---------------

    # Allocate symbolic variables for data
    index = T.lscalar() # This is the index to a minibatch
    x = T.matrix('x') # Data (rasterized images)
    y = T.ivector('y') # Labels (1d vector of ints)

    # Construct logistic regression class
    classifier = LogisticRegression(input=x, n_in=data_dim[0]*data_dim[1], n_out=data_classes)

    # Cost to minimize during training
    cost = classifier.negative_log_likelihood(y)

    # Compile a Theano function that computes mistakes made by the model on a minibatch
    test_model = th.function(inputs=[index], # This function is for the test data   
                             outputs=classifier.errors(y),
                             givens={x: test_data[index * batch_size: (index + 1) * batch_size],
                                     y: test_labels[index * batch_size: (index + 1) * batch_size]})
    validate_model = th.function(inputs=[index], # This function is for the validation data    
                                 outputs=classifier.errors(y),
                                 givens={x: validation_data[index * batch_size: (index + 1) * batch_size],
                                         y: validation_labels[index * batch_size: (index + 1) * batch_size]})
    # Compute the gradient of cost with respect to theta = (W,b)
    grad_W = T.grad(cost=cost, wrt=classifier.W)
    grad_b = T.grad(cost=cost, wrt=classifier.b)

    # Specify how to update model parameters as a list of (variable, update expression) pairs
    updates = [(classifier.W, classifier.W - learning_rate * grad_W),
               (classifier.b, classifier.b - learning_rate * grad_b)]

    # Compile Theano function that returns the cost and updates parameters of model based on update rules
    train_model = th.function(inputs=[index], # Index in minibatch that defines x with label y   
                             outputs=cost, # Cost/loss associated with x,y
                             updates=updates,
                             givens={x: train_data[index * batch_size: (index + 1) * batch_size],
                                     y: train_labels[index * batch_size: (index + 1) * batch_size]})

    # Train the model
    # ---------------

    # Setup the early-stopping parameters
    patience = 5000 # Minimum number of examples to examine
    patience_increase = 2 # How much longer to wait once a new best is found
    improvement_threshold = 0.995 # Value of a significant relative improvement
    validation_frequency = min(n_train_batches, patience / 2) # Number of minibatches before validating
    best_validation_loss = np.inf
    test_score = 0
    start_time = time.clock()

    # Setup the training loop
    done_looping = False
    epoch = 0
    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        for minibatch_index in xrange(n_train_batches):
            minibatch_avg_cost = train_model(minibatch_index)
            # Set the iteration number
            iter = (epoch - 1) * n_train_batches + minibatch_index
            if (iter + 1) % validation_frequency == 0:
                # Compute the zero-one loss on the validation set
                validation_losses = [validate_model(i) for i in xrange(n_valid_batches)]
                this_validation_loss = np.mean(validation_losses)
                print('epoch %i, minibatch %i/%i, validation error %f %%' % (epoch,
                                                                             minibatch_index + 1,
                                                                             n_train_batches,
                                                                             this_validation_loss * 100.))
                # Check if current validation score is the best
                if this_validation_loss < best_validation_loss:
                    # Improve the patience is loss improvement is good enough
                    if this_validation_loss < best_validation_loss * improvement_threshold:
                        patience = max(patience, iter * patience_increase)
                    best_validation_loss = this_validation_loss
                    # Test on test set
                    test_losses = [test_model(i) for i in xrange(n_test_batches)]
                    test_score = np.mean(test_losses)
                    print('epoch %i, minibatch %i/%i, test error of best model %f %%' % (epoch,
                                                                                         minibatch_index + 1,
                                                                                         n_train_batches,
                                                                                         test_score * 100.))
            # Stop the loop if we have exhausted our patience
            if patience <= iter:
                done_looping = True
                break;
    # The loop has ended so record the time it took
    end_time = time.clock()
    # Print out results and timing information
    print('Optimization complete with best validation score of %f %%, with test performance %f %%' % (best_validation_loss * 100.,
                                                                                                      test_score * 100.)) 
    print 'The code ran for %d epochs with %f epochs/sec' % (epoch, 1. * epoch / (end_time - start_time))
    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.1fs' % ((end_time - start_time)))
class SdA(object):
    """Stacked denoising auto-encoder class (SdA)

    A stacked denoising autoencoder model is obtained by stacking several
    dAs. The hidden layer of the dA at layer `i` becomes the input of
    the dA at layer `i+1`. The first layer dA gets as input the input of
    the SdA, and the hidden layer of the last dA represents the output.
    Note that after pretraining, the SdA is dealt with as a normal MLP,
    the dAs are only used to initialize the weights.
    """
    def __init__(self,
                 numpy_rng,
                 theano_rng=None,
                 n_ins=784,
                 hidden_layers_sizes=[500, 500],
                 n_outs=10,
                 corruption_levels=[0.1, 0.1]):
        """ This class is made to support a variable number of layers.

        :type numpy_rng: numpy.random.RandomState
        :param numpy_rng: numpy random number generator used to draw initial
                    weights

        :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams
        :param theano_rng: Theano random generator; if None is given one is
                           generated based on a seed drawn from `rng`

        :type n_ins: int
        :param n_ins: dimension of the input to the sdA

        :type hidden_layers_sizes: list of ints
        :param hidden_layers_sizes: intermediate layers size, must contain
                               at least one value

        :type n_outs: int
        :param n_outs: dimension of the output of the network

        :type corruption_levels: list of float
        :param corruption_levels: amount of corruption to use for each
                                  layer
        """

        self.sigmoid_layers = []
        self.dA_layers = []
        self.params = []
        self.n_layers = len(hidden_layers_sizes)

        assert self.n_layers > 0

        if not theano_rng:
            theano_rng = RandomStreams(numpy_rng.randint(2**30))

        self.theano_rng = theano_rng
        # allocate symbolic variables for the data
        self.x = T.matrix('x')  # the data is presented as rasterized images
        self.y = T.ivector('y')  # the labels are presented as 1D vector of
        # [int] labels

        for i in range(self.n_layers):
            # n sigmoid layers and n dA layers

            # size of input is either hidden units of layer below, or input size for first layer
            if i == 0:
                input_size = n_ins
            else:
                input_size = hidden_layers_sizes[i - 1]

            # input to this layer, is either:
            # activation of hidden layer below
            # or input to SDA if you are first layer
            if i == 0:
                layer_input = self.x
            else:
                layer_input = self.sigmoid_layers[-1].output

            sigmoid_layer = HiddenLayer(rng=numpy_rng,
                                        input=layer_input,
                                        n_in=input_size,
                                        n_out=hidden_layers_sizes[i],
                                        activation=T.nnet.sigmoid)

            self.sigmoid_layers.append(sigmoid_layer)

            self.params.extend(sigmoid_layer.params)

            dA_layer = DenoisingAutoEncoder(numpy_rng=numpy_rng,
                                            theano_rng=theano_rng,
                                            input=layer_input,
                                            n_visible=input_size,
                                            n_hidden=hidden_layers_sizes[i],
                                            W=sigmoid_layer.W,
                                            bhid=sigmoid_layer.b)
            self.dA_layers.append(dA_layer)

        self.logLayer = LogisticRegression(
            input=self.sigmoid_layers[-1].output,
            n_in=hidden_layers_sizes[-1],
            n_out=n_outs)

        self.params.extend(self.logLayer.params)

        self.finetune_cost = self.logLayer.negative_log_likelihood(self.y)

        self.errors = self.logLayer.errors(self.y)

    def pretraining_functions(self, train_set_x, batch_size):
        ''' Generates a list of functions, each of them implementing one
        step in training the dA corresponding to the layer with same index.
        The function will require as input the minibatch index, and to train
        a dA you just need to iterate, calling the corresponding function on
        all minibatch indexes.

        :type train_set_x: theano.tensor.TensorType
        :param train_set_x: Shared variable that contains all datapoints used
                            for training the dA

        :type batch_size: int
        :param batch_size: size of a [mini]batch

        :type learning_rate: float
        :param learning_rate: learning rate used during training for any of
                              the dA layers
        '''

        # index to a [mini]batch
        index = T.lscalar('index')  # index to a minibatch
        corruption_level = T.scalar('corruption')
        learning_rate = T.scalar('lr')
        batch_begin = index * batch_size
        batch_end = batch_begin + batch_size

        pretrain_fns = []
        for dA in self.dA_layers:
            #get cost and updates list
            cost, updates = dA.get_cost_updates(corruption_level,
                                                learning_rate)

            # compile theano function
            fn = theano.function(
                inputs=[
                    index,
                    theano.In(corruption_level, value=0.2),
                    theano.In(learning_rate, value=0.1)
                ],
                outputs=cost,
                updates=updates,
                givens={self.x: train_set_x[batch_begin:batch_end]})
            pretrain_fns.append(fn)

        return pretrain_fns

    def build_finetune_functions(self, datasets, batch_size, learning_rate):
        '''Generates a function `train` that implements one step of
        finetuning, a function `validate` that computes the error on
        a batch from the validation set, and a function `test` that
        computes the error on a batch from the testing set

        :type datasets: list of pairs of theano.tensor.TensorType
        :param datasets: It is a list that contain all the datasets;
                         the has to contain three pairs, `train`,
                         `valid`, `test` in this order, where each pair
                         is formed of two Theano variables, one for the
                         datapoints, the other for the labels

        :type batch_size: int
        :param batch_size: size of a minibatch

        :type learning_rate: float
        :param learning_rate: learning rate used during finetune stage
        '''

        (train_set_x, train_set_y) = datasets[0]
        (valid_set_x, valid_set_y) = datasets[1]
        (test_set_x, test_set_y) = datasets[2]

        # compute number of minibatches for training, validation and testing
        n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]
        n_valid_batches //= batch_size
        n_test_batches = test_set_x.get_value(borrow=True).shape[0]
        n_test_batches //= batch_size

        index = T.lscalar('index')  # index to a [mini]batch

        # compute gradients with respect to model parameters (backprop happens here??)
        gparams = T.grad(self.finetune_cost, self.params)

        # compute list of fine-tuning updates
        updates = [(param, param - gparam * learning_rate)
                   for param, gparam in zip(self.params, gparams)]

        train_fn = theano.function(
            inputs=[index],
            outputs=self.finetune_cost,
            updates=updates,
            givens={
                self.x:
                train_set_x[index * batch_size:(index + 1) * batch_size],
                self.y:
                train_set_y[index * batch_size:(index + 1) * batch_size]
            },
            name='train')

        test_score_i = theano.function(
            inputs=[index],
            outputs=self.errors,
            givens={
                self.x:
                test_set_x[index * batch_size:(index + 1) * batch_size],
                self.y: test_set_y[index * batch_size:(index + 1) * batch_size]
            },
            name='test')

        valid_score_i = theano.function(
            [index],
            self.errors,
            givens={
                self.x:
                valid_set_x[index * batch_size:(index + 1) * batch_size],
                self.y:
                valid_set_y[index * batch_size:(index + 1) * batch_size]
            },
            name='valid')

        # Create a function that scans the entire validation set
        def valid_score():
            return [valid_score_i(i) for i in range(n_valid_batches)]

        # Create a function that scans the entire test set
        def test_score():
            return [test_score_i(i) for i in range(n_test_batches)]

        return train_fn, valid_score, test_score
Beispiel #23
0
def optimize_cnn_lenet(learning_rate=0.01, n_epochs=200, dataset='data/mnist.pkl.gz', batch_size=500, n_hidden=500, nkerns=[20, 50], rng=np.random.RandomState(23455)):
    print '... load training set'
    datasets = load_data(dataset)

    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]
    
    n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size
    n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size

    # ミニバッチのindex
    index = T.lscalar()

    # dataシンボル
    x = T.matrix('x')
    # labelシンボル
    y = T.ivector('y')

    print '... building the model'
    # LeNetConvPoolLayerと矛盾が起きないように、(batch_size, 28*28)にラスタ化された行列を4DTensorにリシェイプする
    # 追加した1はチャンネル数
    # ここではグレイスケール画像なのでチャンネル数は1
    layer0_input = x.reshape((batch_size, 1, 28, 28))

    # filterのnkerns[0]は20
    layer0 = ConvLayer(rng, input=layer0_input, image_shape=(batch_size, 1, 28, 28), filter_shape=(nkerns[0], 1, 5, 5))
    
    layer1 = PoolLayer(layer0.output, poolsize=(2, 2))

    # filterのnkerns[1]は50
    layer2 = ConvLayer(rng, input=layer1.output, image_shape=(batch_size, nkerns[0], 12, 12), filter_shape=(nkerns[1], nkerns[0], 5, 5))

    layer3 = PoolLayer(layer2.output, poolsize=(2, 2))

    # layer2_input
    # layer1の出力は4x4ピクセルの画像が50チャンネル分4次元Tensorで出力されるが、多層パーセプトロンの入力にそのまま使えない
    # 4x4x50=800次元のベクトルに変換する(batch_size, 50, 4, 4)から(batch_size, 800)にする
    layer4_input = layer3.output.flatten(2)

    # 500ユニットの隠れレイヤー
    # layer2_inputで作成した入力ベクトルのサイズ=n_in
    layer4 = HiddenLayer(rng, input=layer4_input, n_in=nkerns[1]*4*4, n_out=n_hidden, activation=T.tanh)

    # 出力は500ユニット
    layer5 = LogisticRegression(input=layer4.output, n_in=n_hidden, n_out=10)
    
    # cost(普通の多層パーセプトロンは正則化項が必要だが、CNNは構造自体で正則化の効果を含んでいる)
    cost = layer5.negative_log_likelihood(y)

    # testモデル
    # 入力indexからgivensによって計算した値を使ってlayer3.errorsを計算する
    test_model = theano.function([index], layer5.errors(y), givens={x:test_set_x[index*batch_size : (index + 1)*batch_size], y: test_set_y[index*batch_size : (index + 1)*batch_size]})
    
    # validationモデル
    validate_model = theano.function([index], layer5.errors(y), givens={x:valid_set_x[index*batch_size : (index + 1)*batch_size], y: valid_set_y[index*batch_size : (index + 1)*batch_size]})

    # 微分用のパラメータ(pooling層にはパラメータがない)
    params = layer5.params + layer4.params + layer2.params + layer0.params

    # コスト関数パラメータについてのの微分
    grads = T.grad(cost, params)

    # パラメータの更新
    updates = [(param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(params, grads)]

    # trainモデル
    train_model = theano.function(inputs=[index], outputs=cost, updates=updates, givens={x: train_set_x[index*batch_size : (index + 1)*batch_size], y:train_set_y[index*batch_size : (index+1)*batch_size]})

    # optimize
    print "train model ..."
    patience = 10000
    patience_increase = 2
    improvement_threshold = 0.995
    validation_frequency = min(n_train_batches, patience/2)

    best_validation_loss = np.inf
    best_iter = 0
    test_score = 0
    start_time = timeit.default_timer()

    epoch = 0
    done_looping = False

    fp1 = open('log/lenet_validation_error.txt', 'w')
    fp2 = open('log/lenet_test_error.txt', 'w')

    while(epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        for minibatch_index in xrange(n_train_batches):
            minibatch_avg_cost = train_model(minibatch_index)
            iter = (epoch - 1) * n_train_batches + minibatch_index

            if (iter + 1) % validation_frequency == 0:
                ## validationのindexをvalidationのエラー率を計算するfunctionに渡し、配列としてかえす
                validation_losses = [validate_model(i) for i in xrange(n_valid_batches)]
                # 平均してscoreにする
                this_validation_loss = np.mean(validation_losses)
                print('epoch %i, minibatch %i/%i, validation error %f ' % (epoch, minibatch_index+1, n_train_batches, this_validation_loss*100.))
                fp1.write("%d\t%f\n" % (epoch, this_validation_loss*100))         

                if this_validation_loss < best_validation_loss:
                    if(this_validation_loss < best_validation_loss * improvement_threshold):
                        patience = max(patience, iter*patience_increase)

                    best_validation_loss = this_validation_loss
                    best_iter = iter
                    ## testのindex をtestのエラー率を計算するfunctionに渡し、配列として渡す
                    test_losses = [test_model(i) for i in xrange(n_test_batches)]
                    
                    ## 平均してscoreにする
                    test_score = np.mean(test_losses)
                    ## 
                    print('epoch %i, minibatch %i/%i, test error %f ' % (epoch, minibatch_index+1, n_train_batches, test_score*100.))
                    fp2.write("%d\t%f\n" % (epoch, test_score*100))
            if patience <= iter:
                done_looping = True
                break
    fp1.close()
    fp2.close()        
    end_time = timeit.default_timer()
    print(('optimization complete. Best validation score of %f obtained at iteration %i, with test performance %f') % (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr,('This code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time)/60.))

    import cPickle
    cPickle.dump(layer0, open("model/cnn_layer0.pkl", "wb"))
    cPickle.dump(layer2, open("model/cnn_layer2.pkl", "wb"))
    cPickle.dump(layer4, open("model/cnn_layer4.pkl", "wb"))
    cPickle.dump(layer5, open("model/cnn_layer5.pkl", "wb"))
Beispiel #24
0
def evaluate_model(learning_rate=0.001,
                   n_epochs=100,
                   nkerns=[16, 40, 50, 60],
                   batch_size=20):
    """ 
    Network for classification of MNIST database

    :type learning_rate: float
    :param learning_rate: this is the initial learning rate used
                            (factor for the stochastic gradient)

    :type n_epochs: int
    :param n_epochs: maximal number of epochs to run the optimizer

    :type nkerns: list of ints
    :param nkerns: number of kernels on each layer

    :type batch_size: int
    :param batch_size: the batch size for training
    """

    print("Evaluating model")

    rng = numpy.random.RandomState(23455)

    # loading the data1
    datasets = load_test_data(1)

    valid_set_x, valid_set_y = datasets[0]
    test_set_x, test_set_y = datasets[1]

    # compute number of minibatches for training, validation and testing
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]
    n_test_batches = test_set_x.get_value(borrow=True).shape[0]
    n_valid_batches //= batch_size
    n_test_batches //= batch_size

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch

    # start-snippet-1
    x = T.matrix('x')  # the data is presented as rasterized images
    y = T.ivector('y')  # the labels are presented as 1D vector of
    # [int] labels

    loaded_params = numpy.load('../saved_models/model1.npy')
    layer4_W, layer4_b, layer3_W, layer3_b, layer2_W, layer2_b, layer1_W, layer1_b, layer0_W, layer0_b = loaded_params

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print('Building the model...')

    # Reshape matrix of rasterized images of shape (batch_size, 32 * 32)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    # (32, 32) is the size of MNIST images.
    layer0_input = x.reshape((batch_size, 1, 64, 88))

    # Construct the first convolutional pooling layer:
    # filtering does not reduce the layer size because we use padding
    # maxpooling reduces the size to (32/2, 32/2) = (16, 16)
    # 4D output tensor is thus of shape (batch_size, nkerns[0], 16, 16)
    layer0 = MyConvPoolLayer(rng,
                             input=layer0_input,
                             image_shape=(batch_size, 1, 64, 88),
                             p1=2,
                             p2=2,
                             filter_shape=(nkerns[0], 1, 5, 5),
                             poolsize=(2, 2),
                             W=layer0_W,
                             b=layer0_b)

    # Construct the second convolutional pooling layer:
    # filtering does not reduce the layer size because we use padding
    # maxpooling reduces the size to (16/2, 16/2) = (8, 8)
    # 4D output tensor is thus of shape (batch_size, nkerns[1], 5, 5)
    layer1 = MyConvPoolLayer(rng,
                             input=layer0.output,
                             image_shape=(batch_size, nkerns[0], 32, 44),
                             p1=2,
                             p2=2,
                             filter_shape=(nkerns[1], nkerns[0], 5, 5),
                             poolsize=(2, 2),
                             W=layer1_W,
                             b=layer1_b)

    # Construct the third convolutional pooling layer
    # filtering does not reduce the layer size because we use padding
    # maxpooling reduces the size to (8/2, 8/2) = (4, 4)
    # 4D output tensor is thus of shape (batch_size, nkerns[2], 4, 4)
    layer2 = MyConvPoolLayer(rng,
                             input=layer1.output,
                             image_shape=(batch_size, nkerns[1], 16, 22),
                             p1=2,
                             p2=2,
                             filter_shape=(nkerns[2], nkerns[1], 5, 5),
                             poolsize=(2, 2),
                             W=layer2_W,
                             b=layer2_b)

    # the HiddenLayer being fully-connected, it operates on 2D matrices of
    # shape (batch_size, num_pixels) (i.e matrix of rasterized images).
    # This will generate a matrix of shape (batch_size, nkerns[2] * 4 * 4),
    # or (500, 20 * 4 * 4) = (500, 320) with the default values.
    layer3_input = layer2.output.flatten(2)

    # construct a fully-connected sigmoidal layer
    layer3 = HiddenLayer(rng,
                         input=layer3_input,
                         n_in=nkerns[2] * 8 * 11,
                         n_out=800,
                         activation=T.tanh,
                         W=layer3_W,
                         b=layer3_b)

    # classify the values of the fully-connected sigmoidal layer
    layer4 = LogisticRegression(input=layer3.output,
                                n_in=800,
                                n_out=6,
                                W=layer4_W,
                                b=layer4_b)

    cost = layer4.negative_log_likelihood(y)

    predicted_output = layer4.y_pred

    # create a function to compute the mistakes that are made by the model
    test_model = theano.function(
        [index],
        layer4.errors(y),
        givens={
            x: test_set_x[index * batch_size:(index + 1) * batch_size],
            y: test_set_y[index * batch_size:(index + 1) * batch_size]
        })

    val_model_preds = theano.function(
        [index],
        layer4.prediction(),
        givens={
            x: valid_set_x[index * batch_size:(index + 1) * batch_size],
        })

    validate_model = theano.function(
        [index],
        layer4.errors(y),
        givens={
            x: valid_set_x[index * batch_size:(index + 1) * batch_size],
            y: valid_set_y[index * batch_size:(index + 1) * batch_size]
        })

    # create a list of all model parameters to be fit by gradient descent
    params = layer4.params + layer3.params + layer2.params + layer1.params + layer0.params

    val_preds = [val_model_preds(i) for i in range(n_valid_batches)]

    #print(val_preds)
    #preds = numpy(val_preds)

    preds = []
    for pred in val_preds:
        for p in pred:
            preds.append(p)

    #preds = val_preds.reshape(valid_set_x.get_value(borrow=True).shape[0])

    actual_labels = load_test_data(1, 2)
    n = len(actual_labels)

    confusion_matrix = numpy.zeros((6, 6))

    for i in range(n):
        confusion_matrix[int(actual_labels[i])][preds[i]] += 1

    print(confusion_matrix)

    correct = 0.0
    for i in range(n):
        if (preds[i] == int(actual_labels[i])):
            correct += 1.0

    accuracy = correct / n
    print("Number of correctly classified : ", correct)
    print("Test accuracy is", accuracy * 100)
Beispiel #25
0
def train_CNN_mini_batch(learning_rate, n_epochs, num_kernels, batch_size,
                         filter_size, is_multi_scale, num_of_classes, height,
                         width, use_interpolation, use_hidden_layer):
    train_set_x_by_1, train_set_y, valid_set_x_by_1, valid_set_y, test_set_x_by_1, test_set_y, train_set_x_by_2, \
    train_set_x_by_4, valid_set_x_by_2, valid_set_x_by_4, test_set_x_by_2, test_set_x_by_4 \
        = load_processed_img_data()

    n_train_batches = train_set_x_by_1.get_value(borrow=True).shape[0]
    n_valid_batches = valid_set_x_by_1.get_value(borrow=True).shape[0]
    n_test_batches = test_set_x_by_1.get_value(borrow=True).shape[0]
    n_train_batches /= batch_size
    n_valid_batches /= batch_size
    n_test_batches /= batch_size

    index = theano.tensor.lscalar()
    x_by_1 = theano.tensor.ftensor4('x_by_1')
    x_by_2 = theano.tensor.ftensor4('x_by_2')
    x_by_4 = theano.tensor.ftensor4('x_by_4')

    y = theano.tensor.ivector('y')

    print '... initialize the model'

    cnn_dir = 'models/CNN_'
    if is_multi_scale is True:
        cnn_dir += 'M_'
    else:
        cnn_dir += 'S_'

    if use_hidden_layer is True:
        cnn_dir += 'H_'
    else:
        cnn_dir += 'L_'

    if use_interpolation is True:
        cnn_dir += 'I_'
    else:
        cnn_dir += 'N_'

    cnn_dir = cnn_dir + str(num_kernels[0]) + '_' + str(
        num_kernels[1]) + '_' + str(
            num_kernels[2]) + '_' + str(batch_size) + '_'
    curr_date = str(datetime.date.today())
    curr_date = curr_date.replace('-', '_')
    cnn_dir = cnn_dir + curr_date + str(time.strftime('_%H_%M_%S'))

    print 'CNN model is ', cnn_dir

    if not os.path.exists(cnn_dir):
        os.makedirs(cnn_dir)

    class Logger(object):
        def __init__(self):
            self.terminal = sys.stdout
            self.log = open(cnn_dir + '/log.txt', 'w')

        def write(self, message):
            self.terminal.write(message)
            self.log.write(message)

    sys.stdout = Logger()

    layer0 = CNN_Layer(
        name='Layer_0',
        W=None,
        b=None,
        filter_shape=(num_kernels[0], 3, filter_size, filter_size),
    )

    layer1 = CNN_Layer(
        name='Layer_1',
        W=None,
        b=None,
        filter_shape=(num_kernels[1], num_kernels[0], filter_size,
                      filter_size),
    )

    layer2 = CNN_Layer(
        name='Layer_2',
        W=None,
        b=None,
        filter_shape=(num_kernels[2], num_kernels[1], filter_size,
                      filter_size),
    )

    layer3 = HiddenLayer(name='Layer_3',
                         W=None,
                         b=None,
                         n_in=num_kernels[2] *
                         3 if is_multi_scale is True else num_kernels[2],
                         n_out=num_kernels[2] *
                         4 if is_multi_scale is True else num_kernels[2] * 2,
                         activation=theano.tensor.tanh)

    if is_multi_scale and use_hidden_layer:
        layer4_in = num_kernels[2] * 4
    elif is_multi_scale and not use_hidden_layer:
        layer4_in = num_kernels[2] * 3
    elif not is_multi_scale and use_hidden_layer:
        layer4_in = num_kernels[2] * 2
    else:
        layer4_in = num_kernels[2]

    layer4 = LogisticRegression(
        name='Layer_4',
        W=None,
        b=None,
        n_in=layer4_in,
        n_out=num_of_classes,
    )

    forward_propagation(layer0=layer0,
                        layer1=layer1,
                        layer2=layer2,
                        layer3=layer3,
                        layer4=layer4,
                        x_by_1=x_by_1,
                        x_by_2=x_by_2,
                        x_by_4=x_by_4,
                        num_kernels=num_kernels,
                        batch_size=batch_size,
                        filter_size=filter_size,
                        is_multi_scale=is_multi_scale,
                        height=height,
                        width=width,
                        use_interpolation=use_interpolation,
                        use_hidden_layer=use_hidden_layer)

    if use_hidden_layer is True:
        L2_norm = (layer4.W**2).sum() + (layer3.W**2).sum() + (
            layer2.W**2).sum() + (layer1.W**2).sum() + (layer0.W**2).sum()
    else:
        L2_norm = (layer4.W**2).sum() + (layer2.W**2).sum() + (
            layer1.W**2).sum() + (layer0.W**2).sum()

    regularization = 0.00001
    cost = layer4.negative_log_likelihood(y) + (regularization * L2_norm)

    if is_multi_scale is True:
        test_model = theano.function(
            [index],
            layer4.errors(y),
            givens={
                x_by_1:
                test_set_x_by_1[index * batch_size:(index + 1) * batch_size],
                x_by_2:
                test_set_x_by_2[index * batch_size:(index + 1) * batch_size],
                x_by_4:
                test_set_x_by_4[index * batch_size:(index + 1) * batch_size],
                y:
                test_set_y[index * batch_size * height * width:(index + 1) *
                           batch_size * height * width]
            })
    else:
        test_model = theano.function(
            [index],
            layer4.errors(y),
            givens={
                x_by_1:
                test_set_x_by_1[index * batch_size:(index + 1) * batch_size],
                y:
                test_set_y[index * batch_size * height * width:(index + 1) *
                           batch_size * height * width]
            })

    if is_multi_scale is True:
        validate_model = theano.function(
            [index],
            layer4.errors(y),
            givens={
                x_by_1:
                valid_set_x_by_1[index * batch_size:(index + 1) * batch_size],
                x_by_2:
                valid_set_x_by_2[index * batch_size:(index + 1) * batch_size],
                x_by_4:
                valid_set_x_by_4[index * batch_size:(index + 1) * batch_size],
                y:
                valid_set_y[index * batch_size * height * width:(index + 1) *
                            batch_size * height * width]
            })
    else:
        validate_model = theano.function(
            [index],
            layer4.errors(y),
            givens={
                x_by_1:
                valid_set_x_by_1[index * batch_size:(index + 1) * batch_size],
                y:
                valid_set_y[index * batch_size * height * width:(index + 1) *
                            batch_size * height * width]
            })

    if use_hidden_layer is True:
        params = layer4.params + layer3.params + layer2.params + layer1.params + layer0.params
    else:
        params = layer4.params + layer2.params + layer1.params + layer0.params

    grads = theano.tensor.grad(cost, params)

    updates = [(param_i, param_i - learning_rate * grad_i)
               for param_i, grad_i in zip(params, grads)]

    if is_multi_scale is True:
        train_model = theano.function(
            [index],
            cost,
            updates=updates,
            givens={
                x_by_1:
                train_set_x_by_1[index * batch_size:(index + 1) * batch_size],
                x_by_2:
                train_set_x_by_2[index * batch_size:(index + 1) * batch_size],
                x_by_4:
                train_set_x_by_4[index * batch_size:(index + 1) * batch_size],
                y:
                train_set_y[index * batch_size * width * height:(index + 1) *
                            batch_size * width * height]
            })
    else:
        train_model = theano.function(
            [index],
            cost,
            updates=updates,
            givens={
                x_by_1:
                train_set_x_by_1[index * batch_size:(index + 1) * batch_size],
                y:
                train_set_y[index * batch_size * width * height:(index + 1) *
                            batch_size * width * height]
            })

    print '... training the model'
    patience = 10000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is found
    improvement_threshold = 0.995  # a relative improvement of this much is considered significant
    validation_frequency = min(n_train_batches, patience / 2)

    best_layer_0_W = numpy.zeros_like(layer0.W.get_value())
    best_layer_0_b = numpy.zeros_like(layer0.b.get_value())
    best_layer_1_W = numpy.zeros_like(layer1.W.get_value())
    best_layer_1_b = numpy.zeros_like(layer1.b.get_value())
    best_layer_2_W = numpy.zeros_like(layer2.W.get_value())
    best_layer_2_b = numpy.zeros_like(layer2.b.get_value())
    best_layer_3_W = numpy.zeros_like(layer3.W.get_value())
    best_layer_3_b = numpy.zeros_like(layer3.b.get_value())
    best_layer_4_W = numpy.zeros_like(layer4.W.get_value())
    best_layer_4_b = numpy.zeros_like(layer4.b.get_value())

    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()

    epoch = 0
    done_looping = False
    while (epoch < n_epochs) and (not done_looping):
        epoch += 1
        for mini_batch_index in xrange(n_train_batches):

            start = time.clock()
            iter = (epoch - 1) * n_train_batches + mini_batch_index
            cost_ij = train_model(mini_batch_index)

            if (iter + 1) % validation_frequency == 0:

                # compute zero-one loss on validation set
                validation_losses = [
                    validate_model(i) for i in xrange(n_valid_batches)
                ]
                this_validation_loss = numpy.mean(validation_losses)
                print('epoch %i, mini-batch %i/%i, validation error %f %%' %
                      (epoch, mini_batch_index + 1, n_train_batches,
                       this_validation_loss * 100.))

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:

                    # improve patience if loss improvement is good enough
                    if this_validation_loss < best_validation_loss * \
                            improvement_threshold:
                        patience = max(patience, iter * patience_increase)

                    # save best validation score and iteration number
                    best_validation_loss = this_validation_loss
                    best_iter = iter

                    # save best filters
                    best_layer_0_W = layer0.W.get_value()
                    best_layer_0_b = layer0.b.get_value()
                    best_layer_1_W = layer1.W.get_value()
                    best_layer_1_b = layer1.b.get_value()
                    best_layer_2_W = layer2.W.get_value()
                    best_layer_2_b = layer2.b.get_value()
                    best_layer_3_W = layer3.W.get_value()
                    best_layer_3_b = layer3.b.get_value()
                    best_layer_4_W = layer4.W.get_value()
                    best_layer_4_b = layer4.b.get_value()

                    # test it on the test set
                    test_losses = [
                        test_model(i) for i in xrange(n_test_batches)
                    ]

                    test_score = numpy.mean(test_losses)
                    print(('     epoch %i, mini-batch %i/%i, test error of '
                           'best model %f %%') %
                          (epoch, mini_batch_index + 1, n_train_batches,
                           test_score * 100.))

            if patience <= iter:
                done_looping = True
                break

            print 'training @ iter = %d, time taken = %f' % (iter,
                                                             (time.clock() -
                                                              start))

    end_time = time.clock()
    print('Optimization complete.')
    print(
        'Best validation score of %f %% obtained at iteration %i, '
        'with test performance %f %%' %
        (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))

    if not os.path.exists(cnn_dir + '/params'):
        os.makedirs(cnn_dir + '/params')

    numpy.save(cnn_dir + '/params/layer_0_W.npy', best_layer_0_W)
    numpy.save(cnn_dir + '/params/layer_0_b.npy', best_layer_0_b)
    numpy.save(cnn_dir + '/params/layer_1_W.npy', best_layer_1_W)
    numpy.save(cnn_dir + '/params/layer_1_b.npy', best_layer_1_b)
    numpy.save(cnn_dir + '/params/layer_2_W.npy', best_layer_2_W)
    numpy.save(cnn_dir + '/params/layer_2_b.npy', best_layer_2_b)
    numpy.save(cnn_dir + '/params/layer_3_W.npy', best_layer_3_W)
    numpy.save(cnn_dir + '/params/layer_3_b.npy', best_layer_3_b)
    numpy.save(cnn_dir + '/params/layer_4_W.npy', best_layer_4_W)
    numpy.save(cnn_dir + '/params/layer_4_b.npy', best_layer_4_b)
    numpy.save(cnn_dir + '/params/filer_kernels.npy', num_kernels)
    numpy.save(cnn_dir + '/params/filter_size.npy', filter_size)

    return cnn_dir
Beispiel #26
0
def evaluate_cifar(learning_rate=0.001,
                   n_epochs=100,
                   dataset_folder='cifar-10-batches-py',
                   nkerns=[16, 20, 20],
                   batch_size=32):
    """ 
    Network for classification of MNIST database

    :type learning_rate: float
    :param learning_rate: this is the initial learning rate used
                            (factor for the stochastic gradient)

    :type n_epochs: int
    :param n_epochs: maximal number of epochs to run the optimizer

    :type dataset_folder: string
    :param dataset_folder: the folder containing the batch files for cifar

    :type nkerns: list of ints
    :param nkerns: number of kernels on each layer

    :type batch_size: int
    :param batch_size: the batch size for training
    """

    rng = numpy.random.RandomState(23455)

    # loading the cifar data
    datasets = load_cifar_data(dataset_folder)
    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0]
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]
    n_test_batches = test_set_x.get_value(borrow=True).shape[0]
    n_train_batches //= batch_size
    n_valid_batches //= batch_size
    n_test_batches //= batch_size

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch

    # start-snippet-1
    x = T.matrix('x')  # the data is presented as rasterized images
    y = T.ivector('y')  # the labels are presented as 1D vector of
    # [int] labels

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print('Building the model...')

    # Reshape matrix of rasterized images of shape (batch_size, 32 * 32)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    # (32, 32) is the size of MNIST images.
    layer0_input = x.reshape((batch_size, 3, 32, 32))

    # Construct the first convolutional pooling layer:
    # filtering does not reduce the layer size because we use padding
    # maxpooling reduces the size to (32/2, 32/2) = (16, 16)
    # 4D output tensor is thus of shape (batch_size, nkerns[0], 16, 16)
    layer0 = MyConvPoolLayer(rng,
                             input=layer0_input,
                             image_shape=(batch_size, 3, 32, 32),
                             p1=2,
                             p2=2,
                             filter_shape=(nkerns[0], 3, 5, 5),
                             poolsize=(2, 2))

    # Construct the second convolutional pooling layer:
    # filtering does not reduce the layer size because we use padding
    # maxpooling reduces the size to (16/2, 16/2) = (8, 8)
    # 4D output tensor is thus of shape (batch_size, nkerns[1], 5, 5)
    layer1 = MyConvPoolLayer(rng,
                             input=layer0.output,
                             image_shape=(batch_size, nkerns[0], 16, 16),
                             p1=2,
                             p2=2,
                             filter_shape=(nkerns[1], nkerns[0], 5, 5),
                             poolsize=(2, 2))

    # Construct the third convolutional pooling layer
    # filtering does not reduce the layer size because we use padding
    # maxpooling reduces the size to (8/2, 8/2) = (4, 4)
    # 4D output tensor is thus of shape (batch_size, nkerns[2], 4, 4)
    layer2 = MyConvPoolLayer(rng,
                             input=layer1.output,
                             image_shape=(batch_size, nkerns[1], 8, 8),
                             p1=2,
                             p2=2,
                             filter_shape=(nkerns[2], nkerns[1], 5, 5),
                             poolsize=(2, 2))

    # the HiddenLayer being fully-connected, it operates on 2D matrices of
    # shape (batch_size, num_pixels) (i.e matrix of rasterized images).
    # This will generate a matrix of shape (batch_size, nkerns[2] * 4 * 4),
    # or (500, 20 * 4 * 4) = (500, 320) with the default values.
    layer3_input = layer2.output.flatten(2)

    # construct a fully-connected sigmoidal layer
    layer3 = HiddenLayer(rng,
                         input=layer3_input,
                         n_in=nkerns[2] * 4 * 4,
                         n_out=500,
                         activation=T.tanh)

    # classify the values of the fully-connected sigmoidal layer
    layer4 = LogisticRegression(input=layer3.output, n_in=500, n_out=5)

    # the cost we minimize during training is the NLL of the model
    cost = layer4.negative_log_likelihood(y)

    predicted_output = layer4.y_pred

    # create a function to compute the mistakes that are made by the model
    test_model = theano.function(
        [index],
        layer4.errors(y),
        givens={
            x: test_set_x[index * batch_size:(index + 1) * batch_size],
            y: test_set_y[index * batch_size:(index + 1) * batch_size]
        })

    validate_model = theano.function(
        [index],
        layer4.errors(y),
        givens={
            x: valid_set_x[index * batch_size:(index + 1) * batch_size],
            y: valid_set_y[index * batch_size:(index + 1) * batch_size]
        })

    # create a list of all model parameters to be fit by gradient descent
    params = layer4.params + layer3.params + layer2.params + layer1.params + layer0.params

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    # the learning rate for batch SGD (adaptive learning rate)
    l_rate = T.scalar('l_rate', dtype=theano.config.floatX)
    # the momentum SGD
    momentum = T.scalar('momentum', dtype=theano.config.floatX)

    # train_model is a function that updates the model parameters by
    # SGD Since this model has many parameters, it would be tedious to
    # manually create an update rule for each model parameter. We thus
    # create the updates list by automatically looping over all
    # (params[i], grads[i]) pairs.
    updates = []
    for param in params:
        previous_step = theano.shared(param.get_value() * 0.,
                                      broadcastable=param.broadcastable)
        step = momentum * previous_step - l_rate * T.grad(cost, param)
        updates.append((previous_step, step))
        updates.append((param, param + step))

    train_model = theano.function(
        [index, l_rate, momentum],
        cost,
        updates=updates,
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size],
            y: train_set_y[index * batch_size:(index + 1) * batch_size]
        })
    # end-snippet-1

    ###############
    # TRAIN MODEL #
    ###############
    print('Training...')
    # early-stopping parameters
    patience = 50000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
    # found
    improvement_threshold = 0.995  # a relative improvement of this much is
    # considered significant
    validation_frequency = min(n_train_batches, patience // 2)
    # go through this many
    # minibatche before checking the network
    # on the validation set; in this case we
    # check every epoch

    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = timeit.default_timer()

    epoch = 0
    done_looping = False

    # initializing the adaptive leaning rate
    adaptive_learning_rate = learning_rate
    # initializing the momentum
    momentum = 0.9

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1

        if epoch % 10 == 0:
            # decreasing the learning rate after every 10 epochs
            adaptive_learning_rate = 0.95 * adaptive_learning_rate
            # increasing the learning rate after every 10 epochs
            momentum = 1.05 * momentum

        for minibatch_index in range(n_train_batches):

            iter = (epoch - 1) * n_train_batches + minibatch_index

            if iter % 100 == 0:
                print('training @ iter = ', iter)
            cost_ij = train_model(minibatch_index, adaptive_learning_rate,
                                  momentum)

            if (iter + 1) % validation_frequency == 0:

                # compute zero-one loss on validation set
                validation_losses = [
                    validate_model(i) for i in range(n_valid_batches)
                ]
                this_validation_loss = numpy.mean(validation_losses)
                print('epoch %i, minibatch %i/%i, validation error %f %%' %
                      (epoch, minibatch_index + 1, n_train_batches,
                       this_validation_loss * 100.))

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:

                    # increase the learning rate by small amount (adaptive)
                    adaptive_learning_rate = 1.01 * adaptive_learning_rate

                    #improve patience if loss improvement is good enough
                    if this_validation_loss < best_validation_loss *  \
                       improvement_threshold:
                        patience = max(patience, iter * patience_increase)

                    # save best validation score and iteration number
                    best_validation_loss = this_validation_loss
                    best_iter = iter

                    # test it on the test set
                    test_losses = [
                        test_model(i) for i in range(n_test_batches)
                    ]
                    test_score = numpy.mean(test_losses)
                    print(('     epoch %i, minibatch %i/%i, test error of '
                           'best model %f %%') %
                          (epoch, minibatch_index + 1, n_train_batches,
                           test_score * 100.))

                else:
                    # decrease the learning rate by small amount (adaptive)
                    adaptive_learning_rate = 0.5 * adaptive_learning_rate

            if patience <= iter:
                done_looping = True
                break

    end_time = timeit.default_timer()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i, '
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print(
        ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' %
         ((end_time - start_time) / 60.)),
        file=sys.stderr)
Beispiel #27
0
class RRNN(object):
    """Recurrent ReLU Neural Network
    """

    def __init__(self, numpy_rng, theano_rng=None, 
            n_ins=N_FEATURES * N_FRAMES,
            relu_layers_sizes=[1024, 1024, 1024],
            recurrent_connections=[2],  # layer(s), can only be i^t -> i^{t+1}
            n_outs=62 * 3,
            rho=0.9, eps=1.E-6):
        """ TODO 
        """

        self.relu_layers = []
        self.params = []
        self.n_layers = len(relu_layers_sizes)
        self._rho = rho  # ``momentum'' for adadelta
        self._eps = eps  # epsilon for adadelta
        self._accugrads = []  # for adadelta
        self._accudeltas = []  # for adadelta
        self.n_outs = n_outs

        assert self.n_layers > 0

        if not theano_rng:
            theano_rng = RandomStreams(numpy_rng.randint(2 ** 30))

        self.x = T.fmatrix('x')
        self.y = T.ivector('y')

        for i in xrange(self.n_layers):
            if i == 0:
                input_size = n_ins
            else:
                input_size = relu_layers_sizes[i-1]

            if i == 0:
                layer_input = self.x
            else:
                layer_input = self.relu_layers[-1].output

            if i in recurrent_connections:
                inputr_size = relu_layers_sizes[i]
                previous_output = T.fmatrix('previous_output')
                relu_layer = RecurrentReLU(rng=numpy_rng,
                        input=layer_input, in_stack=previous_output,
                        n_in=input_size, n_in_stack=inputr_size,
                        n_out=inputr_size)
                #relu_layer.in_stack = relu_layer.output # TODO TODO TODO

                self.params.extend(relu_layer.params)
                self._accugrads.extend([shared(value=numpy.zeros((n_ins, relu_layers_sizes[0]), dtype='float32'), name='accugrad_W', borrow=True), shared(value=numpy.zeros((relu_layers_sizes[0], ), dtype='float32'), name='accugrad_b', borrow=True), shared(value=numpy.zeros((n_outs, relu_layers_sizes[0]), dtype='float32'), name='accugrad_Ws', borrow=True)])
                self._accudeltas.extend([shared(value=numpy.zeros((n_ins, relu_layers_sizes[0]), dtype='float32'), name='accudelta_W', borrow=True), shared(value=numpy.zeros((relu_layers_sizes[0], ), dtype='float32'), name='accudelta_b', borrow=True), shared(value=numpy.zeros((n_outs, relu_layers_sizes[0]), dtype='float32'), name='accudelta_Ws', borrow=True)])
            else:
                relu_layer = ReLU(rng=numpy_rng,
                        input=layer_input,
                        n_in=input_size,
                        n_out=relu_layers_sizes[i])

                self.params.extend(relu_layer.params)
                self._accugrads.extend([shared(value=numpy.zeros((input_size, relu_layers_sizes[i]), dtype='float32'), name='accugrad_W', borrow=True), shared(value=numpy.zeros((relu_layers_sizes[i], ), dtype='float32'), name='accugrad_b', borrow=True)])
                self._accudeltas.extend([shared(value=numpy.zeros((input_size, relu_layers_sizes[i]), dtype='float32'), name='accudelta_W', borrow=True), shared(value=numpy.zeros((relu_layers_sizes[i], ), dtype='float32'), name='accudelta_b', borrow=True)])

            self.relu_layers.append(relu_layer)


        # We now need to add a logistic layer on top of the MLP
        self.logLayer = LogisticRegression(
            input=self.relu_layers[-1].output,
            n_in=relu_layers_sizes[-1],
            n_out=n_outs)
        self.params.extend(self.logLayer.params)
        self._accugrads.extend([shared(value=numpy.zeros((relu_layers_sizes[-1], n_outs), dtype='float32'), name='accugrad_W', borrow=True), shared(value=numpy.zeros((n_outs, ), dtype='float32'), name='accugrad_b', borrow=True)])
        self._accudeltas.extend([shared(value=numpy.zeros((relu_layers_sizes[-1], n_outs), dtype='float32'), name='accudelta_W', borrow=True), shared(value=numpy.zeros((n_outs, ), dtype='float32'), name='accudelta_b', borrow=True)])

        # compute the cost for second phase of training, defined as the
        # negative log likelihood of the logistic regression (output) layer
        self.finetune_cost = self.logLayer.negative_log_likelihood(self.y)
        self.finetune_cost_sum = self.logLayer.negative_log_likelihood_sum(self.y)

        # compute the gradients with respect to the model parameters
        # symbolic variable that points to the number of errors made on the
        # minibatch given by self.x and self.y
        self.errors = self.logLayer.errors(self.y)

    def get_SGD_trainer(self):
        """ Returns a plain SGD minibatch trainer with learning rate as param.
        """
        batch_x = T.fmatrix('batch_x')
        batch_y = T.ivector('batch_y')
        learning_rate = T.fscalar('lr')  # learning rate to use
        cost = self.finetune_cost_sum
        # compute the gradients with respect to the model parameters
        gparams = T.grad(cost, self.params)

        # compute list of fine-tuning updates
        updates = OrderedDict()
        for param, gparam in zip(self.params, gparams):
            updates[param] = param - gparam * learning_rate 

        train_fn = theano.function(inputs=[theano.Param(batch_x), 
            theano.Param(batch_y),
            theano.Param(learning_rate)],
            outputs=cost,
            updates=updates,
            givens={self.x: batch_x, self.y: batch_y})

        return train_fn

    def get_adadelta_trainer(self):
        """ Returns an Adadelta (Zeiler 2012) trainer using self._rho and self._eps params.
        """
        batch_x = T.fmatrix('batch_x')
        batch_y = T.ivector('batch_y')
        cost = self.finetune_cost_sum
        # compute the gradients with respect to the model parameters
        gparams = T.grad(cost, self.params)

        # compute list of fine-tuning updates
        updates = OrderedDict()
        for accugrad, accudelta, param, gparam in zip(self._accugrads,
                self._accudeltas, self.params, gparams):
            # c.f. Algorithm 1 in the Adadelta paper (Zeiler 2012)
            agrad = self._rho * accugrad + (1 - self._rho) * gparam * gparam
            dx = - T.sqrt((accudelta + self._eps) / (agrad + self._eps)) * gparam
            updates[accudelta] = self._rho * accudelta + (1 - self._rho) * dx * dx
            updates[param] = param + dx
            updates[accugrad] = agrad

        train_fn = theano.function(inputs=[theano.Param(batch_x), 
            theano.Param(batch_y)],
            outputs=cost,
            updates=updates,
            givens={self.x: batch_x, self.y: batch_y})

        return train_fn

    def get_adagrad_trainer(self):
        """ Returns an Adagrad (Duchi et al. 2010) trainer using a learning rate.
        """
        batch_x = T.fmatrix('batch_x')
        batch_y = T.ivector('batch_y')
        learning_rate = T.fscalar('lr')  # learning rate to use
        cost = self.finetune_cost_sum
        # compute the gradients with respect to the model parameters
        gparams = T.grad(cost, self.params)

        # compute list of fine-tuning updates
        updates = OrderedDict()
        for accugrad, param, gparam in zip(self._accugrads, self.params, gparams):
            # c.f. Algorithm 1 in the Adadelta paper (Zeiler 2012)
            agrad = accugrad + gparam * gparam
            dx = - (learning_rate / T.sqrt(agrad + self._eps)) * gparam
            updates[param] = param + dx
            updates[accugrad] = agrad

        train_fn = theano.function(inputs=[theano.Param(batch_x), 
            theano.Param(batch_y),
            theano.Param(learning_rate)],
            outputs=cost,
            updates=updates,
            givens={self.x: batch_x, self.y: batch_y})

        return train_fn

    def score_classif(self, given_set):
        """ Returns functions to get current classification scores. """
        batch_x = T.fmatrix('batch_x')
        batch_y = T.ivector('batch_y')
        score = theano.function(inputs=[theano.Param(batch_x), theano.Param(batch_y)],
                outputs=self.errors,
                givens={self.x: batch_x, self.y: batch_y})

        # Create a function that scans the entire set given as input
        def scoref():
            return [score(batch_x, batch_y) for batch_x, batch_y in given_set]

        return scoref
def evaluate_model(learning_rate=0.005,
                   n_epochs=50,
                   nkerns=[16, 40, 50, 60],
                   batch_size=32):
    """ 
    Network for classification 

    :type learning_rate: float
    :param learning_rate: this is the initial learning rate used
                            (factor for the stochastic gradient)

    :type n_epochs: int
    :param n_epochs: maximal number of epochs to run the optimizer

    :type nkerns: list of ints
    :param nkerns: number of kernels on each layer

    :type batch_size: int
    :param batch_size: the batch size for training
    """

    print("Evaluating model")

    rng = numpy.random.RandomState(23455)

    # loading the data
    datasets = load_data(3)
    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0]
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]
    n_test_batches = test_set_x.get_value(borrow=True).shape[0]
    n_train_batches //= batch_size
    n_valid_batches //= batch_size
    n_test_batches //= batch_size

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch

    # start-snippet-1
    x = T.matrix('x')  # the data is presented as rasterized images
    y = T.ivector('y')  # the labels are presented as 1D vector of
    # [int] labels

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print('Building the model...')

    layer0_input = x.reshape((batch_size, 1, 64, 88))

    layer0 = MyConvPoolLayer(rng,
                             input=layer0_input,
                             image_shape=(batch_size, 1, 64, 88),
                             p1=2,
                             p2=2,
                             filter_shape=(nkerns[0], 1, 5, 5),
                             poolsize=(2, 2))

    layer1 = MyConvPoolLayer(rng,
                             input=layer0.output,
                             image_shape=(batch_size, nkerns[0], 32, 44),
                             p1=2,
                             p2=2,
                             filter_shape=(nkerns[1], nkerns[0], 5, 5),
                             poolsize=(2, 2))

    layer2 = MyConvPoolLayer(rng,
                             input=layer1.output,
                             image_shape=(batch_size, nkerns[1], 16, 22),
                             p1=2,
                             p2=2,
                             filter_shape=(nkerns[2], nkerns[1], 5, 5),
                             poolsize=(2, 2))

    layer3_input = layer2.output.flatten(2)

    # construct a fully-connected sigmoidal layer
    layer3 = HiddenLayer(rng,
                         input=layer3_input,
                         n_in=nkerns[2] * 8 * 11,
                         n_out=800,
                         activation=T.tanh)

    # classify the values of the fully-connected sigmoidal layer
    layer4 = LogisticRegression(input=layer3.output, n_in=800, n_out=6)

    # the cost we minimize during training is the NLL of the model
    cost = layer4.negative_log_likelihood(y)

    predicted_output = layer4.y_pred

    # create a function to compute the mistakes that are made by the model
    test_model = theano.function(
        [index],
        layer4.errors(y),
        givens={
            x: test_set_x[index * batch_size:(index + 1) * batch_size],
            y: test_set_y[index * batch_size:(index + 1) * batch_size]
        })

    validate_model = theano.function(
        [index],
        layer4.errors(y),
        givens={
            x: valid_set_x[index * batch_size:(index + 1) * batch_size],
            y: valid_set_y[index * batch_size:(index + 1) * batch_size]
        })

    # create a list of all model parameters to be fit by gradient descent
    params = layer4.params + layer3.params + layer2.params + layer1.params + layer0.params

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    # the learning rate for batch SGD (adaptive learning rate)
    l_rate = T.scalar('l_rate', dtype=theano.config.floatX)
    adaptive_learning_rate = T.scalar('adaptive_learning_rate',
                                      dtype=theano.config.floatX)
    # the momentum SGD
    momentum = T.scalar('momentum', dtype=theano.config.floatX)

    # train_model is a function that updates the model parameters by
    # SGD Since this model has many parameters, it would be tedious to
    # manually create an update rule for each model parameter. We thus
    # create the updates list by automatically looping over all
    # (params[i], grads[i]) pairs.
    updates = []
    for param in params:
        previous_step = theano.shared(param.get_value() * 0.,
                                      broadcastable=param.broadcastable)
        step = momentum * previous_step - l_rate * T.grad(cost, param)
        updates.append((previous_step, step))
        updates.append((param, param + step))

    train_model = theano.function(
        [index, l_rate, momentum],
        cost,
        updates=updates,
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size],
            y: train_set_y[index * batch_size:(index + 1) * batch_size]
        })
    # end-snippet-1

    ###############
    # TRAIN MODEL #
    ###############
    print('Training...')
    # early-stopping parameters
    patience = 50000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
    # found
    improvement_threshold = 0.995  # a relative improvement of this much is
    # considered significant
    validation_frequency = min(n_train_batches, patience // 2)
    # go through this many
    # minibatche before checking the network
    # on the validation set; in this case we
    # check every epoch

    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = timeit.default_timer()

    epoch = 0
    done_looping = False

    # initializing the adaptive leaning rate
    adaptive_learning_rate = learning_rate
    # initializing the momentum
    momentum = 0.1
    a = 0.0001
    b = 0.3

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1

        if epoch % 5 == 0:
            # decreasing the learning rate after every 10 epochs
            adaptive_learning_rate = 0.95 * adaptive_learning_rate
            # increasing the learning rate after every 10 epochs
            #momentum = 1.005 * momentum

        for minibatch_index in range(n_train_batches):

            iter = (epoch - 1) * n_train_batches + minibatch_index

            if iter % 100 == 0:
                print('training @ iter = ', iter)
            cost_ij = train_model(minibatch_index, adaptive_learning_rate,
                                  momentum)

            if (iter + 1) % validation_frequency == 0:

                # compute zero-one loss on validation set
                validation_losses = [
                    validate_model(i) for i in range(n_valid_batches)
                ]
                this_validation_loss = numpy.mean(validation_losses)
                print('epoch %i, minibatch %i/%i, validation error %f %%' %
                      (epoch, minibatch_index + 1, n_train_batches,
                       this_validation_loss * 100.))

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:

                    # increase the learning rate by small amount (adaptive)
                    adaptive_learning_rate += a

                    #improve patience if loss improvement is good enough
                    if this_validation_loss < best_validation_loss *  \
                       improvement_threshold:
                        patience = max(patience, iter * patience_increase)

                    # save best validation score and iteration number
                    best_validation_loss = this_validation_loss
                    best_iter = iter

                    #Save the model
                    print("Saving model")
                    save_filename = "../saved_models/model3"

                    x = numpy.array([
                        layer4.W.get_value(),
                        layer4.b.get_value(),
                        layer3.W.get_value(),
                        layer3.b.get_value(),
                        layer2.W.get_value(),
                        layer2.b.get_value(),
                        layer1.W.get_value(),
                        layer1.b.get_value(),
                        layer0.W.get_value(),
                        layer0.b.get_value()
                    ])

                    numpy.save(save_filename, x)

                    # f = file(save_filename, 'wb')
                    # # cPickle.dump([param.get_value() for param in params], f, protocol=cPickle.HIGHEST_PROTOCOL)
                    # cPickle.dump([param.get_value() for param in params], f, protocol=cPickle.HIGHEST_PROTOCOL)
                    # # cPickle.dump(params, f, protocol=cPickle.HIGHEST_PROTOCOL)

                    # test it on the test set
                    test_losses = [
                        test_model(i) for i in range(n_test_batches)
                    ]
                    test_score = numpy.mean(test_losses)
                    print(('     epoch %i, minibatch %i/%i, test error of '
                           'best model %f %%') %
                          (epoch, minibatch_index + 1, n_train_batches,
                           test_score * 100.))

                else:
                    # decrease the learning rate by small amount (adaptive)
                    adaptive_learning_rate = adaptive_learning_rate - (
                        b * adaptive_learning_rate) + (0.01 * a)

            if patience <= iter:
                done_looping = True
                break

    end_time = timeit.default_timer()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i, '
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print(
        ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' %
         ((end_time - start_time) / 60.)),
        file=sys.stderr)
class DBN(object):
    """Deep Belief Network

    A deep belief network is obtained by stacking several RBMs on top of each
    other. The hidden layer of the RBM at layer `i` becomes the input of the
    RBM at layer `i+1`. The first layer RBM gets as input the input of the
    network, and the hidden layer of the last RBM represents the output. When
    used for classification, the DBN is treated as a MLP, by adding a logistic
    regression layer on top.
    """
    def __init__(self,
                 numpy_rng,
                 theano_rng=None,
                 n_ins=N_FEATURES * N_FRAMES,
                 hidden_layers_sizes=[1024, 1024],
                 n_outs=62 * 3,
                 rho=0.90,
                 eps=1.E-6):
        """This class is made to support a variable number of layers.

        :type numpy_rng: numpy.random.RandomState
        :param numpy_rng: numpy random number generator used to draw initial
                    weights

        :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams
        :param theano_rng: Theano random generator; if None is given one is
                           generated based on a seed drawn from `rng`

        :type n_ins: int
        :param n_ins: dimension of the input to the DBN

        :type n_layers_sizes: list of ints
        :param n_layers_sizes: intermediate layers size, must contain
                               at least one value

        :type n_outs: int
        :param n_outs: dimension of the output of the network
        """

        self.sigmoid_layers = []
        self.rbm_layers = []
        self.params = []
        self.n_layers = len(hidden_layers_sizes)
        #self._rho = shared(numpy.cast['float32'](rho), name='rho')  # for adadelta
        #self._eps = shared(numpy.cast['float32'](eps), name='eps')  # for adadelta
        self._rho = rho
        self._eps = eps
        self._accugrads = []  # for adadelta
        self._accudeltas = []  # for adadelta

        assert self.n_layers > 0

        if not theano_rng:
            theano_rng = RandomStreams(numpy_rng.randint(2**30))

        # allocate symbolic variables for the data
        self.x = T.fmatrix('x')  # the data is presented as rasterized images
        self.y = T.ivector('y')  # the labels are presented as 1D vector
        # of [int] labels

        # The DBN is an MLP, for which all weights of intermediate
        # layers are shared with a different RBM.  We will first
        # construct the DBN as a deep multilayer perceptron, and when
        # constructing each sigmoidal layer we also construct an RBM
        # that shares weights with that layer. During pretraining we
        # will train these RBMs (which will lead to chainging the
        # weights of the MLP as well) During finetuning we will finish
        # training the DBN by doing stochastic gradient descent on the
        # MLP.

        for i in xrange(self.n_layers):
            # construct the sigmoidal layer

            # the size of the input is either the number of hidden
            # units of the layer below or the input size if we are on
            # the first layer
            if i == 0:
                input_size = n_ins
            else:
                input_size = hidden_layers_sizes[i - 1]

            # the input to this layer is either the activation of the
            # hidden layer below or the input of the DBN if you are on
            # the first layer
            if i == 0:
                layer_input = self.x
            else:
                layer_input = self.sigmoid_layers[-1].output

            sigmoid_layer = HiddenLayer(rng=numpy_rng,
                                        input=layer_input,
                                        n_in=input_size,
                                        n_out=hidden_layers_sizes[i],
                                        activation=T.nnet.sigmoid)

            # add the layer to our list of layers
            self.sigmoid_layers.append(sigmoid_layer)

            # its arguably a philosophical question...  but we are
            # going to only declare that the parameters of the
            # sigmoid_layers are parameters of the DBN. The visible
            # biases in the RBM are parameters of those RBMs, but not
            # of the DBN.
            self.params.extend(sigmoid_layer.params)
            self._accugrads.extend([
                shared(value=numpy.zeros((input_size, hidden_layers_sizes[i]),
                                         dtype='float32'),
                       name='accugrad_W',
                       borrow=True),
                shared(value=numpy.zeros((hidden_layers_sizes[i], ),
                                         dtype='float32'),
                       name='accugrad_b',
                       borrow=True)
            ])  # TODO
            self._accudeltas.extend([
                shared(value=numpy.zeros((input_size, hidden_layers_sizes[i]),
                                         dtype='float32'),
                       name='accudelta_W',
                       borrow=True),
                shared(value=numpy.zeros((hidden_layers_sizes[i], ),
                                         dtype='float32'),
                       name='accudelta_b',
                       borrow=True)
            ])  # TODO

            # Construct an RBM that shared weights with this layer
            if i == 0:
                rbm_layer = GRBM(numpy_rng=numpy_rng,
                                 theano_rng=theano_rng,
                                 input=layer_input,
                                 n_visible=input_size,
                                 n_hidden=hidden_layers_sizes[i],
                                 W=sigmoid_layer.W,
                                 hbias=sigmoid_layer.b)
            else:
                rbm_layer = RBM(numpy_rng=numpy_rng,
                                theano_rng=theano_rng,
                                input=layer_input,
                                n_visible=input_size,
                                n_hidden=hidden_layers_sizes[i],
                                W=sigmoid_layer.W,
                                hbias=sigmoid_layer.b)
            self.rbm_layers.append(rbm_layer)

        # We now need to add a logistic layer on top of the MLP
        self.logLayer = LogisticRegression(
            input=self.sigmoid_layers[-1].output,
            n_in=hidden_layers_sizes[-1],
            n_out=n_outs)
        self.params.extend(self.logLayer.params)
        self._accugrads.extend([
            shared(value=numpy.zeros((hidden_layers_sizes[-1], n_outs),
                                     dtype='float32'),
                   name='accugrad_W',
                   borrow=True),
            shared(value=numpy.zeros((n_outs, ), dtype='float32'),
                   name='accugrad_b',
                   borrow=True)
        ])  # TODO
        self._accudeltas.extend([
            shared(value=numpy.zeros((hidden_layers_sizes[-1], n_outs),
                                     dtype='float32'),
                   name='accudelta_W',
                   borrow=True),
            shared(value=numpy.zeros((n_outs, ), dtype='float32'),
                   name='accudelta_b',
                   borrow=True)
        ])  # TODO

        # compute the cost for second phase of training, defined as the
        # negative log likelihood of the logistic regression (output) layer
        self.finetune_cost = self.logLayer.negative_log_likelihood(self.y)
        self.finetune_cost_sum = self.logLayer.negative_log_likelihood_sum(
            self.y)

        # compute the gradients with respect to the model parameters
        # symbolic variable that points to the number of errors made on the
        # minibatch given by self.x and self.y
        self.errors = self.logLayer.errors(self.y)

    def pretraining_functions(self, k):
        batch_x = T.fmatrix('batch_x')
        learning_rate = T.scalar('lr')  # learning rate to use

        pretrain_fns = []
        for rbm in self.rbm_layers:

            # get the cost and the updates list
            # using CD-k here (persisent=None) for training each RBM.
            # TODO: change cost function to reconstruction error
            #markov_chain = shared(numpy.empty((batch_size, rbm.n_hidden), dtype='float32'), borrow=True)
            markov_chain = None
            cost, updates = rbm.get_cost_updates(learning_rate,
                                                 persistent=markov_chain,
                                                 k=k)

            # compile the theano function
            fn = theano.function(
                inputs=[batch_x,
                        theano.Param(learning_rate, default=0.1)],
                outputs=cost,
                updates=updates,
                givens={self.x: batch_x})
            # append `fn` to the list of functions
            pretrain_fns.append(fn)

        return pretrain_fns

    def get_SGD_trainer(self):
        """ Returns a plain SGD minibatch trainer with learning rate as param.
        """
        batch_x = T.fmatrix('batch_x')
        batch_y = T.ivector('batch_y')
        learning_rate = T.fscalar('lr')  # learning rate to use
        cost = self.finetune_cost_sum
        # compute the gradients with respect to the model parameters
        gparams = T.grad(cost, self.params)

        # compute list of fine-tuning updates
        updates = OrderedDict()
        for param, gparam in zip(self.params, gparams):
            updates[param] = param - gparam * learning_rate

        train_fn = theano.function(inputs=[
            theano.Param(batch_x),
            theano.Param(batch_y),
            theano.Param(learning_rate)
        ],
                                   outputs=cost,
                                   updates=updates,
                                   givens={
                                       self.x: batch_x,
                                       self.y: batch_y
                                   })

        return train_fn

    def get_adadelta_trainer(self):
        """ Returns an Adadelta (Zeiler 2012) trainer using self._rho and self._eps params.
        """
        batch_x = T.fmatrix('batch_x')
        batch_y = T.ivector('batch_y')
        cost = self.finetune_cost_sum
        # compute the gradients with respect to the model parameters
        gparams = T.grad(cost, self.params)

        # compute list of fine-tuning updates
        updates = OrderedDict()
        for accugrad, accudelta, param, gparam in zip(self._accugrads,
                                                      self._accudeltas,
                                                      self.params, gparams):
            # c.f. Algorithm 1 in the Adadelta paper (Zeiler 2012)
            agrad = self._rho * accugrad + (1 - self._rho) * gparam * gparam
            dx = -T.sqrt(
                (accudelta + self._eps) / (agrad + self._eps)) * gparam
            updates[accudelta] = self._rho * accudelta + (1 -
                                                          self._rho) * dx * dx
            updates[param] = param + dx
            updates[accugrad] = agrad

        train_fn = theano.function(
            inputs=[theano.Param(batch_x),
                    theano.Param(batch_y)],
            outputs=cost,
            updates=updates,
            givens={
                self.x: batch_x,
                self.y: batch_y
            })

        return train_fn

    def get_adagrad_trainer(self):
        """ Returns an Adagrad (Duchi et al. 2010) trainer using a learning rate.
        """
        batch_x = T.fmatrix('batch_x')
        batch_y = T.ivector('batch_y')
        learning_rate = T.fscalar('lr')  # learning rate to use
        cost = self.finetune_cost_sum
        # compute the gradients with respect to the model parameters
        gparams = T.grad(cost, self.params)

        # compute list of fine-tuning updates
        updates = OrderedDict()
        for accugrad, param, gparam in zip(self._accugrads, self.params,
                                           gparams):
            # c.f. Algorithm 1 in the Adadelta paper (Zeiler 2012)
            agrad = accugrad + gparam * gparam
            dx = -(learning_rate / T.sqrt(agrad + self._eps)) * gparam
            updates[param] = param + dx
            updates[accugrad] = agrad

        train_fn = theano.function(inputs=[
            theano.Param(batch_x),
            theano.Param(batch_y),
            theano.Param(learning_rate)
        ],
                                   outputs=cost,
                                   updates=updates,
                                   givens={
                                       self.x: batch_x,
                                       self.y: batch_y
                                   })

        return train_fn

    def get_SAG_trainer(self):
        """ Returns a Stochastic Averaged Gradient (Bach & Moulines 2011) trainer.

        This is based on Bach 2013 slides: 
        PRavg(theta_n) = Polyak-Ruppert averaging = (1+n)^{-1} * \sum_{k=0}^n theta_k
        theta_n = theta_{n-1} - gamma [ f'_n(PR_avg(theta_{n-1})) + f''_n(PR_avg(
                  theta_{n-1})) * (theta_{n-1} - PR_avg(theta_{n-1}))]

        That returns two trainers: one for the first epoch, one for subsequent epochs.
        We use self._accudeltas to but the Polyak-Ruppert averaging,
        and self._accugrads for the number of iterations (updates).
        """
        print "UNFINISHED, see TODO in get_SAG_trainer()"
        sys.exit(-1)

        batch_x = T.fmatrix('batch_x')
        batch_y = T.ivector('batch_y')
        learning_rate = T.fscalar('lr')  # learning rate to use
        cost = self.finetune_cost_sum

        # First trainer:
        gparams = T.grad(cost, self.params)
        updates = OrderedDict()
        for accudelta, accugrad, param, gparam in zip(self._accudeltas,
                                                      self._accugrads,
                                                      self.params, gparams):
            theta = param - gparam * learning_rate
            updates[accudelta] = (theta + accudelta * accugrad) / (accugrad +
                                                                   1.)
            updates[param] = theta
            updates[accugrad] = accugrad + 1.

        train_fn_init = theano.function(inputs=[
            theano.Param(batch_x),
            theano.Param(batch_y),
            theano.Param(learning_rate)
        ],
                                        outputs=cost,
                                        updates=updates,
                                        givens={
                                            self.x: batch_x,
                                            self.y: batch_y
                                        })

        # Second trainer:
        gparams = T.grad(cost,
                         self._accudeltas)  # TODO recreate the network with
        # (TODO) self._accudeltas instead of self.params so that we can compute the cost
        hparams = T.grad(cost, gparams)

        # compute list of fine-tuning updates
        updates = OrderedDict()
        for accudelta, accugrad, param, gparam, hparam in zip(
                self._accudeltas, self._accugrads, self.params, gparams,
                hparams):
            theta = param - learning_rate * (gparam + hparam *
                                             (param - accudelta))
            updates[accudelta] = (theta + accudelta * accugrad) / (accugrad +
                                                                   1.)
            updates[param] = theta
            updates[accugrad] = accugrad + 1.

        train_fn = theano.function(inputs=[
            theano.Param(batch_x),
            theano.Param(batch_y),
            theano.Param(learning_rate)
        ],
                                   outputs=cost,
                                   updates=updates,
                                   givens={
                                       self.x: batch_x,
                                       self.y: batch_y
                                   })

        return train_fn_init, train_fn

    def get_SGD_ld_trainer(self):
        """ Returns an SGD-ld trainer (Schaul et al. 2012).
        """
        print "UNFINISHED, see TODO in get_SGD_ld_trainer()"
        sys.exit(-1)

        batch_x = T.fmatrix('batch_x')
        batch_y = T.ivector('batch_y')
        cost = self.finetune_cost_sum
        # compute the gradients with respect to the model parameters
        gparams = T.grad(cost, self.params)
        # INIT TODO

        # compute list of fine-tuning updates
        updates = OrderedDict()
        for accugrad, accudelta, accuhess, param, gparam in zip(
                self._accugrads, self._accudeltas, self._accuhess, self.params,
                gparams):
            pass  # TODO
            # TODO
            # TODO

        train_fn = theano.function(
            inputs=[theano.Param(batch_x),
                    theano.Param(batch_y)],
            outputs=cost,
            updates=updates,
            givens={
                self.x: batch_x,
                self.y: batch_y
            })

        return train_fn

    def score_classif(self, given_set):
        """ Returns functions to get current classification scores. """
        batch_x = T.fmatrix('batch_x')
        batch_y = T.ivector('batch_y')
        score = theano.function(
            inputs=[theano.Param(batch_x),
                    theano.Param(batch_y)],
            outputs=self.errors,
            givens={
                self.x: batch_x,
                self.y: batch_y
            })

        # Create a function that scans the entire set given as input
        def scoref():
            return [score(batch_x, batch_y) for batch_x, batch_y in given_set]

        return scoref