def evaluate_lenet5(batch_size=30, n_iter=1000): rng = numpy.random.RandomState(23455) mnist = pylearn.datasets.MNIST.train_valid_test() ishape=(28,28) #this is the size of MNIST images # allocate symbolic variables for the data x = tensor.fmatrix() # the data is presented as rasterized images y = tensor.lvector() # the labels are presented as 1D vector of [long int] labels # construct the first convolutional pooling layer layer0 = LeNetConvPool.new(rng, input=x.reshape((batch_size,1,28,28)), n_examples=batch_size, n_imgs=1, img_shape=ishape, n_filters=6, filter_shape=(5,5), poolsize=(2,2)) # construct the second convolutional pooling layer layer1 = LeNetConvPool.new(rng, input=layer0.output, n_examples=batch_size, n_imgs=6, img_shape=(12,12), n_filters=16, filter_shape=(5,5), poolsize=(2,2)) # construct a fully-connected sigmoidal layer layer2 = SigmoidalLayer.new(rng, input=layer1.output.flatten(2), n_in=16*16, n_out=128) # 128 ? # classify the values of the fully-connected sigmoidal layer layer3 = LogisticRegression.new(input=layer2.output, n_in=128, n_out=10) # the cost we minimize during training is the NLL of the model cost = layer3.nll(y).mean() # create a function to compute the mistakes that are made by the model test_model = pfunc([x,y], layer3.errors(y)) # create a list of all model parameters to be fit by gradient descent params = layer3.params+ layer2.params+ layer1.params + layer0.params learning_rate = numpy.asarray(0.01, dtype='float32') # train_model is a function that updates the model parameters by SGD train_model = pfunc([x, y], cost, updates=[(p, p - learning_rate*gp) for p,gp in zip(params, tensor.grad(cost, params))]) # IS IT MORE SIMPLE TO USE A MINIMIZER OR THE DIRECT CODE? best_valid_score = float('inf') for i in xrange(n_iter): for j in xrange(len(mnist.train.x)/batch_size): cost_ij = train_model( mnist.train.x[j*batch_size:(j+1)*batch_size], mnist.train.y[j*batch_size:(j+1)*batch_size]) #if 0 == j % 100: #print('epoch %i:%i, training error %f' % (i, j*batch_size, cost_ij)) valid_score = numpy.mean([test_model( mnist.valid.x[j*batch_size:(j+1)*batch_size], mnist.valid.y[j*batch_size:(j+1)*batch_size]) for j in xrange(len(mnist.valid.x)/batch_size)]) print('epoch %i, validation error %f' % (i, valid_score)) if valid_score < best_valid_score: best_valid_score = valid_score test_score = numpy.mean([test_model( mnist.test.x[j*batch_size:(j+1)*batch_size], mnist.test.y[j*batch_size:(j+1)*batch_size]) for j in xrange(len(mnist.test.x)/batch_size)]) print('epoch %i, test error of best model %f' % (i, test_score))
def __init__(self, n_visible= 784, n_hidden= 500, lr= 1e-1, input= None): """ Initialize the DAE class by specifying the number of visible units (the dimension d of the input ), the number of hidden units ( the dimension d' of the latent or hidden space ), a initial value for the learning rate and by giving a symbolic description of the input. Such a symbolic description is of no importance for the simple DAE and therefore can be ignored. This feature is useful when stacking DAEs, since the input of intermediate layers can be symbolically described in terms of the hidden units of the previous layer. See the tutorial on SDAE for more details. :param n_visible: number of visible units :param n_hidden: number of hidden units :param lr: a initial value for the learning rate :param input: a symbolic description of the input or None """ self.n_visible = n_visible self.n_hidden = n_hidden # create a Theano random generator that gives symbolic random values theano_rng = RandomStreams( seed = 1234 ) # create a numpy random generator numpy_rng = numpy.random.RandomState( seed = 52432 ) # initial values for weights and biases # note : W' was written as W_prime and b' as b_prime initial_W = numpy_rng.uniform(size = (n_visible, n_hidden)) # transform W such that all values are between -.01 and .01 initial_W = (initial_W*2.0 - 1.0)*.01 initial_b = numpy.zeros(n_hidden) initial_W_prime = numpy_rng.uniform(size = (n_hidden, n_visible)) # transform W_prime such that all values are between -.01 and .01 initial_W_prime = (initial_W_prime*2.0 - 1.0)*.01 initial_b_prime= numpy.zeros(n_visible) # theano shared variables for weights and biases self.W = shared(value = initial_W , name = "W") self.b = shared(value = initial_b , name = "b") self.W_prime = shared(value = initial_W_prime, name = "W'") self.b_prime = shared(value = initial_b_prime, name = "b'") # theano shared variable for the learning rate self.lr = shared(value = lr , name = "learning_rate") # if no input is given generate a variable representing the input if input == None : # we use a matrix because we expect a minibatch of several examples, # each example being a row x = tensor.dmatrix(name = 'input') else: x = input # Equation (1) # note : first argument of theano.rng.binomial is the shape(size) of # random numbers that it should produce # second argument is the number of trials # third argument is the probability of success of any trial # # this will produce an array of 0s and 1s where 1 has a # probability of 0.9 and 0 if 0.1 tilde_x = theano_rng.binomial( x.shape, 1, 0.9) * x # Equation (2) # note : y is stored as an attribute of the class so that it can be # used later when stacking DAEs. self.y = nnet.sigmoid(tensor.dot(tilde_x, self.W ) + self.b) # Equation (3) z = nnet.sigmoid(tensor.dot(self.y, self.W_prime) + self.b_prime) # Equation (4) L = - tensor.sum( x*tensor.log(z) + (1-x)*tensor.log(1-z), axis=1 ) # note : L is now a vector, where each element is the cross-entropy cost # of the reconstruction of the corresponding example of the # minibatch. We need to sum all these to get the cost of the # minibatch cost = tensor.sum(L) # parameters with respect to whom we need to compute the gradient self.params = [ self.W, self.b, self.W_prime, self.b_prime] # use theano automatic differentiation to get the gradients gW, gb, gW_prime, gb_prime = tensor.grad(cost, self.params) # update the parameters in the direction of the gradient using the # learning rate updated_W = self.W - gW * self.lr updated_b = self.b - gb * self.lr updated_W_prime = self.W_prime - gW_prime * self.lr updated_b_prime = self.b_prime - gb_prime * self.lr # defining the function that evaluate the symbolic description of # one update step self.update = pfunc(params = [x], outputs = cost, updates = { self.W : updated_W, self.b : updated_b, self.W_prime : updated_W_prime, self.b_prime : updated_b_prime } ) self.get_cost = pfunc(params = [x], outputs = cost)
def __init__(self, n_visible=784, n_hidden=500, lr=1e-1, input=None): """ Initialize the DAE class by specifying the number of visible units (the dimension d of the input ), the number of hidden units ( the dimension d' of the latent or hidden space ), a initial value for the learning rate and by giving a symbolic description of the input. Such a symbolic description is of no importance for the simple DAE and therefore can be ignored. This feature is useful when stacking DAEs, since the input of intermediate layers can be symbolically described in terms of the hidden units of the previous layer. See the tutorial on SDAE for more details. :param n_visible: number of visible units :param n_hidden: number of hidden units :param lr: a initial value for the learning rate :param input: a symbolic description of the input or None """ self.n_visible = n_visible self.n_hidden = n_hidden # create a Theano random generator that gives symbolic random values theano_rng = RandomStreams(seed=1234) # create a numpy random generator numpy_rng = numpy.random.RandomState(seed=52432) # initial values for weights and biases # note : W' was written as W_prime and b' as b_prime initial_W = numpy_rng.uniform(size=(n_visible, n_hidden)) # transform W such that all values are between -.01 and .01 initial_W = (initial_W * 2.0 - 1.0) * .01 initial_b = numpy.zeros(n_hidden) initial_W_prime = numpy_rng.uniform(size=(n_hidden, n_visible)) # transform W_prime such that all values are between -.01 and .01 initial_W_prime = (initial_W_prime * 2.0 - 1.0) * .01 initial_b_prime = numpy.zeros(n_visible) # theano shared variables for weights and biases self.W = shared(value=initial_W, name="W") self.b = shared(value=initial_b, name="b") self.W_prime = shared(value=initial_W_prime, name="W'") self.b_prime = shared(value=initial_b_prime, name="b'") # theano shared variable for the learning rate self.lr = shared(value=lr, name="learning_rate") # if no input is given generate a variable representing the input if input == None: # we use a matrix because we expect a minibatch of several examples, # each example being a row x = tensor.dmatrix(name='input') else: x = input # Equation (1) # note : first argument of theano.rng.binomial is the shape(size) of # random numbers that it should produce # second argument is the number of trials # third argument is the probability of success of any trial # # this will produce an array of 0s and 1s where 1 has a # probability of 0.9 and 0 if 0.1 tilde_x = theano_rng.binomial(x.shape, 1, 0.9) * x # Equation (2) # note : y is stored as an attribute of the class so that it can be # used later when stacking DAEs. self.y = nnet.sigmoid(tensor.dot(tilde_x, self.W) + self.b) # Equation (3) z = nnet.sigmoid(tensor.dot(self.y, self.W_prime) + self.b_prime) # Equation (4) L = -tensor.sum(x * tensor.log(z) + (1 - x) * tensor.log(1 - z), axis=1) # note : L is now a vector, where each element is the cross-entropy cost # of the reconstruction of the corresponding example of the # minibatch. We need to sum all these to get the cost of the # minibatch cost = tensor.sum(L) # parameters with respect to whom we need to compute the gradient self.params = [self.W, self.b, self.W_prime, self.b_prime] # use theano automatic differentiation to get the gradients gW, gb, gW_prime, gb_prime = tensor.grad(cost, self.params) # update the parameters in the direction of the gradient using the # learning rate updated_W = self.W - gW * self.lr updated_b = self.b - gb * self.lr updated_W_prime = self.W_prime - gW_prime * self.lr updated_b_prime = self.b_prime - gb_prime * self.lr # defining the function that evaluate the symbolic description of # one update step self.update = pfunc(params=[x], outputs=cost, updates={ self.W: updated_W, self.b: updated_b, self.W_prime: updated_W_prime, self.b_prime: updated_b_prime }) self.get_cost = pfunc(params=[x], outputs=cost)
def evaluate_lenet5(batch_size=30, n_iter=1000): rng = numpy.random.RandomState(23455) mnist = pylearn.datasets.MNIST.train_valid_test() ishape = (28, 28) #this is the size of MNIST images # allocate symbolic variables for the data x = tensor.fmatrix() # the data is presented as rasterized images y = tensor.lvector( ) # the labels are presented as 1D vector of [long int] labels # construct the first convolutional pooling layer layer0 = LeNetConvPool.new(rng, input=x.reshape((batch_size, 1, 28, 28)), n_examples=batch_size, n_imgs=1, img_shape=ishape, n_filters=6, filter_shape=(5, 5), poolsize=(2, 2)) # construct the second convolutional pooling layer layer1 = LeNetConvPool.new(rng, input=layer0.output, n_examples=batch_size, n_imgs=6, img_shape=(12, 12), n_filters=16, filter_shape=(5, 5), poolsize=(2, 2)) # construct a fully-connected sigmoidal layer layer2 = SigmoidalLayer.new(rng, input=layer1.output.flatten(2), n_in=16 * 16, n_out=128) # 128 ? # classify the values of the fully-connected sigmoidal layer layer3 = LogisticRegression.new(input=layer2.output, n_in=128, n_out=10) # the cost we minimize during training is the NLL of the model cost = layer3.nll(y).mean() # create a function to compute the mistakes that are made by the model test_model = pfunc([x, y], layer3.errors(y)) # create a list of all model parameters to be fit by gradient descent params = layer3.params + layer2.params + layer1.params + layer0.params learning_rate = numpy.asarray(0.01, dtype='float32') # train_model is a function that updates the model parameters by SGD train_model = pfunc([x, y], cost, updates=[ (p, p - learning_rate * gp) for p, gp in zip(params, tensor.grad(cost, params)) ]) # IS IT MORE SIMPLE TO USE A MINIMIZER OR THE DIRECT CODE? best_valid_score = float('inf') for i in xrange(n_iter): for j in xrange(len(mnist.train.x) / batch_size): cost_ij = train_model( mnist.train.x[j * batch_size:(j + 1) * batch_size], mnist.train.y[j * batch_size:(j + 1) * batch_size]) #if 0 == j % 100: #print('epoch %i:%i, training error %f' % (i, j*batch_size, cost_ij)) valid_score = numpy.mean([ test_model(mnist.valid.x[j * batch_size:(j + 1) * batch_size], mnist.valid.y[j * batch_size:(j + 1) * batch_size]) for j in xrange(len(mnist.valid.x) / batch_size) ]) print('epoch %i, validation error %f' % (i, valid_score)) if valid_score < best_valid_score: best_valid_score = valid_score test_score = numpy.mean([ test_model(mnist.test.x[j * batch_size:(j + 1) * batch_size], mnist.test.y[j * batch_size:(j + 1) * batch_size]) for j in xrange(len(mnist.test.x) / batch_size) ]) print('epoch %i, test error of best model %f' % (i, test_score))