def evaluate_lenet5(learning_rate=0.1, n_epochs=2000, nkerns=[6, 14], batch_size=70, useAllSamples=0, kmax=30, ktop=4, filter_size=[7, 5], hidden_units=50, L2_weight=0.000005, dropout_p=0.2, useEmb=1): """ Demonstrates lenet on MNIST dataset :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type dataset: string :param dataset: path to the dataset used for training /testing (MNIST here) :type nkerns: list of ints :param nkerns: number of kernels on each layer """ root = "/mounts/data/proj/wenpeng/Dataset/StanfordSentiment/stanfordSentimentTreebank/" embeddingPath = '/mounts/data/proj/wenpeng/Downloads/hlbl-embeddings-original.EMBEDDING_SIZE=50.txt' embeddingPath2 = '/mounts/data/proj/wenpeng/MC/src/released_embedding.txt' rng = numpy.random.RandomState(99999) datasets, embedding_size, embeddings = read_data_WP( root + '2classes/train.txt', root + '2classes/dev.txt', root + '2classes/test.txt', embeddingPath, 60, useEmb) #datasets, embedding_size, embeddings=read_data(root+'2classes/train.txt', root+'2classes/dev.txt', root+'2classes/test.txt', embeddingPath,60) #datasets = load_data(dataset) indices_train, trainY, trainLengths, trainLeftPad, trainRightPad = datasets[ 0] indices_dev, devY, devLengths, devLeftPad, devRightPad = datasets[1] indices_test, testY, testLengths, testLeftPad, testRightPad = datasets[2] n_train_batches = indices_train.shape[0] / batch_size n_valid_batches = indices_dev.shape[0] / batch_size n_test_batches = indices_test.shape[0] / batch_size train_batch_start = [] dev_batch_start = [] test_batch_start = [] if useAllSamples: train_batch_start = list( numpy.arange(n_train_batches) * batch_size) + [indices_train.shape[0] - batch_size] dev_batch_start = list(numpy.arange(n_valid_batches) * batch_size) + [ indices_dev.shape[0] - batch_size ] test_batch_start = list(numpy.arange(n_test_batches) * batch_size) + [ indices_test.shape[0] - batch_size ] n_train_batches = n_train_batches + 1 n_valid_batches = n_valid_batches + 1 n_test_batches = n_test_batches + 1 else: train_batch_start = list(numpy.arange(n_train_batches) * batch_size) dev_batch_start = list(numpy.arange(n_valid_batches) * batch_size) test_batch_start = list(numpy.arange(n_test_batches) * batch_size) indices_train_theano = theano.shared(numpy.asarray( indices_train, dtype=theano.config.floatX), borrow=True) indices_dev_theano = theano.shared(numpy.asarray( indices_dev, dtype=theano.config.floatX), borrow=True) indices_test_theano = theano.shared(numpy.asarray( indices_test, dtype=theano.config.floatX), borrow=True) indices_train_theano = T.cast(indices_train_theano, 'int32') indices_dev_theano = T.cast(indices_dev_theano, 'int32') indices_test_theano = T.cast(indices_test_theano, 'int32') # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch x_index = T.imatrix( 'x_index') # now, x is the index matrix, must be integer y = T.ivector('y') #z = T.ivector('z') left = T.ivector('left') right = T.ivector('right') x = embeddings[x_index.flatten()].reshape( (batch_size, 60, embedding_size)).transpose(0, 2, 1).flatten() ishape = (embedding_size, 60) # this is the size of MNIST images filter_size1 = (1, filter_size[0]) filter_size2 = (1, filter_size[1]) #poolsize1=(1, ishape[1]-filter_size1[1]+1) #????????????????????????????? poolsize1 = (1, ishape[1] + filter_size1[1] - 1) left_after_conv = left right_after_conv = right #kmax=30 # this can not be too small, like 20 #ktop=6 #poolsize2=(1, kmax-filter_size2[1]+1) #(1,6) poolsize2 = (1, kmax + filter_size2[1] - 1) #(1,6) #dynamic_lengths=T.maximum(ktop,z/2+1) # dynamic k-max pooling ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size,28*28) # to a 4D tensor, compatible with our LeNetConvPoolLayer layer0_input = x.reshape((batch_size, 1, ishape[0], ishape[1])) layer1 = ConvFoldPoolLayer(rng, input=layer0_input, image_shape=(batch_size, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_size1[0], filter_size1[1]), poolsize=poolsize1, k=ktop, left=left_after_conv, right=right_after_conv) # the HiddenLayer being fully-connected, it operates on 2D matrices of # shape (batch_size,num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (20,32*4*4) = (20,512) layer2_input = layer1.output.flatten(2) # construct a fully-connected sigmoidal layer, the output of layers has nkerns[1]=50 images, each is 4*4 size layer2 = HiddenLayer(rng, input=layer2_input, n_in=nkerns[0] * (embedding_size / 2) * ktop, n_out=hidden_units, activation=T.tanh) dropout = dropout_from_layer(rng, layer2.output, dropout_p) #dropout layer3 = LogisticRegression(rng, input=dropout, n_in=hidden_units, n_out=2) #layer3 = LogisticRegression(rng, input=layer2.output, n_in=50, n_out=2) # the cost we minimize during training is the NLL of the model #L1_reg= abs(layer3.W).sum() + abs(layer2.W).sum() +abs(layer1.W).sum()+abs(layer0.W).sum()+abs(embeddings).sum() L2_reg = (layer3.W**2).sum() + (layer2.W**2).sum() + ( layer1.W**2).sum() + (embeddings**2).sum() #L2_reg = (layer3.W** 2).sum() + (layer2.W** 2).sum()+(layer0.W** 2).sum()+(embeddings**2).sum() #cost must have L2, otherwise, will produce nan, while with L2, each word embedding will be updated cost = layer3.negative_log_likelihood(y) + L2_weight * L2_reg #cost = layer3.negative_log_likelihood(y) # create a function to compute the mistakes that are made by the model test_model = theano.function( [index], layer3.errors(y), givens={ x_index: indices_test_theano[index:index + batch_size], y: testY[index:index + batch_size], left: testLeftPad[index:index + batch_size], right: testRightPad[index:index + batch_size] }) validate_model = theano.function( [index], layer3.errors(y), givens={ x_index: indices_dev_theano[index:index + batch_size], y: devY[index:index + batch_size], left: devLeftPad[index:index + batch_size], right: devRightPad[index:index + batch_size] }) # create a list of all model parameters to be fit by gradient descent params = layer3.params + layer2.params + layer1.params + [embeddings] #params = layer3.params + layer2.params + layer0.params+[embeddings] accumulator = [] for para_i in params: eps_p = numpy.zeros_like(para_i.get_value(borrow=True), dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) # train_model is a function that updates the model parameters by # SGD Since this model has many parameters, it would be tedious to # manually create an update rule for each model parameter. We thus # create the updates list by automatically looping over all # (params[i],grads[i]) pairs. ''' updates = [] for param_i, grad_i in zip(params, grads): updates.append((param_i, param_i - learning_rate * grad_i)) ''' updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): acc = acc_i + T.sqr(grad_i) if param_i == embeddings: updates.append( (param_i, T.set_subtensor( (param_i - learning_rate * grad_i / T.sqrt(acc))[0], theano.shared(numpy.zeros(embedding_size))))) #AdaGrad else: updates.append( (param_i, param_i - learning_rate * grad_i / T.sqrt(acc))) #AdaGrad updates.append((acc_i, acc)) train_model = theano.function( [index], [cost, layer3.errors(y)], updates=updates, givens={ x_index: indices_train_theano[index:index + batch_size], y: trainY[index:index + batch_size], left: trainLeftPad[index:index + batch_size], right: trainRightPad[index:index + batch_size] }) ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 50000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 #for minibatch_index in xrange(n_train_batches): # each batch minibatch_index = 0 for batch_start in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + minibatch_index + 1 minibatch_index = minibatch_index + 1 cost_ij, error_ij = train_model(batch_start) if iter % n_train_batches == 0: print 'training @ iter = ' + str(iter) + ' cost: ' + str( cost_ij) + ' error: ' + str(error_ij) if iter % validation_frequency == 0: # compute zero-one loss on validation set #validation_losses = [validate_model(i) for i in xrange(n_valid_batches)] validation_losses = [ validate_model(i) for i in dev_batch_start ] this_validation_loss = numpy.mean(validation_losses) print('\t\tepoch %i, minibatch %i/%i, validation error %f %%' % \ (epoch, minibatch_index , n_train_batches, \ this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter # test it on the test set test_losses = [test_model(i) for i in test_batch_start] test_score = numpy.mean(test_losses) print(( '\t\t\t\tepoch %i, minibatch %i/%i, test error of best ' 'model %f %%') % (epoch, minibatch_index, n_train_batches, test_score * 100.)) if patience <= iter: done_looping = True break end_time = time.clock() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.1, n_epochs=2000, nkerns=[6, 14], batch_size=70, useAllSamples=0, kmax=30, ktop=4, filter_size=[7,5], hidden_units=50, L2_weight=0.000005, dropout_p=0.2, useEmb=1): """ Demonstrates lenet on MNIST dataset :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type dataset: string :param dataset: path to the dataset used for training /testing (MNIST here) :type nkerns: list of ints :param nkerns: number of kernels on each layer """ root="/mounts/data/proj/wenpeng/Dataset/StanfordSentiment/stanfordSentimentTreebank/" embeddingPath='/mounts/data/proj/wenpeng/Downloads/hlbl-embeddings-original.EMBEDDING_SIZE=50.txt' embeddingPath2='/mounts/data/proj/wenpeng/MC/src/released_embedding.txt' rng = numpy.random.RandomState(99999) datasets, embedding_size, embeddings=read_data_WP(root+'2classes/train.txt', root+'2classes/dev.txt', root+'2classes/test.txt', embeddingPath,60, useEmb) #datasets, embedding_size, embeddings=read_data(root+'2classes/train.txt', root+'2classes/dev.txt', root+'2classes/test.txt', embeddingPath,60) #datasets = load_data(dataset) indices_train, trainY, trainLengths, trainLeftPad, trainRightPad= datasets[0] indices_dev, devY, devLengths, devLeftPad, devRightPad= datasets[1] indices_test, testY, testLengths, testLeftPad, testRightPad= datasets[2] n_train_batches=indices_train.shape[0]/batch_size n_valid_batches=indices_dev.shape[0]/batch_size n_test_batches=indices_test.shape[0]/batch_size train_batch_start=[] dev_batch_start=[] test_batch_start=[] if useAllSamples: train_batch_start=list(numpy.arange(n_train_batches)*batch_size)+[indices_train.shape[0]-batch_size] dev_batch_start=list(numpy.arange(n_valid_batches)*batch_size)+[indices_dev.shape[0]-batch_size] test_batch_start=list(numpy.arange(n_test_batches)*batch_size)+[indices_test.shape[0]-batch_size] n_train_batches=n_train_batches+1 n_valid_batches=n_valid_batches+1 n_test_batches=n_test_batches+1 else: train_batch_start=list(numpy.arange(n_train_batches)*batch_size) dev_batch_start=list(numpy.arange(n_valid_batches)*batch_size) test_batch_start=list(numpy.arange(n_test_batches)*batch_size) indices_train_theano=theano.shared(numpy.asarray(indices_train, dtype=theano.config.floatX), borrow=True) indices_dev_theano=theano.shared(numpy.asarray(indices_dev, dtype=theano.config.floatX), borrow=True) indices_test_theano=theano.shared(numpy.asarray(indices_test, dtype=theano.config.floatX), borrow=True) indices_train_theano=T.cast(indices_train_theano, 'int32') indices_dev_theano=T.cast(indices_dev_theano, 'int32') indices_test_theano=T.cast(indices_test_theano, 'int32') # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch x_index = T.imatrix('x_index') # now, x is the index matrix, must be integer y = T.ivector('y') #z = T.ivector('z') left=T.ivector('left') right=T.ivector('right') x=embeddings[x_index.flatten()].reshape((batch_size,60, embedding_size)).transpose(0, 2, 1).flatten() ishape = (embedding_size, 60) # this is the size of MNIST images filter_size1=(1,filter_size[0]) filter_size2=(1,filter_size[1]) #poolsize1=(1, ishape[1]-filter_size1[1]+1) #????????????????????????????? poolsize1=(1, ishape[1]+filter_size1[1]-1) left_after_conv=left right_after_conv=right #kmax=30 # this can not be too small, like 20 #ktop=6 #poolsize2=(1, kmax-filter_size2[1]+1) #(1,6) poolsize2=(1, kmax+filter_size2[1]-1) #(1,6) #dynamic_lengths=T.maximum(ktop,z/2+1) # dynamic k-max pooling ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size,28*28) # to a 4D tensor, compatible with our LeNetConvPoolLayer layer0_input = x.reshape((batch_size, 1, ishape[0], ishape[1])) layer1 = ConvFoldPoolLayer(rng, input=layer0_input, image_shape=(batch_size, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_size1[0], filter_size1[1]), poolsize=poolsize1, k=ktop, left=left_after_conv, right=right_after_conv) # the HiddenLayer being fully-connected, it operates on 2D matrices of # shape (batch_size,num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (20,32*4*4) = (20,512) layer2_input = layer1.output.flatten(2) # construct a fully-connected sigmoidal layer, the output of layers has nkerns[1]=50 images, each is 4*4 size layer2 = HiddenLayer(rng, input=layer2_input, n_in=nkerns[0] * (embedding_size/2) * ktop, n_out=hidden_units, activation=T.tanh) dropout=dropout_from_layer(rng, layer2.output, dropout_p) #dropout layer3 = LogisticRegression(rng, input=dropout, n_in=hidden_units, n_out=2) #layer3 = LogisticRegression(rng, input=layer2.output, n_in=50, n_out=2) # the cost we minimize during training is the NLL of the model #L1_reg= abs(layer3.W).sum() + abs(layer2.W).sum() +abs(layer1.W).sum()+abs(layer0.W).sum()+abs(embeddings).sum() L2_reg = (layer3.W** 2).sum() + (layer2.W** 2).sum()+ (layer1.W** 2).sum()+(embeddings**2).sum() #L2_reg = (layer3.W** 2).sum() + (layer2.W** 2).sum()+(layer0.W** 2).sum()+(embeddings**2).sum() #cost must have L2, otherwise, will produce nan, while with L2, each word embedding will be updated cost = layer3.negative_log_likelihood(y)+L2_weight*L2_reg #cost = layer3.negative_log_likelihood(y) # create a function to compute the mistakes that are made by the model test_model = theano.function([index], layer3.errors(y), givens={ x_index: indices_test_theano[index: index + batch_size], y: testY[index: index + batch_size], left: testLeftPad[index: index + batch_size], right: testRightPad[index: index + batch_size]}) validate_model = theano.function([index], layer3.errors(y), givens={ x_index: indices_dev_theano[index: index + batch_size], y: devY[index: index + batch_size], left: devLeftPad[index: index + batch_size], right: devRightPad[index: index + batch_size]}) # create a list of all model parameters to be fit by gradient descent params = layer3.params + layer2.params + layer1.params +[embeddings] #params = layer3.params + layer2.params + layer0.params+[embeddings] accumulator=[] for para_i in params: eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) # train_model is a function that updates the model parameters by # SGD Since this model has many parameters, it would be tedious to # manually create an update rule for each model parameter. We thus # create the updates list by automatically looping over all # (params[i],grads[i]) pairs. ''' updates = [] for param_i, grad_i in zip(params, grads): updates.append((param_i, param_i - learning_rate * grad_i)) ''' updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): acc = acc_i + T.sqr(grad_i) if param_i == embeddings: updates.append((param_i, T.set_subtensor((param_i - learning_rate * grad_i / T.sqrt(acc))[0], theano.shared(numpy.zeros(embedding_size))))) #AdaGrad else: updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc))) #AdaGrad updates.append((acc_i, acc)) train_model = theano.function([index], [cost,layer3.errors(y)], updates=updates, givens={ x_index: indices_train_theano[index: index + batch_size], y: trainY[index: index + batch_size], left: trainLeftPad[index: index + batch_size], right: trainRightPad[index: index + batch_size]}) ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 50000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 #for minibatch_index in xrange(n_train_batches): # each batch minibatch_index=0 for batch_start in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + minibatch_index +1 minibatch_index=minibatch_index+1 cost_ij, error_ij = train_model(batch_start) if iter % n_train_batches == 0: print 'training @ iter = '+str(iter)+' cost: '+str(cost_ij)+' error: '+str(error_ij) if iter % validation_frequency == 0: # compute zero-one loss on validation set #validation_losses = [validate_model(i) for i in xrange(n_valid_batches)] validation_losses = [validate_model(i) for i in dev_batch_start] this_validation_loss = numpy.mean(validation_losses) print('\t\tepoch %i, minibatch %i/%i, validation error %f %%' % \ (epoch, minibatch_index , n_train_batches, \ this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter # test it on the test set test_losses = [test_model(i) for i in test_batch_start] test_score = numpy.mean(test_losses) print(('\t\t\t\tepoch %i, minibatch %i/%i, test error of best ' 'model %f %%') % (epoch, minibatch_index, n_train_batches, test_score * 100.)) if patience <= iter: done_looping = True break end_time = time.clock() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def __init__(self, learning_rate=0.2, n_epochs=2000, nkerns=[6, 14], batch_size=20, useAllSamples=0, kmax=30, ktop=4, filter_size=[7,5], L2_weight=0.00005, dropout_p=0.8, useEmb=0, task=2, corpus=1, dataMode=3, maxSentLength=60, sentEm_length=48, window=3, k=5, nce_seeds=2345, only_left_context=False, vali_cost_list_length=20): self.ini_learning_rate=learning_rate self.n_epochs=n_epochs self.nkerns=nkerns self.batch_size=batch_size self.useAllSamples=useAllSamples self.kmax=kmax self.ktop=ktop self.filter_size=filter_size self.L2_weight=L2_weight self.dropout_p=dropout_p self.useEmb=useEmb self.task=task self.corpus=corpus self.dataMode=dataMode self.maxSentLength=maxSentLength self.sentEm_length=sentEm_length self.window=window self.k=k self.only_left_context=only_left_context if self.only_left_context: self.context_size=self.window else: self.context_size=2*self.window self.nce_seed=nce_seeds self.embedding_size=0 root="/mounts/data/proj/wenpeng/Dataset/StanfordSentiment/stanfordSentimentTreebank/" embeddingPath='/mounts/data/proj/wenpeng/Downloads/hlbl-embeddings-original.EMBEDDING_SIZE=50.txt' embeddingPath2='/mounts/data/proj/wenpeng/MC/src/released_embedding.txt' datasets, embedding_size, embeddings_R, embeddings_Q, unigram, train_lengths, dev_lengths, test_lengths=read_data_WP(root+str(self.task)+'classes/'+str(self.corpus)+'train.txt', root+str(self.task)+'classes/'+str(self.corpus)+'dev.txt', root+str(self.task)+'classes/'+str(self.corpus)+'test.txt', embeddingPath,self.maxSentLength, self.useEmb, self.dataMode) self.datasets=datasets self.embedding_size=embedding_size self.embeddings_R=embeddings_R self.embeddings_Q=embeddings_Q self.unigram=unigram self.p_n=theano.shared(value=self.unigram) self.train_lengths=train_lengths self.dev_lengths=dev_lengths self.test_lengths=test_lengths b_values = zero_value((len(unigram),), dtype=theano.config.floatX) self.bias = theano.shared(value=b_values, name='bias') self.vali_cost_list_length=vali_cost_list_length