def evaluate_lenet5(learning_rate=0.1, n_epochs=2000, nkerns=[6, 14], batch_size=70, useAllSamples=0, kmax=30, ktop=4, filter_size=[7,5],
                    hidden_units=50, L2_weight=0.000005, dropout_p=0.2, useEmb=1):
    """ Demonstrates lenet on MNIST dataset

    :type learning_rate: float
    :param learning_rate: learning rate used (factor for the stochastic
                          gradient)

    :type n_epochs: int
    :param n_epochs: maximal number of epochs to run the optimizer

    :type dataset: string
    :param dataset: path to the dataset used for training /testing (MNIST here)

    :type nkerns: list of ints
    :param nkerns: number of kernels on each layer
    """

    root="/mounts/data/proj/wenpeng/Dataset/StanfordSentiment/stanfordSentimentTreebank/"
    embeddingPath='/mounts/data/proj/wenpeng/Downloads/hlbl-embeddings-original.EMBEDDING_SIZE=50.txt'
    embeddingPath2='/mounts/data/proj/wenpeng/MC/src/released_embedding.txt'
    rng = numpy.random.RandomState(99999)
    datasets, embedding_size, embeddings=read_data_WP(root+'2classes/train.txt', root+'2classes/dev.txt', root+'2classes/test.txt', embeddingPath,60, useEmb)
    #datasets, embedding_size, embeddings=read_data(root+'2classes/train.txt', root+'2classes/dev.txt', root+'2classes/test.txt', embeddingPath,60)

    #datasets = load_data(dataset)
    indices_train, trainY, trainLengths, trainLeftPad, trainRightPad= datasets[0]
    indices_dev, devY, devLengths, devLeftPad, devRightPad= datasets[1]
    indices_test, testY, testLengths, testLeftPad, testRightPad= datasets[2]
    n_train_batches=indices_train.shape[0]/batch_size
    n_valid_batches=indices_dev.shape[0]/batch_size
    n_test_batches=indices_test.shape[0]/batch_size
    
    train_batch_start=[]
    dev_batch_start=[]
    test_batch_start=[]
    if useAllSamples:
        train_batch_start=list(numpy.arange(n_train_batches)*batch_size)+[indices_train.shape[0]-batch_size]
        dev_batch_start=list(numpy.arange(n_valid_batches)*batch_size)+[indices_dev.shape[0]-batch_size]
        test_batch_start=list(numpy.arange(n_test_batches)*batch_size)+[indices_test.shape[0]-batch_size]
        n_train_batches=n_train_batches+1
        n_valid_batches=n_valid_batches+1
        n_test_batches=n_test_batches+1
    else:
        train_batch_start=list(numpy.arange(n_train_batches)*batch_size)
        dev_batch_start=list(numpy.arange(n_valid_batches)*batch_size)
        test_batch_start=list(numpy.arange(n_test_batches)*batch_size)

    indices_train_theano=theano.shared(numpy.asarray(indices_train, dtype=theano.config.floatX), borrow=True)
    indices_dev_theano=theano.shared(numpy.asarray(indices_dev, dtype=theano.config.floatX), borrow=True)
    indices_test_theano=theano.shared(numpy.asarray(indices_test, dtype=theano.config.floatX), borrow=True)
    indices_train_theano=T.cast(indices_train_theano, 'int32')
    indices_dev_theano=T.cast(indices_dev_theano, 'int32')
    indices_test_theano=T.cast(indices_test_theano, 'int32')
    
    

    
    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch
    x_index = T.imatrix('x_index')   # now, x is the index matrix, must be integer
    y = T.ivector('y')  
    #z = T.ivector('z')
    left=T.ivector('left')
    right=T.ivector('right')
    
    x=embeddings[x_index.flatten()].reshape((batch_size,60, embedding_size)).transpose(0, 2, 1).flatten()
    ishape = (embedding_size, 60)  # this is the size of MNIST images
    filter_size1=(1,filter_size[0])
    filter_size2=(1,filter_size[1])
    #poolsize1=(1, ishape[1]-filter_size1[1]+1) #?????????????????????????????
    poolsize1=(1, ishape[1]+filter_size1[1]-1)


    left_after_conv=left
    right_after_conv=right
    
    #kmax=30 # this can not be too small, like 20
    #ktop=6
    #poolsize2=(1, kmax-filter_size2[1]+1) #(1,6)
    poolsize2=(1, kmax+filter_size2[1]-1) #(1,6)
    #dynamic_lengths=T.maximum(ktop,z/2+1)  # dynamic k-max pooling
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized images of shape (batch_size,28*28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    layer0_input = x.reshape((batch_size, 1, ishape[0], ishape[1]))
    
    layer1 = ConvFoldPoolLayer(rng, input=layer0_input,
            image_shape=(batch_size, 1, ishape[0], ishape[1]),
            filter_shape=(nkerns[0], 1, filter_size1[0], filter_size1[1]), poolsize=poolsize1, k=ktop, left=left_after_conv, right=right_after_conv)
    
    # the HiddenLayer being fully-connected, it operates on 2D matrices of
    # shape (batch_size,num_pixels) (i.e matrix of rasterized images).
    # This will generate a matrix of shape (20,32*4*4) = (20,512)
    layer2_input = layer1.output.flatten(2)

    # construct a fully-connected sigmoidal layer, the output of layers has nkerns[1]=50 images, each is 4*4 size
    layer2 = HiddenLayer(rng, input=layer2_input, n_in=nkerns[0] * (embedding_size/2) * ktop,
                         n_out=hidden_units, activation=T.tanh)
    
    dropout=dropout_from_layer(rng, layer2.output, dropout_p)               #dropout
    layer3 = LogisticRegression(rng, input=dropout, n_in=hidden_units, n_out=2)
    
    #layer3 = LogisticRegression(rng, input=layer2.output, n_in=50, n_out=2)
    # the cost we minimize during training is the NLL of the model
    #L1_reg= abs(layer3.W).sum() + abs(layer2.W).sum() +abs(layer1.W).sum()+abs(layer0.W).sum()+abs(embeddings).sum()
    L2_reg = (layer3.W** 2).sum() + (layer2.W** 2).sum()+ (layer1.W** 2).sum()+(embeddings**2).sum()
    #L2_reg = (layer3.W** 2).sum() + (layer2.W** 2).sum()+(layer0.W** 2).sum()+(embeddings**2).sum()
    #cost must have L2, otherwise, will produce nan, while with L2, each word embedding will be updated
    cost = layer3.negative_log_likelihood(y)+L2_weight*L2_reg
    
    #cost = layer3.negative_log_likelihood(y)
    # create a function to compute the mistakes that are made by the model
    test_model = theano.function([index], layer3.errors(y),
             givens={
                x_index: indices_test_theano[index: index + batch_size],
                y: testY[index: index + batch_size],
                left: testLeftPad[index: index + batch_size],
                right: testRightPad[index: index + batch_size]})

    validate_model = theano.function([index], layer3.errors(y),
            givens={
                x_index: indices_dev_theano[index: index + batch_size],
                y: devY[index: index + batch_size],
                left: devLeftPad[index: index + batch_size],
                right: devRightPad[index: index + batch_size]})

    # create a list of all model parameters to be fit by gradient descent
    params = layer3.params + layer2.params + layer1.params +[embeddings]
    #params = layer3.params + layer2.params + layer0.params+[embeddings]
    
    accumulator=[]
    for para_i in params:
        eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))
      
    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)
    # train_model is a function that updates the model parameters by
    # SGD Since this model has many parameters, it would be tedious to
    # manually create an update rule for each model parameter. We thus
    # create the updates list by automatically looping over all
    # (params[i],grads[i]) pairs.

    '''   
    updates = []
    for param_i, grad_i in zip(params, grads):
        updates.append((param_i, param_i - learning_rate * grad_i))
    
    '''
    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
        acc = acc_i + T.sqr(grad_i)
        if param_i == embeddings:
            updates.append((param_i, T.set_subtensor((param_i - learning_rate * grad_i / T.sqrt(acc))[0], theano.shared(numpy.zeros(embedding_size)))))   #AdaGrad
        else:
            updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc)))   #AdaGrad
        updates.append((acc_i, acc))    
       
    train_model = theano.function([index], [cost,layer3.errors(y)], updates=updates,
          givens={
            x_index: indices_train_theano[index: index + batch_size],
            y: trainY[index: index + batch_size],
            left: trainLeftPad[index: index + batch_size],
            right: trainRightPad[index: index + batch_size]})

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 50000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
                           # found
    improvement_threshold = 0.995  # a relative improvement of this much is
                                   # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
                                  # go through this many
                                  # minibatche before checking the network
                                  # on the validation set; in this case we
                                  # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()

    epoch = 0
    done_looping = False

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        #for minibatch_index in xrange(n_train_batches): # each batch
        minibatch_index=0
        for batch_start in train_batch_start: 
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + minibatch_index +1
            minibatch_index=minibatch_index+1
            
            cost_ij, error_ij = train_model(batch_start)
            if iter % n_train_batches == 0:
                print 'training @ iter = '+str(iter)+' cost: '+str(cost_ij)+' error: '+str(error_ij)
            if iter % validation_frequency == 0:

                # compute zero-one loss on validation set
                #validation_losses = [validate_model(i) for i in xrange(n_valid_batches)]
                validation_losses = [validate_model(i) for i in dev_batch_start]
                this_validation_loss = numpy.mean(validation_losses)
                print('\t\tepoch %i, minibatch %i/%i, validation error %f %%' % \
                      (epoch, minibatch_index , n_train_batches, \
                       this_validation_loss * 100.))

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:

                    #improve patience if loss improvement is good enough
                    if this_validation_loss < best_validation_loss *  \
                       improvement_threshold:
                        patience = max(patience, iter * patience_increase)

                    # save best validation score and iteration number
                    best_validation_loss = this_validation_loss
                    best_iter = iter

                    # test it on the test set
                    test_losses = [test_model(i) for i in test_batch_start]
                    test_score = numpy.mean(test_losses)
                    print(('\t\t\t\tepoch %i, minibatch %i/%i, test error of best '
                           'model %f %%') %
                          (epoch, minibatch_index, n_train_batches,
                           test_score * 100.))


            if patience <= iter:
                done_looping = True
                break

    end_time = time.clock()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
Example #2
0
def evaluate_lenet5(learning_rate=1.0, n_epochs=2000,
                    dataset='mnist.pkl.gz',
                    nkerns=[6, 12], batch_size=80):
    """ Demonstrates lenet on MNIST dataset

    :type learning_rate: float
    :param learning_rate: learning rate used (factor for the stochastic
                          gradient)

    :type n_epochs: int
    :param n_epochs: maximal number of epochs to run the optimizer

    :type dataset: string
    :param dataset: path to the dataset used for training /testing (MNIST here)

    :type nkerns: list of ints
    :param nkerns: number of kernels on each layer
    """

    root="/mounts/data/proj/wenpeng/Dataset/StanfordSentiment/stanfordSentimentTreebank/"
    embeddingPath='/mounts/data/proj/wenpeng/Downloads/hlbl-embeddings-original.EMBEDDING_SIZE=50.txt'
    embeddingPath2='/mounts/data/proj/wenpeng/MC/src/released_embedding.txt'
    rng = numpy.random.RandomState(23455)
    datasets, embedding_size, embeddings=read_data(root+'5classes/train.txt', root+'5classes/dev.txt', root+'5classes/test.txt', embeddingPath,60)
    #datasets, embedding_size, embeddings=read_data(root+'2classes/train.txt', root+'2classes/dev.txt', root+'2classes/test.txt', embeddingPath,60)

    #datasets = load_data(dataset)
    indices_train, trainY, trainLengths = datasets[0]
    indices_dev, devY, devLengths = datasets[1]
    indices_test, testY, testLengths = datasets[2]
    n_train_batches=indices_train.shape[0]/batch_size
    n_valid_batches=indices_dev.shape[0]/batch_size
    n_test_batches=indices_test.shape[0]/batch_size

    indices_train_theano=theano.shared(numpy.asarray(indices_train, dtype=theano.config.floatX), borrow=True)
    indices_dev_theano=theano.shared(numpy.asarray(indices_dev, dtype=theano.config.floatX), borrow=True)
    indices_test_theano=theano.shared(numpy.asarray(indices_test, dtype=theano.config.floatX), borrow=True)
    indices_train_theano=T.cast(indices_train_theano, 'int32')
    indices_dev_theano=T.cast(indices_dev_theano, 'int32')
    indices_test_theano=T.cast(indices_test_theano, 'int32')
    '''
    indices_train_theano=theano.shared(indices_train, borrow=True)
    indices_dev_theano=theano.shared(indices_dev, borrow=True)
    indices_test_theano=theano.shared(indices_test, borrow=True)
    '''
    

    
    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch
    x_index = T.imatrix('x_index')   # now, x is the index matrix, must be integer
    y = T.ivector('y')  
    z = T.ivector('z')

    x=embeddings[x_index.flatten()].flatten().reshape((batch_size,60*embedding_size))
    ishape = (embedding_size, 60)  # this is the size of MNIST images
    filter_size1=(1,10)
    filter_size2=(1,7)
    poolsize1=(1, ishape[1]-filter_size1[1]+1)
    
    kmax=30 # this can not be too small, like 20
    ktop=5
    poolsize2=(1, kmax-filter_size2[1]+1) #(1,6)
    lengths=T.maximum(ktop,z/2+1)  # dynamic k-max pooling
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print('... building the model')

    # Reshape matrix of rasterized images of shape (batch_size,28*28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    layer0_input = x.reshape((batch_size, 1, ishape[0], ishape[1]))

    # Construct the first convolutional pooling layer:
    # filtering reduces the image size to (28-5+1,28-5+1)=(24,24)
    # maxpooling reduces this further to (24/2,24/2) = (12,12)
    # 4D output tensor is thus of shape (batch_size,nkerns[0],12,12)
    '''
    layer0 = LeNetConvPoolLayer(rng, input=layer0_input,
            image_shape=(batch_size, 1, ishape[0], ishape[1]),
            filter_shape=(nkerns[0], 1, filter_size1[0], filter_size1[1]), poolsize=poolsize1, k=kmax)
    '''
    layer0 = Conv_DynamicK_PoolLayer(rng, input=layer0_input,
            image_shape=(batch_size, 1, ishape[0], ishape[1]),
            filter_shape=(nkerns[0], 1, filter_size1[0], filter_size1[1]), poolsize=poolsize1, k=lengths, unifiedWidth=kmax)
    # Construct the second convolutional pooling layer
    # filtering reduces the image size to (12-5+1,12-5+1)=(8,8)
    # maxpooling reduces this further to (8/2,8/2) = (4,4)
    # 4D output tensor is thus of shape (nkerns[0],nkerns[1],4,4)
    '''
    layer1 = LeNetConvPoolLayer(rng, input=layer0.output,
            image_shape=(batch_size, nkerns[0], ishape[0], kmax),
            filter_shape=(nkerns[1], nkerns[0], filter_size2[0], filter_size2[1]), poolsize=poolsize2, k=ktop)
    '''
    layer1 = ConvFoldPoolLayer(rng, input=layer0.output,
            image_shape=(batch_size, nkerns[0], ishape[0], kmax),
            filter_shape=(nkerns[1], nkerns[0], filter_size2[0], filter_size2[1]), poolsize=poolsize2, k=ktop)
    # the HiddenLayer being fully-connected, it operates on 2D matrices of
    # shape (batch_size,num_pixels) (i.e matrix of rasterized images).
    # This will generate a matrix of shape (20,32*4*4) = (20,512)
    layer2_input = layer1.output.flatten(2)

    # construct a fully-connected sigmoidal layer, the output of layers has nkerns[1]=50 images, each is 4*4 size
    layer2 = HiddenLayer(rng, input=layer2_input, n_in=nkerns[1] * (embedding_size/2) * ktop,
                         n_out=50, activation=T.tanh)
    dropout=dropout_from_layer(rng, layer2.output, 0.5)
    # classify the values of the fully-connected sigmoidal layer
    layer3 = LogisticRegression(input=dropout, n_in=50, n_out=5)

    # the cost we minimize during training is the NLL of the model
    L1_reg= abs(layer3.W).sum() + abs(layer2.W).sum() +abs(layer1.W).sum()+abs(layer0.W).sum()+abs(embeddings).sum()
    L2_reg = (layer3.W** 2).sum() + (layer2.W** 2).sum()+ (layer1.W** 2).sum()+(layer0.W** 2).sum()+(embeddings**2).sum()
    cost = layer3.negative_log_likelihood(y)+0*L1_reg+0.0000001*L2_reg
    
    #cost = layer3.negative_log_likelihood(y)
    # create a function to compute the mistakes that are made by the model
    test_model = theano.function([index], layer3.errors(y),
             givens={
                x_index: indices_test_theano[index * batch_size: (index + 1) * batch_size],
                y: testY[index * batch_size: (index + 1) * batch_size],
                z: testLengths[index * batch_size: (index + 1) * batch_size]})

    validate_model = theano.function([index], layer3.errors(y),
            givens={
                x_index: indices_dev_theano[index * batch_size: (index + 1) * batch_size],
                y: devY[index * batch_size: (index + 1) * batch_size],
                z: devLengths[index * batch_size: (index + 1) * batch_size]})

    # create a list of all model parameters to be fit by gradient descent
    params = layer3.params + layer2.params + layer1.params + layer0.params+[embeddings]
    
    accumulator=[]
    for para_i in params:
        eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))
      
    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)
    grads[len(grads)-1]=T.set_subtensor(grads[len(grads)-1][0], theano.shared(numpy.zeros(embedding_size)))

    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
        acc = acc_i + T.sqr(grad_i)
        updates.append((param_i, param_i - learning_rate * grad_i / (T.sqrt(acc)+ 1e-5)))
        updates.append((acc_i, acc))   
       
    train_model = theano.function([index], cost, updates=updates,
          givens={
            x_index: indices_train_theano[index * batch_size: (index + 1) * batch_size],
            y: trainY[index * batch_size: (index + 1) * batch_size],
            z: trainLengths[index * batch_size: (index + 1) * batch_size] })

    ###############
    # TRAIN MODEL #
    ###############
    print('... training')
    # early-stopping parameters
    patience = 10000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
                           # found
    improvement_threshold = 0.995  # a relative improvement of this much is
                                   # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
                                  # go through this many
                                  # minibatche before checking the network
                                  # on the validation set; in this case we
                                  # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()

    epoch = 0
    done_looping = False

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        for minibatch_index in range(n_train_batches): # each batch
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + minibatch_index

            if iter % 100 == 0:
                print('training @ iter = ', iter)
            cost_ij = train_model(minibatch_index)

            if (iter + 1) % validation_frequency == 0:

                # compute zero-one loss on validation set
                validation_losses = [validate_model(i) for i
                                     in range(n_valid_batches)]
                this_validation_loss = numpy.mean(validation_losses)
                print(('epoch %i, minibatch %i/%i, validation error %f %%' % \
                      (epoch, minibatch_index + 1, n_train_batches, \
                       this_validation_loss * 100.)))

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:

                    #improve patience if loss improvement is good enough
                    if this_validation_loss < best_validation_loss *  \
                       improvement_threshold:
                        patience = max(patience, iter * patience_increase)

                    # save best validation score and iteration number
                    best_validation_loss = this_validation_loss
                    best_iter = iter

                    # test it on the test set
                    test_losses = [test_model(i) for i in range(n_test_batches)]
                    test_score = numpy.mean(test_losses)
                    print((('     epoch %i, minibatch %i/%i, test error of best '
                           'model %f %%') %
                          (epoch, minibatch_index + 1, n_train_batches,
                           test_score * 100.)))


            if patience <= iter:
                done_looping = True
                break

    end_time = time.clock()
    print('Optimization complete.')
    print(('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.)))
    print(('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.)), file=sys.stderr)
Example #3
0
def evaluate_lenet5(learning_rate=0.1,
                    n_epochs=2000,
                    nkerns=[6, 14],
                    batch_size=70,
                    useAllSamples=0,
                    kmax=30,
                    ktop=4,
                    filter_size=[7, 5],
                    hidden_units=50,
                    L2_weight=0.000005,
                    dropout_p=0.2,
                    useEmb=1):
    """ Demonstrates lenet on MNIST dataset

    :type learning_rate: float
    :param learning_rate: learning rate used (factor for the stochastic
                          gradient)

    :type n_epochs: int
    :param n_epochs: maximal number of epochs to run the optimizer

    :type dataset: string
    :param dataset: path to the dataset used for training /testing (MNIST here)

    :type nkerns: list of ints
    :param nkerns: number of kernels on each layer
    """

    root = "/mounts/data/proj/wenpeng/Dataset/StanfordSentiment/stanfordSentimentTreebank/"
    embeddingPath = '/mounts/data/proj/wenpeng/Downloads/hlbl-embeddings-original.EMBEDDING_SIZE=50.txt'
    embeddingPath2 = '/mounts/data/proj/wenpeng/MC/src/released_embedding.txt'
    rng = numpy.random.RandomState(99999)
    datasets, embedding_size, embeddings = read_data_WP(
        root + '2classes/train.txt', root + '2classes/dev.txt',
        root + '2classes/test.txt', embeddingPath, 60, useEmb)
    #datasets, embedding_size, embeddings=read_data(root+'2classes/train.txt', root+'2classes/dev.txt', root+'2classes/test.txt', embeddingPath,60)

    #datasets = load_data(dataset)
    indices_train, trainY, trainLengths, trainLeftPad, trainRightPad = datasets[
        0]
    indices_dev, devY, devLengths, devLeftPad, devRightPad = datasets[1]
    indices_test, testY, testLengths, testLeftPad, testRightPad = datasets[2]
    n_train_batches = indices_train.shape[0] / batch_size
    n_valid_batches = indices_dev.shape[0] / batch_size
    n_test_batches = indices_test.shape[0] / batch_size

    train_batch_start = []
    dev_batch_start = []
    test_batch_start = []
    if useAllSamples:
        train_batch_start = list(
            numpy.arange(n_train_batches) *
            batch_size) + [indices_train.shape[0] - batch_size]
        dev_batch_start = list(numpy.arange(n_valid_batches) * batch_size) + [
            indices_dev.shape[0] - batch_size
        ]
        test_batch_start = list(numpy.arange(n_test_batches) * batch_size) + [
            indices_test.shape[0] - batch_size
        ]
        n_train_batches = n_train_batches + 1
        n_valid_batches = n_valid_batches + 1
        n_test_batches = n_test_batches + 1
    else:
        train_batch_start = list(numpy.arange(n_train_batches) * batch_size)
        dev_batch_start = list(numpy.arange(n_valid_batches) * batch_size)
        test_batch_start = list(numpy.arange(n_test_batches) * batch_size)

    indices_train_theano = theano.shared(numpy.asarray(
        indices_train, dtype=theano.config.floatX),
                                         borrow=True)
    indices_dev_theano = theano.shared(numpy.asarray(
        indices_dev, dtype=theano.config.floatX),
                                       borrow=True)
    indices_test_theano = theano.shared(numpy.asarray(
        indices_test, dtype=theano.config.floatX),
                                        borrow=True)
    indices_train_theano = T.cast(indices_train_theano, 'int32')
    indices_dev_theano = T.cast(indices_dev_theano, 'int32')
    indices_test_theano = T.cast(indices_test_theano, 'int32')

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch
    x_index = T.imatrix(
        'x_index')  # now, x is the index matrix, must be integer
    y = T.ivector('y')
    #z = T.ivector('z')
    left = T.ivector('left')
    right = T.ivector('right')

    x = embeddings[x_index.flatten()].reshape(
        (batch_size, 60, embedding_size)).transpose(0, 2, 1).flatten()
    ishape = (embedding_size, 60)  # this is the size of MNIST images
    filter_size1 = (1, filter_size[0])
    filter_size2 = (1, filter_size[1])
    #poolsize1=(1, ishape[1]-filter_size1[1]+1) #?????????????????????????????
    poolsize1 = (1, ishape[1] + filter_size1[1] - 1)

    left_after_conv = left
    right_after_conv = right

    #kmax=30 # this can not be too small, like 20
    #ktop=6
    #poolsize2=(1, kmax-filter_size2[1]+1) #(1,6)
    poolsize2 = (1, kmax + filter_size2[1] - 1)  #(1,6)
    #dynamic_lengths=T.maximum(ktop,z/2+1)  # dynamic k-max pooling
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized images of shape (batch_size,28*28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    layer0_input = x.reshape((batch_size, 1, ishape[0], ishape[1]))

    layer1 = ConvFoldPoolLayer(rng,
                               input=layer0_input,
                               image_shape=(batch_size, 1, ishape[0],
                                            ishape[1]),
                               filter_shape=(nkerns[0], 1, filter_size1[0],
                                             filter_size1[1]),
                               poolsize=poolsize1,
                               k=ktop,
                               left=left_after_conv,
                               right=right_after_conv)

    # the HiddenLayer being fully-connected, it operates on 2D matrices of
    # shape (batch_size,num_pixels) (i.e matrix of rasterized images).
    # This will generate a matrix of shape (20,32*4*4) = (20,512)
    layer2_input = layer1.output.flatten(2)

    # construct a fully-connected sigmoidal layer, the output of layers has nkerns[1]=50 images, each is 4*4 size
    layer2 = HiddenLayer(rng,
                         input=layer2_input,
                         n_in=nkerns[0] * (embedding_size / 2) * ktop,
                         n_out=hidden_units,
                         activation=T.tanh)

    dropout = dropout_from_layer(rng, layer2.output, dropout_p)  #dropout
    layer3 = LogisticRegression(rng, input=dropout, n_in=hidden_units, n_out=2)

    #layer3 = LogisticRegression(rng, input=layer2.output, n_in=50, n_out=2)
    # the cost we minimize during training is the NLL of the model
    #L1_reg= abs(layer3.W).sum() + abs(layer2.W).sum() +abs(layer1.W).sum()+abs(layer0.W).sum()+abs(embeddings).sum()
    L2_reg = (layer3.W**2).sum() + (layer2.W**2).sum() + (
        layer1.W**2).sum() + (embeddings**2).sum()
    #L2_reg = (layer3.W** 2).sum() + (layer2.W** 2).sum()+(layer0.W** 2).sum()+(embeddings**2).sum()
    #cost must have L2, otherwise, will produce nan, while with L2, each word embedding will be updated
    cost = layer3.negative_log_likelihood(y) + L2_weight * L2_reg

    #cost = layer3.negative_log_likelihood(y)
    # create a function to compute the mistakes that are made by the model
    test_model = theano.function(
        [index],
        layer3.errors(y),
        givens={
            x_index: indices_test_theano[index:index + batch_size],
            y: testY[index:index + batch_size],
            left: testLeftPad[index:index + batch_size],
            right: testRightPad[index:index + batch_size]
        })

    validate_model = theano.function(
        [index],
        layer3.errors(y),
        givens={
            x_index: indices_dev_theano[index:index + batch_size],
            y: devY[index:index + batch_size],
            left: devLeftPad[index:index + batch_size],
            right: devRightPad[index:index + batch_size]
        })

    # create a list of all model parameters to be fit by gradient descent
    params = layer3.params + layer2.params + layer1.params + [embeddings]
    #params = layer3.params + layer2.params + layer0.params+[embeddings]

    accumulator = []
    for para_i in params:
        eps_p = numpy.zeros_like(para_i.get_value(borrow=True),
                                 dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)
    # train_model is a function that updates the model parameters by
    # SGD Since this model has many parameters, it would be tedious to
    # manually create an update rule for each model parameter. We thus
    # create the updates list by automatically looping over all
    # (params[i],grads[i]) pairs.
    '''   
    updates = []
    for param_i, grad_i in zip(params, grads):
        updates.append((param_i, param_i - learning_rate * grad_i))
    
    '''
    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
        acc = acc_i + T.sqr(grad_i)
        if param_i == embeddings:
            updates.append(
                (param_i,
                 T.set_subtensor(
                     (param_i - learning_rate * grad_i / T.sqrt(acc))[0],
                     theano.shared(numpy.zeros(embedding_size)))))  #AdaGrad
        else:
            updates.append(
                (param_i,
                 param_i - learning_rate * grad_i / T.sqrt(acc)))  #AdaGrad
        updates.append((acc_i, acc))

    train_model = theano.function(
        [index], [cost, layer3.errors(y)],
        updates=updates,
        givens={
            x_index: indices_train_theano[index:index + batch_size],
            y: trainY[index:index + batch_size],
            left: trainLeftPad[index:index + batch_size],
            right: trainRightPad[index:index + batch_size]
        })

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 50000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
    # found
    improvement_threshold = 0.995  # a relative improvement of this much is
    # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
    # go through this many
    # minibatche before checking the network
    # on the validation set; in this case we
    # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()

    epoch = 0
    done_looping = False

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        #for minibatch_index in xrange(n_train_batches): # each batch
        minibatch_index = 0
        for batch_start in train_batch_start:
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + minibatch_index + 1
            minibatch_index = minibatch_index + 1

            cost_ij, error_ij = train_model(batch_start)
            if iter % n_train_batches == 0:
                print 'training @ iter = ' + str(iter) + ' cost: ' + str(
                    cost_ij) + ' error: ' + str(error_ij)
            if iter % validation_frequency == 0:

                # compute zero-one loss on validation set
                #validation_losses = [validate_model(i) for i in xrange(n_valid_batches)]
                validation_losses = [
                    validate_model(i) for i in dev_batch_start
                ]
                this_validation_loss = numpy.mean(validation_losses)
                print('\t\tepoch %i, minibatch %i/%i, validation error %f %%' % \
                      (epoch, minibatch_index , n_train_batches, \
                       this_validation_loss * 100.))

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:

                    #improve patience if loss improvement is good enough
                    if this_validation_loss < best_validation_loss *  \
                       improvement_threshold:
                        patience = max(patience, iter * patience_increase)

                    # save best validation score and iteration number
                    best_validation_loss = this_validation_loss
                    best_iter = iter

                    # test it on the test set
                    test_losses = [test_model(i) for i in test_batch_start]
                    test_score = numpy.mean(test_losses)
                    print((
                        '\t\t\t\tepoch %i, minibatch %i/%i, test error of best '
                        'model %f %%') % (epoch, minibatch_index,
                                          n_train_batches, test_score * 100.))

            if patience <= iter:
                done_looping = True
                break

    end_time = time.clock()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
Example #4
0
def evaluate_lenet5(learning_rate=1.0, n_epochs=2000,
                    dataset='mnist.pkl.gz',
                    nkerns=[6, 12], batch_size=80):
    """ Demonstrates lenet on MNIST dataset

    :type learning_rate: float
    :param learning_rate: learning rate used (factor for the stochastic
                          gradient)

    :type n_epochs: int
    :param n_epochs: maximal number of epochs to run the optimizer

    :type dataset: string
    :param dataset: path to the dataset used for training /testing (MNIST here)

    :type nkerns: list of ints
    :param nkerns: number of kernels on each layer
    """

    root="/mounts/data/proj/wenpeng/Dataset/StanfordSentiment/stanfordSentimentTreebank/"
    embeddingPath='/mounts/data/proj/wenpeng/Downloads/hlbl-embeddings-original.EMBEDDING_SIZE=50.txt'
    embeddingPath2='/mounts/data/proj/wenpeng/MC/src/released_embedding.txt'
    rng = numpy.random.RandomState(23455)
    datasets, embedding_size, embeddings=read_data(root+'5classes/train.txt', root+'5classes/dev.txt', root+'5classes/test.txt', embeddingPath,60)
    #datasets, embedding_size, embeddings=read_data(root+'2classes/train.txt', root+'2classes/dev.txt', root+'2classes/test.txt', embeddingPath,60)

    #datasets = load_data(dataset)
    indices_train, trainY, trainLengths = datasets[0]
    indices_dev, devY, devLengths = datasets[1]
    indices_test, testY, testLengths = datasets[2]
    n_train_batches=indices_train.shape[0]/batch_size
    n_valid_batches=indices_dev.shape[0]/batch_size
    n_test_batches=indices_test.shape[0]/batch_size

    indices_train_theano=theano.shared(numpy.asarray(indices_train, dtype=theano.config.floatX), borrow=True)
    indices_dev_theano=theano.shared(numpy.asarray(indices_dev, dtype=theano.config.floatX), borrow=True)
    indices_test_theano=theano.shared(numpy.asarray(indices_test, dtype=theano.config.floatX), borrow=True)
    indices_train_theano=T.cast(indices_train_theano, 'int32')
    indices_dev_theano=T.cast(indices_dev_theano, 'int32')
    indices_test_theano=T.cast(indices_test_theano, 'int32')
    '''
    indices_train_theano=theano.shared(indices_train, borrow=True)
    indices_dev_theano=theano.shared(indices_dev, borrow=True)
    indices_test_theano=theano.shared(indices_test, borrow=True)
    '''
    

    
    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch
    x_index = T.imatrix('x_index')   # now, x is the index matrix, must be integer
    y = T.ivector('y')  
    z = T.ivector('z')

    x=embeddings[x_index.flatten()].flatten().reshape((batch_size,60*embedding_size))
    ishape = (embedding_size, 60)  # this is the size of MNIST images
    filter_size1=(1,10)
    filter_size2=(1,7)
    poolsize1=(1, ishape[1]-filter_size1[1]+1)
    
    kmax=30 # this can not be too small, like 20
    ktop=5
    poolsize2=(1, kmax-filter_size2[1]+1) #(1,6)
    lengths=T.maximum(ktop,z/2+1)  # dynamic k-max pooling
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized images of shape (batch_size,28*28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    layer0_input = x.reshape((batch_size, 1, ishape[0], ishape[1]))

    # Construct the first convolutional pooling layer:
    # filtering reduces the image size to (28-5+1,28-5+1)=(24,24)
    # maxpooling reduces this further to (24/2,24/2) = (12,12)
    # 4D output tensor is thus of shape (batch_size,nkerns[0],12,12)
    '''
    layer0 = LeNetConvPoolLayer(rng, input=layer0_input,
            image_shape=(batch_size, 1, ishape[0], ishape[1]),
            filter_shape=(nkerns[0], 1, filter_size1[0], filter_size1[1]), poolsize=poolsize1, k=kmax)
    '''
    layer0 = Conv_DynamicK_PoolLayer(rng, input=layer0_input,
            image_shape=(batch_size, 1, ishape[0], ishape[1]),
            filter_shape=(nkerns[0], 1, filter_size1[0], filter_size1[1]), poolsize=poolsize1, k=lengths, unifiedWidth=kmax)
    # Construct the second convolutional pooling layer
    # filtering reduces the image size to (12-5+1,12-5+1)=(8,8)
    # maxpooling reduces this further to (8/2,8/2) = (4,4)
    # 4D output tensor is thus of shape (nkerns[0],nkerns[1],4,4)
    '''
    layer1 = LeNetConvPoolLayer(rng, input=layer0.output,
            image_shape=(batch_size, nkerns[0], ishape[0], kmax),
            filter_shape=(nkerns[1], nkerns[0], filter_size2[0], filter_size2[1]), poolsize=poolsize2, k=ktop)
    '''
    layer1 = ConvFoldPoolLayer(rng, input=layer0.output,
            image_shape=(batch_size, nkerns[0], ishape[0], kmax),
            filter_shape=(nkerns[1], nkerns[0], filter_size2[0], filter_size2[1]), poolsize=poolsize2, k=ktop)
    # the HiddenLayer being fully-connected, it operates on 2D matrices of
    # shape (batch_size,num_pixels) (i.e matrix of rasterized images).
    # This will generate a matrix of shape (20,32*4*4) = (20,512)
    layer2_input = layer1.output.flatten(2)

    # construct a fully-connected sigmoidal layer, the output of layers has nkerns[1]=50 images, each is 4*4 size
    layer2 = HiddenLayer(rng, input=layer2_input, n_in=nkerns[1] * (embedding_size/2) * ktop,
                         n_out=50, activation=T.tanh)
    dropout=dropout_from_layer(rng, layer2.output, 0.5)
    # classify the values of the fully-connected sigmoidal layer
    layer3 = LogisticRegression(input=dropout, n_in=50, n_out=5)

    # the cost we minimize during training is the NLL of the model
    L1_reg= abs(layer3.W).sum() + abs(layer2.W).sum() +abs(layer1.W).sum()+abs(layer0.W).sum()+abs(embeddings).sum()
    L2_reg = (layer3.W** 2).sum() + (layer2.W** 2).sum()+ (layer1.W** 2).sum()+(layer0.W** 2).sum()+(embeddings**2).sum()
    cost = layer3.negative_log_likelihood(y)+0*L1_reg+0.0000001*L2_reg
    
    #cost = layer3.negative_log_likelihood(y)
    # create a function to compute the mistakes that are made by the model
    test_model = theano.function([index], layer3.errors(y),
             givens={
                x_index: indices_test_theano[index * batch_size: (index + 1) * batch_size],
                y: testY[index * batch_size: (index + 1) * batch_size],
                z: testLengths[index * batch_size: (index + 1) * batch_size]})

    validate_model = theano.function([index], layer3.errors(y),
            givens={
                x_index: indices_dev_theano[index * batch_size: (index + 1) * batch_size],
                y: devY[index * batch_size: (index + 1) * batch_size],
                z: devLengths[index * batch_size: (index + 1) * batch_size]})

    # create a list of all model parameters to be fit by gradient descent
    params = layer3.params + layer2.params + layer1.params + layer0.params+[embeddings]
    
    accumulator=[]
    for para_i in params:
        eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))
      
    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)
    grads[len(grads)-1]=T.set_subtensor(grads[len(grads)-1][0], theano.shared(numpy.zeros(embedding_size)))

    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
        acc = acc_i + T.sqr(grad_i)
        updates.append((param_i, param_i - learning_rate * grad_i / (T.sqrt(acc)+ 1e-5)))
        updates.append((acc_i, acc))   
       
    train_model = theano.function([index], cost, updates=updates,
          givens={
            x_index: indices_train_theano[index * batch_size: (index + 1) * batch_size],
            y: trainY[index * batch_size: (index + 1) * batch_size],
            z: trainLengths[index * batch_size: (index + 1) * batch_size] })

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 10000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
                           # found
    improvement_threshold = 0.995  # a relative improvement of this much is
                                   # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
                                  # go through this many
                                  # minibatche before checking the network
                                  # on the validation set; in this case we
                                  # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()

    epoch = 0
    done_looping = False

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        for minibatch_index in xrange(n_train_batches): # each batch
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + minibatch_index

            if iter % 100 == 0:
                print 'training @ iter = ', iter
            cost_ij = train_model(minibatch_index)

            if (iter + 1) % validation_frequency == 0:

                # compute zero-one loss on validation set
                validation_losses = [validate_model(i) for i
                                     in xrange(n_valid_batches)]
                this_validation_loss = numpy.mean(validation_losses)
                print('epoch %i, minibatch %i/%i, validation error %f %%' % \
                      (epoch, minibatch_index + 1, n_train_batches, \
                       this_validation_loss * 100.))

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:

                    #improve patience if loss improvement is good enough
                    if this_validation_loss < best_validation_loss *  \
                       improvement_threshold:
                        patience = max(patience, iter * patience_increase)

                    # save best validation score and iteration number
                    best_validation_loss = this_validation_loss
                    best_iter = iter

                    # test it on the test set
                    test_losses = [test_model(i) for i in xrange(n_test_batches)]
                    test_score = numpy.mean(test_losses)
                    print(('     epoch %i, minibatch %i/%i, test error of best '
                           'model %f %%') %
                          (epoch, minibatch_index + 1, n_train_batches,
                           test_score * 100.))


            if patience <= iter:
                done_looping = True
                break

    end_time = time.clock()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))