Example #1
0
def test_dA(learning_rate=0.01,
            training_epochs=15000,
            dataset='mnist.pkl.gz',
            batch_size=5,
            output_folder='dA_plots'):
    """
    This demo is tested on MNIST

    :type learning_rate: float
    :param learning_rate: learning rate used for training the DeNosing
                          AutoEncoder

    :type training_epochs: int
    :param training_epochs: number of epochs used for training

    :type dataset: string
    :param dataset: path to the picked dataset

    """
    ##datasets = load_data(dataset)
    #from SdA_mapping import load_data_half
    #datasets = load_data_half(dataset)
    print 'loading data'
    datasets, x_mean, y_mean, x_std, y_std = load_vc()
    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]
    print 'loaded data'

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch
    x1 = T.matrix('x1')  # the data is presented as rasterized images
    x2 = T.matrix('x2')  # the data is presented as rasterized images
    cor_reg = T.scalar('cor_reg')
    if not os.path.isdir(output_folder):
        os.makedirs(output_folder)
    os.chdir(output_folder)
    ####################################
    # BUILDING THE MODEL NO CORRUPTION #
    ####################################

    rng = numpy.random.RandomState(123)
    theano_rng = RandomStreams(rng.randint(2**30))

    #da = dA_joint(
    #numpy_rng=rng,
    #theano_rng=theano_rng,
    #input1=x1,
    #input2=x2,

    #n_visible1=28 * 28/2,
    #n_visible2=28 * 28/2,

    #n_hidden=500
    #)
    print 'initialize functions'

    da = dA_joint(
        numpy_rng=rng,
        theano_rng=theano_rng,
        input1=x1,
        input2=x2,
        cor_reg=cor_reg,

        #n_visible1=28 * 28/2,
        #n_visible2=28 * 28/2,
        n_visible1=24,
        n_visible2=24,
        n_hidden=50)

    cost, updates = da.get_cost_updates(corruption_level=0.3,
                                        learning_rate=learning_rate)
    cor_reg_val = numpy.float32(5.0)
    train_da = theano.function(
        [index],
        cost,
        updates=updates,
        givens={
            x1: train_set_x[index * batch_size:(index + 1) * batch_size],
            x2: train_set_y[index * batch_size:(index + 1) * batch_size]
        })
    fprop_x1 = theano.function([],
                               outputs=da.output1,
                               givens={x1: test_set_x},
                               name='fprop_x1')
    fprop_x2 = theano.function([],
                               outputs=da.output2,
                               givens={x2: test_set_y},
                               name='fprop_x2')
    fprop_x1t = theano.function([],
                                outputs=da.output1,
                                givens={x1: train_set_x},
                                name='fprop_x1')
    fprop_x2t = theano.function([],
                                outputs=da.output2,
                                givens={x2: train_set_y},
                                name='fprop_x2')
    rec_x1 = theano.function([],
                             outputs=da.rec1,
                             givens={x1: test_set_x},
                             name='rec_x1')
    rec_x2 = theano.function([],
                             outputs=da.rec2,
                             givens={x2: test_set_y},
                             name='rec_x2')
    fprop_x1_to_x2 = theano.function([],
                                     outputs=da.reg,
                                     givens={x1: test_set_x},
                                     name='fprop_x12x2')
    updates_reg = [(da.cor_reg, da.cor_reg + theano.shared(numpy.float32(0.1)))
                   ]
    update_reg = theano.function([], updates=updates_reg)
    print 'initialize functions ended'

    start_time = time.clock()

    ############
    # TRAINING #
    ############
    print 'training started'
    X1 = test_set_x.eval()
    X1 *= x_std
    X1 += x_mean
    X2 = test_set_y.eval()
    X2 *= y_std
    X2 += y_mean
    from dcca_numpy import cor_cost
    # go through training epochs
    for epoch in xrange(training_epochs):
        # go through trainng set
        c = []
        for batch_index in xrange(n_train_batches):
            c.append(train_da(batch_index))

        #cor_reg_val += 1
        #da.cor_reg = theano.shared(cor_reg_val)
        update_reg()

        X1H = rec_x1()
        X2H = rec_x2()
        X1H *= x_std
        X1H += x_mean
        X2H *= y_std
        X2H += y_mean
        H1 = fprop_x1()
        H2 = fprop_x2()
        print 'Training epoch'
        print 'Reconstruction ', numpy.mean(numpy.mean((X1H-X1)**2,1)),\
              numpy.mean(numpy.mean((X2H-X2)**2,1))

        if epoch % 5 == 2:  # pretrain middle layer
            print '... pre-training MIDDLE layer'
            H1t = fprop_x1t()
            H2t = fprop_x2t()
            h1 = T.matrix('x')  # the data is presented as rasterized images
            h2 = T.matrix('y')  # the labels are presented as 1D vector of
            from mlp import HiddenLayer
            numpy_rng = numpy.random.RandomState(89677)
            log_reg = HiddenLayer(numpy_rng, h1, 50, 50, activation=T.tanh)

            if 1:  # for middle layer
                learning_rate = 0.1

                #H1=theano.shared(H1)
                #H2=theano.shared(H2)
                # compute the gradients with respect to the model parameters
                logreg_cost = log_reg.mse(h2)

                gparams = T.grad(logreg_cost, log_reg.params)

                # compute list of fine-tuning updates
                updates = [(param, param - gparam * learning_rate)
                           for param, gparam in zip(log_reg.params, gparams)]

                train_fn_middle = theano.function(inputs=[],
                                                  outputs=logreg_cost,
                                                  updates=updates,
                                                  givens={
                                                      h1: theano.shared(H1t),
                                                      h2: theano.shared(H2t)
                                                  },
                                                  name='train_middle')
            epoch = 0
            while epoch < 100:
                print epoch, train_fn_middle()
                epoch += 1

            ##X2H=fprop_x1_to_x2()
            X2H = numpy.tanh(H1.dot(log_reg.W.eval()) + log_reg.b.eval())
            X2H = numpy.tanh(X2H.dot(da.W2_prime.eval()) + da.b2_prime.eval())

            X2H *= y_std
            X2H += y_mean
            print 'Regression ', numpy.mean(numpy.mean((X2H - X2)**2, 1))

        print 'Correlation ', cor_cost(H1, H2)
    end_time = time.clock()

    training_time = (end_time - start_time)

    print >> sys.stderr, ('The no corruption code for file ' +
                          os.path.split(__file__)[1] + ' ran for %.2fm' %
                          ((training_time) / 60.))
    image = Image.fromarray(
        tile_raster_images(X=da.W1.get_value(borrow=True).T,
                           img_shape=(28, 14),
                           tile_shape=(10, 10),
                           tile_spacing=(1, 1)))
    image.save('filters_corruption_0.png')

    from matplotlib import pyplot as pp
    pp.plot(H1[:10, :2], 'b')
    pp.plot(H2[:10, :2], 'r')
    pp.show()

    print cor
Example #2
0
def test_dA(learning_rate=0.01, training_epochs=15000,
            dataset='mnist.pkl.gz',
            batch_size=5, output_folder='dA_plots'):

    """
    This demo is tested on MNIST

    :type learning_rate: float
    :param learning_rate: learning rate used for training the DeNosing
                          AutoEncoder

    :type training_epochs: int
    :param training_epochs: number of epochs used for training

    :type dataset: string
    :param dataset: path to the picked dataset

    """
    ##datasets = load_data(dataset)
    #from SdA_mapping import load_data_half
    #datasets = load_data_half(dataset)
    print 'loading data'
    datasets, x_mean, y_mean, x_std, y_std = load_vc()
    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]  
    test_set_x, test_set_y = datasets[2]
    print 'loaded data'

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size

    # allocate symbolic variables for the data
    index = T.lscalar()    # index to a [mini]batch
    x1 = T.matrix('x1')  # the data is presented as rasterized images
    x2 = T.matrix('x2')  # the data is presented as rasterized images
    cor_reg = T.scalar('cor_reg')
    if not os.path.isdir(output_folder):
        os.makedirs(output_folder)
    os.chdir(output_folder)
    ####################################
    # BUILDING THE MODEL NO CORRUPTION #
    ####################################

    rng = numpy.random.RandomState(123)
    theano_rng = RandomStreams(rng.randint(2 ** 30))

    #da = dA_joint(
        #numpy_rng=rng,
        #theano_rng=theano_rng,
        #input1=x1,
        #input2=x2,

        #n_visible1=28 * 28/2,
        #n_visible2=28 * 28/2,

        #n_hidden=500
    #)
    print 'initialize functions'

    da = dA_joint(
        numpy_rng=rng,
        theano_rng=theano_rng,
        input1=x1,
        input2=x2,
        cor_reg=cor_reg,

        #n_visible1=28 * 28/2,
        #n_visible2=28 * 28/2,
        n_visible1=24,
        n_visible2=24,
        n_hidden=50
    )

    cost, updates = da.get_cost_updates(
        corruption_level=0.3,
        learning_rate=learning_rate
    )
    cor_reg_val = numpy.float32(5.0)
    train_da = theano.function(
        [index],
        cost,
        updates=updates,
        givens={
            x1: train_set_x[index * batch_size: (index + 1) * batch_size],
            x2: train_set_y[index * batch_size: (index + 1) * batch_size]
        }
    )
    fprop_x1 = theano.function(
               [],
               outputs=da.output1,
               givens={
                   x1: test_set_x
               },
               name='fprop_x1'
    )
    fprop_x2 = theano.function(
               [],
               outputs=da.output2,
               givens={
                   x2: test_set_y
               },
               name='fprop_x2'
    )
    fprop_x1t = theano.function(
               [],
               outputs=da.output1,
               givens={
                   x1: train_set_x
               },
               name='fprop_x1'
    )
    fprop_x2t = theano.function(
               [],
               outputs=da.output2,
               givens={
                   x2: train_set_y
               },
               name='fprop_x2'
    )
    rec_x1 = theano.function(
               [],
               outputs=da.rec1,
               givens={
                   x1: test_set_x
               },
               name='rec_x1'
    )
    rec_x2 = theano.function(
               [],
               outputs=da.rec2,
               givens={
                   x2: test_set_y
               },
               name='rec_x2'
    )
    fprop_x1_to_x2 = theano.function(
               [],
               outputs=da.reg,
               givens={
                   x1: test_set_x
               },
               name='fprop_x12x2'
    )
    updates_reg = [
            (da.cor_reg, da.cor_reg+theano.shared(numpy.float32(0.1)))
    ]
    update_reg = theano.function(
        [],
        updates=updates_reg
    )
    print 'initialize functions ended'

    
    start_time = time.clock()

    ############
    # TRAINING #
    ############
    print 'training started'
    X1=test_set_x.eval()
    X1 *= x_std
    X1 += x_mean
    X2=test_set_y.eval()
    X2 *= y_std
    X2 += y_mean
    from dcca_numpy import cor_cost
    # go through training epochs
    for epoch in xrange(training_epochs):
        # go through trainng set
        c = []
        for batch_index in xrange(n_train_batches):
            c.append(train_da(batch_index))
        
        #cor_reg_val += 1
        #da.cor_reg = theano.shared(cor_reg_val) 
        update_reg()
        
        X1H=rec_x1()
        X2H=rec_x2()
        X1H *= x_std
        X1H += x_mean
        X2H *= y_std
        X2H += y_mean
        H1=fprop_x1()
        H2=fprop_x2()
        print 'Training epoch'
        print 'Reconstruction ', numpy.mean(numpy.mean((X1H-X1)**2,1)),\
              numpy.mean(numpy.mean((X2H-X2)**2,1))
        
        if epoch%5 == 2 : # pretrain middle layer
            print '... pre-training MIDDLE layer'
            H1t=fprop_x1t()
            H2t=fprop_x2t()
            h1 = T.matrix('x')  # the data is presented as rasterized images
            h2 = T.matrix('y')  # the labels are presented as 1D vector of
            from mlp import HiddenLayer
            numpy_rng = numpy.random.RandomState(89677)
            log_reg = HiddenLayer(numpy_rng, h1, 50, 50, activation=T.tanh)

            if 1: # for middle layer
                learning_rate = 0.1
            
                #H1=theano.shared(H1)
                #H2=theano.shared(H2)
                # compute the gradients with respect to the model parameters
                logreg_cost = log_reg.mse(h2)
    
                gparams = T.grad(logreg_cost, log_reg.params)
        
                # compute list of fine-tuning updates
                updates = [
                    (param, param - gparam * learning_rate)
                    for param, gparam in zip(log_reg.params, gparams)
                ]
    
                train_fn_middle = theano.function(
                    inputs=[],
                    outputs=logreg_cost,
                    updates=updates,
                    givens={
                        h1: theano.shared(H1t),
                        h2: theano.shared(H2t)
                    },
                    name='train_middle'
                )
            epoch = 0
            while epoch < 100:
                print epoch, train_fn_middle()
                epoch += 1
            
            ##X2H=fprop_x1_to_x2()
            X2H=numpy.tanh(H1.dot(log_reg.W.eval())+log_reg.b.eval())
            X2H=numpy.tanh(X2H.dot(da.W2_prime.eval())+da.b2_prime.eval())

            X2H *= y_std
            X2H += y_mean
            print 'Regression ', numpy.mean(numpy.mean((X2H-X2)**2,1))
        
        print 'Correlation ', cor_cost(H1, H2)
    end_time = time.clock()

    training_time = (end_time - start_time)

    print >> sys.stderr, ('The no corruption code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((training_time) / 60.))
    image = Image.fromarray(
        tile_raster_images(X=da.W1.get_value(borrow=True).T,
                           img_shape=(28, 14), tile_shape=(10, 10),
                           tile_spacing=(1, 1)))
    image.save('filters_corruption_0.png')
    
    from matplotlib import pyplot as pp
    pp.plot(H1[:10,:2],'b');pp.plot(H2[:10,:2],'r');pp.show()
    
    print cor
Example #3
0
def test_SdA_regress(finetune_lr=0.05, pretraining_epochs=10,
             pretrain_lr=0.1, training_epochs=10000,
             dataset='mnist.pkl.gz', batch_size=20):
    datasets = load_data_half(dataset)

    train_set_x, train_set_y = datasets[0]##
    valid_set_x, valid_set_y = datasets[1]##
    test_set_x, test_set_y = datasets[2]##
    train_set_x=train_set_x.eval()
    train_set_y=train_set_y.eval()
    import theano
    train_set_x_lab=train_set_x[:,:]
    train_set_x_unlab=train_set_x[:,:]
    train_set_y_lab=train_set_y[:,:]
    train_set_y_unlab=train_set_y[:,:]
    train_set_x_lab=theano.shared(numpy.asarray(train_set_x_lab,
                                                dtype=theano.config.floatX),
                                  borrow=True)
    train_set_y_lab=theano.shared(numpy.asarray(train_set_y_lab,
                                                dtype=theano.config.floatX),
                                  borrow=True)
    train_set_x_unlab=theano.shared(numpy.asarray(train_set_x_unlab,
                                                  dtype=theano.config.floatX),
                                    borrow=True)
    train_set_y_unlab=theano.shared(numpy.asarray(train_set_y_unlab,
                                                  dtype=theano.config.floatX),
                                    borrow=True)

    # compute number of minibatches for training, validation and testing
    n_train_batches_l = train_set_y_lab.eval().shape[0]
    n_train_batches_l /= batch_size
    n_train_batches_u = train_set_y_unlab.eval().shape[0]
    n_train_batches_u /= batch_size
    # compute number of minibatches for training, validation and testing
    #n_train_batches = train_set_x.get_value(borrow=True).shape[0]
    #n_train_batches /= batch_size

    # numpy random generator
    # start-snippet-3
    numpy_rng = numpy.random.RandomState(89677)
    print '... building the model'
    # construct the stacked denoising autoencoder class
    #from SdA_orig import SdA as SdA_old
    hidden_layer_size = 100
    SdA_inp = SdA(numpy_rng,
                  n_ins=392,
                  hidden_layers_sizes=[hidden_layer_size]
    )
    SdA_out = SdA(numpy_rng,
                  n_ins=392,
                  hidden_layers_sizes=[hidden_layer_size]
    )
        
    # PRETRAINING THE MODEL #
    if 0 : # pretrain inp ae
        print '... getting the pretraining functions for INPUT AE'
        pretraining_fns = SdA_inp.pretraining_functions(train_set_x=train_set_x_unlab,
                                                    batch_size=batch_size)
    
        print '... pre-training the model'
        start_time = time.clock()
        ## Pre-train layer-wise
        corruption_levels = [.1, .2, .3]
        for i in xrange(SdA_inp.n_layers):
            # go through pretraining epochs
            for epoch in xrange(pretraining_epochs):
                # go through the training set
                c = []
                for batch_index in xrange(n_train_batches_u):
                    c.append(pretraining_fns[i](index=batch_index,
                             corruption=corruption_levels[i],
                             lr=pretrain_lr))
                print 'Pre-training layer %i, epoch %d, cost ' % (i, epoch),
                print numpy.mean(c)
    
        end_time = time.clock()
    
        print >> sys.stderr, ('The pretraining code for file ' +
                              os.path.split(__file__)[1] +
                              ' ran for %.2fm' % ((end_time - start_time) / 60.))
    if 0 : # pretrain out ae
        print '... getting the pretraining functions for OUTPUT AE'
        pretraining_fns = SdA_out.pretraining_functions(train_set_x=train_set_y_unlab,
                                                    batch_size=batch_size)
    
        print '... pre-training the model'
        start_time = time.clock()
        ## Pre-train layer-wise
        corruption_levels = [.5, .2, .3]
        for i in xrange(SdA_out.n_layers):
            # go through pretraining epochs
            for epoch in xrange(pretraining_epochs):
                # go through the training set
                c = []
                for batch_index in xrange(n_train_batches_u):
                    c.append(pretraining_fns[i](index=batch_index,
                             corruption=corruption_levels[i],
                             lr=pretrain_lr))
                print 'Pre-training layer %i, epoch %d, cost ' % (i, epoch),
                print numpy.mean(c)
    
        end_time = time.clock()
    
        print >> sys.stderr, ('The pretraining code for file ' +
                              os.path.split(__file__)[1] +
                              ' ran for %.2fm' % ((end_time - start_time) / 60.))
    
        
    if 0: # save aes
        f=open('aes_shallow_sig_nobias.pkl', 'w+')
        import pickle
        pickle.dump(SdA_inp, f)
        pickle.dump(SdA_out, f)
        f.flush()
        f.close() 
    if 0: # load aes
        f=open('aes_shallow_sig_nobias.pkl', 'r')
        import pickle
        SdA_inp=pickle.load(f)
        SdA_out=pickle.load(f)
        f.close()    
   
    if 1: # cca
        from dcca_numpy import netCCA_nobias, netCCA, dCCA
        from mlp_numpy import expit, logistic_prime, linear, linear_prime, relu, relu_prime, tanh, tanh_prime
        train_y1 = train_set_x_lab.eval()
        train_y2 = train_set_y_lab.eval()
        test_y1 = test_set_x.eval()
        test_y2 = test_set_y.eval()

        ##param1=((train_y1.shape[1],0,0),(2038, relu, relu_prime),(50, relu, relu_prime))
        ##param2=((train_y2.shape[1],0,0),(1608, relu, relu_prime),(50, relu, relu_prime))
        param1=((train_y1.shape[1],0,0),(hidden_layer_size, expit, logistic_prime))
        param2=((train_y2.shape[1],0,0),(hidden_layer_size, expit, logistic_prime))
        W1s = []
        b1s = []
        for i in range(len(SdA_inp.dA_layers)):
            W1s.append( SdA_inp.dA_layers[i].W.T.eval() )
            ##b1s.append( SdA_inp.dA_layers[i].b.eval() )
            ##b1s[-1] = b1s[-1].reshape((b1s[-1].shape[0], 1))
        W2s = []
        b2s = []
        for i in range(len(SdA_out.dA_layers)):
            W2s.append( SdA_out.dA_layers[i].W.T.eval() )
            ##b2s.append( SdA_out.dA_layers[i].b.eval() )
            ##b2s[-1] = b2s[-1].reshape((b2s[-1].shape[0], 1))

        numpy.random.seed(0)
        N1=netCCA_nobias(train_y1,param1, W1s)
        N2=netCCA_nobias(train_y2,param2, W2s)
        N = dCCA(train_y1, train_y2, N1, N2)
        N1.reconstruct(test_set_x.eval()[0,:])
        cnt = 0
        from dcca_numpy import cca_cost, cca, order_cost, cor_cost
        while True:
            X=N1.predict(test_set_x.eval())
            Y=N2.predict(test_set_y.eval())
            _H1 = numpy.dot(X, N.A1)
            _H2 = numpy.dot(Y, N.A2)
            print '****', cnt, cor_cost(_H1, _H2)
            X1_rec = numpy.tanh(X.dot(N1.weights[0]))
            X2_rec = numpy.tanh(Y.dot(N2.weights[0]))
            param=((hidden_layer_size,0,0),(hidden_layer_size, relu, relu_prime))
            from mlp_numpy import NeuralNetwork as NN

            lr=NN(X,Y,param)
            lr.train(X[:,:],Y[:,:],10, 0.005)
            Yh=lr.predict(X[:,:])
            X2_reg = N2.fs[-1](numpy.dot(Yh,N2.weights[0]))

            #X2_reg = N2.fs[-1](numpy.dot(_H1.dot(numpy.linalg.inv(N.A1)),N2.weights[0]))

            print '****', 'mse1:', numpy.mean((X1_rec-test_set_x.eval())**2.0)
            print '****', 'mse2:', numpy.mean((X2_rec-test_set_y.eval())**2.0)
            print '****', 'mse_map:', numpy.mean((X2_reg-test_set_y.eval())**2.0)

            if cnt % 2:
                N.train(5, True, 10000.0)
            else:
                N.train(5, False, 10000.0)

            cnt += 1
            f=open('netcca.pkl', 'w+')
            import pickle
            pickle.dump(N, f)
            pickle.dump(N, f)
            f.flush()
            f.close() 
            if cnt == 200:
                break
        for i in range(len(SdA_inp.dA_layers)):
            SdA_inp.dA_layers[i].W = theano.shared( N1.weights[i].T )
            SdA_inp.dA_layers[i].b = theano.shared( N1.biases[i][:,0] )
        
        for i in range(len(SdA_out.dA_layers)):
            SdA_out.dA_layers[i].W = theano.shared( N2.weights[i].T )
            SdA_out.dA_layers[i].b = theano.shared( N2.weights[i][:,0] )

        
    if 1 : # pretrain middle layer
        print '... pre-training MIDDLE layer'

        h1 = T.matrix('x')  # the data is presented as rasterized images
        h2 = T.matrix('y')  # the labels are presented as 1D vector of
        log_reg = HiddenLayer(numpy_rng, h1, hidden_layer_size, hidden_layer_size)

        if 1: # for middle layer
            learning_rate = 0.01
            fprop_inp = theano.function(
                [],
                SdA_inp.sigmoid_layers[-1].output,
                givens={
                    SdA_inp.sigmoid_layers[0].input: train_set_x_lab
                },
                name='fprop_inp'
            )
            fprop_out = theano.function(
                [],
                SdA_out.sigmoid_layers[-1].output,
                givens={
                    SdA_out.sigmoid_layers[0].input: train_set_y_lab
                },
                name='fprop_out'
            )
            #H11=fprop_inp() 
            #H21=fprop_out()
            ##H1=N1.predict(train_set_x.eval())
            ##H2=N2.predict(train_set_y.eval())
            H1=fprop_inp()
            H2=fprop_out()
            H1=theano.shared(H1)
            H2=theano.shared(H2)
            # compute the gradients with respect to the model parameters
            logreg_cost = log_reg.mse(h2)

            gparams = T.grad(logreg_cost, log_reg.params)
    
            # compute list of fine-tuning updates
            updates = [
                (param, param - gparam * learning_rate)
                for param, gparam in zip(log_reg.params, gparams)
            ]

            train_fn_middle = theano.function(
                inputs=[],
                outputs=logreg_cost,
                updates=updates,
                givens={
                    h1: H1,
                    h2: H2
                },
                name='train_middle'
            )
        epoch = 0
        while epoch < 10:
            print epoch, train_fn_middle()
            epoch += 1
            
    sda = SdA_regress(
        SdA_inp,
        SdA_out,
        log_reg,
        numpy_rng=numpy_rng,
        n_inp=28*28//2,
        hidden_layers_sizes_inp=[hidden_layer_size],
        hidden_layers_sizes_out=[hidden_layer_size],
        n_out=28*28//2
    )
    # end-snippet-3 start-snippet-4
    # end-snippet-4
    
    # FINETUNING THE MODEL #

    # get the training, validation and testing function for the model
    print '... getting the finetuning functions'
    train_fn, validate_model, test_model = sda.build_finetune_functions(
        datasets=datasets,
        batch_size=batch_size,
        learning_rate=finetune_lr
    )
    
        
    print '... finetunning the model'
    # early-stopping parameters
    patience = 10 * n_train_batches_l  # look as this many examples regardless
    patience_increase = 2.  # wait this much longer when a new best is
                            # found
    improvement_threshold = 0.995  # a relative improvement of this much is
                                   # considered significant
    validation_frequency = min(n_train_batches_l, patience / 2)
                                  # go through this many
                                  # minibatche before checking the network
                                  # on the validation set; in this case we
                                  # check every epoch

    best_validation_loss = numpy.inf
    test_score = 0.
    start_time = time.clock()

    done_looping = False
    epoch = 0
    fprop = theano.function(
        [],
        sda.sigmoid_layers[-1].output,
        givens={
            sda.x: test_set_x
        },
        name='fprop'
    )
    while True:
        epoch = epoch + 1
        for minibatch_index in xrange(n_train_batches_l):
            minibatch_avg_cost = train_fn(minibatch_index)
            iter = (epoch - 1) * n_train_batches_l + minibatch_index

            if (iter + 1) % validation_frequency == 0:
                validation_losses = validate_model()
                this_validation_loss = numpy.mean(validation_losses)
                print('epoch %i, minibatch %i/%i, validation error %f %%' %
                      (epoch, minibatch_index + 1, n_train_batches_l,
                       this_validation_loss ))

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:

                    #improve patience if loss improvement is good enough
                    if (
                        this_validation_loss < best_validation_loss *
                        improvement_threshold
                    ):
                        patience = max(patience, iter * patience_increase)

                    # save best validation score and iteration number
                    best_validation_loss = this_validation_loss
                    best_iter = iter

                    # test it on the test set
                    test_losses = test_model()
                    test_score = numpy.mean(test_losses)
                    print(('     epoch %i, minibatch %i/%i, test error of '
                           'best model %f %%') %
                          (epoch, minibatch_index + 1, n_train_batches_l,
                           test_score ))

            if patience <= iter:
                done_looping = True
                #break
            if 0: # vis weights
                fprop = theano.function(
                    [],
                    sda.sigmoid_layers[-1].output,
                    givens={
                        sda.x: test_set_x
                    },
                    name='fprop'
                )
                yh=fprop()
                yh=yh
    end_time = time.clock()
    print(
        (
            'Optimization complete with best validation score of %f %%, '
            'on iteration %i, '
            'with test performance %f %%'
        )
        % (best_validation_loss , best_iter + 1, test_score)
    )
    print >> sys.stderr, ('The training code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))