Example #1
0
def test_mlp(initial_learning_rate,
             learning_rate_decay,
             squared_filter_length_limit,
             n_epochs,
             batch_size,
             mom_params,
             activations,
             dropout,
             dropout_rates,
             results_file_name,
             layer_sizes,
             dataset,
             use_bias,
             random_seed=1234):
    """
    The dataset is the one from the mlp demo on deeplearning.net.  This training
    function is lifted from there almost exactly.
    :type dataset: string
    :param dataset: the path of the MNIST dataset file from
                 http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz
    """
    assert len(layer_sizes) - 1 == len(dropout_rates)

    # extract the params for momentum
    mom_start = mom_params["start"]
    mom_end = mom_params["end"]
    mom_epoch_interval = mom_params["interval"]

    from utils import load_vc
    #datasets = load_mnist(dataset)
    print '... loading the data'
    dataset = 'c2s.npy'
    datasets, x_mean, y_mean, x_std, y_std = load_vc(dataset)
    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size
    n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size

    ######################
    # BUILD ACTUAL MODEL #
    ######################

    print '... building the model'

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch
    epoch = T.scalar()
    x = T.matrix('x')  # the data is presented as rasterized images
    y = T.matrix('y')  # the labels are presented as 1D vector of
    # [int] labels
    learning_rate = theano.shared(
        np.asarray(initial_learning_rate, dtype=theano.config.floatX))

    rng = np.random.RandomState(random_seed)

    # construct the MLP class

    if 1:  # load
        f = open('c2s_pre.npy.dnn.pkl', 'r')
        pretrained = cPickle.load(f)
        f.close()
    else:
        pretrained = None
    classifier = MLP(rng=rng,
                     input=x,
                     layer_sizes=layer_sizes,
                     dropout_rates=dropout_rates,
                     activations=activations,
                     use_bias=use_bias,
                     pretrained=pretrained)

    # Build the expresson for the cost function.
    cost = classifier.negative_log_likelihood(y)
    dropout_cost = classifier.dropout_negative_log_likelihood(y)

    # Compile theano function for testing.
    test_model = theano.function(
        inputs=[index],
        outputs=classifier.errors(y),
        givens={
            x: test_set_x[index * batch_size:(index + 1) * batch_size],
            y: test_set_y[index * batch_size:(index + 1) * batch_size]
        })
    #theano.printing.pydotprint(test_model, outfile="test_file.png",
    #        var_with_name_simple=True)

    # Compile theano function for validation.
    validate_model = theano.function(
        inputs=[index],
        outputs=classifier.errors(y),
        givens={
            x: valid_set_x[index * batch_size:(index + 1) * batch_size],
            y: valid_set_y[index * batch_size:(index + 1) * batch_size]
        })
    test_fprop = theano.function(inputs=[],
                                 outputs=classifier.layers[-1].output,
                                 givens={x: test_set_x})
    #theano.printing.pydotprint(validate_model, outfile="validate_file.png",
    #        var_with_name_simple=True)

    # Compute gradients of the model wrt parameters
    gparams = []
    for param in classifier.params:
        # Use the right cost function here to train with or without dropout.
        gparam = T.grad(dropout_cost if dropout else cost, param)
        gparams.append(gparam)

    # ... and allocate mmeory for momentum'd versions of the gradient
    gparams_mom = []
    for param in classifier.params:
        gparam_mom = theano.shared(
            np.zeros(param.get_value(borrow=True).shape,
                     dtype=theano.config.floatX))
        gparams_mom.append(gparam_mom)

    # Compute momentum for the current epoch
    mom = ifelse(
        epoch < mom_epoch_interval,
        mom_start * (1.0 - epoch / mom_epoch_interval) + mom_end *
        (epoch / mom_epoch_interval), mom_end)

    # Update the step direction using momentum
    updates = OrderedDict()
    for gparam_mom, gparam in zip(gparams_mom, gparams):
        # Misha Denil's original version
        #updates[gparam_mom] = mom * gparam_mom + (1. - mom) * gparam

        # change the update rule to match Hinton's dropout paper
        updates[gparam_mom] = mom * gparam_mom - (1. -
                                                  mom) * learning_rate * gparam

    # ... and take a step along that direction
    for param, gparam_mom in zip(classifier.params, gparams_mom):
        # Misha Denil's original version
        #stepped_param = param - learning_rate * updates[gparam_mom]

        # since we have included learning_rate in gparam_mom, we don't need it
        # here
        stepped_param = param + updates[gparam_mom]

        # This is a silly hack to constrain the norms of the rows of the weight
        # matrices.  This just checks if there are two dimensions to the
        # parameter and constrains it if so... maybe this is a bit silly but it
        # should work for now.
        if param.get_value(borrow=True).ndim == 2:
            #squared_norms = T.sum(stepped_param**2, axis=1).reshape((stepped_param.shape[0],1))
            #scale = T.clip(T.sqrt(squared_filter_length_limit / squared_norms), 0., 1.)
            #updates[param] = stepped_param * scale

            # constrain the norms of the COLUMNs of the weight, according to
            # https://github.com/BVLC/caffe/issues/109
            col_norms = T.sqrt(T.sum(T.sqr(stepped_param), axis=0))
            desired_norms = T.clip(col_norms, 0,
                                   T.sqrt(squared_filter_length_limit))
            scale = desired_norms / (1e-7 + col_norms)
            updates[param] = stepped_param * scale
        else:
            updates[param] = stepped_param

    # Compile theano function for training.  This returns the training cost and
    # updates the model parameters.
    output = dropout_cost if dropout else cost
    train_model = theano.function(
        inputs=[epoch, index],
        outputs=output,
        updates=updates,
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size],
            y: train_set_y[index * batch_size:(index + 1) * batch_size]
        })
    #theano.printing.pydotprint(train_model, outfile="train_file.png",
    #        var_with_name_simple=True)

    # Theano function to decay the learning rate, this is separate from the
    # training function because we only want to do this once each epoch instead
    # of after each minibatch.
    decay_learning_rate = theano.function(
        inputs=[],
        outputs=learning_rate,
        updates={learning_rate: learning_rate * learning_rate_decay})

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'

    best_params = None
    best_validation_errors = np.inf
    best_iter = 0
    test_score = 0.
    epoch_counter = 0
    start_time = time.clock()

    results_file = open(results_file_name, 'wb')
    X2 = test_set_y.eval()
    X2 *= y_std
    X2 += y_mean
    X1 = test_set_x.eval()
    X1 *= x_std
    X1 += x_mean
    last_reg = 10000.0
    while epoch_counter < n_epochs:
        # Train this epoch
        epoch_counter = epoch_counter + 1
        for minibatch_index in xrange(n_train_batches):
            minibatch_avg_cost = train_model(epoch_counter, minibatch_index)

        # Compute loss on validation set
        validation_losses = [
            validate_model(i) for i in xrange(n_valid_batches)
        ]
        this_validation_errors = np.mean(validation_losses)

        # Report and save progress.
        print "epoch {}, test error {}, learning_rate={}{}".format(
            epoch_counter, this_validation_errors,
            learning_rate.get_value(borrow=True),
            " **" if this_validation_errors < best_validation_errors else "")

        best_validation_errors = min(best_validation_errors,
                                     this_validation_errors)
        results_file.write("{0}\n".format(this_validation_errors))
        results_file.flush()

        new_learning_rate = decay_learning_rate()
        YH = test_fprop()
        YH *= y_std
        YH += y_mean
        print 'Regression ', np.mean(np.mean((YH - X2)**2, 1))
        print 'Baseline! ', np.mean(np.mean((X1 - X2)**2, 1))
        if np.mean(np.mean((YH - X2)**2, 1)) < last_reg:
            print 'This is better. Saving the model to ' + dataset + '.dnn.pkl'
            f = open(dataset + '.dnn.pkl', 'w+')
            cPickle.dump(classifier, f)
            f.flush()
            f.close()
            last_reg = np.mean(np.mean((YH - X2)**2, 1))

    end_time = time.clock()
    print(('Optimization complete. Best validation score of %f %% '
           'obtained at iteration %i, with test performance %f %%') %
          (best_validation_errors * 100., best_iter, test_score * 100.))
    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
Example #2
0
def test_dA(learning_rate=0.01,
            training_epochs=15000,
            dataset='mnist.pkl.gz',
            batch_size=5,
            output_folder='dA_plots'):
    """
    This demo is tested on MNIST

    :type learning_rate: float
    :param learning_rate: learning rate used for training the DeNosing
                          AutoEncoder

    :type training_epochs: int
    :param training_epochs: number of epochs used for training

    :type dataset: string
    :param dataset: path to the picked dataset

    """
    ##datasets = load_data(dataset)
    #from SdA_mapping import load_data_half
    #datasets = load_data_half(dataset)
    print 'loading data'
    datasets, x_mean, y_mean, x_std, y_std = load_vc()
    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]
    print 'loaded data'

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch
    x1 = T.matrix('x1')  # the data is presented as rasterized images
    x2 = T.matrix('x2')  # the data is presented as rasterized images
    cor_reg = T.scalar('cor_reg')
    if not os.path.isdir(output_folder):
        os.makedirs(output_folder)
    os.chdir(output_folder)
    ####################################
    # BUILDING THE MODEL NO CORRUPTION #
    ####################################

    rng = numpy.random.RandomState(123)
    theano_rng = RandomStreams(rng.randint(2**30))

    #da = dA_joint(
    #numpy_rng=rng,
    #theano_rng=theano_rng,
    #input1=x1,
    #input2=x2,

    #n_visible1=28 * 28/2,
    #n_visible2=28 * 28/2,

    #n_hidden=500
    #)
    print 'initialize functions'

    da = dA_joint(
        numpy_rng=rng,
        theano_rng=theano_rng,
        input1=x1,
        input2=x2,
        cor_reg=cor_reg,

        #n_visible1=28 * 28/2,
        #n_visible2=28 * 28/2,
        n_visible1=24,
        n_visible2=24,
        n_hidden=50)

    cost, updates = da.get_cost_updates(corruption_level=0.3,
                                        learning_rate=learning_rate)
    cor_reg_val = numpy.float32(5.0)
    train_da = theano.function(
        [index],
        cost,
        updates=updates,
        givens={
            x1: train_set_x[index * batch_size:(index + 1) * batch_size],
            x2: train_set_y[index * batch_size:(index + 1) * batch_size]
        })
    fprop_x1 = theano.function([],
                               outputs=da.output1,
                               givens={x1: test_set_x},
                               name='fprop_x1')
    fprop_x2 = theano.function([],
                               outputs=da.output2,
                               givens={x2: test_set_y},
                               name='fprop_x2')
    fprop_x1t = theano.function([],
                                outputs=da.output1,
                                givens={x1: train_set_x},
                                name='fprop_x1')
    fprop_x2t = theano.function([],
                                outputs=da.output2,
                                givens={x2: train_set_y},
                                name='fprop_x2')
    rec_x1 = theano.function([],
                             outputs=da.rec1,
                             givens={x1: test_set_x},
                             name='rec_x1')
    rec_x2 = theano.function([],
                             outputs=da.rec2,
                             givens={x2: test_set_y},
                             name='rec_x2')
    fprop_x1_to_x2 = theano.function([],
                                     outputs=da.reg,
                                     givens={x1: test_set_x},
                                     name='fprop_x12x2')
    updates_reg = [(da.cor_reg, da.cor_reg + theano.shared(numpy.float32(0.1)))
                   ]
    update_reg = theano.function([], updates=updates_reg)
    print 'initialize functions ended'

    start_time = time.clock()

    ############
    # TRAINING #
    ############
    print 'training started'
    X1 = test_set_x.eval()
    X1 *= x_std
    X1 += x_mean
    X2 = test_set_y.eval()
    X2 *= y_std
    X2 += y_mean
    from dcca_numpy import cor_cost
    # go through training epochs
    for epoch in xrange(training_epochs):
        # go through trainng set
        c = []
        for batch_index in xrange(n_train_batches):
            c.append(train_da(batch_index))

        #cor_reg_val += 1
        #da.cor_reg = theano.shared(cor_reg_val)
        update_reg()

        X1H = rec_x1()
        X2H = rec_x2()
        X1H *= x_std
        X1H += x_mean
        X2H *= y_std
        X2H += y_mean
        H1 = fprop_x1()
        H2 = fprop_x2()
        print 'Training epoch'
        print 'Reconstruction ', numpy.mean(numpy.mean((X1H-X1)**2,1)),\
              numpy.mean(numpy.mean((X2H-X2)**2,1))

        if epoch % 5 == 2:  # pretrain middle layer
            print '... pre-training MIDDLE layer'
            H1t = fprop_x1t()
            H2t = fprop_x2t()
            h1 = T.matrix('x')  # the data is presented as rasterized images
            h2 = T.matrix('y')  # the labels are presented as 1D vector of
            from mlp import HiddenLayer
            numpy_rng = numpy.random.RandomState(89677)
            log_reg = HiddenLayer(numpy_rng, h1, 50, 50, activation=T.tanh)

            if 1:  # for middle layer
                learning_rate = 0.1

                #H1=theano.shared(H1)
                #H2=theano.shared(H2)
                # compute the gradients with respect to the model parameters
                logreg_cost = log_reg.mse(h2)

                gparams = T.grad(logreg_cost, log_reg.params)

                # compute list of fine-tuning updates
                updates = [(param, param - gparam * learning_rate)
                           for param, gparam in zip(log_reg.params, gparams)]

                train_fn_middle = theano.function(inputs=[],
                                                  outputs=logreg_cost,
                                                  updates=updates,
                                                  givens={
                                                      h1: theano.shared(H1t),
                                                      h2: theano.shared(H2t)
                                                  },
                                                  name='train_middle')
            epoch = 0
            while epoch < 100:
                print epoch, train_fn_middle()
                epoch += 1

            ##X2H=fprop_x1_to_x2()
            X2H = numpy.tanh(H1.dot(log_reg.W.eval()) + log_reg.b.eval())
            X2H = numpy.tanh(X2H.dot(da.W2_prime.eval()) + da.b2_prime.eval())

            X2H *= y_std
            X2H += y_mean
            print 'Regression ', numpy.mean(numpy.mean((X2H - X2)**2, 1))

        print 'Correlation ', cor_cost(H1, H2)
    end_time = time.clock()

    training_time = (end_time - start_time)

    print >> sys.stderr, ('The no corruption code for file ' +
                          os.path.split(__file__)[1] + ' ran for %.2fm' %
                          ((training_time) / 60.))
    image = Image.fromarray(
        tile_raster_images(X=da.W1.get_value(borrow=True).T,
                           img_shape=(28, 14),
                           tile_shape=(10, 10),
                           tile_spacing=(1, 1)))
    image.save('filters_corruption_0.png')

    from matplotlib import pyplot as pp
    pp.plot(H1[:10, :2], 'b')
    pp.plot(H2[:10, :2], 'r')
    pp.show()

    print cor
Example #3
0
def test_dA(learning_rate=0.01, training_epochs=15000,
            dataset='mnist.pkl.gz',
            batch_size=5, output_folder='dA_plots'):

    """
    This demo is tested on MNIST

    :type learning_rate: float
    :param learning_rate: learning rate used for training the DeNosing
                          AutoEncoder

    :type training_epochs: int
    :param training_epochs: number of epochs used for training

    :type dataset: string
    :param dataset: path to the picked dataset

    """
    ##datasets = load_data(dataset)
    #from SdA_mapping import load_data_half
    #datasets = load_data_half(dataset)
    print 'loading data'
    datasets, x_mean, y_mean, x_std, y_std = load_vc()
    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]  
    test_set_x, test_set_y = datasets[2]
    print 'loaded data'

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size

    # allocate symbolic variables for the data
    index = T.lscalar()    # index to a [mini]batch
    x1 = T.matrix('x1')  # the data is presented as rasterized images
    x2 = T.matrix('x2')  # the data is presented as rasterized images
    cor_reg = T.scalar('cor_reg')
    if not os.path.isdir(output_folder):
        os.makedirs(output_folder)
    os.chdir(output_folder)
    ####################################
    # BUILDING THE MODEL NO CORRUPTION #
    ####################################

    rng = numpy.random.RandomState(123)
    theano_rng = RandomStreams(rng.randint(2 ** 30))

    #da = dA_joint(
        #numpy_rng=rng,
        #theano_rng=theano_rng,
        #input1=x1,
        #input2=x2,

        #n_visible1=28 * 28/2,
        #n_visible2=28 * 28/2,

        #n_hidden=500
    #)
    print 'initialize functions'

    da = dA_joint(
        numpy_rng=rng,
        theano_rng=theano_rng,
        input1=x1,
        input2=x2,
        cor_reg=cor_reg,

        #n_visible1=28 * 28/2,
        #n_visible2=28 * 28/2,
        n_visible1=24,
        n_visible2=24,
        n_hidden=50
    )

    cost, updates = da.get_cost_updates(
        corruption_level=0.3,
        learning_rate=learning_rate
    )
    cor_reg_val = numpy.float32(5.0)
    train_da = theano.function(
        [index],
        cost,
        updates=updates,
        givens={
            x1: train_set_x[index * batch_size: (index + 1) * batch_size],
            x2: train_set_y[index * batch_size: (index + 1) * batch_size]
        }
    )
    fprop_x1 = theano.function(
               [],
               outputs=da.output1,
               givens={
                   x1: test_set_x
               },
               name='fprop_x1'
    )
    fprop_x2 = theano.function(
               [],
               outputs=da.output2,
               givens={
                   x2: test_set_y
               },
               name='fprop_x2'
    )
    fprop_x1t = theano.function(
               [],
               outputs=da.output1,
               givens={
                   x1: train_set_x
               },
               name='fprop_x1'
    )
    fprop_x2t = theano.function(
               [],
               outputs=da.output2,
               givens={
                   x2: train_set_y
               },
               name='fprop_x2'
    )
    rec_x1 = theano.function(
               [],
               outputs=da.rec1,
               givens={
                   x1: test_set_x
               },
               name='rec_x1'
    )
    rec_x2 = theano.function(
               [],
               outputs=da.rec2,
               givens={
                   x2: test_set_y
               },
               name='rec_x2'
    )
    fprop_x1_to_x2 = theano.function(
               [],
               outputs=da.reg,
               givens={
                   x1: test_set_x
               },
               name='fprop_x12x2'
    )
    updates_reg = [
            (da.cor_reg, da.cor_reg+theano.shared(numpy.float32(0.1)))
    ]
    update_reg = theano.function(
        [],
        updates=updates_reg
    )
    print 'initialize functions ended'

    
    start_time = time.clock()

    ############
    # TRAINING #
    ############
    print 'training started'
    X1=test_set_x.eval()
    X1 *= x_std
    X1 += x_mean
    X2=test_set_y.eval()
    X2 *= y_std
    X2 += y_mean
    from dcca_numpy import cor_cost
    # go through training epochs
    for epoch in xrange(training_epochs):
        # go through trainng set
        c = []
        for batch_index in xrange(n_train_batches):
            c.append(train_da(batch_index))
        
        #cor_reg_val += 1
        #da.cor_reg = theano.shared(cor_reg_val) 
        update_reg()
        
        X1H=rec_x1()
        X2H=rec_x2()
        X1H *= x_std
        X1H += x_mean
        X2H *= y_std
        X2H += y_mean
        H1=fprop_x1()
        H2=fprop_x2()
        print 'Training epoch'
        print 'Reconstruction ', numpy.mean(numpy.mean((X1H-X1)**2,1)),\
              numpy.mean(numpy.mean((X2H-X2)**2,1))
        
        if epoch%5 == 2 : # pretrain middle layer
            print '... pre-training MIDDLE layer'
            H1t=fprop_x1t()
            H2t=fprop_x2t()
            h1 = T.matrix('x')  # the data is presented as rasterized images
            h2 = T.matrix('y')  # the labels are presented as 1D vector of
            from mlp import HiddenLayer
            numpy_rng = numpy.random.RandomState(89677)
            log_reg = HiddenLayer(numpy_rng, h1, 50, 50, activation=T.tanh)

            if 1: # for middle layer
                learning_rate = 0.1
            
                #H1=theano.shared(H1)
                #H2=theano.shared(H2)
                # compute the gradients with respect to the model parameters
                logreg_cost = log_reg.mse(h2)
    
                gparams = T.grad(logreg_cost, log_reg.params)
        
                # compute list of fine-tuning updates
                updates = [
                    (param, param - gparam * learning_rate)
                    for param, gparam in zip(log_reg.params, gparams)
                ]
    
                train_fn_middle = theano.function(
                    inputs=[],
                    outputs=logreg_cost,
                    updates=updates,
                    givens={
                        h1: theano.shared(H1t),
                        h2: theano.shared(H2t)
                    },
                    name='train_middle'
                )
            epoch = 0
            while epoch < 100:
                print epoch, train_fn_middle()
                epoch += 1
            
            ##X2H=fprop_x1_to_x2()
            X2H=numpy.tanh(H1.dot(log_reg.W.eval())+log_reg.b.eval())
            X2H=numpy.tanh(X2H.dot(da.W2_prime.eval())+da.b2_prime.eval())

            X2H *= y_std
            X2H += y_mean
            print 'Regression ', numpy.mean(numpy.mean((X2H-X2)**2,1))
        
        print 'Correlation ', cor_cost(H1, H2)
    end_time = time.clock()

    training_time = (end_time - start_time)

    print >> sys.stderr, ('The no corruption code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((training_time) / 60.))
    image = Image.fromarray(
        tile_raster_images(X=da.W1.get_value(borrow=True).T,
                           img_shape=(28, 14), tile_shape=(10, 10),
                           tile_spacing=(1, 1)))
    image.save('filters_corruption_0.png')
    
    from matplotlib import pyplot as pp
    pp.plot(H1[:10,:2],'b');pp.plot(H2[:10,:2],'r');pp.show()
    
    print cor
Example #4
0
def test_mlp(
        initial_learning_rate,
        learning_rate_decay,
        squared_filter_length_limit,
        n_epochs,
        batch_size,
        mom_params,
        activations,
        dropout,
        dropout_rates,
        results_file_name,
        layer_sizes,
        dataset,
        use_bias,
        random_seed=1234):
    """
    The dataset is the one from the mlp demo on deeplearning.net.  This training
    function is lifted from there almost exactly.
    :type dataset: string
    :param dataset: the path of the MNIST dataset file from
                 http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz
    """
    assert len(layer_sizes) - 1 == len(dropout_rates)
    
    # extract the params for momentum
    mom_start = mom_params["start"]
    mom_end = mom_params["end"]
    mom_epoch_interval = mom_params["interval"]
    
    from utils import load_vc
    #datasets = load_mnist(dataset)
    print '... loading the data'
    dataset = 'c2s.npy'
    datasets, x_mean, y_mean, x_std, y_std = load_vc(dataset)
    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size
    n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size

    ######################
    # BUILD ACTUAL MODEL #
    ######################

    print '... building the model'

    # allocate symbolic variables for the data
    index = T.lscalar()    # index to a [mini]batch
    epoch = T.scalar()
    x = T.matrix('x')  # the data is presented as rasterized images
    y = T.matrix('y')  # the labels are presented as 1D vector of
                        # [int] labels
    learning_rate = theano.shared(np.asarray(initial_learning_rate,
        dtype=theano.config.floatX))

    rng = np.random.RandomState(random_seed)

    # construct the MLP class
        
    if 1: # load
        f = open('c2s_pre.npy.dnn.pkl','r')
        pretrained = cPickle.load(f)
        f.close()
    else:
        pretrained=None
    classifier = MLP(rng=rng, input=x,
                     layer_sizes=layer_sizes,
                     dropout_rates=dropout_rates,
                     activations=activations,
                     use_bias=use_bias,
                     pretrained=pretrained)

    # Build the expresson for the cost function.
    cost = classifier.negative_log_likelihood(y)
    dropout_cost = classifier.dropout_negative_log_likelihood(y)

    # Compile theano function for testing.
    test_model = theano.function(inputs=[index],
            outputs=classifier.errors(y),
            givens={
                x: test_set_x[index * batch_size:(index + 1) * batch_size],
                y: test_set_y[index * batch_size:(index + 1) * batch_size]})
    #theano.printing.pydotprint(test_model, outfile="test_file.png",
    #        var_with_name_simple=True)

    # Compile theano function for validation.
    validate_model = theano.function(inputs=[index],
            outputs=classifier.errors(y),
            givens={
                x: valid_set_x[index * batch_size:(index + 1) * batch_size],
                y: valid_set_y[index * batch_size:(index + 1) * batch_size]})
    test_fprop = theano.function(inputs=[],
            outputs=classifier.layers[-1].output,
            givens={
                x: test_set_x
                })
    #theano.printing.pydotprint(validate_model, outfile="validate_file.png",
    #        var_with_name_simple=True)

    # Compute gradients of the model wrt parameters
    gparams = []
    for param in classifier.params:
        # Use the right cost function here to train with or without dropout.
        gparam = T.grad(dropout_cost if dropout else cost, param)
        gparams.append(gparam)

    # ... and allocate mmeory for momentum'd versions of the gradient
    gparams_mom = []
    for param in classifier.params:
        gparam_mom = theano.shared(np.zeros(param.get_value(borrow=True).shape,
            dtype=theano.config.floatX))
        gparams_mom.append(gparam_mom)

    # Compute momentum for the current epoch
    mom = ifelse(epoch < mom_epoch_interval,
            mom_start*(1.0 - epoch/mom_epoch_interval) + mom_end*(epoch/mom_epoch_interval),
            mom_end)

    # Update the step direction using momentum
    updates = OrderedDict()
    for gparam_mom, gparam in zip(gparams_mom, gparams):
        # Misha Denil's original version
        #updates[gparam_mom] = mom * gparam_mom + (1. - mom) * gparam
      
        # change the update rule to match Hinton's dropout paper
        updates[gparam_mom] = mom * gparam_mom - (1. - mom) * learning_rate * gparam

    # ... and take a step along that direction
    for param, gparam_mom in zip(classifier.params, gparams_mom):
        # Misha Denil's original version
        #stepped_param = param - learning_rate * updates[gparam_mom]
        
        # since we have included learning_rate in gparam_mom, we don't need it
        # here
        stepped_param = param + updates[gparam_mom]

        # This is a silly hack to constrain the norms of the rows of the weight
        # matrices.  This just checks if there are two dimensions to the
        # parameter and constrains it if so... maybe this is a bit silly but it
        # should work for now.
        if param.get_value(borrow=True).ndim == 2:
            #squared_norms = T.sum(stepped_param**2, axis=1).reshape((stepped_param.shape[0],1))
            #scale = T.clip(T.sqrt(squared_filter_length_limit / squared_norms), 0., 1.)
            #updates[param] = stepped_param * scale
            
            # constrain the norms of the COLUMNs of the weight, according to
            # https://github.com/BVLC/caffe/issues/109
            col_norms = T.sqrt(T.sum(T.sqr(stepped_param), axis=0))
            desired_norms = T.clip(col_norms, 0, T.sqrt(squared_filter_length_limit))
            scale = desired_norms / (1e-7 + col_norms)
            updates[param] = stepped_param * scale
        else:
            updates[param] = stepped_param


    # Compile theano function for training.  This returns the training cost and
    # updates the model parameters.
    output = dropout_cost if dropout else cost
    train_model = theano.function(inputs=[epoch, index], outputs=output,
            updates=updates,
            givens={
                x: train_set_x[index * batch_size:(index + 1) * batch_size],
                y: train_set_y[index * batch_size:(index + 1) * batch_size]})
    #theano.printing.pydotprint(train_model, outfile="train_file.png",
    #        var_with_name_simple=True)

    # Theano function to decay the learning rate, this is separate from the
    # training function because we only want to do this once each epoch instead
    # of after each minibatch.
    decay_learning_rate = theano.function(inputs=[], outputs=learning_rate,
            updates={learning_rate: learning_rate * learning_rate_decay})

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'

    best_params = None
    best_validation_errors = np.inf
    best_iter = 0
    test_score = 0.
    epoch_counter = 0
    start_time = time.clock()

    results_file = open(results_file_name, 'wb')
    X2=test_set_y.eval()
    X2 *= y_std
    X2 += y_mean
    X1=test_set_x.eval()
    X1 *= x_std
    X1 += x_mean
    last_reg = 10000.0
    while epoch_counter < n_epochs:
        # Train this epoch
        epoch_counter = epoch_counter + 1
        for minibatch_index in xrange(n_train_batches):
            minibatch_avg_cost = train_model(epoch_counter, minibatch_index)

        # Compute loss on validation set
        validation_losses = [validate_model(i) for i in xrange(n_valid_batches)]
        this_validation_errors = np.mean(validation_losses)

        # Report and save progress.
        print "epoch {}, test error {}, learning_rate={}{}".format(
                epoch_counter, this_validation_errors,
                learning_rate.get_value(borrow=True),
                " **" if this_validation_errors < best_validation_errors else "")

        best_validation_errors = min(best_validation_errors,
                this_validation_errors)
        results_file.write("{0}\n".format(this_validation_errors))
        results_file.flush()

        new_learning_rate = decay_learning_rate()
        YH=test_fprop()
        YH *= y_std
        YH += y_mean
        print 'Regression ', np.mean(np.mean((YH-X2)**2,1))
        print 'Baseline! ', np.mean(np.mean((X1-X2)**2,1))
        if np.mean(np.mean((YH-X2)**2,1)) < last_reg:
            print 'This is better. Saving the model to ' + dataset+'.dnn.pkl'
            f = open(dataset+'.dnn.pkl','w+')
            cPickle.dump(classifier, f)
            f.flush()
            f.close()
            last_reg = np.mean(np.mean((YH-X2)**2,1))
            
    end_time = time.clock()
    print(('Optimization complete. Best validation score of %f %% '
           'obtained at iteration %i, with test performance %f %%') %
          (best_validation_errors * 100., best_iter, test_score * 100.))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))