def evaluate_lenet5(n_epochs=130, binary=True, preTrain=True, learning_rate=3e-3, \
                    nkerns=[3, 4], batch_size=1):
    """ Demonstrates lenet on MNIST dataset

    :type learning_rate: float
    :param learning_rate: learning rate used (factor for the stochastic
                          gradient)

    :type n_epochs: int
    :param n_epochs: maximal number of epochs to run the optimizer

    :type dataset: string
    :param dataset: path to the dataset used for training /testing (MNIST here)

    :type nkerns: list of ints
    :param nkerns: number of kernels on each layer
    """
    rng = numpy.random.RandomState(23455)

    datasets = Preprocess().load_data(binary)

    train_set_x, train_set_y, train_set_z = datasets[0]
    valid_set_x, valid_set_y, valid_set_z = datasets[1]
    test_set_x, test_set_y, test_set_z = datasets[2]
    
    if preTrain:
        pre_W = SCAE_preTrain(nkerns=nkerns, dataset=datasets)
        W_h, b_h,_, W2, b2,_, W1, b1,_ = pre_W
    else:
        W_h, b_h, W2, b2, W1, b1 = [None] * 6


    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0]
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]
    n_test_batches = test_set_x.get_value(borrow=True).shape[0]

    print 'Train\tVal\tTest\tbatch_size\n', n_train_batches, '\t', n_valid_batches, '\t', n_test_batches, '\t', batch_size
    n_train_batches /= batch_size
    n_valid_batches /= batch_size
    n_test_batches /= batch_size

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch
    x = T.matrix('x')   # the data is presented as rasterized sentences. Each row is an instance of sentence.
    y = T.ivector('y')  # the labels are presented as 1D vector of [int] labels
    z = T.iscalar('z')  # the sentence lengths are presented as 1D vector of [int] lengths.
    
    k_Top = 5
    em=50
    s_shape = (50, 56)  # this is the size of (padded) sentence matrix.

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized sentence of shape (batch_size,25*37)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    layer0_input = x[:, :z*em].reshape((batch_size, 1, em, -1))

    # Construct the first convolutional pooling layer:
    # filtering reduces the image size to (25,37+3-1)=(25,39)
    # maxpooling reduces this further to (25,19)
    # 4D output tensor is thus of shape (batch_size,nkerns[0],25,19)
    
    layer0 = LeNetConvPoolLayer(rng, input=layer0_input,
            image_shape=(batch_size, 1, em, None),
            filter_shape=(nkerns[0], 1, 1, 5), factor=.5, W = W1, b=b1, k_Top=k_Top, s=z)

    # Construct the second convolutional pooling layer
    # filtering reduces the image size to (25,19+3-1)=(25,21)
    # maxpooling reduces this further to (25,12)
    # 4D output tensor is thus of shape (nkerns[1], nkerns[0], 1, 2)

    layer1 = LeNetConvPoolLayer(rng, input=layer0.output,
            image_shape=(batch_size, nkerns[0], em, None),
            filter_shape=(nkerns[1], nkerns[0], 1, 3), factor=0., W=W2, b=b2, k_Top=k_Top, s=z)

    # the HiddenLayer being fully-connected, it operates on 2D matrices of
    # shape (batch_size,num_pixels) (i.e matrix of rasterized images).
    # This will generate a matrix of shape (batch_size,3*25*12) = (100,900)
    layer2_input = layer1.output.flatten(2)

    # construct a fully-connected sigmoidal layer
    layer2 = HiddenLayer(rng, input=layer2_input, n_in=nkerns[1] * em * k_Top,
                         n_out=100, W=W_h, b=b_h, activation=T.tanh)

    # classify the values of the fully-connected sigmoidal layer
    layer3 = LogisticRegression(input=layer2.output, n_in=100, n_out=2)

    # the cost we minimize during training is the NLL of the model
    cost = layer3.negative_log_likelihood(y)

    # create a function to compute the mistakes that are made by the model
    test_model = theano.function([index], layer3.errors(y),
             givens={
                x: test_set_x[index * batch_size: (index + 1) * batch_size],
                y: test_set_y[index * batch_size: (index + 1) * batch_size],
                z: test_set_z[index]})

    validate_model = theano.function([index], layer3.errors(y),
            givens={
                x: valid_set_x[index * batch_size: (index + 1) * batch_size],
                y: valid_set_y[index * batch_size: (index + 1) * batch_size],
                z: valid_set_z[index]})

    # create a list of all model parameters to be fit by gradient descent
    params = layer3.params + layer2.params + layer1.params + layer0.params

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    # train_model is a function that updates the model parameters by
    # SGD Since this model has many parameters, it would be tedious to
    # manually create an update rule for each model parameter. We thus
    # create the updates list by automatically looping over all
    # (params[i],grads[i]) pairs.
    
    rho = 1e-7
    G = [(theano.shared(value=numpy.zeros_like(param.get_value())+rho, name="AdaGrad_" + param.name, borrow=True)) for param in params]
    G_update = [T.add(g_adag, T.sqr(grad_i)) for g_adag, grad_i in zip(G, grads)]
    
    updates = []
    for param_i, g_update, grad_i, g in zip(params, G_update, grads, G):
        updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(g_update) ))
        updates.append((g, g_update))

    train_model = theano.function([index], cost, updates=updates, allow_input_downcast=True,
          givens={
            x: train_set_x[index * batch_size: (index + 1) * batch_size],
            y: train_set_y[index * batch_size: (index + 1) * batch_size],
            z: train_set_z[index]})


    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 100000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
                           # found
    improvement_threshold = 0.995  # a relative improvement of this much is
                                   # considered significant
    validation_frequency = 500 # min(n_train_batches/4, patience / 2)
                                  # go through this many
                                  # minibatche before checking the network
                                  # on the validation set; in this case we
                                  # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()

    epoch = 0
    done_looping = False
    res = []

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        for minibatch_index in xrange(n_train_batches):

            iter = (epoch - 1) * n_train_batches + minibatch_index
            cost_ij = train_model(minibatch_index)

            if (iter + 1) % validation_frequency == 0:
                
                # compute zero-one loss on validation set
                validation_losses = [validate_model(i) for i
                                     in xrange(n_valid_batches)]
                this_validation_loss = numpy.mean(validation_losses)

            #    print('epoch %i, minibatch %i/%i, validation error %f %%' % \
            #          (epoch, minibatch_index + 1, n_train_batches, \
            #           this_validation_loss * 100.))
            #    res.append(this_validation_loss)

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:

                    #improve patience if loss improvement is good enough
                    if this_validation_loss < best_validation_loss *  \
                       improvement_threshold:
                        patience = max(patience, iter * patience_increase)

                    # save best validation score and iteration number
                    best_validation_loss = this_validation_loss
                    best_iter = iter

                    # test it on the test set
                    test_losses = [test_model(i) for i in xrange(n_test_batches)]
                    test_score = numpy.mean(test_losses)
                    print(('     epoch %i, minibatch %i/%i, test error of best '
                           'model %f %%') %
                          (epoch, minibatch_index + 1, n_train_batches,
                           test_score * 100.))

            #if patience <= iter:
            #    done_looping = True
            #    break
        print('epoch %i, validation error %f %%' % \
              (epoch, this_validation_loss * 100.))
        res.append(this_validation_loss)
    
    test_losses = [test_model(i) for i in xrange(n_test_batches)]
    final_test_score = numpy.mean(test_losses)
    
    end_time = time.clock()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print 'Final test score:', final_test_score
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))

    plot('Movie Reviews- Binary. Validation Loss.', numpy.asarray(res) * 100., test_score * 100.)
def SCAE_preTrain(batch_size=1, nkerns= [3, 4], dataset=None, n_epochs=70, k_Top=5, learning_rate=1e-1, binary=True):
    """
        Stacked Convolutional AutoEncoders.
    """

    with open('SCAE_MR_1e-1K34', 'r') as f:
        rval = cPickle.load(f)
    return rval

    if dataset is None:
        dataset = Preprocess().load_data(binary)
    train_set_x = dataset[0][0]
    train_set_z = dataset[0][2]
    n_train_batch = train_set_x.get_value(borrow=True).shape[0]
    n_train_batch /= batch_size

    print '... Building Stacked Conv. AutoEncoders'

    rng = numpy.random.RandomState(96813)
    index = T.lscalar('index')
    
    x = T.dmatrix('Input Sentence')
    z = T.iscalar('Sentence length')
    layer0_input = x[:, :z*50].reshape((batch_size, 1, 50, -1))
    
    layer0 = SCAE(rng, input=layer0_input, image_shape=None, filter_shape=(nkerns[0], 1, 1, 8), \
                                        factor=.5, s=z, k_Top=k_Top, do_fold=False)

    layer1_input = layer0.get_hidden_values(layer0_input)
    layer1 = SCAE(rng, input=layer1_input, filter_shape=(nkerns[1], nkerns[0], 1, 5), \
                        image_shape=None, factor = .0, s=z, k_Top=k_Top, do_fold=False)

    layer1_output = layer1.get_hidden_values(layer1_input)
    
    hidden_input = layer1_output.flatten(2)
    layer2 = AE(rng, input=hidden_input, n_visible=layer1.kshp[0]*50*k_Top, n_hidden=100)
    
    Y = layer2.get_hidden_values(hidden_input)
    
    ################
    #   DECODING   #
    ################

    decode_hidden_layer = layer2.get_reconstructed_input(Y)
    decode_input = decode_hidden_layer.reshape(layer1.shape)
    
    decode_layer1 = layer1.get_reconstructed_input(decode_input)
    Z = layer0.get_reconstructed_input(decode_layer1)

    params = layer2.params + layer1.params + layer0.params
    
    def get_cost_updates(X, Z, params, learning_rate):
        ''' Update the Stacked Convolutional Auto-Encoders. '''
        
        L = T.sum((X-Z) ** 2, axis=(1,2,3))
        cost = T.mean(L)
        
        gparams = T.grad(cost, params)
        
        rho = 1e-7
        G = [(theano.shared(value=numpy.zeros_like(param.get_value()), name="AdaGrad_" + param.name, borrow=True)) for param in params]
        G_update = [T.add(g_adag, T.sqr(grad_i)) for g_adag, grad_i in zip(G, gparams)]
        
        updates = []
        for param_i, g_update, grad_i, g in zip(params, G_update, gparams, G):
            updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(g_update) ))
            updates.append((g, g_update))
        
        return (cost, updates)
    
    cost, updates = get_cost_updates(layer0_input, Z, params, learning_rate)
    
    train_model = theano.function([index], cost, updates=updates, \
                        givens={x: train_set_x[index * batch_size: (index + 1) * batch_size],
                                z: train_set_z[index]})

    print '... Pretraining model'

    plot_SCAE = []
    epoch = 0
    while epoch < n_epochs:
        epoch += 1
        for minibatch in xrange(n_train_batch):
            cost_ij = train_model(minibatch)
        print '\tepoch %i,\tcost  %f' % (epoch, cost_ij)
        plot_SCAE.append(cost_ij)
    plot('SCAE_Movie Results.', numpy.asarray(plot_SCAE), 74e-2)

    # Serialise the learned parameters
    with open('SCAE_MR_1e-1K%i%i'%(nkerns[0], nkerns[1]), 'wb') as f:
        cPickle.dump(params, f)
    return params
Exemple #3
0
                       DQN.convert_to_tensor(next_state, device),
                       torch.tensor([reward]).to(device)))

        if memory.can_provide_sample(batch_size):
            experiences = memory.sample(batch_size)
            states, actions, rewards, next_states = extract_tensors(
                experiences)
            current_q_values = QValues_Trading.QValuesTrading.get_current(
                policy_net, states, actions)
            next_q_values = QValues_Trading.QValuesTrading.get_next(
                target_net, next_states)
            target_q_values = (next_q_values * gamma) + rewards

            loss = F.mse_loss(current_q_values, target_q_values.unsqueeze(1))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        if done:
            episode_rewards.append(reward)
            average = get_average(episode_rewards, 100)
            rewards_average.append(average)
            if average > 9 and batch_size != 256:
                print("Changing batch")
                batch_size = 256
            plot(episode_rewards, rewards_average)
            break

    if episode_number % target_update == 0:
        target_net.load_state_dict(policy_net.state_dict())
Exemple #4
0
def simulate(trn, tst):
    start = time.time()
    b_tp = b_fp = b_tn = b_fn = 0
    s_tp = s_fp = s_tn = s_fn = 0
    b_min = s_min = 1000000
    b_max = s_max = 0
    b_money = s_money = 0
    b_money_vec = [0]
    s_money_vec = [0]
    b_gain = s_gain = 0
    b_loss = s_loss = 0
    b_draw = s_draw = 0
    b_gain_vec = []
    s_gain_vec = []
    b_loss_vec = []
    s_loss_vec = []
    b_max_drawdown = s_max_drawdown = 0
    b_pos = s_pos = False
    time_vec = []
    aux_ii = len(tst) - 1

    for t, val in enumerate(tst):
        start_i = time.time()

        if t == 201:
            continue

        if t == 0:
            tst[0, 5] = id.log_return(tst[0, 0], tst[0, 3], trn[-1, 0], trn[-1,
                                                                            3])
        else:
            tst[t, 5] = id.log_return(tst[t, 0], tst[t, 3], trn[t - 1, 0],
                                      trn[t - 1, 3])
        tst[t, 6] = mnm.get(val[5])
        tst[t, 7] = obv.get_obv(val[3], val[4])
        aux = bbs.sma(val[3])
        if aux is not None:
            tst[t, 8], tst[t, 9] = aux
        aux_9 = m_9.ema(val[3])
        aux12 = m12.ema(val[3])
        aux26 = m26.ema(val[3])
        tst[t, 10] = aux12 - aux26
        tst[t, 11] = tst[t, 10] - aux_9

        aux = trn[-1000:]
        aux_i = [(i[1] - mn[i[0]]) * mx[i[0]] for i in enumerate(tst[t, :12])]
        # aux_j = trn[-1000:, :]

        b_elm = ELMClassifier(random_state=0,
                              n_hidden=200,
                              activation_func='sigmoid',
                              alpha=0.0)
        b_elm.fit(aux[:, :12], aux[:, 12])
        b_res = b_elm.predict([aux_i[:12]])
        s_elm = ELMClassifier(random_state=0,
                              n_hidden=200,
                              activation_func='sigmoid',
                              alpha=0.0)
        s_elm.fit(aux[:, :12], aux[:, 13])
        s_res = s_elm.predict([aux_i[:12]])

        if b_res == 1.0:
            if val[12] == 1.0:
                b_tp += 1
            else:
                b_fp += 1
            if not b_pos:
                # Entra
                b_money -= val[3]
                b_pos = True
        else:
            if val[12] == 0.0:
                b_tn += 1
            else:
                b_fn += 1
            if b_pos:
                # Sai
                b_money += val[3]
                b_pos = False
                if b_money < b_money_vec[-1]:
                    b_loss += 1
                    b_loss_vec.append(b_money_vec[-1] - b_money)
                elif b_money > b_money_vec[-1]:
                    b_gain += 1
                    b_gain_vec.append(b_money - b_money_vec[-1])
                else:
                    b_draw += 1
        if val[14] == 1.0:
            # Sai
            b_money += val[3]
            b_pos = False
            if b_money < b_money_vec[-1]:
                b_loss += 1
                b_loss_vec.append(b_money_vec[-1] - b_money)
            elif b_money > b_money_vec[-1]:
                b_gain += 1
                b_gain_vec.append(b_money - b_money_vec[-1])
            else:
                b_draw += 1

        if b_pos:
            b_money_vec.append(b_money_vec[-1])
        else:
            b_money_vec.append(b_money)
            if b_money > b_max:
                b_max = b_money
            if b_money < b_min:
                b_min = b_money

        if s_res == 1.0:
            if val[13] == 1.0:
                s_tp += 1
            else:
                s_fp += 1
            if not s_pos:
                # Entra
                s_money += val[3]
                s_pos = True
        else:
            if val[13] == 0.0:
                s_tn += 1
            else:
                s_fn += 1
            if s_pos:
                # Sai
                s_money -= val[3]
                s_pos = False
                if s_money < s_money_vec[-1]:
                    s_loss += 1
                    s_loss_vec.append(s_money_vec[-1] - s_money)
                elif s_money > s_money_vec[-1]:
                    s_gain += 1
                    s_gain_vec.append(s_money - s_money_vec[-1])
                else:
                    s_draw += 1
        if val[14] == 1.0:
            # Sai
            s_money -= val[3]
            s_pos = False
            if s_money < s_money_vec[-1]:
                s_loss += 1
                s_loss_vec.append(s_money_vec[-1] - s_money)
            elif s_money > s_money_vec[-1]:
                s_gain += 1
                s_gain_vec.append(s_money - s_money_vec[-1])
            else:
                s_draw += 1

        if s_pos:
            s_money_vec.append(s_money_vec[-1])
        else:
            s_money_vec.append(s_money)
            if s_money > s_max:
                s_max = s_money
            if s_money < s_min:
                s_min = s_money

        # print(aux_i + list(tst[t, 12:]))
        trn = np.append(trn, [aux_i + list(tst[t, 12:])], axis=0)
        time_vec.append(time.time() - start_i)
        sys.stdout.write('\r' + '%6d / %d' % (t, aux_ii) + '\033[K')
    sys.stdout.write('\r' + '>> %6.2f: Simulation Done!\n\n' %
                     (time.time() - start) + '\033[K')

    print('#### ' + sys.argv[1] + ' ####')
    print('Tempo médio: %f' % np.mean(time_vec))
    print('Final      : %5.5f | %5.5f' % (b_money, s_money))
    # print('Final      : %5.5f | %5.5f' % (b_money_vec[-1], s_money_vec[-1]))
    print('Minimo     : %5.5f | %5.5f' % (b_min, s_min))
    print('Maximo     : %5.5f | %5.5f' % (b_max, s_max))
    print('Ganho qtd  : %10d | %10d' % (b_gain, s_gain))
    print('Perda qtd  : %10d | %10d' % (b_loss, s_loss))
    print('Empate qtd : %10d | %10d' % (b_draw, s_draw))
    print('Ganho medio: %5.5f | %5.5f' %
          (np.mean(b_gain_vec), np.mean(s_gain_vec)))
    print('Perda media: %5.5f | %5.5f' %
          (np.mean(b_loss_vec), np.mean(s_loss_vec)))
    print('TP         : %10d | %10d' % (b_tp, s_tp))
    print('FP         : %10d | %10d' % (b_fp, s_fp))
    print('TN         : %10d | %10d' % (b_tn, s_tn))
    print('FN         : %10d | %10d' % (b_fn, s_fn))

    plot(b_money_vec, s_money_vec, sys.argv[1], tst[:, 3])
    #-- gaussian noise
    loc_gaussian = 0.
    scale_gaussian = 10.

    #-- laplacian noise
    loc_laplace = 0.
    scale_laplace = 10.

    #-- salt and pepper noise
    sp_probability = 0.05

    image = np.asarray(Image.open(input_image_path).convert('L'))

    noised_gaussian = noise_gaussian(image, loc_gaussian, scale_gaussian)
    noised_laplace = noise_laplace(image, loc_laplace, scale_laplace)
    noised_salt_and_pepper = noise_salt_and_pepper(image, sp_probability)

    max_flow = MaxFlow(noised_gaussian, lamda, sigma, number_of_iterations)
    den_gaussian = max_flow.alpha_expansion()

    max_flow = MaxFlow(noised_laplace, lamda, sigma, number_of_iterations)
    den_laplace = max_flow.alpha_expansion()

    max_flow = MaxFlow(noised_salt_and_pepper, lamda, sigma,
                       number_of_iterations)
    den_sp = max_flow.alpha_expansion()

    # Plot results
    plot(image, noised_gaussian, noised_laplace, noised_salt_and_pepper,
         den_gaussian, den_laplace, den_sp)
def evaluate_lenet5(n_epochs=200,
                    nkerns=[6, 3], batch_size=10):
    """ Demonstrates lenet on MNIST dataset

    :type learning_rate: float
    :param learning_rate: learning rate used (factor for the stochastic
                          gradient)

    :type n_epochs: int
    :param n_epochs: maximal number of epochs to run the optimizer

    :type dataset: string
    :param dataset: path to the dataset used for training /testing (MNIST here)

    :type nkerns: list of ints
    :param nkerns: number of kernels on each layer
    """
    #theano.config.compute_test_value = 'warn'
    rng = numpy.random.RandomState(23455)

    datasets = Preprocess_Input().load_data()

    train_set_x, train_set_y, _ = datasets[0]
    test_set_x, test_set_y, _ = datasets[2]

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0]
    n_test_batches = test_set_x.get_value(borrow=True).shape[0]
    print 'Train\tTest\tbatch_size\n', n_train_batches, '\t', n_test_batches, '\t', batch_size
    n_train_batches /= batch_size
    n_test_batches /= batch_size

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch
    x = T.matrix('x')   # the data is presented as rasterized sentences. Each row is an instance of sentence.
    y = T.ivector('y')  # the labels are presented as 1D vector of
                        # [int] labels
    learning_rate = T.dscalar('rate')

    k_Top = 6
    s_shape = (25, 37)  # this is the size of sentence matrix.

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized sentence of shape (batch_size,25*37)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    layer0_input = x.reshape((batch_size, 1, 25, 37))

    # Construct the first convolutional pooling layer:
    # filtering reduces the image size to (25,37+3-1)=(25,39)
    # maxpooling reduces this further to (25,19)
    # 4D output tensor is thus of shape (batch_size,nkerns[0],25,19)
    
    # _W = main(dataset=datasets)
    layer0 = LeNetConvPoolLayer(rng, input=layer0_input, W=None,
            image_shape=(batch_size, 1, 25, 37),
            filter_shape=(nkerns[0], 1, 1, 5), k=k_Top)

    # Construct the second convolutional pooling layer
    # filtering reduces the image size to (25,19+3-1)=(25,21)
    # maxpooling reduces this further to (25,12)
    # 4D output tensor is thus of shape (nkerns[1], nkerns[0], 1, 2)
    
    #k = max(k_Top, numpy.ceil((2.-2.)/2. * 37.))
    
    #layer1 = LeNetConvPoolLayer(rng, input=layer0.output, #W=_W[1],
    #        image_shape=(batch_size, nkerns[0], 25, 19),
    #        filter_shape=(nkerns[1], nkerns[0], 1, 3), k=12)

    # the HiddenLayer being fully-connected, it operates on 2D matrices of
    # shape (batch_size,num_pixels) (i.e matrix of rasterized images).
    # This will generate a matrix of shape (batch_size,3*25*12) = (100,900)
    layer2_input = layer0.output.flatten(2)

    # construct a fully-connected sigmoidal layer
    layer2 = HiddenLayer(rng, input=layer2_input, n_in=nkerns[0] * 25 * k_Top,
                         n_out=200, activation=T.tanh)

    # classify the values of the fully-connected sigmoidal layer
    layer3 = LogisticRegression(input=layer2.output, n_in=200, n_out=6)

    # the cost we minimize during training is the NLL of the model
    cost = layer3.negative_log_likelihood(y)

    # create a function to compute the mistakes that are made by the model
    test_model = theano.function([index], layer3.errors(y),
             givens={
                x: test_set_x[index * batch_size: (index + 1) * batch_size],
                y: test_set_y[index * batch_size: (index + 1) * batch_size]})

    #validate_model = theano.function([index], layer3.errors(y),
    #        givens={
    #            x: valid_set_x[index * batch_size: (index + 1) * batch_size],
    #            y: valid_set_y[index * batch_size: (index + 1) * batch_size]})

    # create a list of all model parameters to be fit by gradient descent
    params = layer3.params + layer2.params + layer0.params

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    # train_model is a function that updates the model parameters by
    # SGD Since this model has many parameters, it would be tedious to
    # manually create an update rule for each model parameter. We thus
    # create the updates list by automatically looping over all
    # (params[i],grads[i]) pairs.
    updates = []
    for param_i, grad_i in zip(params, grads):
        updates.append((param_i, param_i - learning_rate * grad_i))

    train_model = theano.function([index, learning_rate], cost, updates=updates, #g mode='DebugMode',
          givens={
            x: train_set_x[index * batch_size: (index + 1) * batch_size],
            y: train_set_y[index * batch_size: (index + 1) * batch_size]})

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 10000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
                           # found
    improvement_threshold = 0.995  # a relative improvement of this much is
                                   # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
                                  # go through this many
                                  # minibatche before checking the network
                                  # on the validation set; in this case we
                                  # check every epoch

    

    '''f = theano.function([index], T.shape(layer1.pooled_out), givens={x: train_set_x[index * batch_size: (index + 1) * batch_size]})
    print 'pool_out.shape', f(0)
    '''

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()

    epoch = 0
    done_looping = False
    ls = []
    rate = 1e-1

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        for minibatch_index in xrange(n_train_batches):

            iter = (epoch - 1) * n_train_batches + minibatch_index

            cost_ij = train_model(minibatch_index, rate)


        # test it on the test set
        test_losses = [test_model(i) for i in xrange(n_test_batches)]
        test_score = numpy.mean(test_losses)
        ls.append(test_score)
        print(('     epoch %i, minibatch %i/%i, test error of best '
               'model %f %%') %
              (epoch, minibatch_index + 1, n_train_batches,
               test_score * 100.))
        
        # save best validation score and iteration number
        if test_score < best_validation_loss:
            best_validation_loss = test_score
            best_iter = iter

        if epoch <= 40:
            rate *= .9


    end_time = time.clock()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
                          
    plot(numpy.asarray(ls)*100, rate)