def test_bc(initial_learning_rate=0.1, final_learning_rate=0.01, n_epochs=1000, 
            nkerns=[64, 64, 128, 128, 256, 256], batch_size=200, verbose=False, 
            stochastic=False, binary=True, which_data='svhn', outputlayer='Logistic'):
    """
    Wrapper function for testing a deep Convoluvtional Network

    :type intial_learning_rate: float
    :param initial_learning_rate: starting learning rate used for the first epoch (factor for the stochastic
    gradient. The learning rate decays at each epoch after this starting value

    :type final_learning_rate: float
    :param final_learning_rate: final learning rate used for the last epoch. The learning rate decays at each
    epoch, sweeping through values from the the starting learning rate to ending learning rate.

    :type n_epochs: int
    :param n_epochs: maximal number of epochs to run the optimizer

    :type nkerns: list of ints
    :param nkerns: number of kernels on each layer

    :type batch_size: int
    :param batch_szie: number of examples in minibatch.

    :type verbose: boolean
    :param verbose: to print out epoch summary or not to.
    
    :type binary: boolean
    :param binary: defines whether to use Binary Connect (true) or the base version of the network model (false)
    
    :type stochatic: boolean
    :param stochastic: defines whether to use stochastic (true) or deterministic (false) Binary Connect implementation
    
    :type outputlayer: string
    :param outputlayer: defines whetehr to use Logistic or SVM (Support Vector Machine) layer as the final output layer of
    the network
    
    :type which_data: string
    :param which_data: indicates which dataset should be trained by the model - 'mnist', 'svhn', or 'cifar10'
    
    
    """

    rng = numpy.random.RandomState(23455)

    if which_data not in ('mnist','cifar10','svhn'):
        return 'Need to choose corrrect dataset either "mnist", "svhn", or "cifar10"'

    # load data set defined in parameters
    if which_data=='mnist':
        datasets = load_mnist(outputlayer)
        nins = 28*28*1
    elif which_data=='svhn':
        datasets = load_svhn(outputlayer)
        nins = 32*32*3
    elif which_data=='cifar10':
        datasets = load_cifar10(outputlayer)

    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0]
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]
    n_test_batches = test_set_x.get_value(borrow=True).shape[0]
    n_train_batches //= batch_size
    n_valid_batches //= batch_size
    n_test_batches //= batch_size

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch

    x = T.matrix('x')   # the data is presented as rasterized images
    if outputlayer=='svm':
        y = T.imatrix('y')
    if outputlayer=='Logistic':
        y = T.ivector('y')
                        # [int] labels
    
    # Define function for learning rate decay
    learning_rate_decay = (float(final_learning_rate)/float(initial_learning_rate))**(1./n_epochs)
    learning_rate = theano.shared(numpy.asarray(initial_learning_rate, dtype=theano.config.floatX))
    decay_learning_rate = theano.function(inputs=[], outputs=learning_rate,
            updates={learning_rate: learning_rate * learning_rate_decay})    
    
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print('... building the model')

    # Reshape matrix of rasterized images of shape (batch_size, 3 * 32 * 32)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    layer0_input = x.reshape((batch_size, 3, 32, 32))

    # Construct the first convolutional pooling layer
    # filtering reduces the image size to (32-3+1 , 32-3+1) = (30, 30)
    # maxpooling reduces this further to (30/1, 30/1) = (30, 30)
    # 4D output tensor is thus of shape (batch_size, nkerns[0], 30, 30)
    layer0 = LeNetConvPoolLayer(
        rng,
        input=layer0_input,
        image_shape=(batch_size, 3, 32, 32),
        filter_shape=(nkerns[0], 3, 3, 3),
        poolsize=(1,1),
        stochastic=stochastic,
        binary=binary
    )

    # Construct the second convolutional pooling layer
    # filtering reduces the image size to (30-3+1, 30-3+1) = (28, 28)
    # maxpooling reduces this further to (28/2, 28/2) = (14, 14)
    # 4D output tensor is thus of shape (batch_size, nkerns[1], 14, 14)
    layer1 = LeNetConvPoolLayer(
        rng,
        input=layer0.output,
        image_shape=(batch_size, nkerns[0], 30,30),
        filter_shape=(nkerns[1], nkerns[0], 3, 3),
        poolsize=(2,2),
        stochastic=stochastic,
        binary=binary
    )
    
    # Construct the third convolutional pooling layer
    # filtering reduces the image size to (14-3+1, 14-3+1) = (12, 12)
    # maxpooling reduces this further to (12/1, 12/1) = (12, 12)
    # 4D output tensor is thus of shape (batch_size, nkerns[2], 12, 12)
    layer2 = LeNetConvPoolLayer(
        rng,
        input=layer1.output,
        image_shape=(batch_size, nkerns[1], 14 ,14),
        filter_shape=(nkerns[2], nkerns[1], 3, 3),
        poolsize=(1,1),
        stochastic=stochastic,
        binary=binary
    )
    
    # Construct the fourth convolutional pooling layer
    # filtering reduces the image size to (12-3+1, 12-3+1) = (10, 10)
    # maxpooling reduces this further to (10/2, 10/2) = (5, 5)
    # 4D output tensor is thus of shape (batch_size, nkerns[2], 5, 5)
    layer3 = LeNetConvPoolLayer(
        rng,
        input=layer2.output,
        image_shape=(batch_size, nkerns[2], 12 ,12),
        filter_shape=(nkerns[3], nkerns[2], 3, 3),
        poolsize=(2,2),
        stochastic=stochastic,
        binary=binary
    )
    
    # Construct the fifth convolutional pooling layer
    # filtering reduces the image size to (5-3+1, 5-3+1) = (3, 3)
    # maxpooling reduces this further to (3/1, 3/1) = (3, 3)
    # 4D output tensor is thus of shape (batch_size, nkerns[3], 3, 3)
    layer4 = LeNetConvPoolLayer(
        rng,
        input=layer3.output,
        image_shape=(batch_size, nkerns[3], 5 ,5),
        filter_shape=(nkerns[4], nkerns[3], 3, 3),
        poolsize=(1,1),
        stochastic=stochastic,
        binary=binary
    )

    # Construct the sixth convolutional pooling layer
    # filtering reduces the image size to (3-3+1, 3-3+1) = (1, 1)
    # maxpooling reduces this further to (1/1, 1/1) = (1, 1)
    # 4D output tensor is thus of shape (batch_size, nkerns[3], 1, 1)
    layer5 = LeNetConvPoolLayer(
        rng,
        input=layer4.output,
        image_shape=(batch_size, nkerns[4], 3, 3),
        filter_shape=(nkerns[5], nkerns[4], 3, 3),
        poolsize=(1,1),
        stochastic=stochastic,
        binary=binary
    )
    # the two HiddenLayers being fully-connected, it operates on 2D matrices of
    # shape (batch_size, num_pixels) (i.e matrix of rasterized images).
    # This will generate a matrix of shape (batch_size, nkerns[5] * 1 * 1),
    # or (200, 256 * 1 * 1) = (200, 256) with the default values.
    layer6_input = layer5.output.flatten(2)

    layer6 = HiddenLayer(
        rng,
        input=layer6_input,
        n_in=nkerns[5] * 1 * 1,
        n_out=1024,
        activation=T.nnet.relu,
        stochastic=stochastic,
        binary=binary
    )
    
    layer7 = HiddenLayer(
        rng,
        input=layer6.output,
        n_in=1024,
        n_out=1024,
        activation=T.nnet.relu,
        stochastic=stochastic,
        binary=binary
    )
    
    
    # Define output layer based on parameter
    if outputlayer=='Logistic':
            print("Using logistic regression")
            outputRegressionFunction = LogisticRegression

    if outputlayer=='svm':
        print("Using Support Vector Machines")
        outputRegressionFunction = SVMLayer

    layer8 = outputRegressionFunction(
            input=layer7.output,
            n_in=1024,
            n_out=10,
            stochastic=stochastic,
            binary=binary
        )

    # the cost we minimize during training is the NLL of the model
    cost = layer8.cost(y)

    # create a function to compute the mistakes that are made by the model
    test_model = theano.function(
        [index],
        layer8.errors(y),
        givens={
            x: test_set_x[index * batch_size: (index + 1) * batch_size],
            y: test_set_y[index * batch_size: (index + 1) * batch_size]
        }
    )

    validate_model = theano.function(
        [index],
        layer8.errors(y),
        givens={
            x: valid_set_x[index * batch_size: (index + 1) * batch_size],
            y: valid_set_y[index * batch_size: (index + 1) * batch_size]
        }
    )

    # Get training error rate
    train_model_perf = theano.function(
        [index],
        layer8.errors(y),
        givens={
            x: train_set_x[index * batch_size: (index + 1) * batch_size],
            y: train_set_y[index * batch_size: (index + 1) * batch_size]
        }
    )
    params = layer8.params + layer7.params + layer6.params + layer5.params + layer4.params + layer3.params + layer2.params + layer1.params + layer0.params
    total_len_params = len(params)
    

    # Specifying outputs as layed out in BC paper
    if binary:
        W0=theano.shared(layer0.W0, name='W0', borrow=True)
        updates=[]
        for i in range(total_len_params):
            p=params[i]
            if p.name in ('beta', 'gamma', 'b', 'Wb'):
                u = p - learning_rate * T.grad(cost, p)
                updates.append((p, u))
            elif p.name=='W':
                n_Wb = params[i+1]
                u = p - learning_rate * T.grad(cost, n_Wb)
                u = T.clip(u, -W0, W0)
                updates.append((p, u))
            else:
                continue
    
    elif not binary:
        gparams = T.grad(cost, params)
        updates = [(p,p - learning_rate *gp) for p, gp in zip(params, gparams)]
    
    train_model = theano.function(
        [index],
        cost,
        updates=updates,
        givens={
            x: train_set_x[index * batch_size: (index + 1) * batch_size],
            y: train_set_y[index * batch_size: (index + 1) * batch_size]
        }
    )
    
    ###############
    # TRAIN MODEL #
    ###############
    print('... training')

    train_nn(decay_learning_rate, train_model,train_model_perf, validate_model, test_model,
        n_train_batches, n_valid_batches, n_test_batches, n_epochs, learning_rate, which_data,
        stochastic, binary, outputlayer, verbose)
    
    # return first hidden layer weights for image plotting
    return layer0.W.get_value(), layer0.Wb.get_value()
def test_mlp(initial_learning_rate=0.3, final_learning_rate=0.01, 
             L1_reg=0.00, L2_reg=0.000, n_epochs=100,
             batch_size=200, n_hidden=1024, n_hiddenLayers=3,
             verbose=False, stochastic=False, binary=True, 
             which_data='svhn', seedval=12345, outputlayer='Logistic'):
    """
    Wrapper function for training and testing MLP

    :type intial_learning_rate: float
    :param initial_learning_rate: starting learning rate used for the first epoch (factor for the stochastic
    gradient. The learning rate decays at each epoch after this starting value

    :type final_learning_rate: float
    :param final_learning_rate: final learning rate used for the last epoch. The learning rate decays at each
    epoch, sweeping through values from the the starting learning rate to ending learning rate.
    
    :type L1_reg: float
    :param L1_reg: L1-norm's weight when added to the cost (see
    regularization).

    :type L2_reg: float
    :param L2_reg: L2-norm's weight when added to the cost (see
    regularization).

    :type n_epochs: int
    :param n_epochs: maximal number of epochs to run the optimizer.

    :type batch_size: int
    :param batch_szie: number of examples in minibatch.

    :type n_hidden: int or list of ints
    :param n_hidden: number of hidden units. If a list, it specifies the
    number of units in each hidden layers, and its length should equal to
    n_hiddenLayers.

    :type n_hiddenLayers: int
    :param n_hiddenLayers: number of hidden layers.

    :type verbose: boolean
    :param verbose: to print out epoch summary or not to.
    
    :type binary: boolean
    :param binary: defines whether to use Binary Connect (true) or the base version of the network model (false)
    
    :type stochatic: boolean
    :param stochastic: defines whether to use stochastic (true) or deterministic (false) Binary Connect implementation
    
    :type seedval: int
    :param seedval: defines a seed to use for random numer generation
    
    :type outputlayer: string
    :param outputlayer: defines whetehr to use Logistic or SVM (Support Vector Machine) layer as the final output layer of
    the network
    
    :type which_data: string
    :param which_data: indicates which dataset should be trained by the model - 'mnist', 'svhn', or 'cifar10'

    """
    # load the requested dataset
    if which_data not in ('mnist','cifar10','svhn'):
        return 'Need to choose corrrect dataset either "mnist", "svhn", or "cifar10"'

    if which_data=='mnist':
        datasets = load_mnist(outputlayer)
        nins = 28*28*1

    elif which_data=='svhn':
        datasets = load_svhn(outputlayer)
        nins = 32*32*3

    elif which_data=='cifar10':
        datasets = load_cifar10(outputlayer)
        nins = 32*32*3
    
    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0] // batch_size
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] // batch_size
    n_test_batches = test_set_x.get_value(borrow=True).shape[0] // batch_size

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print('... building the model')

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch
    x = T.matrix('x')  # the data is presented as rasterized images
    
    # allocate dfferent variables for svm vs Logistic layers
    if outputlayer=='svm':
        y = T.imatrix('y')
    if outputlayer=='Logistic':
        y = T.ivector('y')
    
    # define learning rate decay rule
    learning_rate_decay = (float(final_learning_rate)/float(initial_learning_rate))**(1./n_epochs)
    learning_rate = theano.shared(numpy.asarray(initial_learning_rate, dtype=theano.config.floatX))
    decay_learning_rate = theano.function(inputs=[], outputs=learning_rate,
            updates={learning_rate: learning_rate * learning_rate_decay})
   
    rng = numpy.random.RandomState(seedval)
    
    ## define MLP classifier with ReLu activation function
    classifier = myMLP(rng=rng, 
                       input=x, 
                       n_in=nins, 
                       n_hidden=n_hidden, 
                       n_out=10, 
                       n_hiddenLayers=n_hiddenLayers, 
                       stochastic=stochastic,
                       binary=binary,
                       activation=T.nnet.relu,
                       outputlayer=outputlayer)
    
    # the cost we minimize during training is the negative log likelihood of
    # the model plus the regularization terms (L1 and L2); cost is expressed
    # here symbolically
    cost = (
        classifier.cost(y)
        + L1_reg * classifier.L1
        + L2_reg * classifier.L2_sqr
    )

    # compiling a Theano function that computes the mistakes that are made
    # by the model on a minibatch
    test_model = theano.function(
        inputs=[index],
        outputs=classifier.errors(y),
        givens={
            x: test_set_x[index * batch_size:(index + 1) * batch_size],
            y: test_set_y[index * batch_size:(index + 1) * batch_size]
        }
    )

    validate_model = theano.function(
        inputs=[index],
        outputs=classifier.errors(y),
        givens={
            x: valid_set_x[index * batch_size:(index + 1) * batch_size],
            y: valid_set_y[index * batch_size:(index + 1) * batch_size]
        }
    )

    train_model_perf = theano.function(
        inputs=[index],
        outputs=classifier.errors(y),
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size],
            y: train_set_y[index * batch_size:(index + 1) * batch_size]
        }
    )
    # compute the gradient of cost with respect to theta (sotred in params)
    # the resulting gradients will be stored in a list gparams
    

    # specify how to update the parameters of the model as a list of
    # (variable, update expression) pairs

    # given two lists of the same length, A = [a1, a2, a3, a4] and
    # B = [b1, b2, b3, b4], zip generates a list C of same size, where each
    # element is a pair formed from the two lists :
    #    C = [(a1, b1), (a2, b2), (a3, b3), (a4, b4)]
    
    ## calculate gradients using binarized weights if binary
    if binary:
        W0=theano.shared(classifier.W0, name='W0', borrow=True)    
        updates=[]
        for i in range(classifier.len_params):
            for j in range(n_hiddenLayers+1):
                p=classifier.params[j*(classifier.len_params)+i]
                if p.name in ('beta','gamma','b', 'Wb'):
                    u = p - learning_rate * T.grad(cost, p)
                    updates.append((p, u))
                elif p.name=='W':
                    n_Wb = classifier.params[j*(classifier.len_params)+i+2]
                    u = p - learning_rate * T.grad(cost, n_Wb)
                    u = T.clip(u, -W0, W0)
                    updates.append((p, u))
                else:
                    continue
    elif not binary:
        gparams = T.grad(cost, classifier.params)
        updates = [(p, p - learning_rate * gp) for p, gp in zip(classifier.params, gparams)]

    # compiling a Theano function `train_model` that returns the cost, but
    # at the same time updates the parameter of the model based on the rules
    # defined in `updates`
    train_model = theano.function(
        inputs=[index],
        outputs=cost,
        updates=updates,
        givens={
            x: train_set_x[index * batch_size: (index + 1) * batch_size],
            y: train_set_y[index * batch_size: (index + 1) * batch_size]
        }
    )

    ###############
    # TRAIN MODEL #
    ###############
    print('... training')

    train_nn(decay_learning_rate, train_model, train_model_perf, validate_model, test_model,
        n_train_batches, n_valid_batches, n_test_batches, n_epochs, learning_rate, which_data,
        stochastic, binary, outputlayer, verbose)
    
    # return first hidden layer weights for image plotting
    return classifier.hiddenLayers[0].W.get_value(), classifier.hiddenLayers[0].Wb.get_value()