Esempio n. 1
0
def main(method,LR_start,Binarize_weight_only, SEQ_LENGTH):

	lasagne.random.set_rng(np.random.RandomState(1))

	name = "linux"
	print("dataset = "+str(name))

	print("Binarize_weight_only="+str(Binarize_weight_only))

	print("Method = "+str(method))

	# Sequence Length
	SEQ_LENGTH = SEQ_LENGTH
	# SEQ_LENGTH = 100  #can have diffvalues 50, 100, 200
	print("SEQ_LENGTH = "+str(SEQ_LENGTH))

	# Number of units in the two hidden (LSTM) layers
	N_HIDDEN = 512
	print("N_HIDDEN = "+str(N_HIDDEN))

	# All gradients above this will be clipped
	GRAD_CLIP=5.  #### this clip the gradients at every time step, while T.clip clips the sum of gradients as a whole
	print("GRAD_CLIP ="+str(GRAD_CLIP))

	# Number of epochs to train the net
	num_epochs = 200
	print("num_epochs = "+str(num_epochs))

	# Batch Size
	batch_size = 100
	print("batch_size = "+str(batch_size))
	 
	print("LR_start = "+str(LR_start))
	LR_decay = 0.98
	print("LR_decay="+str(LR_decay))

	if Binarize_weight_only =="w":
		activation = lasagne.nonlinearities.tanh
	else:
		activation = lab.binary_tanh_unit
	print("activation = "+ str(activation))

	name = name+"_"+Binarize_weight_only

	## load data, change data file dir
	with open('data/linux_input.txt', 'r') as f:
		in_text = f.read()

	generation_phrase = "Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar\n *\n * This file contains the interrupt probing code and driver APIs.\n */\n\n#include"
	#This snippet loads the text file and creates dictionaries to 
	#encode characters into a vector-space representation and vice-versa. 
	chars = list(set(in_text))
	data_size, vocab_size = len(in_text), len(chars)
	char_to_ix = { ch:i for i,ch in enumerate(chars) }
	ix_to_char = { i:ch for i,ch in enumerate(chars) }

	num_splits = [0.9, 0.05, 0.05]
	num_splits_all = np.floor(data_size/batch_size/SEQ_LENGTH)
	num_train = np.floor(num_splits_all*num_splits[0])
	num_val   = np.floor(num_splits_all*num_splits[1])
	num_test  = num_splits_all - num_train - num_val

	train_X = in_text[0:(num_train*batch_size*SEQ_LENGTH+1).astype('int32')]
	val_X = in_text[(num_train*batch_size*SEQ_LENGTH).astype('int32'):((num_train+num_val)*batch_size*SEQ_LENGTH+1).astype('int32')]
	test_X = in_text[((num_train+num_val)*batch_size*SEQ_LENGTH).astype('int32'):(num_splits_all*batch_size*SEQ_LENGTH+1).astype('int32')]


	## build model
	print('Building the model...') 
		
	# input = T.tensor3('inputs')
	target = T.imatrix('target')
	LR = T.scalar('LR', dtype=theano.config.floatX)

	# (batch size, SEQ_LENGTH, num_features)
	l_in = lasagne.layers.InputLayer(shape=(None, None, vocab_size))
	l_forward_2 = lab.LSTMLayer(
				l_in, 
				num_units=N_HIDDEN,
				grad_clipping=GRAD_CLIP,
				peepholes=False,
				nonlinearity=activation, ### change this activation can change the hidden layer to binary
				method=method)   ### batch_size*SEQ_LENGTH*N_HIDDEN

	l_shp = lasagne.layers.ReshapeLayer(l_forward_2, (-1, N_HIDDEN))  ## (batch_size*SEQ_LENGTH, N_HIDDEN)
	l_out = lasagne.layers.DenseLayer(l_shp, num_units=vocab_size, W = lasagne.init.Normal(), nonlinearity=lasagne.nonlinearities.softmax)
	batchsize, seqlen, _ = l_in.input_var.shape
	l_shp1 = lasagne.layers.ReshapeLayer(l_out, (batchsize, seqlen, vocab_size))
	l_out1 = lasagne.layers.SliceLayer(l_shp1, -1, 1)

	train_output = lasagne.layers.get_output(l_out, deterministic=False)
	loss = T.nnet.categorical_crossentropy(train_output,target.flatten()).mean()


	if method!= "FPN": 
		# W updates
		W = lasagne.layers.get_all_params(l_out, binary=True)
		W_grads = lab.compute_grads(loss,l_out) 
		updates = optimizer.adam(loss_or_grads=W_grads, params=W, learning_rate=LR, epsilon = 1e-8)   ### can choose different methods to update
		updates = lab.clipping_scaling(updates,l_out)

		# other parameters updates
		params = lasagne.layers.get_all_params(l_out, trainable=True, binary=False)
		updates = OrderedDict(updates.items() + optimizer.adam(loss_or_grads=loss, params=params, learning_rate=LR, epsilon = 1e-8).items())

		## update 2 momentum
		updates3 = OrderedDict()
		acc_tag = lasagne.layers.get_all_params(l_out, acc=True)	
		idx = 0
		beta2 = 0.999
		for acc_tag_temp in acc_tag:
			# updates3[acc_tag_temp]=updates.keys()[idx]
			updates3[acc_tag_temp]= acc_tag_temp*beta2 + W_grads[idx]*W_grads[idx]*(1-beta2)
			idx = idx+1

		updates = OrderedDict(updates.items() + updates3.items())

	else:
		params_other = lasagne.layers.get_all_params(l_out, trainable=True)
		
		W_grads = [theano.grad(loss, wrt=l_forward_2.W_in_to_ingate), theano.grad(loss, wrt=l_forward_2.W_hid_to_ingate),
		theano.grad(loss, wrt=l_forward_2.W_in_to_fotgetgate),theano.grad(loss, wrt=l_forward_2.W_hid_to_forgetgate),
		theano.grad(loss, wrt=l_forward_2.W_in_to_cell),theano.grad(loss, wrt=l_forward_2.W_hid_to_cell),
		theano.grad(loss, wrt=l_forward_2.W_in_to_outgate),theano.grad(loss, wrt=l_forward_2.W_hid_to_outgate)]
		
		updates = optimizer.adam(loss_or_grads=loss, params=params_other, learning_rate=LR)

	test_output = lasagne.layers.get_output(l_out, deterministic=True)
	test_loss = T.nnet.categorical_crossentropy(test_output,target.flatten()).mean()
			


	train_fn = theano.function([l_in.input_var, target, LR], [loss, W_grads[5]], updates=updates, allow_input_downcast=True)
	val_fn = theano.function([l_in.input_var, target], test_loss, allow_input_downcast=True)
	probs = theano.function([l_in.input_var],lasagne.layers.get_output(l_out1), allow_input_downcast=True)

	
	print('Training...')
	
	lab.train(
			name, method,
			train_fn,val_fn,
			batch_size,
			SEQ_LENGTH,
			N_HIDDEN,
			LR_start,LR_decay,
			num_epochs,
			train_X,
			val_X,
			test_X)
Esempio n. 2
0
def main(method, LR_start, Binarize_weight_only):

    name = "svhn"
    print("dataset = " + str(name))

    print("Binarize_weight_only=" + str(Binarize_weight_only))

    print("Method = " + str(method))

    # alpha is the exponential moving average factor
    alpha = .1
    print("alpha = " + str(alpha))
    epsilon = 1e-4
    print("epsilon = " + str(epsilon))

    # Training parameters
    batch_size = 50
    print("batch_size = " + str(batch_size))

    num_epochs = 50
    print("num_epochs = " + str(num_epochs))

    print("LR_start = " + str(LR_start))
    LR_decay = 0.1
    print("LR_decay=" + str(LR_decay))
    # BTW, LR decay might good for the BN moving average...

    if Binarize_weight_only == "w":
        activation = lasagne.nonlinearities.rectify
    else:
        activation = lab.binary_tanh_unit
    print("activation = " + str(activation))

    ## number of filters in the first convolutional layer
    K = 64
    print("K=" + str(K))

    print('Building the CNN...')

    # Prepare Theano variables for inputs and targets
    input = T.tensor4('inputs')
    target = T.matrix('targets')
    LR = T.scalar('LR', dtype=theano.config.floatX)

    l_in = lasagne.layers.InputLayer(shape=(None, 3, 32, 32), input_var=input)

    # 128C3-128C3-P2
    l_cnn1 = lab.Conv2DLayer(l_in,
                             num_filters=K,
                             filter_size=(3, 3),
                             pad=1,
                             nonlinearity=lasagne.nonlinearities.identity,
                             method=method)

    l_bn1 = batch_norm.BatchNormLayer(l_cnn1, epsilon=epsilon, alpha=alpha)

    l_nl1 = lasagne.layers.NonlinearityLayer(l_bn1, nonlinearity=activation)

    l_cnn2 = lab.Conv2DLayer(l_nl1,
                             num_filters=K,
                             filter_size=(3, 3),
                             pad=1,
                             nonlinearity=lasagne.nonlinearities.identity,
                             method=method)

    l_mp1 = lasagne.layers.MaxPool2DLayer(l_cnn2, pool_size=(2, 2))

    l_bn2 = batch_norm.BatchNormLayer(l_mp1, epsilon=epsilon, alpha=alpha)

    l_nl2 = lasagne.layers.NonlinearityLayer(l_bn2, nonlinearity=activation)
    # 256C3-256C3-P2
    l_cnn3 = lab.Conv2DLayer(l_nl2,
                             num_filters=2 * K,
                             filter_size=(3, 3),
                             pad=1,
                             nonlinearity=lasagne.nonlinearities.identity,
                             method=method)

    l_bn3 = batch_norm.BatchNormLayer(l_cnn3, epsilon=epsilon, alpha=alpha)

    l_nl3 = lasagne.layers.NonlinearityLayer(l_bn3, nonlinearity=activation)

    l_cnn4 = lab.Conv2DLayer(l_nl3,
                             num_filters=2 * K,
                             filter_size=(3, 3),
                             pad=1,
                             nonlinearity=lasagne.nonlinearities.identity,
                             method=method)

    l_mp2 = lasagne.layers.MaxPool2DLayer(l_cnn4, pool_size=(2, 2))

    l_bn4 = batch_norm.BatchNormLayer(l_mp2, epsilon=epsilon, alpha=alpha)

    l_nl4 = lasagne.layers.NonlinearityLayer(l_bn4, nonlinearity=activation)

    # 512C3-512C3-P2
    l_cnn5 = lab.Conv2DLayer(l_nl4,
                             num_filters=4 * K,
                             filter_size=(3, 3),
                             pad=1,
                             nonlinearity=lasagne.nonlinearities.identity,
                             method=method)

    l_bn5 = batch_norm.BatchNormLayer(l_cnn5, epsilon=epsilon, alpha=alpha)

    l_nl5 = lasagne.layers.NonlinearityLayer(l_bn5, nonlinearity=activation)

    l_cnn6 = lab.Conv2DLayer(l_nl5,
                             num_filters=4 * K,
                             filter_size=(3, 3),
                             pad=1,
                             nonlinearity=lasagne.nonlinearities.identity,
                             method=method)

    l_mp3 = lasagne.layers.MaxPool2DLayer(l_cnn6, pool_size=(2, 2))

    l_bn6 = batch_norm.BatchNormLayer(l_mp3, epsilon=epsilon, alpha=alpha)

    l_nl6 = lasagne.layers.NonlinearityLayer(l_bn6, nonlinearity=activation)

    # print(cnn.output_shape)

    # 1024FP-1024FP-10FP
    l_dn1 = lab.DenseLayer(l_nl6,
                           nonlinearity=lasagne.nonlinearities.identity,
                           num_units=1024,
                           method=method)

    l_bn7 = batch_norm.BatchNormLayer(l_dn1, epsilon=epsilon, alpha=alpha)

    l_nl7 = lasagne.layers.NonlinearityLayer(l_bn7, nonlinearity=activation)

    l_dn2 = lab.DenseLayer(l_nl7,
                           nonlinearity=lasagne.nonlinearities.identity,
                           num_units=1024,
                           method=method)

    l_bn8 = batch_norm.BatchNormLayer(l_dn2, epsilon=epsilon, alpha=alpha)

    l_nl8 = lasagne.layers.NonlinearityLayer(l_bn8, nonlinearity=activation)

    l_dn3 = lab.DenseLayer(l_nl8,
                           nonlinearity=lasagne.nonlinearities.identity,
                           num_units=10,
                           method=method)

    l_out = batch_norm.BatchNormLayer(l_dn3, epsilon=epsilon, alpha=alpha)

    train_output = lasagne.layers.get_output(l_out, deterministic=False)

    # squared hinge loss
    loss = T.mean(T.sqr(T.maximum(0., 1. - target * train_output)))

    if method != "FPN":
        # W updates
        W = lasagne.layers.get_all_params(l_out, binary=True)
        W_grads = lab.compute_grads(loss, l_out)
        updates = optimizer.adam(loss_or_grads=W_grads,
                                 params=W,
                                 learning_rate=LR)
        updates = lab.clipping_scaling(updates, l_out)

        # other parameters updates
        params = lasagne.layers.get_all_params(l_out,
                                               trainable=True,
                                               binary=False)
        updates = OrderedDict(updates.items() + optimizer.adam(
            loss_or_grads=loss, params=params, learning_rate=LR).items())

        ## update 2nd moment, can get from the adam optimizer also
        updates3 = OrderedDict()
        acc_tag = lasagne.layers.get_all_params(l_out, acc=True)
        idx = 0
        beta2 = 0.999
        for acc_tag_temp in acc_tag:
            updates3[acc_tag_temp] = acc_tag_temp * beta2 + W_grads[
                idx] * W_grads[idx] * (1 - beta2)
            idx = idx + 1

        updates = OrderedDict(updates.items() + updates3.items())
    else:
        params = lasagne.layers.get_all_params(l_out, trainable=True)
        updates = optimizer.adam(loss_or_grads=loss,
                                 params=params,
                                 learning_rate=LR)

    test_output = lasagne.layers.get_output(l_out, deterministic=True)
    test_loss = T.mean(T.sqr(T.maximum(0., 1. - target * test_output)))
    test_err = T.mean(T.neq(T.argmax(test_output, axis=1),
                            T.argmax(target, axis=1)),
                      dtype=theano.config.floatX)

    # Compile a function performing a training step on a mini-batch (by giving the updates dictionary)
    # and returning the corresponding training loss:
    train_fn = theano.function([input, target, LR], loss, updates=updates)
    val_fn = theano.function([input, target], [test_loss, test_err])

    ## load data
    print('Loading SVHN dataset')

    train_set = SVHN(
        which_set='splitted_train',
        # which_set= 'valid',
        path="${SVHN_LOCAL_PATH}",
        axes=['b', 'c', 0, 1])

    valid_set = SVHN(which_set='valid',
                     path="${SVHN_LOCAL_PATH}",
                     axes=['b', 'c', 0, 1])

    test_set = SVHN(which_set='test',
                    path="${SVHN_LOCAL_PATH}",
                    axes=['b', 'c', 0, 1])

    # bc01 format
    # print train_set.X.shape
    train_set.X = np.reshape(train_set.X, (-1, 3, 32, 32))
    valid_set.X = np.reshape(valid_set.X, (-1, 3, 32, 32))
    test_set.X = np.reshape(test_set.X, (-1, 3, 32, 32))

    train_set.y = np.array(train_set.y).flatten()
    valid_set.y = np.array(valid_set.y).flatten()
    test_set.y = np.array(test_set.y).flatten()

    # Onehot the targets
    train_set.y = np.float32(np.eye(10)[train_set.y])
    valid_set.y = np.float32(np.eye(10)[valid_set.y])
    test_set.y = np.float32(np.eye(10)[test_set.y])

    # for hinge loss
    train_set.y = 2 * train_set.y - 1.
    valid_set.y = 2 * valid_set.y - 1.
    test_set.y = 2 * test_set.y - 1.

    print('Training...')

    # ipdb.set_trace()
    lab.train(name, method, train_fn, val_fn, batch_size, LR_start, LR_decay,
              num_epochs, train_set.X, train_set.y, valid_set.X, valid_set.y,
              test_set.X, test_set.y)
Esempio n. 3
0
def main(method, LR_start, Binarize_weight_only):

    # BN parameters
    name = "mnist"
    print("dataset = " + str(name))

    print("Binarize_weight_only=" + str(Binarize_weight_only))

    print("Method = " + str(method))

    # alpha is the exponential moving average factor
    alpha = .1
    print("alpha = " + str(alpha))
    epsilon = 1e-4
    print("epsilon = " + str(epsilon))

    batch_size = 100
    print("batch_size = " + str(batch_size))

    num_epochs = 50
    print("num_epochs = " + str(num_epochs))

    # network structure
    num_units = 2048
    print("num_units = " + str(num_units))
    n_hidden_layers = 3
    print("n_hidden_layers = " + str(n_hidden_layers))

    print("LR_start = " + str(LR_start))
    LR_decay = 0.1
    print("LR_decay=" + str(LR_decay))

    if Binarize_weight_only == "w":
        activation = lasagne.nonlinearities.rectify
    else:
        activation = lab.binary_tanh_unit
    print("activation = " + str(activation))

    print('Loading MNIST dataset...')

    train_set = MNIST(which_set='train', start=0, stop=50000, center=True)
    valid_set = MNIST(which_set='train', start=50000, stop=60000, center=True)
    test_set = MNIST(which_set='test', center=True)

    # bc01 format
    train_set.X = train_set.X.reshape(-1, 1, 28, 28)
    valid_set.X = valid_set.X.reshape(-1, 1, 28, 28)
    test_set.X = test_set.X.reshape(-1, 1, 28, 28)

    # flatten targets
    train_set.y = np.hstack(train_set.y)
    valid_set.y = np.hstack(valid_set.y)
    test_set.y = np.hstack(test_set.y)

    # Onehot the targets
    train_set.y = np.float32(np.eye(10)[train_set.y])
    valid_set.y = np.float32(np.eye(10)[valid_set.y])
    test_set.y = np.float32(np.eye(10)[test_set.y])

    # for hinge loss
    train_set.y = 2 * train_set.y - 1.
    valid_set.y = 2 * valid_set.y - 1.
    test_set.y = 2 * test_set.y - 1.

    print('Building the MLP...')

    # Prepare Theano variables for inputs and targets
    input = T.tensor4('inputs')
    target = T.matrix('targets')
    LR = T.scalar('LR', dtype=theano.config.floatX)

    mlp = lasagne.layers.InputLayer(shape=(None, 1, 28, 28), input_var=input)

    for k in range(n_hidden_layers):
        mlp = lab.DenseLayer(mlp,
                             nonlinearity=lasagne.nonlinearities.identity,
                             num_units=num_units,
                             method=method)
        mlp = batch_norm.BatchNormLayer(mlp, epsilon=epsilon, alpha=alpha)
        mlp = lasagne.layers.NonlinearityLayer(mlp, nonlinearity=activation)

    mlp = lab.DenseLayer(mlp,
                         nonlinearity=lasagne.nonlinearities.identity,
                         num_units=10,
                         method=method)

    mlp = batch_norm.BatchNormLayer(mlp, epsilon=epsilon, alpha=alpha)

    train_output = lasagne.layers.get_output(mlp, deterministic=False)

    # squared hinge loss
    loss = T.mean(T.sqr(T.maximum(0., 1. - target * train_output)))

    if method != "FPN":

        # W updates
        W = lasagne.layers.get_all_params(mlp, binary=True)
        W_grads = lab.compute_grads(loss, mlp)
        updates = optimizer.adam(loss_or_grads=W_grads,
                                 params=W,
                                 learning_rate=LR)
        updates = lab.clipping_scaling(updates, mlp)

        # other parameters updates
        params = lasagne.layers.get_all_params(mlp,
                                               trainable=True,
                                               binary=False)
        updates = OrderedDict(updates.items() + optimizer.adam(
            loss_or_grads=loss, params=params, learning_rate=LR).items())

        ## update 2nd moment, can get from the adam optimizer also
        updates3 = OrderedDict()
        acc_tag = lasagne.layers.get_all_params(mlp, acc=True)
        idx = 0
        beta2 = 0.999
        for acc_tag_temp in acc_tag:
            updates3[acc_tag_temp] = acc_tag_temp * beta2 + W_grads[
                idx] * W_grads[idx] * (1 - beta2)
            idx = idx + 1

        updates = OrderedDict(updates.items() + updates3.items())

    else:
        params = lasagne.layers.get_all_params(mlp, trainable=True)
        updates = optimizer.adam(loss_or_grads=loss,
                                 params=params,
                                 learning_rate=LR)

    test_output = lasagne.layers.get_output(mlp, deterministic=True)
    test_loss = T.mean(T.sqr(T.maximum(0., 1. - target * test_output)))
    test_err = T.mean(T.neq(T.argmax(test_output, axis=1),
                            T.argmax(target, axis=1)),
                      dtype=theano.config.floatX)

    # Compile a function performing a training step on a mini-batch (by giving the updates dictionary)
    # and returning the corresponding training loss:
    train_fn = theano.function([input, target, LR], loss, updates=updates)
    val_fn = theano.function([input, target], [test_loss, test_err])

    print('Training...')

    lab.train(name, method, train_fn, val_fn, batch_size, LR_start, LR_decay,
              num_epochs, train_set.X, train_set.y, valid_set.X, valid_set.y,
              test_set.X, test_set.y)
Esempio n. 4
0
def main(method,LR_start,Binarize_weight_only):
	
	name = "cifar"
	print("dataset = "+str(name))

	print("Binarize_weight_only="+str(Binarize_weight_only))

	print("Method = "+str(method))

	# alpha is the exponential moving average factor
	alpha = .1
	print("alpha = "+str(alpha))
	epsilon = 1e-4
	print("epsilon = "+str(epsilon))
	
	# Training parameters
	batch_size = 50
	print("batch_size = "+str(batch_size))
	
	num_epochs = 200
	print("num_epochs = "+str(num_epochs))

	print("LR_start = "+str(LR_start))
	LR_decay = 0.5
	print("LR_decay="+str(LR_decay))

	if Binarize_weight_only =="w":
		activation = lasagne.nonlinearities.rectify
	else:
		activation = lab.binary_tanh_unit
	print("activation = "+ str(activation))
	

	train_set_size = 45000
	print("train_set_size = "+str(train_set_size))
	
	print('Loading CIFAR-10 dataset...')
	
	preprocessor = serial.load("${PYLEARN2_DATA_PATH}/cifar10/pylearn2_gcn_whitened/preprocessor.pkl")
	train_set = ZCA_Dataset(
		preprocessed_dataset=serial.load("${PYLEARN2_DATA_PATH}/cifar10/pylearn2_gcn_whitened/train.pkl"), 
		preprocessor = preprocessor,
		start=0, stop = train_set_size)
	valid_set = ZCA_Dataset(
		preprocessed_dataset= serial.load("${PYLEARN2_DATA_PATH}/cifar10/pylearn2_gcn_whitened/train.pkl"), 
		preprocessor = preprocessor,
		start=45000, stop = 50000)  
	test_set = ZCA_Dataset(
		preprocessed_dataset= serial.load("${PYLEARN2_DATA_PATH}/cifar10/pylearn2_gcn_whitened/test.pkl"), 
		preprocessor = preprocessor)
		
	# bc01 format
	train_set.X = train_set.X.reshape(-1,3,32,32)
	valid_set.X = valid_set.X.reshape(-1,3,32,32)
	test_set.X = test_set.X.reshape(-1,3,32,32)
	
	# flatten targets
	train_set.y = np.hstack(train_set.y)
	valid_set.y = np.hstack(valid_set.y)
	test_set.y = np.hstack(test_set.y)

   
	# Onehot the targets
	train_set.y = np.float32(np.eye(10)[train_set.y])    
	valid_set.y = np.float32(np.eye(10)[valid_set.y])
	test_set.y = np.float32(np.eye(10)[test_set.y])
	
	# for hinge loss
	train_set.y = 2* train_set.y - 1.
	valid_set.y = 2* valid_set.y - 1.
	test_set.y = 2* test_set.y - 1.

	print('Building the CNN...') 
	
	# Prepare Theano variables for inputs and targets
	input = T.tensor4('inputs')
	target = T.matrix('targets')
	LR = T.scalar('LR', dtype=theano.config.floatX)

	l_in = lasagne.layers.InputLayer(
			shape=(None, 3, 32, 32),
			input_var=input)
	
	# 128C3-128C3-P2             
	l_cnn1 = lab.Conv2DLayer(
			l_in, 
			num_filters=128, 
			filter_size=(3, 3),
			pad=1,
			nonlinearity=lasagne.nonlinearities.identity,
			method = method)

	l_bn1 = batch_norm.BatchNormLayer(
			l_cnn1,
			epsilon=epsilon, 
			alpha=alpha)

	l_nl1 = lasagne.layers.NonlinearityLayer(
			l_bn1,
			nonlinearity = activation)

	l_cnn2 = lab.Conv2DLayer(
			l_nl1, 
			num_filters=128, 
			filter_size=(3, 3),
			pad=1,
			nonlinearity=lasagne.nonlinearities.identity,
			method = method)
	
	l_mp1 = lasagne.layers.MaxPool2DLayer(l_cnn2, pool_size=(2, 2))
	
	l_bn2 = batch_norm.BatchNormLayer(
			l_mp1,
			epsilon=epsilon, 
			alpha=alpha)

	l_nl2 = lasagne.layers.NonlinearityLayer(
			l_bn2,
			nonlinearity = activation)			
	# 256C3-256C3-P2             
	l_cnn3 = lab.Conv2DLayer(
			l_nl2, 
			num_filters=256, 
			filter_size=(3, 3),
			pad=1,
			nonlinearity=lasagne.nonlinearities.identity,
			method = method)
	
	l_bn3 = batch_norm.BatchNormLayer(
			l_cnn3,
			epsilon=epsilon, 
			alpha=alpha)

	l_nl3 = lasagne.layers.NonlinearityLayer(
			l_bn3,
			nonlinearity = activation)
			
	l_cnn4 = lab.Conv2DLayer(
			l_nl3, 
			num_filters=256, 
			filter_size=(3, 3),
			pad=1,
			nonlinearity=lasagne.nonlinearities.identity,
			method = method)
	
	l_mp2 = lasagne.layers.MaxPool2DLayer(l_cnn4, pool_size=(2, 2))
	
	l_bn4 = batch_norm.BatchNormLayer(
			l_mp2,
			epsilon=epsilon, 
			alpha=alpha)
	
	l_nl4 = lasagne.layers.NonlinearityLayer(
			l_bn4,
			nonlinearity = activation)

	# 512C3-512C3-P2              
	l_cnn5 = lab.Conv2DLayer(
			l_nl4, 
			num_filters=512, 
			filter_size=(3, 3),
			pad=1,
			nonlinearity=lasagne.nonlinearities.identity,
			method = method)
	
	l_bn5 = batch_norm.BatchNormLayer(
			l_cnn5,
			epsilon=epsilon, 
			alpha=alpha)

	l_nl5 = lasagne.layers.NonlinearityLayer(
			l_bn5,
			nonlinearity = activation)
				  
	l_cnn6 = lab.Conv2DLayer(
			l_nl5, 
			num_filters=512, 
			filter_size=(3, 3),
			pad=1,
			nonlinearity=lasagne.nonlinearities.identity,
			method = method)
	
	l_mp3 = lasagne.layers.MaxPool2DLayer(l_cnn6, pool_size=(2, 2))
	
	l_bn6 = batch_norm.BatchNormLayer(
			l_mp3,
			epsilon=epsilon, 
			alpha=alpha)

	l_nl6 = lasagne.layers.NonlinearityLayer(
			l_bn6,
			nonlinearity = activation)

	# print(cnn.output_shape)
	
	# 1024FP-1024FP-10FP            
	l_dn1 = lab.DenseLayer(
				l_nl6, 
				nonlinearity=lasagne.nonlinearities.identity,
				num_units=1024,
				method = method)      
				  
	l_bn7 = batch_norm.BatchNormLayer(
			l_dn1,
			epsilon=epsilon, 
			alpha=alpha)

	l_nl7 = lasagne.layers.NonlinearityLayer(
			l_bn7,
			nonlinearity = activation)

	l_dn2 = lab.DenseLayer(
				l_nl7, 
				nonlinearity=lasagne.nonlinearities.identity,
				num_units=1024,
				method = method)      
				  
	l_bn8 = batch_norm.BatchNormLayer(
			l_dn2,
			epsilon=epsilon, 
			alpha=alpha)

	l_nl8 = lasagne.layers.NonlinearityLayer(
			l_bn8,
			nonlinearity = activation)

	l_dn3 = lab.DenseLayer(
				l_nl8, 
				nonlinearity=lasagne.nonlinearities.identity,
				num_units=10,
				method = method)      
				  
	l_out = batch_norm.BatchNormLayer(
			l_dn3,
			epsilon=epsilon, 
			alpha=alpha)

	train_output = lasagne.layers.get_output(l_out, deterministic=False)
	
	# squared hinge loss
	loss = T.mean(T.sqr(T.maximum(0.,1.-target*train_output)))
	
	if method!="FPN":
		# W updates
		W = lasagne.layers.get_all_params(l_out, binary=True)
		W_grads = lab.compute_grads(loss,l_out)
		updates = optimizer.adam(loss_or_grads=W_grads, params=W, learning_rate=LR)
		updates = lab.clipping_scaling(updates,l_out)
		
		# other parameters updates
		params = lasagne.layers.get_all_params(l_out, trainable=True, binary=False)
		updates = OrderedDict(updates.items() + optimizer.adam(loss_or_grads=loss, params=params, learning_rate=LR).items())

		## update 2nd moment, can get from the adam optimizer also
		updates3 = OrderedDict()
		acc_tag = lasagne.layers.get_all_params(l_out, acc=True)	
		idx = 0
		beta2 = 0.999   
		for acc_tag_temp in acc_tag:
			updates3[acc_tag_temp]= acc_tag_temp*beta2 + W_grads[idx]*W_grads[idx]*(1-beta2)
			idx = idx+1

		updates = OrderedDict(updates.items() + updates3.items())	
	else:
		params = lasagne.layers.get_all_params(l_out, trainable=True)
		updates = optimizer.adam(loss_or_grads=loss, params=params, learning_rate=LR)

	test_output = lasagne.layers.get_output(l_out, deterministic=True)
	test_loss = T.mean(T.sqr(T.maximum(0.,1.-target*test_output)))
	test_err = T.mean(T.neq(T.argmax(test_output, axis=1), T.argmax(target, axis=1)),dtype=theano.config.floatX)
	
	# Compile a function performing a training step on a mini-batch (by giving the updates dictionary) 
	# and returning the corresponding training loss:
	train_fn = theano.function([input, target, LR], loss, updates=updates)
	val_fn = theano.function([input, target], [test_loss, test_err])

	print('Training...')
	
	lab.train(
			name, method,
			train_fn,val_fn,
			batch_size,
			LR_start,LR_decay,
			num_epochs,
			train_set.X,train_set.y,
			valid_set.X,valid_set.y,
			test_set.X,test_set.y)