def getGeneratorUpdates(self, loss, aLearningRate, aBeta1): LR = aLearningRate mlp = self.gen if self.IS_BINARY: # W updates # extract 'binary' parameters only Wb_list = lasagne.layers.get_all_params(self.gen, binary=True) for eW in Wb_list: #print( 'eW:', type(eW), eW ) pass # Make a list of the gradients w.r.t. the binary parameters W_grad_list = binary_net.compute_grads(loss, mlp) #print('W_grad_list', type(W_grad_list), W_grad_list) # Update function map(OrderedDict) with ADAM learning method updates_b0 = lasagne.updates.adam(loss_or_grads=W_grad_list, params=Wb_list, learning_rate=LR, beta1=aBeta1) # clipping & scaling for binarization updates_b1 = binary_net.clipping_scaling(updates_b0, mlp) # other parameters updates Wr_list = lasagne.layers.get_all_params(mlp, trainable=True, binary=False) for Wr in Wr_list: #print('Wr:', type(Wr), Wr) pass # Marging the parameters : binary params + other params updates = OrderedDict(updates_b1.items() + lasagne.updates.adam(loss_or_grads=loss, params=Wr_list, learning_rate=LR, beta1=aBeta1).items()) else: Wr_list = lasagne.layers.get_all_params(mlp, trainable=True) updates = lasagne.updates.adam(loss_or_grads=loss, params=Wr_list, learning_rate=LR, beta1=aBeta1) return updates
# if this does not work, try # train_0_when_0 = batch_size - T.sum(T.or_(T.argmax(train_output,axis=1),T.argmax(target,axis=1))),dtype=theano.config.floatX) train_precision = train_0_when_0 / (train_0_when_0 + train_0_when_1 ) # TP/(TP+FP) train_recall = train_0_when_0 / (train_0_when_0 + train_1_when_0 ) # TP/(TP+FN) if binary: # W updates W = lasagne.layers.get_all_params(cnn, binary=True) W_grads = binary_net.compute_grads(loss, cnn) updates = lasagne.updates.adam(loss_or_grads=W_grads, params=W, learning_rate=LR) updates = binary_net.clipping_scaling(updates, cnn) # other parameters updates params = lasagne.layers.get_all_params(cnn, trainable=True, binary=False) updates = OrderedDict(updates.items() + lasagne.updates.adam( loss_or_grads=loss, params=params, learning_rate=LR).items()) else: params = lasagne.layers.get_all_params(cnn, trainable=True) updates = lasagne.updates.adam(loss_or_grads=loss, params=params, learning_rate=LR) test_output = lasagne.layers.get_output(cnn, deterministic=True)
cnn, epsilon=epsilon, alpha=alpha) train_output = lasagne.layers.get_output(cnn, deterministic=False) # squared hinge loss loss = T.mean(T.sqr(T.maximum(0.,1.-target*train_output))) if binary: # W updates W = lasagne.layers.get_all_params(cnn, binary=True) W_grads = binary_net.compute_grads(loss,cnn) updates = lasagne.updates.adam(loss_or_grads=W_grads, params=W, learning_rate=LR) updates = binary_net.clipping_scaling(updates,cnn) # other parameters updates params = lasagne.layers.get_all_params(cnn, trainable=True, binary=False) updates = OrderedDict(updates.items() + lasagne.updates.adam(loss_or_grads=loss, params=params, learning_rate=LR).items()) else: params = lasagne.layers.get_all_params(cnn, trainable=True) updates = lasagne.updates.adam(loss_or_grads=loss, params=params, learning_rate=LR) test_output = lasagne.layers.get_output(cnn, deterministic=True) test_loss = T.mean(T.sqr(T.maximum(0.,1.-target*test_output))) test_err = T.mean(T.neq(T.argmax(test_output, axis=1), T.argmax(target, axis=1)),dtype=theano.config.floatX) # Compile a function performing a training step on a mini-batch (by giving the updates dictionary) # and returning the corresponding training loss:
def main(): # BN parameters # alpha is the exponential moving average factor alpha = .1 print("alpha = " + str(alpha)) epsilon = 1e-4 print("epsilon = " + str(epsilon)) # BinaryOut activation = binary_net.binary_tanh_unit print("activation = binary_net.binary_tanh_unit") # activation = binary_net.binary_sigmoid_unit # print("activation = binary_net.binary_sigmoid_unit") # BinaryConnect binary = True print("binary = " + str(binary)) stochastic = False print("stochastic = " + str(stochastic)) # (-H,+H) are the two binary values # H = "Glorot" H = 1. print("H = " + str(H)) W_LR_scale = 1. #W_LR_scale = "Glorot" # "Glorot" means we are using the coefficients from Glorot's paper print("W_LR_scale = " + str(W_LR_scale)) # Training parameters num_epochs = 50 print("num_epochs = " + str(num_epochs)) # Decaying LR LR_start = 0.01 print("LR_start = " + str(LR_start)) LR_fin = 0.000003 # LR_start = 0.01 # print("LR_start = " + str(LR_start)) # LR_fin = 1e-6 print("LR_fin = " + str(LR_fin)) LR_decay = (LR_fin / LR_start)**(1. / num_epochs) print("LR_decay = " + str(LR_decay)) # BTW, LR decay might good for the BN moving average... shuffle_parts = 1 print("shuffle_parts = " + str(shuffle_parts)) networkType = 'google' dataType = 'TCDTIMIT' #networkType='cifar10' #dataType='cifar10' # these batch sizes work for a GTX 1060 (6GB) if dataType == 'TCDTIMIT': if networkType == 'google': batch_size = 100 else: batch_size = 24 elif dataType == 'cifar10' and networkType == 'cifar10': batch_size = 400 elif dataType == 'cifar10' and networkType == 'google': batch_size = 1000 model_name = os.path.expanduser('~/TCDTIMIT/lipreading/TCDTIMIT/results/CNN_binaryNet/lipspeakers_') \ + networkType + "_phoneme39_binary" + "_" + dataType model_path = model_name + ".npz" if not os.path.exists(os.path.dirname(model_path)): os.makedirs(os.path.dirname(model_path)) print('Building the CNN...') # Prepare Theano variables for inputs and targets input = T.tensor4('inputs') target = T.matrix('targets') LR = T.scalar('LR', dtype=theano.config.floatX) cnn = buildCNN(networkType, dataType, input, epsilon, alpha, activation, binary, stochastic, H, W_LR_scale) # restore network weights if os.path.exists(model_path): with np.load(model_path) as f: param_values = [f['arr_%d' % i] for i in range(len(f.files))] try: lasagne.layers.set_all_param_values(cnn, *param_values) except: lasagne.layers.set_all_param_values(cnn, param_values) print("\n\n\t Loaded model " + model_path) print('Loading ' + dataType + ' dataset...') X_train, y_train, X_val, y_val, X_test, y_test = loadDataset(dataType) print("Building Functions...") train_output = lasagne.layers.get_output(cnn, deterministic=False) # squared hinge loss loss = T.mean(T.sqr(T.maximum(0., 1. - target * train_output))) # W updates W = lasagne.layers.get_all_params(cnn, binary=True) W_grads = binary_net.compute_grads(loss, cnn) updates = lasagne.updates.adam(loss_or_grads=W_grads, params=W, learning_rate=LR) updates = binary_net.clipping_scaling(updates, cnn) # other parameters updates params = lasagne.layers.get_all_params(cnn, trainable=True, binary=False) updates = OrderedDict(updates.items() + lasagne.updates.adam( loss_or_grads=loss, params=params, learning_rate=LR).items()) test_output = lasagne.layers.get_output(cnn, deterministic=True) test_loss = T.mean(T.sqr(T.maximum(0., 1. - target * test_output))) test_err = T.mean(T.neq(T.argmax(test_output, axis=1), T.argmax(target, axis=1)), dtype=theano.config.floatX) # Compile a function performing a training step on a mini-batch (by giving the updates dictionary) # and returning the corresponding training loss: train_fn = theano.function([input, target, LR], loss, updates=updates) # Compile a second function computing the validation loss and accuracy: val_fn = theano.function([input, target], [test_loss, test_err]) print('Training...') binary_net.train(train_fn, val_fn, cnn, batch_size, LR_start, LR_decay, num_epochs, X_train, y_train, X_val, y_val, X_test, y_test, save_name=model_name, shuffle_parts=shuffle_parts, justTest=justTest)
train_output = lasagne.layers.get_output(cnn, deterministic=False) # squared hinge loss loss = T.mean(T.sqr(T.maximum(0., 1. - target * train_output))) if args.train: if binary: # W updates W = lasagne.layers.get_all_params(cnn, binary=True) W_grads = binary_net.compute_grads(loss, cnn) updates = lasagne.updates.adam(loss_or_grads=W_grads, params=W, learning_rate=LR) updates = binary_net.clipping_scaling( updates, cnn) # weight scaling disabled # other parameters updates params = lasagne.layers.get_all_params(cnn, trainable=True, binary=False) updates = OrderedDict(updates.items() + lasagne.updates.adam( loss_or_grads=loss, params=params, learning_rate=LR).items()) else: params = lasagne.layers.get_all_params(cnn, trainable=True) updates = lasagne.updates.adam(loss_or_grads=loss, params=params, learning_rate=LR) train_fn = theano.function([input, target, LR], loss, updates=updates)
def run(binary=False, noise=None, nalpha=0, result_path=None): # BN parameters batch_size = 128 print("batch_size = " + str(batch_size)) # alpha is the exponential moving average factor alpha = .1 print("alpha = " + str(alpha)) epsilon = 1e-4 print("epsilon = " + str(epsilon)) # Training parameters num_epochs = 150 print("num_epochs = " + str(num_epochs)) # Dropout parameters dropout_in = .2 # default: .2 print("dropout_in = " + str(dropout_in)) dropout_hidden = .5 # default: .5 print("dropout_hidden = " + str(dropout_hidden)) # BinaryOut if binary: activation = binary_net.binary_tanh_unit print("activation = binary_net.binary_tanh_unit") else: activation = lasagne.nonlinearities.tanh print("activation = lasagne.nonlinearities.tanh") # BinaryConnect print("binary = " + str(binary)) stochastic = False print("stochastic = " + str(stochastic)) # (-H,+H) are the two binary values # H = "Glorot" H = 1. print("H = " + str(H)) # W_LR_scale = 1. W_LR_scale = "Glorot" # "Glorot" means we are using the coefficients from Glorot's paper print("W_LR_scale = " + str(W_LR_scale)) # Decaying LR LR_start = 0.005 print("LR_start = " + str(LR_start)) LR_fin = 0.0000005 # 0.0000003 print("LR_fin = " + str(LR_fin)) LR_decay = (LR_fin / LR_start)**(1. / num_epochs) print("LR_decay = " + str(LR_decay)) # BTW, LR decay might good for the BN moving average... train_set_size = 40000 shuffle_parts = 1 print("shuffle_parts = " + str(shuffle_parts)) print("noise = " + str(noise)) print("nalpha = " + str(nalpha)) print('Loading CIFAR-10 dataset...') cifar = CifarReader("./data/cifar-10-batches-py/") train_X, train_y = cifar.get_train_data(n_samples=train_set_size, noise=noise, alpha=nalpha) valid_X, valid_y = cifar.get_validation_data() test_X, test_y = cifar.get_test_data() print("train_set_size = " + str(train_y.shape[0])) print("validation_set_size = " + str(valid_y.shape[0])) print("test_set_size = " + str(test_y.shape[0])) # Log output with open(result_path + "params.txt", "a+") as l: print("batch_size = " + str(batch_size), file=l) print("alpha = " + str(alpha), file=l) print("epsilon = " + str(epsilon), file=l) print("num_epochs = " + str(num_epochs), file=l) print("dropout_in = " + str(dropout_in), file=l) print("dropout_hidden = " + str(dropout_hidden), file=l) if binary: print("activation = binary_net.binary_tanh_unit", file=l) else: print("activation = lasagne.nonlinearities.tanh", file=l) print("binary = " + str(binary), file=l) print("stochastic = " + str(stochastic), file=l) print("H = " + str(H), file=l) print("W_LR_scale = " + str(W_LR_scale), file=l) print("LR_start = " + str(LR_start), file=l) print("LR_fin = " + str(LR_fin), file=l) print("LR_decay = " + str(LR_decay), file=l) print("shuffle_parts = " + str(shuffle_parts), file=l) print("noise = " + str(noise), file=l) print("nalpha = " + str(nalpha), file=l) print("train_set_size = " + str(train_y.shape[0]), file=l) print("validation_set_size = " + str(valid_y.shape[0]), file=l) print("test_set_size = " + str(test_y.shape[0]), file=l) # bc01 format # Inputs in the range [-1,+1] # print("Inputs in the range [-1,+1]") train_X = np.reshape(np.subtract(np.multiply(2. / 255., train_X), 1.), (-1, 3, 32, 32)) valid_X = np.reshape(np.subtract(np.multiply(2. / 255., valid_X), 1.), (-1, 3, 32, 32)) test_X = np.reshape(np.subtract(np.multiply(2. / 255., test_X), 1.), (-1, 3, 32, 32)) # flatten targets train_y = np.hstack(train_y) valid_y = np.hstack(valid_y) test_y = np.hstack(test_y) # Onehot the targets train_y = np.float32(np.eye(10)[train_y]) valid_y = np.float32(np.eye(10)[valid_y]) test_y = np.float32(np.eye(10)[test_y]) # for hinge loss train_y = 2 * train_y - 1. valid_y = 2 * valid_y - 1. test_y = 2 * test_y - 1. print('Building the CNN...') # Prepare Theano variables for inputs and targets input = T.tensor4('inputs') target = T.matrix('targets') LR = T.scalar('LR', dtype=theano.config.floatX) cnn = lasagne.layers.InputLayer(shape=(None, 3, 32, 32), input_var=input) cnn = lasagne.layers.DropoutLayer(cnn, p=dropout_in) # 32C3-64C3-P2 cnn = binary_net.Conv2DLayer(cnn, binary=binary, stochastic=stochastic, H=H, W_LR_scale=W_LR_scale, num_filters=32, filter_size=(3, 3), pad=1, nonlinearity=lasagne.nonlinearities.identity) cnn = lasagne.layers.BatchNormLayer(cnn, epsilon=epsilon, alpha=alpha) cnn = lasagne.layers.NonlinearityLayer(cnn, nonlinearity=activation) cnn = binary_net.Conv2DLayer(cnn, binary=binary, stochastic=stochastic, H=H, W_LR_scale=W_LR_scale, num_filters=64, filter_size=(3, 3), pad=1, nonlinearity=lasagne.nonlinearities.identity) cnn = lasagne.layers.MaxPool2DLayer(cnn, pool_size=(2, 2)) cnn = lasagne.layers.BatchNormLayer(cnn, epsilon=epsilon, alpha=alpha) cnn = lasagne.layers.NonlinearityLayer(cnn, nonlinearity=activation) cnn = lasagne.layers.DropoutLayer(cnn, p=dropout_hidden) # 128FP-10FP cnn = binary_net.DenseLayer(cnn, binary=binary, stochastic=stochastic, H=H, W_LR_scale=W_LR_scale, nonlinearity=lasagne.nonlinearities.identity, num_units=128) cnn = lasagne.layers.BatchNormLayer(cnn, epsilon=epsilon, alpha=alpha) cnn = lasagne.layers.NonlinearityLayer(cnn, nonlinearity=activation) cnn = lasagne.layers.DropoutLayer(cnn, p=dropout_hidden) cnn = binary_net.DenseLayer(cnn, binary=binary, stochastic=stochastic, H=H, W_LR_scale=W_LR_scale, nonlinearity=lasagne.nonlinearities.identity, num_units=10) cnn = lasagne.layers.BatchNormLayer(cnn, epsilon=epsilon, alpha=alpha) cnn = lasagne.layers.NonlinearityLayer( cnn, nonlinearity=lasagne.nonlinearities.softmax) train_output = lasagne.layers.get_output(cnn, deterministic=False) # squared hinge loss loss = T.mean(T.sqr(T.maximum(0., 1. - target * train_output))) if binary: # W updates W = lasagne.layers.get_all_params(cnn, binary=True) W_grads = binary_net.compute_grads(loss, cnn) updates = lasagne.updates.adam(loss_or_grads=W_grads, params=W, learning_rate=LR) updates = binary_net.clipping_scaling(updates, cnn) # other parameters updates params = lasagne.layers.get_all_params(cnn, trainable=True, binary=False) updates.update( lasagne.updates.adam(loss_or_grads=loss, params=params, learning_rate=LR)) else: params = lasagne.layers.get_all_params(cnn, trainable=True) updates = lasagne.updates.adam(loss_or_grads=loss, params=params, learning_rate=LR) test_output = lasagne.layers.get_output(cnn, deterministic=True) test_loss = T.mean(T.sqr(T.maximum(0., 1. - target * test_output))) test_err = T.mean(T.neq(T.argmax(test_output, axis=1), T.argmax(target, axis=1)), dtype=theano.config.floatX) # Compile a function performing a training step on a mini-batch (by giving the updates dictionary) # and returning the corresponding training loss: train_fn = theano.function([input, target, LR], loss, updates=updates) # Compile a second function computing the validation loss and accuracy: val_fn = theano.function([input, target], [test_loss, test_err]) print('Training...') binary_net.train(train_fn, val_fn, cnn, batch_size, LR_start, LR_decay, num_epochs, train_X, train_y, valid_X, valid_y, test_X, test_y, shuffle_parts=shuffle_parts, result_path=result_path)
def main(): # BN parameters batch_size = 200 print("batch_size = " + str(batch_size)) # alpha is the exponential moving average factor alpha = .1 print("alpha = " + str(alpha)) epsilon = 1e-4 print("epsilon = " + str(epsilon)) # BinaryOut activation = binary_net.binary_tanh_unit print("activation = binary_net.binary_tanh_unit") # activation = binary_net.binary_sigmoid_unit # print("activation = binary_net.binary_sigmoid_unit") # BinaryConnect binary = True print("binary = " + str(binary)) stochastic = False print("stochastic = " + str(stochastic)) # (-H,+H) are the two binary values # H = "Glorot" H = 1. print("H = " + str(H)) # W_LR_scale = 1. W_LR_scale = "Glorot" # "Glorot" means we are using the coefficients from Glorot's paper print("W_LR_scale = " + str(W_LR_scale)) # Training parameters num_epochs = 500 print("num_epochs = " + str(num_epochs)) # Decaying LR LR_start = 0.01 print("LR_start = " + str(LR_start)) LR_fin = 0.0000003 print("LR_fin = " + str(LR_fin)) LR_decay = (LR_fin / LR_start)**(1. / num_epochs) print("LR_decay = " + str(LR_decay)) # BTW, LR decay might good for the BN moving average... train_set_size = 45000 print("train_set_size = " + str(train_set_size)) shuffle_parts = 1 print("shuffle_parts = " + str(shuffle_parts)) print('\nLoading CIFAR-10 dataset...') train_set = CIFAR10(which_set="train", start=0, stop=train_set_size) valid_set = CIFAR10(which_set="train", start=train_set_size, stop=50000) test_set = CIFAR10(which_set="test") # bc01 format # Inputs in the range [-1,+1] # print("Inputs in the range [-1,+1]") train_set.X = np.reshape( np.subtract(np.multiply(2. / 255., train_set.X), 1.), (-1, 3, 32, 32)) valid_set.X = np.reshape( np.subtract(np.multiply(2. / 255., valid_set.X), 1.), (-1, 3, 32, 32)) test_set.X = np.reshape( np.subtract(np.multiply(2. / 255., test_set.X), 1.), (-1, 3, 32, 32)) # flatten targets train_set.y = np.hstack(train_set.y) valid_set.y = np.hstack(valid_set.y) test_set.y = np.hstack(test_set.y) if oneHot: # Onehot the targets train_set.y = np.float32(np.eye(10)[train_set.y]) valid_set.y = np.float32(np.eye(10)[valid_set.y]) test_set.y = np.float32(np.eye(10)[test_set.y]) # for hinge loss train_set.y = 2 * train_set.y - 1. valid_set.y = 2 * valid_set.y - 1. test_set.y = 2 * test_set.y - 1. else: train_set.y = np.int32(train_set.y) valid_set.y = np.int32(valid_set.y) test_set.y = np.int32(test_set.y) #import pdb;pdb.set_trace() print('\nBuilding the CNN...') # Prepare Theano variables for inputs and targets input = T.tensor4('inputs') if oneHot: target = T.matrix('targets') else: target = T.ivector('targets') LR = T.scalar('LR', dtype=theano.config.floatX) cnn = buildCNN(dataType='cifar10', networkType='cifar10', oneHot=oneHot, input=input, epsilon=epsilon, alpha=alpha, activation=activation, binary=binary, stochastic=stochastic, H=H, W_LR_scale=W_LR_scale) train_output = lasagne.layers.get_output(cnn, deterministic=False) # squared hinge loss if oneHot: loss = T.mean(T.sqr(T.maximum(0., 1. - target * train_output))) else: loss = LO.categorical_crossentropy(train_output, target) loss = loss.mean() # W updates W = lasagne.layers.get_all_params(cnn, binary=True) W_grads = binary_net.compute_grads(loss, cnn) updates = lasagne.updates.adam(loss_or_grads=W_grads, params=W, learning_rate=LR) updates = binary_net.clipping_scaling(updates, cnn) # other parameters updates params = lasagne.layers.get_all_params(cnn, trainable=True, binary=False) updates = OrderedDict(updates.items() + lasagne.updates.adam( loss_or_grads=loss, params=params, learning_rate=LR).items()) test_output = lasagne.layers.get_output(cnn, deterministic=True) if oneHot: test_loss = T.mean(T.sqr(T.maximum(0., 1. - target * test_output))) test_err = T.mean(T.neq(T.argmax(test_output, axis=1), T.argmax(target, axis=1)), dtype=theano.config.floatX) else: test_loss = LO.categorical_crossentropy(test_output, target) test_loss = test_loss.mean() test_err = T.mean(T.neq(T.argmax(test_output, axis=1), T.argmax(target)), dtype=theano.config.floatX) # Compile a function performing a training step on a mini-batch (by giving the updates dictionary) # and returning the corresponding training loss: train_fn = theano.function([input, target, LR], loss, updates=updates) # Compile a second function computing the validation loss and accuracy: val_fn = theano.function([input, target], [test_loss, test_err]) print('Training...') binary_net.train(train_fn, val_fn, cnn, batch_size, LR_start, LR_decay, num_epochs, train_set.X, train_set.y, valid_set.X, valid_set.y, test_set.X, test_set.y, shuffle_parts=shuffle_parts)
def trial(N_HIDDEN_LAYERS, NUM_UNITS, OUTPUT_TYPE, MAIN_LOSS_TYPE, LAMBDA, FOLD, FINTUNE_SNAPSHOT, FINTUNE_SCALE): # BN parameters batch_size = 97 print("batch_size = " + str(batch_size)) # alpha is the exponential moving average factor # alpha = .15 alpha = .1 print("alpha = " + str(alpha)) epsilon = 1e-4 print("epsilon = " + str(epsilon)) # MLP parameters #NUM_UNITS = 25 print("NUM_UNITS = " + str(NUM_UNITS)) #N_HIDDEN_LAYERS = 1 print("N_HIDDEN_LAYERS = " + str(N_HIDDEN_LAYERS)) # Training parameters num_epochs = 1000 print("num_epochs = " + str(num_epochs)) # Dropout parameters dropout_in = .2 # 0. means no dropout print("dropout_in = " + str(dropout_in)) dropout_hidden = .5 print("dropout_hidden = " + str(dropout_hidden)) # BinaryOut activation = binary_net.binary_tanh_unit print("activation = binary_net.binary_tanh_unit") # activation = binary_net.binary_sigmoid_unit # print("activation = binary_net.binary_sigmoid_unit") # BinaryConnect binary = True print("binary = " + str(binary)) stochastic = False print("stochastic = " + str(stochastic)) # (-H,+H) are the two binary values # H = "Glorot" H = 1. print("H = " + str(H)) # W_LR_scale = 1. W_LR_scale = "Glorot" # "Glorot" means we are using the coefficients from Glorot's paper print("W_LR_scale = " + str(W_LR_scale)) # Decaying LR #LR_start = .003 LR_start = 0.000003 print("LR_start = " + str(LR_start)) #LR_fin = 0.0000003 LR_fin = LR_start print("LR_fin = " + str(LR_fin)) LR_decay = (LR_fin / LR_start)**(1. / num_epochs) print("LR_decay = " + str(LR_decay)) # BTW, LR decay might good for the BN moving average... # replace the dataset print('Loading SFEW2 dataset...') [train_x, train_y, val_x, val_y] = SFEW2.load_train_val() print(train_x.shape) print(train_y.shape) print(val_x.shape) print(val_y.shape) print('last training minibatch size: ' + str(train_x.shape[0] - train_x.shape[0] / batch_size * batch_size) + ' / ' + str(batch_size)) print( 'last training minibatch size should not be too small (except 0). try decrease the batch_size, but not add more minibatches.' ) print('minibatches size: ' + str(batch_size)) print('suggested minibatches size: ' + str( math.ceil( float(train_x.shape[0]) / math.ceil(float(train_x.shape[0]) / 100)))) print('Building the MLP...') # Prepare Theano variables for inputs and targets input = T.matrix('inputs') target = T.matrix('targets') LR = T.scalar('LR', dtype=theano.config.floatX) mlp = lasagne.layers.InputLayer(shape=(None, train_x.shape[1]), input_var=input) mlp = lasagne.layers.DropoutLayer(mlp, p=dropout_in) for k in range(N_HIDDEN_LAYERS): # pretrain-finetune if (k == 0): # fixed num_units mlp = binary_net.DenseLayer( mlp, binary=binary, stochastic=stochastic, H=H, W_LR_scale=W_LR_scale, nonlinearity=lasagne.nonlinearities.identity, num_units=1500) # scale down the LR of transfered dense layer print('scale down the LR of transfered dense layer from', str(mlp.W_LR_scale)) mlp.W_LR_scale *= np.float32(FINTUNE_SCALE) print('to', str(mlp.W_LR_scale)) else: mlp = binary_net.DenseLayer( mlp, binary=binary, stochastic=stochastic, H=H, W_LR_scale=W_LR_scale, nonlinearity=lasagne.nonlinearities.identity, num_units=NUM_UNITS) mlp = lasagne.layers.BatchNormLayer(mlp, epsilon=epsilon, alpha=alpha) mlp = lasagne.layers.NonlinearityLayer(mlp, nonlinearity=activation) mlp = lasagne.layers.DropoutLayer(mlp, p=dropout_hidden) # pretrain-finetune # only restore the first layer group if (k == 0): if (FINTUNE_SNAPSHOT != 0): print('Load ./W-%d.npz' % FINTUNE_SNAPSHOT) with np.load('./W-%d.npz' % FINTUNE_SNAPSHOT) as f: param_values = [ f['arr_%d' % i] for i in range(len(f.files)) ] param_values = param_values[0:6] lasagne.layers.set_all_param_values(mlp, param_values) mlp = binary_net.DenseLayer(mlp, binary=binary, stochastic=stochastic, H=H, W_LR_scale=W_LR_scale, nonlinearity=lasagne.nonlinearities.identity, num_units=7) mlp = lasagne.layers.BatchNormLayer(mlp, epsilon=epsilon, alpha=alpha) # network output BN or SGN if OUTPUT_TYPE == 'C': pass # elif OUTPUT_TYPE == 'D': mlp = lasagne.layers.NonlinearityLayer(mlp, nonlinearity=activation) else: assert (False) # loss weight nodes SPARSITY = 0.9 SPARSITY_MAP = (np.float32(train_x == -1)).mean(0) LOSS_WEIGHT_1 = 1. + input * (2. * SPARSITY - 1) LOSS_WEIGHT_1 /= 4 * SPARSITY * (1 - SPARSITY ) # fixed 1->-1:5 -1->1:5/9 weights LOSS_WEIGHT_2 = 1. + input * (2. * SPARSITY_MAP - 1) # LOSS_WEIGHT_2 /= 4 * SPARSITY_MAP * ( 1 - SPARSITY_MAP) # weights considering element's prior probability # train loss nodes train_output = lasagne.layers.get_output(mlp, deterministic=False) if MAIN_LOSS_TYPE == 'SH': train_loss = T.mean(T.sqr(T.maximum(0., 1. - target * train_output))) elif MAIN_LOSS_TYPE == 'W1SH': train_loss = T.mean( T.sqr(T.maximum(0., (1. - target * train_output))) * LOSS_WEIGHT_1) elif MAIN_LOSS_TYPE == 'W2SH': train_loss = T.mean( T.sqr(T.maximum(0., (1. - target * train_output))) * LOSS_WEIGHT_2) elif MAIN_LOSS_TYPE == 'H': train_loss = T.mean(T.maximum(0., 1. - target * train_output)) elif MAIN_LOSS_TYPE == 'W1H': train_loss = T.mean( T.maximum(0., (1. - target * train_output)) * LOSS_WEIGHT_1) elif MAIN_LOSS_TYPE == 'W2H': train_loss = T.mean( T.maximum(0., (1. - target * train_output)) * LOSS_WEIGHT_2) else: assert (False) # + sparse penalty if LAMBDA > 0: train_pixel_wise_density = T.mean(T.reshape( (train_output + 1.) / 2., [train_output.shape[0], train_output.shape[1] / 10, 10]), axis=2) train_penalty = LAMBDA * T.mean( T.sqr(train_pixel_wise_density - (1. - SPARSITY))) else: train_penalty = T.constant(0.) train_loss = train_loss + train_penalty # acc train_acc = T.mean(T.eq(T.argmax(train_output, axis=1), T.argmax(target, axis=1)), dtype=theano.config.floatX) # grad nodes if binary: # W updates W = lasagne.layers.get_all_params(mlp, binary=True) W_grads = binary_net.compute_grads(train_loss, mlp) updates = lasagne.updates.adam(loss_or_grads=W_grads, params=W, learning_rate=LR) updates = binary_net.clipping_scaling(updates, mlp) # other parameters updates params = lasagne.layers.get_all_params(mlp, trainable=True, binary=False) updates = OrderedDict(updates.items() + lasagne.updates.adam( loss_or_grads=train_loss, params=params, learning_rate=LR).items()) else: params = lasagne.layers.get_all_params(mlp, trainable=True) updates = lasagne.updates.adam(loss_or_grads=train_loss, params=params, learning_rate=LR) # val loss nodes # must be created after grad nodes val_output = lasagne.layers.get_output(mlp, deterministic=True) if MAIN_LOSS_TYPE == 'SH': val_loss = T.mean(T.sqr(T.maximum(0., 1. - target * val_output))) elif MAIN_LOSS_TYPE == 'W1SH': val_loss = T.mean( T.sqr(T.maximum(0., (1. - target * val_output))) * LOSS_WEIGHT_1) elif MAIN_LOSS_TYPE == 'W2SH': val_loss = T.mean( T.sqr(T.maximum(0., (1. - target * val_output))) * LOSS_WEIGHT_2) elif MAIN_LOSS_TYPE == 'H': val_loss = T.mean(T.maximum(0., 1. - target * val_output)) elif MAIN_LOSS_TYPE == 'W1H': val_loss = T.mean( T.maximum(0., (1. - target * val_output)) * LOSS_WEIGHT_1) elif MAIN_LOSS_TYPE == 'W2H': val_loss = T.mean( T.maximum(0., (1. - target * val_output)) * LOSS_WEIGHT_2) # + sparse penalty if LAMBDA > 0: val_pixel_wise_density = T.mean(T.reshape( (val_output + 1.) / 2., [val_output.shape[0], val_output.shape[1] / 10, 10]), axis=2) val_penalty = LAMBDA * T.mean( T.sqr(val_pixel_wise_density - (1. - SPARSITY))) else: val_penalty = T.constant(0.) val_loss = val_loss + val_penalty # acc val_acc = T.mean(T.eq(T.argmax(val_output, axis=1), T.argmax(target, axis=1)), dtype=theano.config.floatX) # Compile a function performing a training step on a mini-batch (by giving the updates dictionary) # and returning the corresponding training train_loss: train_fn = theano.function( [input, target, LR], [train_loss, train_penalty, train_acc, train_output], updates=updates) # Compile a second function computing the validation train_loss and accuracy: val_fn = theano.function([input, target], [val_loss, val_penalty, val_acc, val_output]) print('Training...') train_x = binary_net.MoveParameter(train_x) binary_net.train(train_fn, val_fn, batch_size, LR_start, LR_decay, num_epochs, train_x, train_y, val_x, val_y)
def main(): # BN parameters batch_size = 100 logger_lip.info("batch_size = %s", batch_size) # alpha is the exponential moving average factor alpha = .1 logger_lip.info("alpha = %s", alpha) epsilon = 1e-4 logger_lip.info("epsilon = %s", epsilon) # BinaryOut activation = binary_net.binary_tanh_unit print("activation = binary_tanh_unit") stochastic = True print("stochastic = " + str(stochastic)) # (-H,+H) are the two binary values #H = "Glorot" H = 1. print("H = " + str(H)) # W_LR_scale = 1. W_LR_scale = "Glorot" # "Glorot" means we are using the coefficients from Glorot's paper print("W_LR_scale = " + str(W_LR_scale)) # Training parameters num_epochs = 50 logger_lip.info("num_epochs = %s", num_epochs) # Decaying LR LR_start = 0.1 logger_lip.info("LR_start = %s", LR_start) LR_fin = 0.0000003 logger_lip.info("LR_fin = %s", LR_fin) # LR_decay = (LR_fin / LR_start) ** (1. / num_epochs) LR_decay = 0.5 # sqrt(0.5) logger_lip.info("LR_decay = %s", LR_decay) # BTW, LR decay might good for the BN moving average... shuffle_parts = 1 logger_lip.info("shuffle_parts = %s", shuffle_parts) if binary: oneHot = True else: oneHot = False ############################################## network_type = "google" viseme = False # will set nbClasses and store path vis: 6.498.828 phn: 7.176.231 if viseme: nbClasses = 12 else: nbClasses = 39 # get the database # If it's small (lipspeakers) -> generate X_train, y_train etc here # otherwise we need to load and generate each speaker seperately in the training loop dataset = "TCDTIMIT" root_dir = os.path.join( os.path.expanduser('~/TCDTIMIT/lipreading/' + dataset)) results_dir = root_dir + "/results/CNN_binaryNet" if not os.path.exists(results_dir): os.makedirs(results_dir) if viseme: database_binaryDir = root_dir + '/binaryViseme' else: database_binaryDir = root_dir + '/binary' datasetType = "lipspeakers" # "lipspeakers" #"volunteers" #"volunteers" # lipspeakers or volunteers" ############################################## if datasetType == "lipspeakers": loadPerSpeaker = False # only lipspeakers small enough to fit in CPU RAM, generate X_train etc here storeProcessed = True processedDir = database_binaryDir + "_allLipspeakersProcessed" # TODO: prepLip_all can be used to generate pkl containing all the lipspeaker data. Not sure if this stil works, so use with care! if not oneHot: pkl_path = processedDir + os.sep + datasetType + ".pkl" else: pkl_path = processedDir + os.sep + datasetType + "_oneHot" + ".pkl" if not os.path.exists(pkl_path): logger_lip.info("dataset not yet processed. Processing...") code.lipreading.preprocessLipreading.prepLip_all( data_path=database_binaryDir, store_path=pkl_path, trainFraction=0.7, validFraction=0.1, testFraction=0.2, nbClasses=nbClasses, onehot=oneHot, type=datasetType, verbose=True) datasetFiles = code.lipreading.general_tools.unpickle(pkl_path) X_train, y_train, X_val, y_val, X_test, y_test = datasetFiles dtypeX = 'float32' dtypeY = 'float32' X_train = X_train.astype(dtypeX) y_train = y_train.astype(dtypeY) X_val = X_val.astype(dtypeX) y_val = y_val.astype(dtypeY) X_test = X_test.astype(dtypeX) y_test = y_test.astype(dtypeY) datasetFiles = [X_train, y_train, X_val, y_val, X_test, y_test] # These files have been generated with datasetToPkl_fromCombined, so that the train/val/test set are the same as for combinedSR. # X_train, y_train = unpickle(os.path.expanduser("~/TCDTIMIT/lipreading/TCDTIMIT/binary/allLipspeakersTrain.pkl")) # X_val, y_val = unpickle(os.path.expanduser("~/TCDTIMIT/lipreading/TCDTIMIT/binary/allLipspeakersVal.pkl")) # X_test, y_test = unpickle(os.path.expanduser("~/TCDTIMIT/lipreading/TCDTIMIT/binary/allLipspeakersTest.pkl")) # datasetFiles = [X_train, y_train, X_val, y_val, X_test, y_test] else: # we need to load and preprocess each speaker before we evaluate, because dataset is too large and doesn't fit in CPU RAM loadPerSpeaker = True storeProcessed = True # if you have about 10GB hdd space, you can increase the speed by not reprocessing it each iteration processedDir = database_binaryDir + "_finalProcessed" # you can just run this program and it will generate the files the first time it encounters them, or generate them manually with datasetToPkl.py # just get the names testVolunteerNumbers = [ "13F", "15F", "21M", "23M", "24M", "25M", "28M", "29M", "30F", "31F", "34M", "36F", "37F", "43F", "47M", "51F", "54M" ] testVolunteers = [ str(testNumber) + ".pkl" for testNumber in testVolunteerNumbers ] lipspeakers = ["Lipspkr1.pkl", "Lipspkr2.pkl", "Lipspkr3.pkl"] allSpeakers = [ f for f in os.listdir(database_binaryDir) if os.path.isfile(os.path.join(database_binaryDir, f)) and os.path.splitext(f)[1] == ".pkl" ] trainVolunteers = [ f for f in allSpeakers if not (f in testVolunteers or f in lipspeakers) ] trainVolunteers = [vol for vol in trainVolunteers if vol is not None] if datasetType == "combined": trainingSpeakerFiles = trainVolunteers + lipspeakers testSpeakerFiles = testVolunteers elif datasetType == "volunteers": trainingSpeakerFiles = trainVolunteers testSpeakerFiles = testVolunteers else: raise Exception("invalid dataset entered") datasetFiles = [trainingSpeakerFiles, testSpeakerFiles] model_name = datasetType + "_" + network_type + "_" + ("viseme" if viseme else "phoneme") + str(nbClasses) \ + ("_binary" if binary else "") model_save_name = os.path.join(results_dir, model_name) # log file logFile = results_dir + os.sep + model_name + '.log' # if os.path.exists(logFile): # fh = logging.FileHandler(logFileT) # append to existing log # else: fh = logging.FileHandler(logFile, 'w') # create new logFile fh.setLevel(logging.DEBUG) fh.setFormatter(formatter) logger_lip.addHandler(fh) logger_lip.info('Building the CNN...') # Prepare Theano variables for inputs and targets inputs = T.tensor4('inputs') if oneHot: targets = T.matrix('targets') else: targets = T.ivector('targets') LR = T.scalar('LR', dtype=theano.config.floatX) # get the network structure l_out = code.lipreading.buildNetworks.build_network_google_binary( activation, alpha, epsilon, inputs, binary, stochastic, H, W_LR_scale) # 7176231 params for layer in L.get_all_layers(l_out): print(layer) # print het amount of network parameters logger_lip.info("Using the %s network", network_type) logger_lip.info("The number of parameters of this network: %s", L.count_params(l_out)) logger_lip.info("loading %s", model_save_name + '.npz') load_model(model_save_name + '.npz', l_out) logger_lip.info("* COMPILING FUNCTIONS...") train_output = lasagne.layers.get_output(l_out, deterministic=False) # squared hinge loss loss = T.mean(T.sqr(T.maximum(0., 1. - targets * train_output))) # W updates W = lasagne.layers.get_all_params(l_out, binary=True) W_grads = binary_net.compute_grads(loss, l_out) updates = lasagne.updates.adam(loss_or_grads=W_grads, params=W, learning_rate=LR) updates = binary_net.clipping_scaling(updates, l_out) # other parameters updates params = lasagne.layers.get_all_params(l_out, trainable=True, binary=False) updates = OrderedDict(updates.items() + lasagne.updates.adam( loss_or_grads=loss, params=params, learning_rate=LR).items()) test_output = lasagne.layers.get_output(l_out, deterministic=True) out_fn = theano.function([inputs], test_output) test_loss = T.mean(T.sqr(T.maximum(0., 1. - targets * test_output))) test_acc = T.mean(T.eq(T.argmax(test_output, axis=1), T.argmax(targets, axis=1)), dtype=theano.config.floatX) k = 3 test_top3_acc = T.zeros((1, )) topk_acc_fn = theano.function([], test_top3_acc) val_fn = theano.function([inputs, targets], [test_loss, test_acc, test_top3_acc]) if debug: nb = 3 debugX = X_train[0:nb] debugY = y_train[0:nb] out = out_fn(debugX) val = val_fn(debugX, debugY) import pdb pdb.set_trace() # Compile a function performing a training step on a mini-batch (by giving the updates dictionary) # and returning the corresponding training loss: train_fn = theano.function([inputs, targets, LR], loss, updates=updates) logger_lip.info('Training...') import code.lipreading.train_lipreading code.lipreading.train_lipreading.train( train_fn=train_fn, val_fn=val_fn, out_fn=out_fn, topk_acc_fn=topk_acc_fn, k=k, network_output_layer=l_out, batch_size=batch_size, LR_start=LR_start, LR_decay=LR_decay, num_epochs=num_epochs, dataset=datasetFiles, database_binaryDir=database_binaryDir, storeProcessed=storeProcessed, processedDir=processedDir, loadPerSpeaker=loadPerSpeaker, justTest=justTest, save_name=model_save_name, shuffleEnabled=True)
def trial(N_HIDDEN_LAYERS, NUM_UNITS, OUTPUT_TYPE, MAIN_LOSS_TYPE, LAMBDA, FOLD): # BN parameters batch_size = 100 print("batch_size = " + str(batch_size)) # alpha is the exponential moving average factor # alpha = .15 alpha = .1 print("alpha = " + str(alpha)) epsilon = 1e-4 print("epsilon = " + str(epsilon)) # MLP parameters #NUM_UNITS = 25 print("NUM_UNITS = " + str(NUM_UNITS)) #N_HIDDEN_LAYERS = 1 print("N_HIDDEN_LAYERS = " + str(N_HIDDEN_LAYERS)) # Training parameters num_epochs = 1000000 print("num_epochs = " + str(num_epochs)) # Dropout parameters dropout_in = .2 # 0. means no dropout print("dropout_in = " + str(dropout_in)) dropout_hidden = .5 print("dropout_hidden = " + str(dropout_hidden)) # BinaryOut activation = binary_net.binary_tanh_unit print("activation = binary_net.binary_tanh_unit") # activation = binary_net.binary_sigmoid_unit # print("activation = binary_net.binary_sigmoid_unit") # BinaryConnect binary = True print("binary = " + str(binary)) stochastic = False print("stochastic = " + str(stochastic)) # (-H,+H) are the two binary values # H = "Glorot" H = 1. print("H = " + str(H)) # W_LR_scale = 1. W_LR_scale = "Glorot" # "Glorot" means we are using the coefficients from Glorot's paper print("W_LR_scale = " + str(W_LR_scale)) # Decaying LR #LR_start = .003 LR_start = 0.000003 print("LR_start = " + str(LR_start)) #LR_fin = 0.0000003 LR_fin = LR_start print("LR_fin = " + str(LR_fin)) LR_decay = (LR_fin / LR_start)**(1. / num_epochs) print("LR_decay = " + str(LR_decay)) # BTW, LR decay might good for the BN moving average... # replace the dataset print('Loading SFEW2 dataset...') [train_x] = SFEW2.load_lfw() assert (train_x.shape[0] == 26404) train_x = train_x[0:26400, :] [val_x, _, _, _] = SFEW2.load_train_val() print(train_x.shape) print(val_x.shape) print('last training minibatch size: ' + str(train_x.shape[0] - train_x.shape[0] / batch_size * batch_size) + ' / ' + str(batch_size)) print( 'last training minibatch size should not be too small (except 0). try decrease the batch_size, but not add more minibatches.' ) print('minibatches size: ' + str(batch_size)) print('suggested minibatches size: ' + str( math.ceil( float(train_x.shape[0]) / math.ceil(float(train_x.shape[0]) / 100)))) ############################################################################################## print('Building the MLP...') # Prepare Theano variables for inputs and targets input = T.matrix('inputs') LR = T.scalar('LR', dtype=theano.config.floatX) mlp = lasagne.layers.InputLayer(shape=(None, train_x.shape[1]), input_var=input) mlp = lasagne.layers.DropoutLayer( mlp, p=0) # train BAE-2: no dropout on input & BAE-1 layer for k in range(N_HIDDEN_LAYERS): if (k == 0): mlp = binary_net.DenseLayer( mlp, binary=binary, stochastic=stochastic, H=H, W_LR_scale=W_LR_scale, nonlinearity=lasagne.nonlinearities.identity, num_units=NUM_UNITS) elif (k == 1): mlp = binary_net.DenseLayer( mlp, binary=binary, stochastic=stochastic, H=H, W_LR_scale=W_LR_scale, nonlinearity=lasagne.nonlinearities.identity, num_units=NUM_UNITS * 2) else: assert (False) #if(k==0): # print('scale down the LR of transfered dense layer from', str(mlp.W_LR_scale)) # mlp.W_LR_scale = 0 # print('to', str(mlp.W_LR_scale)) if (k == 0): # BAE1 encoder: BN mlp = lasagne.layers.BatchNormLayer(mlp, epsilon=epsilon, alpha=alpha) elif (k == 1): # BAE2 encoder: do not use BN for encouraging sparsity pass else: # further layer use BN mlp = lasagne.layers.BatchNormLayer(mlp, epsilon=epsilon, alpha=alpha) # midactivation place before hard tanh # encoder and decoder should not use BatchNorm # "l1 reg" on midactivation if (k == 1): mlp_midactivation = mlp mlp = lasagne.layers.NonlinearityLayer(mlp, nonlinearity=activation) if (k == 0): mlp = lasagne.layers.DropoutLayer( mlp, p=0) # train BAE-2: no dropout on input & BAE-1 layer else: mlp = lasagne.layers.DropoutLayer(mlp, p=dropout_hidden) # pretrain-finetune # only restore the first layer group if (k == 0): print('Load ./W-1168.npz') with np.load('./W-1168.npz') as f: param_values = [f['arr_%d' % i] for i in range(len(f.files))] param_values = param_values[0:6] lasagne.layers.set_all_param_values(mlp, param_values) mlp_groundtruth = mlp mlp = binary_net.DenseLayer(mlp, binary=binary, stochastic=stochastic, H=H, W_LR_scale=W_LR_scale, nonlinearity=lasagne.nonlinearities.identity, num_units=1500) mlp = lasagne.layers.BatchNormLayer(mlp, epsilon=epsilon, alpha=alpha) # network output BN or SGN if OUTPUT_TYPE == 'C': pass # elif OUTPUT_TYPE == 'D': mlp = lasagne.layers.NonlinearityLayer(mlp, nonlinearity=activation) else: assert (False) ''' # equal transform validation # 1 set AE transform to I # 1 modift AE DenseLayer.get_output_for() use W(0 1) instead of Wb(+1 -1) # 2 set encoder's dropout=0 # 3 comment out encoder's and decoder's BatchNormLayer, modify set_all_param_values # will see train loss = 0 pv = lasagne.layers.get_all_param_values(mlp) pv[2] = np.identity(1500, np.float64) pv[4] = np.identity(1500, np.float64) lasagne.layers.set_all_param_values(mlp, pv) ''' ''' # loss weight nodes SPARSITY = 0.9 SPARSITY_MAP = (np.float32(train_x==-1)).mean(0) LOSS_WEIGHT_1 = 1.+input*(2.*SPARSITY-1) LOSS_WEIGHT_1 /= 4*SPARSITY*(1 - SPARSITY)# fixed 1->-1:5 -1->1:5/9 weights LOSS_WEIGHT_2 = 1.+input*(2.*SPARSITY_MAP-1)# LOSS_WEIGHT_2 /= 4*SPARSITY_MAP*(1 - SPARSITY_MAP)# weights considering element's prior probability ''' # train loss nodes ''' train_output = lasagne.layers.get_output(mlp, deterministic=False) if MAIN_LOSS_TYPE=='SH': train_loss = T.mean(T.sqr(T.maximum(0.,1.-input*train_output))) elif MAIN_LOSS_TYPE == 'W1SH': train_loss = T.mean(T.sqr(T.maximum(0., (1. - input * train_output))) * LOSS_WEIGHT_1) elif MAIN_LOSS_TYPE == 'W2SH': train_loss = T.mean(T.sqr(T.maximum(0., (1. - input * train_output))) * LOSS_WEIGHT_2) elif MAIN_LOSS_TYPE == 'H': train_loss = T.mean(T.maximum(0.,1.-input*train_output)) elif MAIN_LOSS_TYPE == 'W1H': train_loss = T.mean(T.maximum(0., (1. - input * train_output)) * LOSS_WEIGHT_1) elif MAIN_LOSS_TYPE == 'W2H': train_loss = T.mean(T.maximum(0., (1. - input * train_output)) * LOSS_WEIGHT_2) else: assert(False) ''' [ train_output_mlp_groundtruth, train_output_mlp_midactivation, train_output ] = lasagne.layers.get_output([mlp_groundtruth, mlp_midactivation, mlp], deterministic=False) train_loss = T.mean( T.maximum(0., 1. - train_output_mlp_groundtruth * train_output)) # + sparse penalty ''' if LAMBDA>0: train_pixel_wise_density = T.mean(T.reshape((train_output+1.)/2., [train_output.shape[0], train_output.shape[1]/10, 10]), axis=2) train_penalty = LAMBDA*T.mean(T.sqr(train_pixel_wise_density - (1.-SPARSITY))) else: train_penalty = T.constant(0.) train_loss = train_loss + train_penalty ''' if LAMBDA > 0: train_penalty = LAMBDA * T.mean( T.maximum(0., 1. + train_output_mlp_midactivation)) else: train_penalty = T.constant(0.) train_loss = train_loss + train_penalty # grad nodes if binary: # W updates W = lasagne.layers.get_all_params(mlp, binary=True) W_grads = binary_net.compute_grads(train_loss, mlp) # untrainable W1 assert (len(W) == 3) assert (len(W_grads) == 3) W = W[1:len(W)] W_grads = W_grads[1:len(W_grads)] assert (len(W) == 2) assert (len(W_grads) == 2) updates = lasagne.updates.adam(loss_or_grads=W_grads, params=W, learning_rate=LR) updates = binary_net.clipping_scaling(updates, mlp) # other parameters updates params = lasagne.layers.get_all_params(mlp, trainable=True, binary=False) # untrainable b1 bn1 assert (len(params) == 7) assert (params[0].name == 'b') # fix assert (params[1].name == 'beta') # fix assert (params[2].name == 'gamma') # fix assert (params[3].name == 'b') assert (params[4].name == 'b') assert (params[5].name == 'beta') assert (params[6].name == 'gamma') params = params[3:len(params)] assert (len(params) == 4) updates = OrderedDict(updates.items() + lasagne.updates.adam( loss_or_grads=train_loss, params=params, learning_rate=LR).items()) else: params = lasagne.layers.get_all_params(mlp, trainable=True) updates = lasagne.updates.adam(loss_or_grads=train_loss, params=params, learning_rate=LR) ############################################################################################## # val loss nodes # must be created after grad nodes ''' val_output = lasagne.layers.get_output(mlp, deterministic=True) if MAIN_LOSS_TYPE=='SH': val_loss = T.mean(T.sqr(T.maximum(0.,1.-input*val_output))) elif MAIN_LOSS_TYPE == 'W1SH': val_loss = T.mean(T.sqr(T.maximum(0., (1. - input * val_output))) * LOSS_WEIGHT_1) elif MAIN_LOSS_TYPE == 'W2SH': val_loss = T.mean(T.sqr(T.maximum(0., (1. - input * val_output))) * LOSS_WEIGHT_2) elif MAIN_LOSS_TYPE == 'H': val_loss = T.mean(T.maximum(0.,1.-input*val_output)) elif MAIN_LOSS_TYPE == 'W1H': val_loss = T.mean(T.maximum(0., (1. - input * val_output)) * LOSS_WEIGHT_1) elif MAIN_LOSS_TYPE == 'W2H': val_loss = T.mean(T.maximum(0., (1. - input * val_output)) * LOSS_WEIGHT_2) ''' [val_output_mlp_groundtruth, val_output_mlp_midactivation, val_output ] = lasagne.layers.get_output([mlp_groundtruth, mlp_midactivation, mlp], deterministic=True) val_loss = T.mean( T.maximum(0., 1. - val_output_mlp_groundtruth * val_output)) # + sparse penalty ''' if LAMBDA>0: val_pixel_wise_density = T.mean(T.reshape((val_output + 1.) / 2., [val_output.shape[0], val_output.shape[1] / 10, 10]), axis=2) val_penalty = LAMBDA*T.mean(T.sqr(val_pixel_wise_density - (1. - SPARSITY))) else: val_penalty = T.constant(0.) val_loss = val_loss + val_penalty ''' if LAMBDA > 0: val_penalty = LAMBDA * T.mean( T.maximum(0., 1. + val_output_mlp_midactivation)) else: val_penalty = T.constant(0.) val_loss = val_loss + val_penalty ############################################################################################## # Compile a function performing a training step on a mini-batch (by giving the updates dictionary) # and returning the corresponding training train_loss: train_fn = theano.function([input, LR], [ train_loss, train_penalty, train_output_mlp_groundtruth, train_output_mlp_midactivation, train_output ], updates=updates) ############################################################################################## # Compile a second function computing the validation loss and accuracy: val_fn = theano.function([input], [ val_loss, val_penalty, val_output_mlp_groundtruth, val_output_mlp_midactivation, val_output ]) ############################################################################################## print('Training...') train_x = binary_net.MoveParameter(train_x) binary_net.train(train_fn, val_fn, batch_size, LR_start, LR_decay, num_epochs, train_x, val_x, mlp) print('Save W') np.savez('./W.npz', *lasagne.layers.get_all_param_values( mlp)) # W b BN BN BN BN W b BN BN BN BN
def run(binary=False, noise=None, nalpha=0, result_path=None): # BN parameters batch_size = 128 # default: 100 print("batch_size = " + str(batch_size)) # alpha is the exponential moving average factor alpha = .1 # default: .1 print("alpha = " + str(alpha)) epsilon = 1e-4 # default: 1e-4 print("epsilon = " + str(epsilon)) # MLP parameters num_units = 300 # default: 4096 print("num_units = " + str(num_units)) n_hidden_layers = 1 # default: 3 print("n_hidden_layers = " + str(n_hidden_layers)) # Training parameters num_epochs = 500 # default: 1000 print("num_epochs = " + str(num_epochs)) # Dropout parameters dropout_in = .2 # default: .2 print("dropout_in = " + str(dropout_in)) dropout_hidden = .5 # default: .5 print("dropout_hidden = " + str(dropout_hidden)) # BinaryOut if binary: activation = binary_net.binary_tanh_unit print("activation = binary_net.binary_tanh_unit") else: activation = lasagne.nonlinearities.tanh print("activation = lasagne.nonlinearities.tanh") # BinaryConnect print("binary = " + str(binary)) stochastic = False # default: False print("stochastic = " + str(stochastic)) # (-H,+H) are the two binary values # H = "Glorot" H = 1. # default: 1. print("H = " + str(H)) # W_LR_scale = 1. W_LR_scale = "Glorot" # default: "Glorot" print("W_LR_scale = " + str(W_LR_scale)) # Decaying LR LR_start = 0.005 # default: .003 print("LR_start = " + str(LR_start)) LR_fin = 0.0000005 # default: 0.0000003 print("LR_fin = " + str(LR_fin)) LR_decay = (LR_fin / LR_start) ** (1. / num_epochs) print("LR_decay = " + str(LR_decay)) # BTW, LR decay might good for the BN moving average... save_path = None # default: "mnist_parameters.npz" print("save_path = " + str(save_path)) # Load the dataset (https://github.com/mnielsen/neural-networks-and-deep-learning) print('Loading MNIST dataset...') mnist = MnistReader("./data/mnist.pkl.gz") shuffle_parts = 1 # default: 1 print("shuffle_parts = " + str(shuffle_parts)) print("noise = " + str(noise)) print("nalpha = " + str(nalpha)) train_set_size = 50000 # default: 50000 train_X, train_y = mnist.get_train_data(n_samples=train_set_size, noise=noise, alpha=nalpha) validation_X, validation_y = mnist.get_validation_data() test_X, test_y = mnist.get_test_data() print("train_set_size = "+str(train_y.shape[0])) print("validation_set_size = "+str(validation_y.shape[0])) print("test_set_size = "+str(test_y.shape[0])) # Log output with open(result_path + "params.txt", "a+") as l: print("batch_size = " + str(batch_size), file=l) print("alpha = " + str(alpha), file=l) print("epsilon = " + str(epsilon), file=l) print("num_units = " + str(num_units), file=l) print("n_hidden_layers = " + str(n_hidden_layers), file=l) print("num_epochs = " + str(num_epochs), file=l) print("dropout_in = " + str(dropout_in), file=l) print("dropout_hidden = " + str(dropout_hidden), file=l) if binary: print("activation = binary_net.binary_tanh_unit", file=l) else: print("activation = lasagne.nonlinearities.tanh", file=l) print("binary = " + str(binary), file=l) print("stochastic = " + str(stochastic), file=l) print("H = " + str(H), file=l) print("W_LR_scale = " + str(W_LR_scale), file=l) print("LR_start = " + str(LR_start), file=l) print("LR_fin = " + str(LR_fin), file=l) print("LR_decay = " + str(LR_decay), file=l) print("save_path = " + str(save_path), file=l) print("shuffle_parts = " + str(shuffle_parts), file=l) print("noise = " + str(noise), file=l) print("nalpha = " + str(nalpha), file=l) print("train_set_size = "+str(train_y.shape[0]), file=l) print("validation_set_size = "+str(validation_y.shape[0]), file=l) print("test_set_size = "+str(test_y.shape[0]), file=l) # bc01 format # Inputs in the range [-1,+1] # print("Inputs in the range [-1,+1]") train_X = 2 * train_X.reshape(-1, 1, 28, 28) - 1. validation_X = 2 * validation_X.reshape(-1, 1, 28, 28) - 1. test_X = 2 * test_X.reshape(-1, 1, 28, 28) - 1. # flatten targets train_y = np.hstack(train_y) validation_y = np.hstack(validation_y) test_y = np.hstack(test_y) # Onehot the targets train_y = np.float32(np.eye(10)[train_y]) validation_y = np.float32(np.eye(10)[validation_y]) test_y = np.float32(np.eye(10)[test_y]) # for hinge loss train_y = 2 * train_y - 1. validation_y = 2 * validation_y - 1. test_y = 2 * test_y - 1. print('Building the MLP...') # Prepare Theano variables for inputs and targets input = T.tensor4('inputs') target = T.matrix('targets') LR = T.scalar('LR', dtype=theano.config.floatX) mlp = lasagne.layers.InputLayer( shape=(None, 1, 28, 28), input_var=input) mlp = lasagne.layers.DropoutLayer( mlp, p=dropout_in) for k in range(n_hidden_layers): mlp = binary_net.DenseLayer( mlp, binary=binary, stochastic=stochastic, H=H, W_LR_scale=W_LR_scale, nonlinearity=lasagne.nonlinearities.identity, num_units=num_units) mlp = lasagne.layers.BatchNormLayer( mlp, epsilon=epsilon, alpha=alpha) mlp = lasagne.layers.NonlinearityLayer( mlp, nonlinearity=activation) mlp = lasagne.layers.DropoutLayer( mlp, p=dropout_hidden) mlp = binary_net.DenseLayer( mlp, binary=binary, stochastic=stochastic, H=H, W_LR_scale=W_LR_scale, nonlinearity=lasagne.nonlinearities.identity, num_units=10) mlp = lasagne.layers.BatchNormLayer( mlp, epsilon=epsilon, alpha=alpha) train_output = lasagne.layers.get_output(mlp, deterministic=False) # squared hinge loss loss = T.mean(T.sqr(T.maximum(0., 1. - target * train_output))) if binary: # W updates W = lasagne.layers.get_all_params(mlp, binary=True) W_grads = binary_net.compute_grads(loss, mlp) updates = lasagne.updates.adam(loss_or_grads=W_grads, params=W, learning_rate=LR) updates = binary_net.clipping_scaling(updates, mlp) # other parameters updates params = lasagne.layers.get_all_params(mlp, trainable=True, binary=False) updates.update(lasagne.updates.adam(loss_or_grads=loss, params=params, learning_rate=LR)) else: params = lasagne.layers.get_all_params(mlp, trainable=True) updates = lasagne.updates.adam(loss_or_grads=loss, params=params, learning_rate=LR) test_output = lasagne.layers.get_output(mlp, deterministic=True) test_loss = T.mean(T.sqr(T.maximum(0., 1. - target * test_output))) test_err = T.mean(T.neq(T.argmax(test_output, axis=1), T.argmax(target, axis=1)), dtype=theano.config.floatX) # Compile a function performing a training step on a mini-batch (by giving the updates dictionary) # and returning the corresponding training loss: train_fn = theano.function([input, target, LR], loss, updates=updates) # Compile a second function computing the validation loss and accuracy: val_fn = theano.function([input, target], [test_loss, test_err]) print('Training...') binary_net.train( train_fn, val_fn, mlp, batch_size, LR_start, LR_decay, num_epochs, train_X, train_y, validation_X, validation_y, test_X, test_y, save_path, shuffle_parts, result_path)
# binary なパラメータだけを取り出す Wb_list = lasagne.layers.get_all_params(mlp, binary=True) for eW in Wb_list: print('eW:', type(eW), eW) # binary なパラメータのみに対する勾配を求めてリストアップ W_grad_list = binary_net.compute_grads(loss, mlp) print('W_grad_list', type(W_grad_list), W_grad_list) # ADAM学習則による更新式マップ(OrderedDict) updates_b0 = lasagne.updates.adam(loss_or_grads=W_grad_list, params=Wb_list, learning_rate=LR) # バイナリ化のためのクリッピング&スケーリング updates_b1 = binary_net.clipping_scaling(updates_b0, mlp) # other parameters updates # 非バイナリパラメータの更新則 Wr_list = lasagne.layers.get_all_params(mlp, trainable=True, binary=False) # バイナリ+非バイナリ:パラメータ群をまとめる updates = OrderedDict(updates_b1.items() + lasagne.updates.adam( loss_or_grads=loss, params=Wr_list, learning_rate=LR).items()) else: Wr_list = lasagne.layers.get_all_params(mlp, trainable=True) updates = lasagne.updates.adam(loss_or_grads=loss, params=Wr_list,