def trial(N_HIDDEN_LAYERS, NUM_UNITS, OUTPUT_TYPE, MAIN_LOSS_TYPE, LAMBDA, FOLD, FINTUNE_SNAPSHOT, FINTUNE_SCALE): # BN parameters batch_size = 97 print("batch_size = " + str(batch_size)) # alpha is the exponential moving average factor # alpha = .15 alpha = .1 print("alpha = " + str(alpha)) epsilon = 1e-4 print("epsilon = " + str(epsilon)) # MLP parameters #NUM_UNITS = 25 print("NUM_UNITS = " + str(NUM_UNITS)) #N_HIDDEN_LAYERS = 1 print("N_HIDDEN_LAYERS = " + str(N_HIDDEN_LAYERS)) # Training parameters num_epochs = 1000 print("num_epochs = " + str(num_epochs)) # Dropout parameters dropout_in = .2 # 0. means no dropout print("dropout_in = " + str(dropout_in)) dropout_hidden = .5 print("dropout_hidden = " + str(dropout_hidden)) # BinaryOut activation = binary_net.binary_tanh_unit print("activation = binary_net.binary_tanh_unit") # activation = binary_net.binary_sigmoid_unit # print("activation = binary_net.binary_sigmoid_unit") # BinaryConnect binary = True print("binary = " + str(binary)) stochastic = False print("stochastic = " + str(stochastic)) # (-H,+H) are the two binary values # H = "Glorot" H = 1. print("H = " + str(H)) # W_LR_scale = 1. W_LR_scale = "Glorot" # "Glorot" means we are using the coefficients from Glorot's paper print("W_LR_scale = " + str(W_LR_scale)) # Decaying LR #LR_start = .003 LR_start = 0.000003 print("LR_start = " + str(LR_start)) #LR_fin = 0.0000003 LR_fin = LR_start print("LR_fin = " + str(LR_fin)) LR_decay = (LR_fin / LR_start)**(1. / num_epochs) print("LR_decay = " + str(LR_decay)) # BTW, LR decay might good for the BN moving average... # replace the dataset print('Loading SFEW2 dataset...') [train_x, train_y, val_x, val_y] = SFEW2.load_train_val() print(train_x.shape) print(train_y.shape) print(val_x.shape) print(val_y.shape) print('last training minibatch size: ' + str(train_x.shape[0] - train_x.shape[0] / batch_size * batch_size) + ' / ' + str(batch_size)) print( 'last training minibatch size should not be too small (except 0). try decrease the batch_size, but not add more minibatches.' ) print('minibatches size: ' + str(batch_size)) print('suggested minibatches size: ' + str( math.ceil( float(train_x.shape[0]) / math.ceil(float(train_x.shape[0]) / 100)))) print('Building the MLP...') # Prepare Theano variables for inputs and targets input = T.matrix('inputs') target = T.matrix('targets') LR = T.scalar('LR', dtype=theano.config.floatX) mlp = lasagne.layers.InputLayer(shape=(None, train_x.shape[1]), input_var=input) mlp = lasagne.layers.DropoutLayer(mlp, p=dropout_in) for k in range(N_HIDDEN_LAYERS): # pretrain-finetune if (k == 0): # fixed num_units mlp = binary_net.DenseLayer( mlp, binary=binary, stochastic=stochastic, H=H, W_LR_scale=W_LR_scale, nonlinearity=lasagne.nonlinearities.identity, num_units=1500) # scale down the LR of transfered dense layer print('scale down the LR of transfered dense layer from', str(mlp.W_LR_scale)) mlp.W_LR_scale *= np.float32(FINTUNE_SCALE) print('to', str(mlp.W_LR_scale)) else: mlp = binary_net.DenseLayer( mlp, binary=binary, stochastic=stochastic, H=H, W_LR_scale=W_LR_scale, nonlinearity=lasagne.nonlinearities.identity, num_units=NUM_UNITS) mlp = lasagne.layers.BatchNormLayer(mlp, epsilon=epsilon, alpha=alpha) mlp = lasagne.layers.NonlinearityLayer(mlp, nonlinearity=activation) mlp = lasagne.layers.DropoutLayer(mlp, p=dropout_hidden) # pretrain-finetune # only restore the first layer group if (k == 0): if (FINTUNE_SNAPSHOT != 0): print('Load ./W-%d.npz' % FINTUNE_SNAPSHOT) with np.load('./W-%d.npz' % FINTUNE_SNAPSHOT) as f: param_values = [ f['arr_%d' % i] for i in range(len(f.files)) ] param_values = param_values[0:6] lasagne.layers.set_all_param_values(mlp, param_values) mlp = binary_net.DenseLayer(mlp, binary=binary, stochastic=stochastic, H=H, W_LR_scale=W_LR_scale, nonlinearity=lasagne.nonlinearities.identity, num_units=7) mlp = lasagne.layers.BatchNormLayer(mlp, epsilon=epsilon, alpha=alpha) # network output BN or SGN if OUTPUT_TYPE == 'C': pass # elif OUTPUT_TYPE == 'D': mlp = lasagne.layers.NonlinearityLayer(mlp, nonlinearity=activation) else: assert (False) # loss weight nodes SPARSITY = 0.9 SPARSITY_MAP = (np.float32(train_x == -1)).mean(0) LOSS_WEIGHT_1 = 1. + input * (2. * SPARSITY - 1) LOSS_WEIGHT_1 /= 4 * SPARSITY * (1 - SPARSITY ) # fixed 1->-1:5 -1->1:5/9 weights LOSS_WEIGHT_2 = 1. + input * (2. * SPARSITY_MAP - 1) # LOSS_WEIGHT_2 /= 4 * SPARSITY_MAP * ( 1 - SPARSITY_MAP) # weights considering element's prior probability # train loss nodes train_output = lasagne.layers.get_output(mlp, deterministic=False) if MAIN_LOSS_TYPE == 'SH': train_loss = T.mean(T.sqr(T.maximum(0., 1. - target * train_output))) elif MAIN_LOSS_TYPE == 'W1SH': train_loss = T.mean( T.sqr(T.maximum(0., (1. - target * train_output))) * LOSS_WEIGHT_1) elif MAIN_LOSS_TYPE == 'W2SH': train_loss = T.mean( T.sqr(T.maximum(0., (1. - target * train_output))) * LOSS_WEIGHT_2) elif MAIN_LOSS_TYPE == 'H': train_loss = T.mean(T.maximum(0., 1. - target * train_output)) elif MAIN_LOSS_TYPE == 'W1H': train_loss = T.mean( T.maximum(0., (1. - target * train_output)) * LOSS_WEIGHT_1) elif MAIN_LOSS_TYPE == 'W2H': train_loss = T.mean( T.maximum(0., (1. - target * train_output)) * LOSS_WEIGHT_2) else: assert (False) # + sparse penalty if LAMBDA > 0: train_pixel_wise_density = T.mean(T.reshape( (train_output + 1.) / 2., [train_output.shape[0], train_output.shape[1] / 10, 10]), axis=2) train_penalty = LAMBDA * T.mean( T.sqr(train_pixel_wise_density - (1. - SPARSITY))) else: train_penalty = T.constant(0.) train_loss = train_loss + train_penalty # acc train_acc = T.mean(T.eq(T.argmax(train_output, axis=1), T.argmax(target, axis=1)), dtype=theano.config.floatX) # grad nodes if binary: # W updates W = lasagne.layers.get_all_params(mlp, binary=True) W_grads = binary_net.compute_grads(train_loss, mlp) updates = lasagne.updates.adam(loss_or_grads=W_grads, params=W, learning_rate=LR) updates = binary_net.clipping_scaling(updates, mlp) # other parameters updates params = lasagne.layers.get_all_params(mlp, trainable=True, binary=False) updates = OrderedDict(updates.items() + lasagne.updates.adam( loss_or_grads=train_loss, params=params, learning_rate=LR).items()) else: params = lasagne.layers.get_all_params(mlp, trainable=True) updates = lasagne.updates.adam(loss_or_grads=train_loss, params=params, learning_rate=LR) # val loss nodes # must be created after grad nodes val_output = lasagne.layers.get_output(mlp, deterministic=True) if MAIN_LOSS_TYPE == 'SH': val_loss = T.mean(T.sqr(T.maximum(0., 1. - target * val_output))) elif MAIN_LOSS_TYPE == 'W1SH': val_loss = T.mean( T.sqr(T.maximum(0., (1. - target * val_output))) * LOSS_WEIGHT_1) elif MAIN_LOSS_TYPE == 'W2SH': val_loss = T.mean( T.sqr(T.maximum(0., (1. - target * val_output))) * LOSS_WEIGHT_2) elif MAIN_LOSS_TYPE == 'H': val_loss = T.mean(T.maximum(0., 1. - target * val_output)) elif MAIN_LOSS_TYPE == 'W1H': val_loss = T.mean( T.maximum(0., (1. - target * val_output)) * LOSS_WEIGHT_1) elif MAIN_LOSS_TYPE == 'W2H': val_loss = T.mean( T.maximum(0., (1. - target * val_output)) * LOSS_WEIGHT_2) # + sparse penalty if LAMBDA > 0: val_pixel_wise_density = T.mean(T.reshape( (val_output + 1.) / 2., [val_output.shape[0], val_output.shape[1] / 10, 10]), axis=2) val_penalty = LAMBDA * T.mean( T.sqr(val_pixel_wise_density - (1. - SPARSITY))) else: val_penalty = T.constant(0.) val_loss = val_loss + val_penalty # acc val_acc = T.mean(T.eq(T.argmax(val_output, axis=1), T.argmax(target, axis=1)), dtype=theano.config.floatX) # Compile a function performing a training step on a mini-batch (by giving the updates dictionary) # and returning the corresponding training train_loss: train_fn = theano.function( [input, target, LR], [train_loss, train_penalty, train_acc, train_output], updates=updates) # Compile a second function computing the validation train_loss and accuracy: val_fn = theano.function([input, target], [val_loss, val_penalty, val_acc, val_output]) print('Training...') train_x = binary_net.MoveParameter(train_x) binary_net.train(train_fn, val_fn, batch_size, LR_start, LR_decay, num_epochs, train_x, train_y, val_x, val_y)
def trial(N_HIDDEN_LAYERS, NUM_UNITS, OUTPUT_TYPE, MAIN_LOSS_TYPE, LAMBDA, FOLD): # BN parameters batch_size = 100 print("batch_size = " + str(batch_size)) # alpha is the exponential moving average factor # alpha = .15 alpha = .1 print("alpha = " + str(alpha)) epsilon = 1e-4 print("epsilon = " + str(epsilon)) # MLP parameters #NUM_UNITS = 25 print("NUM_UNITS = " + str(NUM_UNITS)) #N_HIDDEN_LAYERS = 1 print("N_HIDDEN_LAYERS = " + str(N_HIDDEN_LAYERS)) # Training parameters num_epochs = 1000000 print("num_epochs = " + str(num_epochs)) # Dropout parameters dropout_in = .2 # 0. means no dropout print("dropout_in = " + str(dropout_in)) dropout_hidden = .5 print("dropout_hidden = " + str(dropout_hidden)) # BinaryOut activation = binary_net.binary_tanh_unit print("activation = binary_net.binary_tanh_unit") # activation = binary_net.binary_sigmoid_unit # print("activation = binary_net.binary_sigmoid_unit") # BinaryConnect binary = True print("binary = " + str(binary)) stochastic = False print("stochastic = " + str(stochastic)) # (-H,+H) are the two binary values # H = "Glorot" H = 1. print("H = " + str(H)) # W_LR_scale = 1. W_LR_scale = "Glorot" # "Glorot" means we are using the coefficients from Glorot's paper print("W_LR_scale = " + str(W_LR_scale)) # Decaying LR #LR_start = .003 LR_start = 0.000003 print("LR_start = " + str(LR_start)) #LR_fin = 0.0000003 LR_fin = LR_start print("LR_fin = " + str(LR_fin)) LR_decay = (LR_fin / LR_start)**(1. / num_epochs) print("LR_decay = " + str(LR_decay)) # BTW, LR decay might good for the BN moving average... # replace the dataset print('Loading SFEW2 dataset...') [train_x] = SFEW2.load_lfw() assert (train_x.shape[0] == 26404) train_x = train_x[0:26400, :] [val_x, _, _, _] = SFEW2.load_train_val() print(train_x.shape) print(val_x.shape) print('last training minibatch size: ' + str(train_x.shape[0] - train_x.shape[0] / batch_size * batch_size) + ' / ' + str(batch_size)) print( 'last training minibatch size should not be too small (except 0). try decrease the batch_size, but not add more minibatches.' ) print('minibatches size: ' + str(batch_size)) print('suggested minibatches size: ' + str( math.ceil( float(train_x.shape[0]) / math.ceil(float(train_x.shape[0]) / 100)))) ############################################################################################## print('Building the MLP...') # Prepare Theano variables for inputs and targets input = T.matrix('inputs') LR = T.scalar('LR', dtype=theano.config.floatX) mlp = lasagne.layers.InputLayer(shape=(None, train_x.shape[1]), input_var=input) mlp = lasagne.layers.DropoutLayer( mlp, p=0) # train BAE-2: no dropout on input & BAE-1 layer for k in range(N_HIDDEN_LAYERS): if (k == 0): mlp = binary_net.DenseLayer( mlp, binary=binary, stochastic=stochastic, H=H, W_LR_scale=W_LR_scale, nonlinearity=lasagne.nonlinearities.identity, num_units=NUM_UNITS) elif (k == 1): mlp = binary_net.DenseLayer( mlp, binary=binary, stochastic=stochastic, H=H, W_LR_scale=W_LR_scale, nonlinearity=lasagne.nonlinearities.identity, num_units=NUM_UNITS * 2) else: assert (False) #if(k==0): # print('scale down the LR of transfered dense layer from', str(mlp.W_LR_scale)) # mlp.W_LR_scale = 0 # print('to', str(mlp.W_LR_scale)) if (k == 0): # BAE1 encoder: BN mlp = lasagne.layers.BatchNormLayer(mlp, epsilon=epsilon, alpha=alpha) elif (k == 1): # BAE2 encoder: do not use BN for encouraging sparsity pass else: # further layer use BN mlp = lasagne.layers.BatchNormLayer(mlp, epsilon=epsilon, alpha=alpha) # midactivation place before hard tanh # encoder and decoder should not use BatchNorm # "l1 reg" on midactivation if (k == 1): mlp_midactivation = mlp mlp = lasagne.layers.NonlinearityLayer(mlp, nonlinearity=activation) if (k == 0): mlp = lasagne.layers.DropoutLayer( mlp, p=0) # train BAE-2: no dropout on input & BAE-1 layer else: mlp = lasagne.layers.DropoutLayer(mlp, p=dropout_hidden) # pretrain-finetune # only restore the first layer group if (k == 0): print('Load ./W-1168.npz') with np.load('./W-1168.npz') as f: param_values = [f['arr_%d' % i] for i in range(len(f.files))] param_values = param_values[0:6] lasagne.layers.set_all_param_values(mlp, param_values) mlp_groundtruth = mlp mlp = binary_net.DenseLayer(mlp, binary=binary, stochastic=stochastic, H=H, W_LR_scale=W_LR_scale, nonlinearity=lasagne.nonlinearities.identity, num_units=1500) mlp = lasagne.layers.BatchNormLayer(mlp, epsilon=epsilon, alpha=alpha) # network output BN or SGN if OUTPUT_TYPE == 'C': pass # elif OUTPUT_TYPE == 'D': mlp = lasagne.layers.NonlinearityLayer(mlp, nonlinearity=activation) else: assert (False) ''' # equal transform validation # 1 set AE transform to I # 1 modift AE DenseLayer.get_output_for() use W(0 1) instead of Wb(+1 -1) # 2 set encoder's dropout=0 # 3 comment out encoder's and decoder's BatchNormLayer, modify set_all_param_values # will see train loss = 0 pv = lasagne.layers.get_all_param_values(mlp) pv[2] = np.identity(1500, np.float64) pv[4] = np.identity(1500, np.float64) lasagne.layers.set_all_param_values(mlp, pv) ''' ''' # loss weight nodes SPARSITY = 0.9 SPARSITY_MAP = (np.float32(train_x==-1)).mean(0) LOSS_WEIGHT_1 = 1.+input*(2.*SPARSITY-1) LOSS_WEIGHT_1 /= 4*SPARSITY*(1 - SPARSITY)# fixed 1->-1:5 -1->1:5/9 weights LOSS_WEIGHT_2 = 1.+input*(2.*SPARSITY_MAP-1)# LOSS_WEIGHT_2 /= 4*SPARSITY_MAP*(1 - SPARSITY_MAP)# weights considering element's prior probability ''' # train loss nodes ''' train_output = lasagne.layers.get_output(mlp, deterministic=False) if MAIN_LOSS_TYPE=='SH': train_loss = T.mean(T.sqr(T.maximum(0.,1.-input*train_output))) elif MAIN_LOSS_TYPE == 'W1SH': train_loss = T.mean(T.sqr(T.maximum(0., (1. - input * train_output))) * LOSS_WEIGHT_1) elif MAIN_LOSS_TYPE == 'W2SH': train_loss = T.mean(T.sqr(T.maximum(0., (1. - input * train_output))) * LOSS_WEIGHT_2) elif MAIN_LOSS_TYPE == 'H': train_loss = T.mean(T.maximum(0.,1.-input*train_output)) elif MAIN_LOSS_TYPE == 'W1H': train_loss = T.mean(T.maximum(0., (1. - input * train_output)) * LOSS_WEIGHT_1) elif MAIN_LOSS_TYPE == 'W2H': train_loss = T.mean(T.maximum(0., (1. - input * train_output)) * LOSS_WEIGHT_2) else: assert(False) ''' [ train_output_mlp_groundtruth, train_output_mlp_midactivation, train_output ] = lasagne.layers.get_output([mlp_groundtruth, mlp_midactivation, mlp], deterministic=False) train_loss = T.mean( T.maximum(0., 1. - train_output_mlp_groundtruth * train_output)) # + sparse penalty ''' if LAMBDA>0: train_pixel_wise_density = T.mean(T.reshape((train_output+1.)/2., [train_output.shape[0], train_output.shape[1]/10, 10]), axis=2) train_penalty = LAMBDA*T.mean(T.sqr(train_pixel_wise_density - (1.-SPARSITY))) else: train_penalty = T.constant(0.) train_loss = train_loss + train_penalty ''' if LAMBDA > 0: train_penalty = LAMBDA * T.mean( T.maximum(0., 1. + train_output_mlp_midactivation)) else: train_penalty = T.constant(0.) train_loss = train_loss + train_penalty # grad nodes if binary: # W updates W = lasagne.layers.get_all_params(mlp, binary=True) W_grads = binary_net.compute_grads(train_loss, mlp) # untrainable W1 assert (len(W) == 3) assert (len(W_grads) == 3) W = W[1:len(W)] W_grads = W_grads[1:len(W_grads)] assert (len(W) == 2) assert (len(W_grads) == 2) updates = lasagne.updates.adam(loss_or_grads=W_grads, params=W, learning_rate=LR) updates = binary_net.clipping_scaling(updates, mlp) # other parameters updates params = lasagne.layers.get_all_params(mlp, trainable=True, binary=False) # untrainable b1 bn1 assert (len(params) == 7) assert (params[0].name == 'b') # fix assert (params[1].name == 'beta') # fix assert (params[2].name == 'gamma') # fix assert (params[3].name == 'b') assert (params[4].name == 'b') assert (params[5].name == 'beta') assert (params[6].name == 'gamma') params = params[3:len(params)] assert (len(params) == 4) updates = OrderedDict(updates.items() + lasagne.updates.adam( loss_or_grads=train_loss, params=params, learning_rate=LR).items()) else: params = lasagne.layers.get_all_params(mlp, trainable=True) updates = lasagne.updates.adam(loss_or_grads=train_loss, params=params, learning_rate=LR) ############################################################################################## # val loss nodes # must be created after grad nodes ''' val_output = lasagne.layers.get_output(mlp, deterministic=True) if MAIN_LOSS_TYPE=='SH': val_loss = T.mean(T.sqr(T.maximum(0.,1.-input*val_output))) elif MAIN_LOSS_TYPE == 'W1SH': val_loss = T.mean(T.sqr(T.maximum(0., (1. - input * val_output))) * LOSS_WEIGHT_1) elif MAIN_LOSS_TYPE == 'W2SH': val_loss = T.mean(T.sqr(T.maximum(0., (1. - input * val_output))) * LOSS_WEIGHT_2) elif MAIN_LOSS_TYPE == 'H': val_loss = T.mean(T.maximum(0.,1.-input*val_output)) elif MAIN_LOSS_TYPE == 'W1H': val_loss = T.mean(T.maximum(0., (1. - input * val_output)) * LOSS_WEIGHT_1) elif MAIN_LOSS_TYPE == 'W2H': val_loss = T.mean(T.maximum(0., (1. - input * val_output)) * LOSS_WEIGHT_2) ''' [val_output_mlp_groundtruth, val_output_mlp_midactivation, val_output ] = lasagne.layers.get_output([mlp_groundtruth, mlp_midactivation, mlp], deterministic=True) val_loss = T.mean( T.maximum(0., 1. - val_output_mlp_groundtruth * val_output)) # + sparse penalty ''' if LAMBDA>0: val_pixel_wise_density = T.mean(T.reshape((val_output + 1.) / 2., [val_output.shape[0], val_output.shape[1] / 10, 10]), axis=2) val_penalty = LAMBDA*T.mean(T.sqr(val_pixel_wise_density - (1. - SPARSITY))) else: val_penalty = T.constant(0.) val_loss = val_loss + val_penalty ''' if LAMBDA > 0: val_penalty = LAMBDA * T.mean( T.maximum(0., 1. + val_output_mlp_midactivation)) else: val_penalty = T.constant(0.) val_loss = val_loss + val_penalty ############################################################################################## # Compile a function performing a training step on a mini-batch (by giving the updates dictionary) # and returning the corresponding training train_loss: train_fn = theano.function([input, LR], [ train_loss, train_penalty, train_output_mlp_groundtruth, train_output_mlp_midactivation, train_output ], updates=updates) ############################################################################################## # Compile a second function computing the validation loss and accuracy: val_fn = theano.function([input], [ val_loss, val_penalty, val_output_mlp_groundtruth, val_output_mlp_midactivation, val_output ]) ############################################################################################## print('Training...') train_x = binary_net.MoveParameter(train_x) binary_net.train(train_fn, val_fn, batch_size, LR_start, LR_decay, num_epochs, train_x, val_x, mlp) print('Save W') np.savez('./W.npz', *lasagne.layers.get_all_param_values( mlp)) # W b BN BN BN BN W b BN BN BN BN