def test_get_all_params(self): from lasagne.layers import (InputLayer, DenseLayer, count_params) l1 = InputLayer((10, 20)) l2 = DenseLayer(l1, 30) l3 = DenseLayer(l2, 40) num_weights = 20 * 30 + 30 * 40 num_biases = 30 + 40 assert count_params(l3, regularizable=True) == num_weights assert count_params(l3, regularizable=False) == num_biases assert count_params(l3) == num_weights + num_biases
def buildModel(): #this is our input layer with the inputs (None, dimensions, width, height) l_input = layers.InputLayer((None, 3, 64, 64)) #first convolutional layer, has l_input layer as incoming and is followed by a pooling layer l_conv1 = layers.Conv2DLayer(l_input, num_filters=32, filter_size=3, pad='same', nonlinearity=tanh) l_pool1 = layers.MaxPool2DLayer(l_conv1, pool_size=2) #second convolution (l_pool1 is incoming), let's increase the number of filters l_conv2 = layers.Conv2DLayer(l_pool1, num_filters=64, filter_size=3, pad='same', nonlinearity=tanh) l_pool2 = layers.MaxPool2DLayer(l_conv2, pool_size=2) #third convolution (l_pool2 is incoming), even more filters l_conv3 = layers.Conv2DLayer(l_pool2, num_filters=128, filter_size=3, pad='same', nonlinearity=tanh) l_pool3 = layers.MaxPool2DLayer(l_conv3, pool_size=2) #fourth and final convolution l_conv4 = layers.Conv2DLayer(l_pool3, num_filters=256, filter_size=3, pad='same', nonlinearity=tanh) l_pool4 = layers.MaxPool2DLayer(l_conv4, pool_size=2) #our cnn contains 3 dense layers, one of them is our output layer l_dense1 = layers.DenseLayer(l_pool4, num_units=128, nonlinearity=tanh) l_dense2 = layers.DenseLayer(l_dense1, num_units=128, nonlinearity=tanh) #the output layer has 6 units which is exactly the count of our class labels #it has a softmax activation function, its values represent class probabilities l_output = layers.DenseLayer(l_dense2, num_units=6, nonlinearity=softmax) #let's see how many params our net has print ("MODEL HAS"+ str(layers.count_params(l_output))+" PARAMS") #we return the layer stack as our network by returning the last layer return l_output
def buildModel(mtype=1): print "BUILDING MODEL TYPE", mtype, "..." #default settings (Model 1) filters = 64 first_stride = 2 last_filter_multiplier = 16 #specific model type settings (see working notes for details) if mtype == 2: first_stride = 1 elif mtype == 3: filters = 32 last_filter_multiplier = 8 #input layer net = l.InputLayer((None, IM_DIM, IM_SIZE[1], IM_SIZE[0])) #conv layers net = l.batch_norm(l.Conv2DLayer(net, num_filters=filters, filter_size=7, pad='same', stride=first_stride, W=init.HeNormal(gain=INIT_GAIN), nonlinearity=NONLINEARITY)) net = l.MaxPool2DLayer(net, pool_size=2) if mtype == 2: net = l.batch_norm(l.Conv2DLayer(net, num_filters=filters, filter_size=5, pad='same', stride=1, W=init.HeNormal(gain=INIT_GAIN), nonlinearity=NONLINEARITY)) net = l.MaxPool2DLayer(net, pool_size=2) net = l.batch_norm(l.Conv2DLayer(net, num_filters=filters * 2, filter_size=5, pad='same', stride=1, W=init.HeNormal(gain=INIT_GAIN), nonlinearity=NONLINEARITY)) net = l.MaxPool2DLayer(net, pool_size=2) net = l.batch_norm(l.Conv2DLayer(net, num_filters=filters * 4, filter_size=3, pad='same', stride=1, W=init.HeNormal(gain=INIT_GAIN), nonlinearity=NONLINEARITY)) net = l.MaxPool2DLayer(net, pool_size=2) net = l.batch_norm(l.Conv2DLayer(net, num_filters=filters * 8, filter_size=3, pad='same', stride=1, W=init.HeNormal(gain=INIT_GAIN), nonlinearity=NONLINEARITY)) net = l.MaxPool2DLayer(net, pool_size=2) net = l.batch_norm(l.Conv2DLayer(net, num_filters=filters * last_filter_multiplier, filter_size=3, pad='same', stride=1, W=init.HeNormal(gain=INIT_GAIN), nonlinearity=NONLINEARITY)) net = l.MaxPool2DLayer(net, pool_size=2) print "\tFINAL POOL OUT SHAPE:", l.get_output_shape(net) #dense layers net = l.batch_norm(l.DenseLayer(net, 512, W=init.HeNormal(gain=INIT_GAIN), nonlinearity=NONLINEARITY)) net = l.batch_norm(l.DenseLayer(net, 512, W=init.HeNormal(gain=INIT_GAIN), nonlinearity=NONLINEARITY)) #Classification Layer if MULTI_LABEL: net = l.DenseLayer(net, NUM_CLASSES, nonlinearity=nonlinearities.sigmoid, W=init.HeNormal(gain=1)) else: net = l.DenseLayer(net, NUM_CLASSES, nonlinearity=nonlinearities.softmax, W=init.HeNormal(gain=1)) print "...DONE!" #model stats print "MODEL HAS", (sum(hasattr(layer, 'W') for layer in l.get_all_layers(net))), "WEIGHTED LAYERS" print "MODEL HAS", l.count_params(net), "PARAMS" return net
def build_pi_model(): log.i('BUILDING RASBPERRY PI MODEL...') # Random Seed lasagne_random.set_rng(cfg.getRandomState()) # Input layer for images net = l.InputLayer((None, cfg.IM_DIM, cfg.IM_SIZE[1], cfg.IM_SIZE[0])) # Convolutinal layer groups for i in range(len(cfg.FILTERS)): # 3x3 Convolution + Stride net = batch_norm( l.Conv2DLayer(net, num_filters=cfg.FILTERS[i], filter_size=cfg.KERNEL_SIZES[i], num_groups=cfg.NUM_OF_GROUPS[i], pad='same', stride=2, W=initialization(cfg.NONLINEARITY), nonlinearity=nonlinearity(cfg.NONLINEARITY))) log.i(('\tGROUP', i + 1, 'OUT SHAPE:', l.get_output_shape(net))) # Fully connected layers + dropout layers net = l.DenseLayer(net, cfg.DENSE_UNITS, nonlinearity=nonlinearity(cfg.NONLINEARITY), W=initialization(cfg.NONLINEARITY)) net = l.DropoutLayer(net, p=cfg.DROPOUT) net = l.DenseLayer(net, cfg.DENSE_UNITS, nonlinearity=nonlinearity(cfg.NONLINEARITY), W=initialization(cfg.NONLINEARITY)) net = l.DropoutLayer(net, p=cfg.DROPOUT) # Classification Layer (Softmax) net = l.DenseLayer(net, len(cfg.CLASSES), nonlinearity=nonlinearity('softmax'), W=initialization('softmax')) log.i(("\tFINAL NET OUT SHAPE:", l.get_output_shape(net))) log.i("...DONE!") # Model stats log.i(("MODEL HAS", (sum(hasattr(layer, 'W') for layer in l.get_all_layers(net))), "WEIGHTED LAYERS")) log.i(("MODEL HAS", l.count_params(net), "PARAMS")) return net
def buildModel(): print "BUILDING MODEL TYPE..." #default settings filters = 32 first_stride = 2 last_filter_multiplier = 4 #input layer net = l.InputLayer((None, IM_DIM, IM_SIZE[1], IM_SIZE[0])) #conv layers net = l.batch_norm(l.Conv2DLayer(net, num_filters=filters , filter_size=7, pad='same', stride=first_stride, W=init.HeNormal(gain=INIT_GAIN), nonlinearity=NONLINEARITY)) net = l.MaxPool2DLayer(net, pool_size=2) net = l.batch_norm(l.Conv2DLayer(net, num_filters=filters * 2 , filter_size=5, pad='same', stride=1, W=init.HeNormal(gain=INIT_GAIN), nonlinearity=NONLINEARITY)) net = l.MaxPool2DLayer(net, pool_size=2) net = l.batch_norm(l.Conv2DLayer(net, num_filters=filters * 4 , filter_size=3, pad='same', stride=1, W=init.HeNormal(gain=INIT_GAIN), nonlinearity=NONLINEARITY)) net = l.MaxPool2DLayer(net, pool_size=2) net = l.batch_norm(l.Conv2DLayer(net, num_filters=filters * 8 , filter_size=3, pad='same', stride=1, W=init.HeNormal(gain=INIT_GAIN), nonlinearity=NONLINEARITY)) net = l.MaxPool2DLayer(net, pool_size=2) net = l.batch_norm(l.Conv2DLayer(net, num_filters=filters * 16 , filter_size=3, pad='same', stride=1, W=init.HeNormal(gain=INIT_GAIN), nonlinearity=NONLINEARITY)) net = l.MaxPool2DLayer(net, pool_size=2) #net = l.batch_norm(l.Conv2DLayer(net, num_filters=filters * 32 , filter_size=7, pad='same', stride=1, W=init.HeNormal(gain=INIT_GAIN), nonlinearity=NONLINEARITY)) #net = l.MaxPool2DLayer(net, pool_size=2) #print "\tFINAL POOL OUT SHAPE:", l.get_output_shape(net) #dense layers net = l.batch_norm(l.DenseLayer(net, 256, W=init.HeNormal(gain=INIT_GAIN), nonlinearity=NONLINEARITY)) net = l.DropoutLayer(net, DROPOUT) net = l.batch_norm(l.DenseLayer(net, 256, W=init.HeNormal(gain=INIT_GAIN), nonlinearity=NONLINEARITY)) net = l.DropoutLayer(net, DROPOUT) #Classification Layer if MULTI_LABEL: net = l.DenseLayer(net, NUM_CLASSES, nonlinearity=nonlinearities.sigmoid, W=init.HeNormal(gain=1)) else: net = l.DenseLayer(net, NUM_CLASSES, nonlinearity=nonlinearities.sigmoid, W=init.HeNormal(gain=1)) print "...DONE!" #model stats print "MODEL HAS", (sum(hasattr(layer, 'W') for layer in l.get_all_layers(net))), "WEIGHTED LAYERS" print "MODEL HAS", l.count_params(net), "PARAMS" return net
def print_layers(l_out): all_layers = layers.get_all_layers(l_out) print('this network has %d learnable parameters' % ( (layers.count_params(l_out)))) for layer in all_layers: if hasattr(layer, 'W') and hasattr(layer, 'b'): num_params = np.prod( layer.W.get_value().shape) + np.prod(layer.b.get_value().shape) print('layer %s has output shape %r with %d parameters' % ( (layer.name, layer.output_shape, num_params))) else: print('layer %s has output shape %r' % ( (layer.name, layer.output_shape)))
def print_layers(l_out): all_layers = layers.get_all_layers(l_out) print('this network has %d learnable parameters' % ((layers.count_params(l_out)))) for layer in all_layers: if hasattr(layer, 'W') and hasattr(layer, 'b'): num_params = np.prod(layer.W.get_value().shape) + np.prod( layer.b.get_value().shape) print('layer %s has output shape %r with %d parameters' % ((layer.name, layer.output_shape, num_params))) else: print('layer %s has output shape %r' % ((layer.name, layer.output_shape)))
def train_setup(): x = T.tensor3('input') y = T.matrix('output') encoding, decoding = cnn( x, config.input_length, config.output_length, \ config.encoding_length ) print 'Number of Parameters {0}'.format(count_params(decoding)) if config.init_model is not None: with np.load(config.init_model) as f: param_values = [f['arr_%d' % i] for i in range(len(f.files))] set_all_param_values(decoding, param_values) # training tasks in sequence prediction = get_output(decoding) error = squared_error(y, prediction) error = error.mean() l1_norm = config.l1_weight * regularize_network_params(decoding, l1) l2_norm = config.l2_weight * regularize_network_params(decoding, l2) total_error = error + l1_norm + l2_norm params = get_all_params(decoding, trainable=True) updates = adadelta( total_error, params, config.learning_rate, \ config.rho, \ config.eps ) train_fn = function( [x, y], [error, l1_norm, l2_norm], \ updates = updates, \ allow_input_downcast = True ) val_prediction = get_output(decoding, deterministic=True) val_error = squared_error(y, val_prediction) val_error = val_error.mean() val_fn = function([x, y], val_error, allow_input_downcast=True) return encoding, decoding, train_fn, val_fn
def train_setup(): x = T.tensor3('input') y = T.lvector('output') network = cnn(x, config.input_length, config.output_length) print 'Number of Parameters {0}'.format(count_params(network)) if config.init_model is not None: with np.load(config.init_model) as f: param_values = [f['arr_%d' % i] for i in range(len(f.files))] set_all_param_values(decoding, param_values) # training tasks in sequence prediction = get_output(network) ent = categorical_crossentropy(prediction, y) ent = ent.mean() l1_norm = config.l1_weight * regularize_network_params(network, l1) l2_norm = config.l2_weight * regularize_network_params(network, l2) total_error = ent + l1_norm + l2_norm params = get_all_params(network, trainable=True) updates = adadelta( total_error, params, config.learning_rate, \ config.rho, \ config.eps ) train_fn = function( [x, y], [ent, l1_norm, l2_norm, prediction], \ updates = updates, \ allow_input_downcast = True ) val_prediction = get_output(network, deterministic=True) val_ent = categorical_crossentropy(val_prediction, y) val_ent = val_ent.mean() val_fn = function([x, y], [val_ent, val_prediction], allow_input_downcast=True) return network, train_fn, val_fn
def main(config=None, init_path='', out_path='', batchsize=64, dataset='C10', n=31, growth=40, bottleneck=True, neck_size=None, compression=1, dropout=0): # network assert dataset in ('C10', 'C100', 'SVHN') classes = 100 if dataset == 'C100' else 10 model = DenseNet.cifar_model( n=n, growth=growth, bottleneck=bottleneck, neck_size=neck_size, compression=compression, dropout=dropout, classes=classes ) # trainer if dataset == 'SVHN': trainer_cls = SVHN_DenseNetTrainer else: trainer_cls = CIFAR_DenseNetTrainer if init_path: trainer = trainer_cls.load_state(model, init_path, batchsize=batchsize) else: trainer = trainer_cls(model, batchsize=batchsize) # dataset if not trainer.dataset: if dataset == 'C10': trainer.dataset = CIFAR10(testsplit=0.1) elif dataset == 'C100': trainer.dataset = CIFAR100(testsplit=0.1) else: raise NotImplementedError( 'The SVHN dataset is not yet implemented.') # training the network print('Training model ({} parameters) ...'.format( count_params(model, trainable=True))) trainer.train(config) # save the network, the updates and the journal if not out_path: _, acc = trainer.validate() date = datetime.now().strftime('%Y-%m-%d_%H:%M') bn_str = 'bottleneck' if bottleneck else 'no_bottleneck' tmpl = 'densenet-{}_-_n_{}_-_k_{}_-_{}_-_t_{:.2f}_-_acc_{:.2f}_{}' out_path = tmpl.format(dataset, n, growth, bn_str, compression, acc * 100, date) trainer.save_state(out_path, resume=True)
def test_setup(): # Prepare Theano variables for inputs and targets input_var = T.tensor4('inputs') target_var = T.ivector('targets') # Create neural network model (depending on first command line parameter) print("Building model and compiling functions...") print( " with input dimension {0},{1},{2}".format( config.image_height, \ config. image_width, \ config.image_channel ) ) network = cnn_archi( input_var, \ config.image_channel, \ config.image_height, config.image_width,\ config.output_length ) print('Number of parameters : {0}'.format(count_params(network))) with np.load(config.model_file) as f: param_values = [f['arr_%d' % i] for i in range(len(f.files))] set_all_param_values(network, param_values) # Create a loss expression for validation/testing. The crucial difference # here is that we do a deterministic forward pass through the network, # disabling dropout layers. test_prediction = lasagne.layers.get_output(network, deterministic=True) test_classes = T.argmax(test_prediction, axis=1) test_loss = categorical_crossentropy(test_prediction,\ target_var) # As a bonus, also create an expression for the classification accuracy: test_acc = T.eq(test_classes, target_var) # Compile a function performing a training step on a mini-batch (by giving # the updates dictionary) and returning the corresponding training loss: # Compile a second function computing the validation loss and accuracy: val_fn = theano.function([input_var, target_var], \ [test_loss, test_prediction, test_acc], \ allow_input_downcast=True ) return val_fn
def main(options): print 'Build and compile network' input_data = T.ftensor3('input_data') input_mask = T.fmatrix('input_mask') target_data = T.imatrix('target_data') target_mask = T.fmatrix('target_mask') network_outputs = build_network( input_data=input_data, input_mask=input_mask, num_inputs=options['num_inputs'], num_outputs=options['num_outputs'], num_inner_units_list=options['num_inner_units_list'], num_outer_units_list=options['num_outer_units_list'], use_peepholes=options['use_peepholes'], use_layer_norm=options['use_layer_norm'], learn_init=options['learn_init'], grad_clipping=options['grad_clip']) network = network_outputs[-1] inner_loop_layers = network_outputs[:-1] network_params = get_all_params(network, trainable=True) print("number of parameters in model: %d" % count_params(network, trainable=True)) if options['reload_model']: print('Loading Parameters...') [ pretrain_network_params_val, pretrain_update_params_val, pretrain_total_batch_cnt ] = pickle.load(open(options['reload_model'], 'rb')) print('Applying Parameters...') set_model_param_value(network_params, pretrain_network_params_val) else: pretrain_update_params_val = None pretrain_total_batch_cnt = 0 print 'Build network trainer' train_lr = theano.shared(convert_to_floatX(options['lr'])) training_fn, trainer_params = set_network_trainer( input_data=input_data, input_mask=input_mask, target_data=target_data, target_mask=target_mask, num_outputs=options['num_outputs'], network=network, inner_loop_layers=inner_loop_layers, updater=options['updater'], learning_rate=train_lr, grad_max_norm=options['grad_norm'], l2_lambda=options['l2_lambda'], load_updater_params=pretrain_update_params_val) print 'Build network predictor' predict_fn = set_network_predictor(input_data=input_data, input_mask=input_mask, target_data=target_data, target_mask=target_mask, num_outputs=options['num_outputs'], network=network) # evaluation if options['reload_model']: train_eval_datastream = get_datastream( path=options['data_path'], norm_path=options['norm_data_path'], which_set='train_si84', batch_size=options['eval_batch_size']) valid_eval_datastream = get_datastream( path=options['data_path'], norm_path=options['norm_data_path'], which_set='test_dev93', batch_size=options['eval_batch_size']) train_nll, train_bpc, train_fer = network_evaluation( predict_fn, train_eval_datastream) valid_nll, valid_bpc, valid_fer = network_evaluation( predict_fn, valid_eval_datastream) print '=======================================================' print 'Train NLL: ', str(train_nll), ', FER: ', str(train_fer) print 'Valid NLL: ', str(valid_nll), ', FER: ', str(valid_fer) print '=======================================================' print 'Load data stream' train_datastream = get_datastream(path=options['data_path'], norm_path=options['norm_data_path'], which_set='train_si84', batch_size=options['batch_size']) print 'Start training' if os.path.exists(options['save_path'] + '_eval_history.npz'): evaluation_history = numpy.load( options['save_path'] + '_eval_history.npz')['eval_history'].tolist() else: evaluation_history = [[[100.0, 100.0, 1.0], [100.0, 100.0, 1.0]]] total_batch_cnt = 0 start_time = time.time() try: # for each epoch for e_idx in range(options['num_epochs']): # for each batch for b_idx, data in enumerate( train_datastream.get_epoch_iterator()): total_batch_cnt += 1 if pretrain_total_batch_cnt >= total_batch_cnt: continue # get input, target data input_data = data[0].astype(floatX) input_mask = data[1].astype(floatX) # get target data target_data = data[2] target_mask = data[3].astype(floatX) # get output train_output = training_fn(input_data, input_mask, target_data, target_mask) train_predict_cost = train_output[0] network_grads_norm = train_output[1] train_sf_cost0 = train_output[2] train_sf_cost1 = train_output[3] train_sf_cost2 = train_output[4] print('=====================================================') print(total_batch_cnt, train_predict_cost, network_grads_norm) print(train_sf_cost0, train_sf_cost1, train_sf_cost2) if numpy.isnan(train_predict_cost) or numpy.isnan( network_grads_norm): print('update cnt: ', total_batch_cnt) print('NaN detected: ', train_predict_cost, network_grads_norm) raw_input() # show intermediate result if total_batch_cnt % options[ 'train_disp_freq'] == 0 and total_batch_cnt != 0: best_idx = numpy.asarray(evaluation_history)[:, 1, 2].argmin() print '============================================================================================' print 'Model Name: ', options['save_path'].split('/')[-1] print '============================================================================================' print 'Epoch: ', str(e_idx), ', Update: ', str( total_batch_cnt), ', Time: ', str(time.time() - start_time) print '--------------------------------------------------------------------------------------------' print 'Prediction Cost: ', str(train_predict_cost) print 'Gradient Norm: ', str(network_grads_norm) print '--------------------------------------------------------------------------------------------' print 'Learn Rate: ', str(train_lr.get_value()) print '--------------------------------------------------------------------------------------------' print 'Train NLL: ', str( evaluation_history[-1][0][0]), ', BPC: ', str( evaluation_history[-1][0][1]), ', FER: ', str( evaluation_history[-1][0][2]) print 'Valid NLL: ', str( evaluation_history[-1][1][0]), ', BPC: ', str( evaluation_history[-1][1][1]), ', FER: ', str( evaluation_history[-1][1][2]) print '--------------------------------------------------------------------------------------------' print 'Best NLL: ', str( evaluation_history[best_idx][1][0]), ', BPC: ', str( evaluation_history[best_idx][1] [1]), ', FER: ', str( evaluation_history[best_idx][1][2]) start_time = time.time() # # evaluation # if total_batch_cnt%options['train_eval_freq'] == 0 and total_batch_cnt!=0: # train_eval_datastream = get_datastream(path=options['data_path'], # norm_path=options['norm_data_path'], # which_set='train_si84', # batch_size=options['eval_batch_size']) # valid_eval_datastream = get_datastream(path=options['data_path'], # norm_path=options['norm_data_path'], # which_set='test_dev93', # batch_size=options['eval_batch_size']) # train_nll, train_bpc, train_fer = network_evaluation(predict_fn, # train_eval_datastream) # valid_nll, valid_bpc, valid_fer = network_evaluation(predict_fn, # valid_eval_datastream) # # # check over-fitting # if valid_fer<numpy.asarray(evaluation_history)[:, 1, 2].min(): # best_network_params_vals = get_model_param_values(network_params) # pickle.dump(best_network_params_vals, # open(options['save_path'] + '_best_model.pkl', 'wb')) # # # save results # evaluation_history.append([[train_nll, train_bpc, train_fer], # [valid_nll, valid_bpc, valid_fer]]) # numpy.savez(options['save_path'] + '_eval_history', # eval_history=evaluation_history) # save network if total_batch_cnt % options[ 'train_save_freq'] == 0 and total_batch_cnt != 0: cur_network_params_val = get_model_param_values( network_params) cur_trainer_params_val = get_update_params_values( trainer_params) cur_total_batch_cnt = total_batch_cnt pickle.dump([ cur_network_params_val, cur_trainer_params_val, cur_total_batch_cnt ], open( options['save_path'] + str(total_batch_cnt).zfill(10) + '_model.pkl', 'wb')) except KeyboardInterrupt: print 'Training Interrupted' cur_network_params_val = get_model_param_values(network_params) cur_trainer_params_val = get_update_params_values(trainer_params) cur_total_batch_cnt = total_batch_cnt pickle.dump([ cur_network_params_val, cur_trainer_params_val, cur_total_batch_cnt ], open(options['save_path'] + '_last_model.pkl', 'wb'))
def main(): # BN parameters batch_size = 100 logger_lip.info("batch_size = %s", batch_size) # alpha is the exponential moving average factor alpha = .1 logger_lip.info("alpha = %s", alpha) epsilon = 1e-4 logger_lip.info("epsilon = %s", epsilon) # BinaryOut activation = binary_net.binary_tanh_unit print("activation = binary_tanh_unit") stochastic = True print("stochastic = " + str(stochastic)) # (-H,+H) are the two binary values #H = "Glorot" H = 1. print("H = " + str(H)) # W_LR_scale = 1. W_LR_scale = "Glorot" # "Glorot" means we are using the coefficients from Glorot's paper print("W_LR_scale = " + str(W_LR_scale)) # Training parameters num_epochs = 50 logger_lip.info("num_epochs = %s", num_epochs) # Decaying LR LR_start = 0.1 logger_lip.info("LR_start = %s", LR_start) LR_fin = 0.0000003 logger_lip.info("LR_fin = %s", LR_fin) # LR_decay = (LR_fin / LR_start) ** (1. / num_epochs) LR_decay = 0.5 # sqrt(0.5) logger_lip.info("LR_decay = %s", LR_decay) # BTW, LR decay might good for the BN moving average... shuffle_parts = 1 logger_lip.info("shuffle_parts = %s", shuffle_parts) if binary: oneHot = True else: oneHot = False ############################################## network_type = "google" viseme = False # will set nbClasses and store path vis: 6.498.828 phn: 7.176.231 if viseme: nbClasses = 12 else: nbClasses = 39 # get the database # If it's small (lipspeakers) -> generate X_train, y_train etc here # otherwise we need to load and generate each speaker seperately in the training loop dataset = "TCDTIMIT" root_dir = os.path.join( os.path.expanduser('~/TCDTIMIT/lipreading/' + dataset)) results_dir = root_dir + "/results/CNN_binaryNet" if not os.path.exists(results_dir): os.makedirs(results_dir) if viseme: database_binaryDir = root_dir + '/binaryViseme' else: database_binaryDir = root_dir + '/binary' datasetType = "lipspeakers" # "lipspeakers" #"volunteers" #"volunteers" # lipspeakers or volunteers" ############################################## if datasetType == "lipspeakers": loadPerSpeaker = False # only lipspeakers small enough to fit in CPU RAM, generate X_train etc here storeProcessed = True processedDir = database_binaryDir + "_allLipspeakersProcessed" # TODO: prepLip_all can be used to generate pkl containing all the lipspeaker data. Not sure if this stil works, so use with care! if not oneHot: pkl_path = processedDir + os.sep + datasetType + ".pkl" else: pkl_path = processedDir + os.sep + datasetType + "_oneHot" + ".pkl" if not os.path.exists(pkl_path): logger_lip.info("dataset not yet processed. Processing...") code.lipreading.preprocessLipreading.prepLip_all( data_path=database_binaryDir, store_path=pkl_path, trainFraction=0.7, validFraction=0.1, testFraction=0.2, nbClasses=nbClasses, onehot=oneHot, type=datasetType, verbose=True) datasetFiles = code.lipreading.general_tools.unpickle(pkl_path) X_train, y_train, X_val, y_val, X_test, y_test = datasetFiles dtypeX = 'float32' dtypeY = 'float32' X_train = X_train.astype(dtypeX) y_train = y_train.astype(dtypeY) X_val = X_val.astype(dtypeX) y_val = y_val.astype(dtypeY) X_test = X_test.astype(dtypeX) y_test = y_test.astype(dtypeY) datasetFiles = [X_train, y_train, X_val, y_val, X_test, y_test] # These files have been generated with datasetToPkl_fromCombined, so that the train/val/test set are the same as for combinedSR. # X_train, y_train = unpickle(os.path.expanduser("~/TCDTIMIT/lipreading/TCDTIMIT/binary/allLipspeakersTrain.pkl")) # X_val, y_val = unpickle(os.path.expanduser("~/TCDTIMIT/lipreading/TCDTIMIT/binary/allLipspeakersVal.pkl")) # X_test, y_test = unpickle(os.path.expanduser("~/TCDTIMIT/lipreading/TCDTIMIT/binary/allLipspeakersTest.pkl")) # datasetFiles = [X_train, y_train, X_val, y_val, X_test, y_test] else: # we need to load and preprocess each speaker before we evaluate, because dataset is too large and doesn't fit in CPU RAM loadPerSpeaker = True storeProcessed = True # if you have about 10GB hdd space, you can increase the speed by not reprocessing it each iteration processedDir = database_binaryDir + "_finalProcessed" # you can just run this program and it will generate the files the first time it encounters them, or generate them manually with datasetToPkl.py # just get the names testVolunteerNumbers = [ "13F", "15F", "21M", "23M", "24M", "25M", "28M", "29M", "30F", "31F", "34M", "36F", "37F", "43F", "47M", "51F", "54M" ] testVolunteers = [ str(testNumber) + ".pkl" for testNumber in testVolunteerNumbers ] lipspeakers = ["Lipspkr1.pkl", "Lipspkr2.pkl", "Lipspkr3.pkl"] allSpeakers = [ f for f in os.listdir(database_binaryDir) if os.path.isfile(os.path.join(database_binaryDir, f)) and os.path.splitext(f)[1] == ".pkl" ] trainVolunteers = [ f for f in allSpeakers if not (f in testVolunteers or f in lipspeakers) ] trainVolunteers = [vol for vol in trainVolunteers if vol is not None] if datasetType == "combined": trainingSpeakerFiles = trainVolunteers + lipspeakers testSpeakerFiles = testVolunteers elif datasetType == "volunteers": trainingSpeakerFiles = trainVolunteers testSpeakerFiles = testVolunteers else: raise Exception("invalid dataset entered") datasetFiles = [trainingSpeakerFiles, testSpeakerFiles] model_name = datasetType + "_" + network_type + "_" + ("viseme" if viseme else "phoneme") + str(nbClasses) \ + ("_binary" if binary else "") model_save_name = os.path.join(results_dir, model_name) # log file logFile = results_dir + os.sep + model_name + '.log' # if os.path.exists(logFile): # fh = logging.FileHandler(logFileT) # append to existing log # else: fh = logging.FileHandler(logFile, 'w') # create new logFile fh.setLevel(logging.DEBUG) fh.setFormatter(formatter) logger_lip.addHandler(fh) logger_lip.info('Building the CNN...') # Prepare Theano variables for inputs and targets inputs = T.tensor4('inputs') if oneHot: targets = T.matrix('targets') else: targets = T.ivector('targets') LR = T.scalar('LR', dtype=theano.config.floatX) # get the network structure l_out = code.lipreading.buildNetworks.build_network_google_binary( activation, alpha, epsilon, inputs, binary, stochastic, H, W_LR_scale) # 7176231 params for layer in L.get_all_layers(l_out): print(layer) # print het amount of network parameters logger_lip.info("Using the %s network", network_type) logger_lip.info("The number of parameters of this network: %s", L.count_params(l_out)) logger_lip.info("loading %s", model_save_name + '.npz') load_model(model_save_name + '.npz', l_out) logger_lip.info("* COMPILING FUNCTIONS...") train_output = lasagne.layers.get_output(l_out, deterministic=False) # squared hinge loss loss = T.mean(T.sqr(T.maximum(0., 1. - targets * train_output))) # W updates W = lasagne.layers.get_all_params(l_out, binary=True) W_grads = binary_net.compute_grads(loss, l_out) updates = lasagne.updates.adam(loss_or_grads=W_grads, params=W, learning_rate=LR) updates = binary_net.clipping_scaling(updates, l_out) # other parameters updates params = lasagne.layers.get_all_params(l_out, trainable=True, binary=False) updates = OrderedDict(updates.items() + lasagne.updates.adam( loss_or_grads=loss, params=params, learning_rate=LR).items()) test_output = lasagne.layers.get_output(l_out, deterministic=True) out_fn = theano.function([inputs], test_output) test_loss = T.mean(T.sqr(T.maximum(0., 1. - targets * test_output))) test_acc = T.mean(T.eq(T.argmax(test_output, axis=1), T.argmax(targets, axis=1)), dtype=theano.config.floatX) k = 3 test_top3_acc = T.zeros((1, )) topk_acc_fn = theano.function([], test_top3_acc) val_fn = theano.function([inputs, targets], [test_loss, test_acc, test_top3_acc]) if debug: nb = 3 debugX = X_train[0:nb] debugY = y_train[0:nb] out = out_fn(debugX) val = val_fn(debugX, debugY) import pdb pdb.set_trace() # Compile a function performing a training step on a mini-batch (by giving the updates dictionary) # and returning the corresponding training loss: train_fn = theano.function([inputs, targets, LR], loss, updates=updates) logger_lip.info('Training...') import code.lipreading.train_lipreading code.lipreading.train_lipreading.train( train_fn=train_fn, val_fn=val_fn, out_fn=out_fn, topk_acc_fn=topk_acc_fn, k=k, network_output_layer=l_out, batch_size=batch_size, LR_start=LR_start, LR_decay=LR_decay, num_epochs=num_epochs, dataset=datasetFiles, database_binaryDir=database_binaryDir, storeProcessed=storeProcessed, processedDir=processedDir, loadPerSpeaker=loadPerSpeaker, justTest=justTest, save_name=model_save_name, shuffleEnabled=True)
# print 'extract input var \n' # X = get_all_layers(network)[0].input_var ##################################################### # # VGG run # net = build_model_vanila_CNN(X=X, channel= n_ch, stride=1 ) # network = net['prob'] ##################################################### # FULLCCN run network = build_CNN_nopool(in_shape = (None, n_ch,96,96), num_filter = [64,64,128,128,128,128], fil_size = [ 3, 1, 3, 3, 3, 12], strides = [ 1, 1, 2, 2, 2, 1], num_out = 30, nlin_func=rectify, in_var=X) print "num_params", count_params(network) ##################################################### train_fn, val_fn = build_update_functions(train_set_x=train_set_x, train_set_y=train_set_y, valid_set_x=valid_set_x,valid_set_y= valid_set_y, y= y,X= X,network=network, val_MASK=val_MASK, train_MASK=train_MASK, learning_rate=learn_rate,batch_size=batch_size,l2_reg=l2) print 'compile done successfully \n' # call early_stop_train function early_stop_train(train_set_x, train_set_y, valid_set_x, valid_set_y, network, train_fn, val_fn, batch_size=batch_size)
def main(): for batch_size, network_type in zip(batch_sizes, networks): print(batch_size, network_type) # BN parameters # batch_size = 100 logger_lip.info("batch_size = %s", batch_size) # alpha is the exponential moving average factor alpha = .1 logger_lip.info("alpha = %s", alpha) epsilon = 1e-4 logger_lip.info("epsilon = %s", epsilon) # activation activation = T.nnet.relu logger_lip.info("activation = T.nnet.relu") # Training parameters num_epochs = 20 logger_lip.info("num_epochs = %s", num_epochs) # Decaying LR LR_start = 0.001 logger_lip.info("LR_start = %s", LR_start) LR_fin = 0.0000003 logger_lip.info("LR_fin = %s", LR_fin) #LR_decay = (LR_fin / LR_start) ** (1. / num_epochs) LR_decay = 0.5 # sqrt(0.5) logger_lip.info("LR_decay = %s", LR_decay) # BTW, LR decay might good for the BN moving average... shuffle_parts = 1 logger_lip.info("shuffle_parts = %s", shuffle_parts) oneHot = False ############################################## if viseme: nbClasses = 12 else: nbClasses = 39 # get the database # If it's small (lipspeakers) -> generate X_train, y_train etc here # otherwise we need to load and generate each speaker seperately in the training loop dataset = "TCDTIMIT" root_dir = os.path.join( os.path.expanduser('~/TCDTIMIT/lipreading/' + dataset)) results_dir = root_dir + "/results/CNN" if not os.path.exists(results_dir): os.makedirs(results_dir) if viseme: database_binaryDir = root_dir + '/binaryViseme' else: database_binaryDir = root_dir + '/binary' datasetType = "lipspeakers" #"volunteers" # lipspeakers or volunteers" ############################################## if datasetType == "lipspeakers": loadPerSpeaker = False # only lipspeakers small enough to fit in CPU RAM, generate X_train etc here storeProcessed = True processedDir = database_binaryDir + "_allLipspeakersProcessed" # pkl_path = processedDir + os.sep + datasetType + ".pkl" # if not os.path.exists(pkl_path): # logger_lip.info("dataset not yet processed. Processing...") # preprocessLipreading.prepLip_all(data_path=database_binaryDir, store_path=pkl_path, trainFraction=0.7, validFraction=0.1, # testFraction=0.2, # nbClasses=nbClasses, onehot=oneHot, type=datasetType, verbose=True) #datasetFiles = general_tools.unpickle(pkl_path) # if this doesn't succeed, you probably have to generate the files with datasetToPkl_fromCombined.py X_train, y_train = unpickle( os.path.expanduser( "~/TCDTIMIT/lipreading/TCDTIMIT/binary/allLipspeakersTrain.pkl" )) X_val, y_val = unpickle( os.path.expanduser( "~/TCDTIMIT/lipreading/TCDTIMIT/binary/allLipspeakersVal.pkl" )) X_test, y_test = unpickle( os.path.expanduser( "~/TCDTIMIT/lipreading/TCDTIMIT/binary/allLipspeakersTest.pkl" )) datasetFiles = [X_train, y_train, X_val, y_val, X_test, y_test] else: # we need to load and preprocess each speaker before we evaluate, because dataset is too large and doesn't fit in CPU RAM loadPerSpeaker = True storeProcessed = True #if you have about 10GB hdd space, you can increase the speed by not reprocessing it each iteration processedDir = database_binaryDir + "_finalProcessed" # you can just run this program and it will generate the files the first time it encounters them, or generate them manually with datasetToPkl.py # just get the names testVolunteerNumbers = [ "13F", "15F", "21M", "23M", "24M", "25M", "28M", "29M", "30F", "31F", "34M", "36F", "37F", "43F", "47M", "51F", "54M" ] testVolunteers = [ str(testNumber) + ".pkl" for testNumber in testVolunteerNumbers ] lipspeakers = ["Lipspkr1.pkl", "Lipspkr2.pkl", "Lipspkr3.pkl"] allSpeakers = [ f for f in os.listdir(database_binaryDir) if os.path.isfile(os.path.join(database_binaryDir, f)) and os.path.splitext(f)[1] == ".pkl" ] trainVolunteers = [ f for f in allSpeakers if not (f in testVolunteers or f in lipspeakers) ] trainVolunteers = [ vol for vol in trainVolunteers if vol is not None ] if datasetType == "combined": trainingSpeakerFiles = trainVolunteers + lipspeakers testSpeakerFiles = testVolunteers elif datasetType == "volunteers": trainingSpeakerFiles = trainVolunteers testSpeakerFiles = testVolunteers else: raise Exception("invalid dataset entered") datasetFiles = [trainingSpeakerFiles, testSpeakerFiles] model_name = datasetType + "_" + network_type + "_" + ( "viseme" if viseme else "phoneme") + str(nbClasses) model_save_name = os.path.join(results_dir, model_name) # log file logFile = results_dir + os.sep + model_name + '.log' # if os.path.exists(logFile): # fh = logging.FileHandler(logFileT) # append to existing log # else: fh = logging.FileHandler(logFile, 'w') # create new logFile fh.setLevel(logging.DEBUG) fh.setFormatter(formatter) logger_lip.addHandler(fh) logger_lip.info('Building the CNN...') # Prepare Theano variables for inputs and targets inputs = T.tensor4('inputs') if oneHot: targets = T.matrix('targets') else: targets = T.ivector('targets') LR = T.scalar('LR', dtype=theano.config.floatX) # get the network structure if network_type == "google": cnnDict, l_out = buildNetworks.build_network_google( activation, alpha, epsilon, inputs, nbClasses) # 7.176.231 params elif network_type == "cifar10": cnn, l_out = buildNetworks.build_network_cifar10( input=inputs, nbClasses=nbClasses, activation=activation, alpha=alpha, epsilon=epsilon) elif network_type == "cifar10_v2": cnn, l_out = buildNetworks.build_network_cifar10_v2( input=inputs, nbClasses=nbClasses) elif network_type == "resnet50": cnn, l_out = buildNetworks.build_network_resnet50( inputs, nbClasses) # print het amount of network parameters logger_lip.info("Using the %s network", network_type) logger_lip.info("The number of parameters of this network: %s", L.count_params(l_out)) logger_lip.info("loading %s", model_save_name + '.npz') load_model(model_save_name + '.npz', l_out) # a = '/home/matthijs/TCDTIMIT/lipreading/TCDTIMIT/results/thirty.npz' # logger_lip.info("loading %s", a) # load_model(a, l_out) logger_lip.info("* COMPILING FUNCTIONS...") # for validation: disable dropout etc layers -> deterministic test_network_output = L.get_output(l_out, deterministic=True) test_acc = T.mean(T.eq(T.argmax(test_network_output, axis=1), targets), dtype=theano.config.floatX) # T.zeros((1,)) test_loss = LO.categorical_crossentropy(test_network_output, targets) test_loss = test_loss.mean() # Top k accuracy k = 3 # topk_acc = T.mean( T.any(T.eq(T.argsort(test_network_output, axis=1)[:, -k:], targets.dimshuffle(0, 'x')), axis=1), # dtype=theano.config.floatX) topk_acc = T.mean( lasagne.objectives.categorical_accuracy(test_network_output, targets.flatten(), top_k=k)) topk_acc_fn = theano.function([inputs, targets], topk_acc) val_fn = theano.function([inputs, targets], [test_loss, test_acc, topk_acc]) # For training, use nondeterministic output network_output = L.get_output(l_out, deterministic=False) out_fn = theano.function([inputs], network_output) # cross-entropy loss loss = LO.categorical_crossentropy(network_output, targets) loss = loss.mean() # # Also add weight decay to the cost function weight_decay = 1e-5 weightsl2 = lasagne.regularization.regularize_network_params( l_out, lasagne.regularization.l2) loss += weight_decay * weightsl2 # acc err = T.mean(T.eq(T.argmax(network_output, axis=1), targets), dtype=theano.config.floatX) # set all params to trainable params = L.get_all_params(l_out, trainable=True) updates = lasagne.updates.adam(loss_or_grads=loss, params=params, learning_rate=LR) # Compile a function performing a training step on a mini-batch (by giving the updates dictionary) # and returning the corresponding training loss: train_fn = theano.function([inputs, targets, LR], loss, updates=updates) logger_lip.info('Training...') train_lipreading.train(train_fn=train_fn, val_fn=val_fn, out_fn=out_fn, topk_acc_fn=topk_acc_fn, k=k, network_output_layer=l_out, batch_size=batch_size, LR_start=LR_start, LR_decay=LR_decay, num_epochs=num_epochs, dataset=datasetFiles, database_binaryDir=database_binaryDir, storeProcessed=storeProcessed, processedDir=processedDir, loadPerSpeaker=loadPerSpeaker, justTest=justTest, save_name=model_save_name, shuffleEnabled=True)
def print_layers(network): for layer in get_all_layers(network): print str(type(layer)).split(".")[-1][:-2] + ': ' + str( layer.output_shape) print count_params(layer)
def train_model(window_size, max_epochs, patience): root_dir = join('data', 'nets') # the file from which to load pre-trained weights #init_file = join(root_dir, # 'subj%d_weights_deep_nocsp_wide.pickle' % ( # 4)) #init_file = join(root_dir, # 'weights_super_deeper.pickle') init_file = None # the file to which the learned weights will be written weights_file = join(root_dir, 'weights.pickle') temp_weights_file = join(root_dir, 'epoch_%d.pickle') train_data, train_events = [], [] valid_data, valid_events = [], [] for subj_id in range(1, 13): print('loading time series for subject %d...' % (subj_id)) subj_data_list, subj_events_list = utils.load_subject_train(subj_id) print(' creating train and validation sets...') subj_train_data, subj_train_events, subj_valid_data, subj_valid_events = \ utils.split_train_test_data(subj_data_list, subj_events_list, val_size=2, rand=False) train_data += subj_train_data train_events += subj_train_events valid_data += subj_valid_data valid_events += subj_valid_events print('using %d time series for training' % (len(train_data))) print('using %d time series for validation' % (len(valid_data))) print('creating fixed-size time-windows of size %d' % (window_size)) # the training windows should be in random order train_slices = batching.get_permuted_windows(train_data, window_size, rand=True) valid_slices = batching.get_permuted_windows(valid_data, window_size, rand=True) print('there are %d windows for training' % (len(train_slices))) print('there are %d windows for validation' % (len(valid_slices))) #batch_size = 64 batch_size = 512 num_channels = 32 num_actions = 6 train_data, valid_data = \ utils.preprocess(train_data, valid_data) print('building model %s...' % (sys.modules[build_model.__module__].__name__)) l_out = build_model(None, num_channels, window_size, num_actions) all_layers = layers.get_all_layers(l_out) print('this network has %d learnable parameters' % (layers.count_params(l_out))) for layer in all_layers: print('Layer %s has output shape %r' % (layer.name, layer.output_shape)) if init_file is not None: print('loading model weights from %s' % (init_file)) with open(init_file, 'rb') as ifile: src_layers = pickle.load(ifile) dst_layers = layers.get_all_params(l_out) for i, (src_weights, dst_layer) in enumerate(zip(src_layers, dst_layers)): print('loading pretrained weights for %s' % (dst_layer.name)) dst_layer.set_value(src_weights) else: print('all layers will be trained from random initialization') #1r = theano.shared(np.cast['float32'](0.001)) lr = theano.shared(np.cast['float32'](0.01)) mntm = 0.9 print('compiling theano functions...') train_iter = iter_funcs.create_iter_funcs_train(lr, mntm, l_out) valid_iter = iter_funcs.create_iter_funcs_valid(l_out) best_weights = None best_valid_loss = np.inf best_epoch = 0 print('starting training for all subjects at %s' % (utils.get_current_time())) try: for epoch in range(max_epochs): print('epoch: %d' % (epoch)) train_losses, training_outputs, training_inputs = [], [], [] num_batches = (len(train_slices) + batch_size - 1) / batch_size t_train_start = time() for i, (Xb, yb) in enumerate( batching.batch_iterator(batch_size, train_slices, train_data, train_events, window_norm=False)): t_batch_start = time() # hack for faster debugging #if i < 70000: # continue train_loss, train_output = \ train_iter(Xb, yb) if np.isnan(train_loss): print('nan loss encountered in minibatch %d' % (i)) continue train_losses.append(train_loss) assert len(yb) == len(train_output) for input, output in zip(yb, train_output): training_inputs.append(input) training_outputs.append(output) batch_duration = time() - t_batch_start if i % 10 == 0: eta = batch_duration * (num_batches - i) m, s = divmod(eta, 60) h, m = divmod(m, 60) print(' training... (ETA = %d:%02d:%02d)\r' % (h, m, s)), sys.stdout.flush() avg_train_loss = np.mean(train_losses) training_inputs = np.vstack(training_inputs) training_outputs = np.vstack(training_outputs) train_roc = roc_auc_score(training_inputs, training_outputs) train_duration = time() - t_train_start print('') print(' train loss: %.6f' % (avg_train_loss)) print(' train roc: %.6f' % (train_roc)) print(' duration: %.2f s' % (train_duration)) valid_losses, valid_outputs, valid_inputs = [], [], [] num_batches = (len(valid_slices) + batch_size - 1) / batch_size t_valid_start = time() for i, (Xb, yb) in enumerate( batching.batch_iterator(batch_size, valid_slices, valid_data, valid_events, window_norm=False)): t_batch_start = time() valid_loss, valid_output = \ valid_iter(Xb, yb) if np.isnan(valid_loss): print('nan loss encountered in minibatch %d' % (i)) continue valid_losses.append(valid_loss) assert len(yb) == len(valid_output) for input, output in zip(yb, valid_output): valid_inputs.append(input) valid_outputs.append(output) batch_duration = time() - t_batch_start if i % 10 == 0: eta = batch_duration * (num_batches - i) m, s = divmod(eta, 60) h, m = divmod(m, 60) print(' validation... (ETA = %d:%02d:%02d)\r' % (h, m, s)), sys.stdout.flush() # allow training without validation if valid_losses: avg_valid_loss = np.mean(valid_losses) valid_inputs = np.vstack(valid_inputs) valid_outputs = np.vstack(valid_outputs) valid_roc = roc_auc_score(valid_inputs, valid_outputs) valid_duration = time() - t_valid_start print('') print(' valid loss: %.6f' % (avg_valid_loss)) print(' valid roc: %.6f' % (valid_roc)) print(' duration: %.2f s' % (valid_duration)) else: print(' no validation...') # if we are not doing validation we always want the latest weights if not valid_losses: best_epoch = epoch model_train_loss = avg_train_loss model_train_roc = train_roc model_valid_roc = -1. best_valid_loss = -1. best_weights = layers.get_all_param_values(l_out) elif avg_valid_loss < best_valid_loss: best_epoch = epoch model_train_roc = train_roc model_valid_roc = valid_roc model_train_loss = avg_train_loss best_valid_loss = avg_valid_loss best_weights = layers.get_all_param_values(l_out) temp_file = temp_weights_file % (epoch) print('saving temporary best weights to %s' % (temp_file)) with open(temp_file, 'wb') as ofile: pickle.dump(best_weights, ofile, protocol=pickle.HIGHEST_PROTOCOL) if epoch > best_epoch + patience: break best_epoch = epoch new_lr = 0.5 * lr.get_value() lr.set_value(np.cast['float32'](new_lr)) print('setting learning rate to %.6f' % (new_lr)) except KeyboardInterrupt: print('caught Ctrl-C, stopping training...') with open(weights_file, 'wb') as ofile: print('saving best weights to %s' % (weights_file)) pickle.dump(best_weights, ofile, protocol=pickle.HIGHEST_PROTOCOL) print('finished training for all subjects at %s' % (utils.get_current_time())) return model_train_loss, best_valid_loss, model_train_roc, model_valid_roc
def __init__(self, config, use_noise=False): self.width = config['width'] self.height = config['height'] self.channels = config['channels'] self.actions = config['actions'] self.history = config['history'] gru_units = config['gru_units'] att_units = config['att_units'] l_action = L.InputLayer((None, )) l_input, l_cnn = build_cnn(config, use_noise) l_gru = GRULayer([l_action, l_cnn], num_steps=self.history, num_units=gru_units, att_units=att_units) l_attention = L.InputLayer((None, l_gru.num_pixel)) l_hidden1 = L.InputLayer((None, gru_units)) l_hidden2 = L.InputLayer((None, gru_units)) l_gru_step = l_gru.get_step_layer( [l_action, l_cnn, l_attention, l_hidden1, l_hidden2]) l_out_step = L.DenseLayer(l_gru_step, num_units=self.actions, nonlinearity=lasagne.nonlinearities.softmax) l_out_batch = L.DenseLayer(l_gru, num_units=l_out_step.num_units, nonlinearity=l_out_step.nonlinearity, W=l_out_step.W, b=l_out_step.b) self.l_attention = l_attention self.l_action = l_action self.l_input = l_input self.num_params_all = L.count_params(l_out_batch, trainable=True) self.params_all = L.get_all_params(l_out_batch, trainable=True) #shapes_all = [p.get_value(borrow=True).shape for p in params_all] self.num_params_cnn = L.count_params(l_cnn, trainable=True) self.params_cnn = L.get_all_params(l_cnn, trainable=True) #shapes_cnn = [p.get_value(borrow=True).shape for p in params_cnn] self.num_params_gru = self.num_params_all - self.num_params_cnn self.params_gru = [ p for p in self.params_all if p not in self.params_cnn ] #shapes_gru = [p.get_value(borrow=True).shape for p in params_gru] print 'Number of policy parameters: {} > {}({}) = {}({}) + {}({})'.format( L.count_params(l_out_step), self.num_params_all, len(self.params_all), self.num_params_cnn, len(self.params_cnn), self.num_params_gru, len(self.params_gru)) self.cnn = l_cnn self.gru = l_gru self.gru_step = l_out_step self.gru_batch = l_out_batch self.batch_history_shape = (-1, self.history, self.channels, self.height, self.width) self.batch_flatten_shape = (-1, self.channels, self.height, self.width) self.t_action = T.ivector('action') self.t_state = T.tensor4('state') self.t_attention = T.matrix('attention') self.t_hidden1 = T.matrix('hidden1') self.t_hidden2 = T.matrix('hidden2') step_hidden2, step_output = L.get_output( [l_gru_step, l_out_step], { l_action: self.t_action, l_input: self.t_state, l_attention: self.t_attention, l_hidden1: self.t_hidden1, l_hidden2: self.t_hidden2 }, deterministic=True) step_hidden1 = l_gru.hidden1 step_attention = l_gru.attention print 'Compile policy one step output' self._output_step = theano.function([ self.t_action, self.t_state, self.t_attention, self.t_hidden1, self.t_hidden2 ], [step_attention, step_hidden1, step_hidden2, step_output])
def main(config=None, init_path='', out_path='', batchsize=128, dataset='C10', pre=False, stoch_depth=False, n=5, stype='A', bottleneck=False, dim_inc_meth='1x1'): """Train a resnet on the CIFAR-10 data set. Parameters ---------- config : list of dictionaries or ``None`` (``None``) The configuration for the training. init_model : string (``''``) The path (prefix) to the initial model parameters, updates and journal files. This is for continuing a previous training. out_path : string (``''``) The path (prefix) for the output file. This function will save the trained model, a journal with training statistics the updates for the optimizer and the create data set (this is needed to continue the training). batch_size : integer (``128``) The batch size for training. dataset : ``'C10'``, ``'C100'`` or ``'SVHN'`` (``'C10'``) The data set to use for training. The options are: CIFAR-10, CIFAR-100 and the "Street View House Number" data sets. pre : boolean (``False``) If ``True`` use the pre-action order. stoch_depth: boolean (``False``) If ``True`` use the stochastic depth approach, with a linear decay and $p_L = 0..5$. n : integer (``5``) The parameter 'n' from the paper. stype : ``'A'``, ``'B'`` or ``'C'`` (``'A'``) The type of shortcut. bottleneck : boolean (``False``) Use bottleneck approach with 3 layers per stack. dim_inc : ``'1x1'``, ``'2x2'``, ``'max'``, ``'sum'`` or ``'avg'`` The method to deal with the increase in dimensions. '1x1' will perform a 1x1 convolution and ignore 3/4 of the input. '2x2' will perform a 2x2 convolution. This will add some parameters, but won't ignore any inputs. 'max', 'sum' and 'avg' will perform a 1x1 convolution with a 1x1 followed by the corresponding pooling operation, followed by a 1x1 convolution. This will not ignore any inputs nor add any parameters to the model. NOTE: This argument is ignored if the shortcut type is 'A'. """ # network assert dataset in ('C10', 'C100', 'SVHN') classes = 100 if dataset == 'C100' else 10 bases = (PreResNet, ResNet) if pre else (ResNet, ) if stoch_depth: bases = (StochasticDepth, ) + bases model_cls = type('ModelClass', bases, {}) model = model_cls.cifar_model(n=n, type=stype, bottleneck=bottleneck, dim_inc_meth=dim_inc_meth, classes=classes) # trainer if dataset == 'SVHN': trainer_cls = SVHN_SDTrainer else: trainer_cls = CIFAR_SDTrainer if stoch_depth else CIFAR_ResNetTrainer if init_path: trainer = trainer_cls.load_state(model, init_path, batchsize=batchsize) else: trainer = trainer_cls(model, batchsize=batchsize) # dataset if not trainer.dataset: if dataset == 'SVHN': raise NotImplementedError( 'The SVHN dataset is not yet implemented.') elif dataset == 'C10': trainer.dataset = CIFAR10(testsplit=0.1) elif dataset == 'C100': trainer.dataset = CIFAR100(testsplit=0.1) # training the network print('Training model ({} parameters) ...'.format( count_params(model, trainable=True))) trainer.train(config) # save the network, the updates and the journal if not out_path: _, acc = trainer.validate() date = datetime.now().strftime('%Y-%m-%d_%H:%M') bn_str = 'bottleneck' if bottleneck else 'no_bottleneck' _type = 'A' if stype == 'A' else '{}_{}'.format(stype, dim_inc_meth) mdl_str = 'pre-resnet' if pre else 'resnet' if stoch_depth: mdl_str += '-sd' tmpl = '{}-{}__-__n_{}_-_{}_-_{}__-__acc_{:.2f}_{}' out_path = tmpl.format(mdl_str, dataset, n, _type, bn_str, acc * 100, date) trainer.save_state(out_path, resume=True)
def train_setup(): # Prepare Theano variables for inputs and targets input_var = T.tensor4('inputs') target_var = T.ivector('targets') # Create neural network model (depending on first command line parameter) print("Building model and compiling functions...") print( " with input dimension {0},{1},{2}".format( config.image_height, \ config.image_width, \ config. image_channel ) ) network = cnn_archi( input_var, \ config.image_channel,\ config.image_height, config.image_width,\ config.output_length ) print('Number of parameters : {0}'.format(count_params(network))) if (config.init_model is not None): with np.load(config.init_model) as f: param_values = [f['arr_%d' % i] for i in range(len(f.files))] set_all_param_values(network, param_values) # Create a loss expression for training, i.e., a scalar objective we want # to minimize (for our multi-class problem, it is the cross-entropy loss): prediction = lasagne.layers.get_output(network) ent_loss = categorical_crossentropy(prediction, target_var) ent_loss = ent_loss.mean() l1_regu = config.l1_regu * regularize_network_params(network, l1) l2_regu = config.l2_regu * regularize_network_params(network, l2) loss = ent_loss + l1_regu + l2_regu # We could add some weight decay as well here, see lasagne.regularization. # Create update expressions for training, i.e., how to modify the # parameters at each training step. Here, we'll use Stochastic Gradient # Descent (SGD) with Nesterov momentum, but Lasagne offers plenty more. params = get_all_params(network, trainable=True) #grads = T.grad( loss, params ) #scaled_grads = norm_constraint( grads, 5. ) updates = nesterov_momentum(loss, params, \ learning_rate=config.learning_rate, \ momentum=config.momentum ) #updates = rmsprop( loss , params, learning_rate = config.learning_rate ) for param in get_all_params(network, regularizable=True): norm_axis = None if param.ndim == 1: norm_axis = [0] updates[param] = norm_constraint( updates[param], \ 5. * compute_norms( param.get_value() ).mean(), norm_axes = norm_axis ) #Create a loss expression for validation/testing. The crucial difference # here is that we do a deterministic forward pass through the network, # disabling dropout layers. test_prediction = get_output(network, deterministic=True) test_classes = T.argmax(test_prediction, axis=1) test_loss = categorical_crossentropy(test_prediction, target_var) test_loss = test_loss.mean() # As a bonus, also create an expression for the classification accuracy: test_acc = T.eq(test_classes, target_var) # Compile a function performing a training step on a mini-batch (by giving # the updates dictionary) and returning the corresponding training loss: train_fn = theano.function([input_var,target_var], \ ent_loss,\ updates=updates, \ allow_input_downcast=True) # Compile a second function computing the validation loss and accuracy: val_fn = theano.function([input_var, target_var], \ [test_loss, test_prediction, test_acc], \ allow_input_downcast=True ) return network, train_fn, val_fn
def _print_network(network): kwargs['logger'].info("\n") for layer in get_all_layers(network): kwargs['logger'].info(str(layer) +' : ' + str(layer.output_shape)) kwargs['logger'].info("Total Parameters: " + str(count_params(layer))) kwargs['logger'].info("\n")
def main(): # Where we'll save sample data to fname = sys.argv[0].split('.py')[0] curr_time = datetime.now().strftime('%d%H%M') save_dir = '../output/segmentation/images-' + fname + curr_time image_path = '../data/seg-tomato-leaf-stem-images.npz' label_path = '../data/seg-tomato-leaf-stem-labels.npz' #image_path = '../data/seg-tomato-images.npz' #label_path = '../data/seg-tomato-labels.npz' pretrained = '../data/vgg16.pkl' class_dict_path = '../data/segmentation-label-dict.json' num_epochs = 300 lrate = 1e-3 batch_size = 1 seed = 1234 crop_size = 700 # theano symbolic tensors input_var = T.tensor4('x') target_var = T.itensor3('y') input_shape = (None, 3, None, None) # Load data. # x_train and x_valid contains the images; # y_train and y_test contains the labels for the corresponding images # Number of validation/test samples is hard coded to 10 samples. # TO DO: change this to a percentage of the total size of the data X_train, y_train, X_valid, y_valid = load_data(image_path, label_path, seed, 10) unique_classes = np.unique(y_train) print "class: ", unique_classes # Slightly increase importance of segmenting the tomato border class. Note # that we can do this for all other classes as well by changing the index. with open(class_dict_path) as data_file: class_labels = json.load(data_file) print "class labels:", class_labels class_weights = np.ones(len(class_labels)) class_weights[class_labels['tomato border']] *= 1.05 class_weights[class_labels['leaf border']] *= 1.05 class_weights[class_labels['stem border']] *= 1.05 # Compute class weights to balance dataset. We first find the value to get # an equal contribution from each class, then transform weights to [0, 1] # First get the frequency of each class in the training set print "bgd count: ", [np.sum(y_train == 5)] counts = [np.sum(y_train == class_) for class_ in unique_classes] print "counts: ", counts for class_name in class_labels.keys(): print "class: ", class_labels[ class_name], "class name: ", class_name, " freq: ", counts[ class_labels[class_name]] counts = np.asarray(counts).astype(theano.config.floatX) # The factor used to adjust the weights for each freq in counts is (minimum_freq in counts)/freq counts = np.min(counts) * (1. / counts) # Since 'stem' is the least frequent class in the training set # class_weights[class_labels['stem']] *= 1.0 counts = counts * class_weights counts = T.as_tensor_variable(counts) print 'Building model' softmax, network, network_crf, vgg_layers = \ build_network(input_var, len(unique_classes)) print 'Number of parameters: ', nn.count_params(softmax) # If training, initialize weights with ImageNet pretrained weights. # otherwise, we can load full network weights from file and set all # by commenting this and uncommenting subsequent lines of code param_values = pickle.load(file(pretrained, mode='r'))['param values'] nn.set_all_param_values(vgg_layers, param_values[:13 * 2]) #with np.load('../data/trained-weights.npz') as f: # param_values = [f['arr_%d' % i] for i in range(len(f.files))] #lasagne.layers.set_all_param_values(softmax, param_values) # When building the loss, we'll weight class loss by frequency due to # consistency of labels and their overall desireability output = nn.get_output(softmax, deterministic=False) loss = categorical_crossentropy(output, target_var.flatten()) loss = loss * counts[target_var.flatten()] loss = T.mean(loss) # When training, we only want to update the newly added layers pretrained_layers = nn.get_all_layers(vgg_layers) layers = nn.get_all_layers(softmax, treat_as_input=pretrained_layers) params = [l.get_params(trainable=True) for l in layers] #[[W,b], [W,b] ... ] trainable_params = [p for a in params for p in a] updates = lasagne.updates.adamax(loss, trainable_params, lrate) #updates = lasagne.updates.rmsprop(loss, trainable_params, lrate) # Take the most likely class and compare it to the provided labels train_acc = T.mean(T.eq(T.argmax(output, axis=1), target_var.flatten())) train_fn = theano.function([input_var, target_var], [loss, train_acc], updates=updates, allow_input_downcast=True) # Validation function output, preds = nn.get_output([softmax, network_crf], deterministic=True) loss = categorical_crossentropy(output, target_var.flatten()) loss = loss * counts[target_var.flatten()] loss = T.mean(loss) test_acc = T.mean(T.eq(T.argmax(output, axis=1), target_var.flatten())) valid_fn = theano.function([input_var, target_var], [loss, test_acc, preds], allow_input_downcast=True) # Early stopping best_params = None count, best_err = 0, np.inf # Train the network: iterate over epochs: for epoch in range(num_epochs): # In each epoch, we do a full pass over the training data: train_err = 0 train_acc = 0 train_batches = 0 start_time = time.time() for inputs, targets in iterate_minibatches(X_train, y_train, batch_size, shuffle=True): inputs, targets = random_crop(inputs, targets, X_train.shape[2], X_train.shape[3]) err, acc = train_fn(inputs, targets) train_err += err train_acc += acc train_batches += 1 # And a full pass over the validation data: val_err = 0 val_acc = 0 val_batches = 0 valid_iou = np.zeros((len(unique_classes), )) valid_found = np.zeros((len(unique_classes), )) val_preds, val_inputs, val_targets = [], [], [] for inputs, targets in iterate_minibatches(X_valid, y_valid, batch_size, shuffle=False): err, acc, preds = valid_fn(inputs, targets) val_err += err val_acc += acc val_batches += 1 val_preds.append(preds) val_inputs.append(inputs) val_targets.append(targets) iou, found = meanIOU(preds, targets, len(unique_classes)) valid_iou += iou valid_found += found if epoch % 3 == 0: val_preds = np.vstack(val_preds) val_inputs = np.vstack(val_inputs) / 255.0 val_targets = np.vstack(val_targets) plot_segmentations(val_inputs, val_preds, val_targets, epoch, save_dir) ''' output_dir = 'output' if not os.path.exists(output_dir): os.makedirs(output_dir) np.savez(os.path.join(output_dir, 'predictions.npz'), val_preds) np.savez(os.path.join(output_dir, 'targets.npz'), val_targets) np.savez(os.path.join(output_dir, 'rgb.npz'), val_inputs) ''' confusion(val_preds, val_targets) # Then we print the results for this epoch: print "Epoch {} of {} took {:.3f}s".format(epoch + 1, num_epochs, time.time() - start_time) print " training loss:\t\t{:.6f}".format(train_err / train_batches) print " validation loss:\t\t{:.6f}".format(val_err / val_batches) print " validation accuracy:\t\t{:.2f} %".format(val_acc / val_batches * 100) print " validation IOU:\t\t{}".format(valid_iou / valid_found) # Early stopping val_err = val_err / float(val_batches) if val_err > best_err * 0.99: count += 1 else: count = 0 best_err = val_err best_params = nn.get_all_param_values(softmax) if count >= 6: nn.set_all_param_values(softmax, best_params) break # And a full pass over the validation data: val_preds, val_inputs, val_targets = [], [], [] for batch in iterate_minibatches(X_valid, y_valid, batch_size, shuffle=False): inputs, targets = batch err, acc, preds = valid_fn(inputs, targets) val_preds.append(preds) val_inputs.append(inputs) val_targets.append(targets) val_preds = np.vstack(val_preds) val_inputs = np.vstack(val_inputs) / 255.0 val_targets = np.vstack(val_targets) print 'Final confusion matrix: ' confusion(val_preds, val_targets) plot_segmentations(val_inputs, val_preds, val_targets, 'final', save_dir) np.savez('../data/trained-weights.npz', *lasagne.layers.get_all_param_values(softmax))
def main(): # Where we'll save data to fname = sys.argv[0].split('.py')[0] curr_time = datetime.now().strftime('%d%H%M') save_dir = 'sample-' + fname + curr_time lrate = 5e-4 batch_size = 1 num_epochs = 100 crop_size = 360 input_var = T.tensor4('x') target_var = T.itensor4('y') images = np.load('images.npz')['arr_0'].astype( theano.config.floatX) / 255.0 labels = np.load('labels.npz')['arr_0'].astype(np.int32) num_classes = labels.shape[1] idx = np.arange(num_classes) idx = idx.reshape(1, num_classes, 1, 1) labels = labels / 255 labels = labels.astype(np.int32) * idx labels = np.sum(labels, axis=1, keepdims=True) np.random.seed(1234) idx = np.arange(images.shape[0]) np.random.shuffle(idx) X_train = images[idx[:-10]] y_train = labels[idx[:-10]] X_valid = images[idx[-10:]] y_valid = labels[idx[-10:]] # Compute class weights to balance dataset counts = [] for cl in xrange(num_classes): class_counts = 0 for img in y_train: class_counts += np.sum(img == cl) counts.append(class_counts) counts = np.array(counts).astype(theano.config.floatX) # We can either upscale the loss (i.e. multiply by a factor > 1), or # downscale the loss (multiply by a factor < 1). Here we do the latter counts = np.max(counts) / counts counts = counts / np.max(counts) counts[0] = counts[0] * 1.1 # stem counts[1] = counts[1] * 1.1 # tomato counts = T.as_tensor_variable(counts) # Build DenseNetwork input_shape = (None, 3, crop_size, crop_size) softmax, network = build_network(input_var, input_shape, num_classes) print 'Number of paramters: ', nn.count_params(network) preds = nn.get_output(softmax, deterministic=False) loss = lasagne.objectives.categorical_crossentropy(preds, target_var.flatten()) loss = loss * counts[target_var.flatten()] loss = T.mean(loss) + regularize_network_params(softmax, l2) * 0.0001 acc = T.mean(T.eq(T.argmax(preds, axis=1), target_var.flatten())) params = nn.get_all_params(softmax, trainable=True) updates = lasagne.updates.adam(loss, params, lrate) train_fn = theano.function([input_var, target_var], [loss, acc], updates=updates, allow_input_downcast=True) probs, preds = nn.get_output([softmax, network], deterministic=True) loss = lasagne.objectives.categorical_crossentropy(probs, target_var.flatten()) loss = loss * counts[target_var.flatten()] loss = T.mean(loss) + regularize_network_params(softmax, l2) * 0.0001 acc = T.mean(T.eq(T.argmax(probs, axis=1), target_var.flatten())) valid_fn = theano.function([input_var, target_var], [loss, acc, preds], allow_input_downcast=True) # We iterate over epochs: for epoch in range(num_epochs): # In each epoch, we do a full pass over the training data: train_err = 0 train_acc = 0 train_batches = 0 start_time = time.time() for batch in iterate_minibatches(X_train, y_train, batch_size, shuffle=True): inputs, targets = batch inputs, targets = random_crop(inputs, targets, crop_size, crop_size) err, acc = train_fn(inputs, targets) train_err += err train_acc += acc train_batches += 1 # And a full pass over the validation data: val_err = 0 val_acc = 0 val_batches = 0 valid_iou = np.zeros((num_classes, )) val_preds, val_inputs, val_targets = [], [], [] for batch in iterate_minibatches(X_valid, y_valid, batch_size, shuffle=False): inputs, targets = batch input_crop, target_crop = random_crop(inputs, targets, crop_size, crop_size) err, acc, preds = valid_fn(input_crop, target_crop) val_err += err val_acc += acc val_batches += 1 val_preds.append(preds) val_inputs.append(input_crop) val_targets.append(target_crop) valid_iou += meanIOU(preds, target_crop, num_classes) if epoch % 2 == 0: val_preds = np.vstack(val_preds) val_inputs = np.vstack(val_inputs) val_targets = np.vstack(val_targets) plot_predictions(val_inputs, val_preds, val_targets, epoch, save_dir) # Then we print the results for this epoch: print "Epoch {} of {} took {:.3f}s".format(epoch + 1, num_epochs, time.time() - start_time) print " training loss:\t\t{:.6f}".format(train_err / train_batches) print " validation loss:\t\t{:.6f}".format(val_err / val_batches) print " validation accuracy:\t\t{:.2f} %".format(val_acc / val_batches * 100) print " validation IOU:\t\t{}".format(valid_iou / val_batches)
def main(options): print 'Build and compile network' input_data = T.ftensor3('input_data') input_mask = T.fmatrix('input_mask') target_data = T.imatrix('target_data') target_mask = T.fmatrix('target_mask') skip_scale = theano.shared(convert_to_floatX(options['skip_scale'])) network, rand_layer_list = build_network( input_data=input_data, input_mask=input_mask, num_inputs=options['num_inputs'], num_units_list=options['num_units_list'], num_outputs=options['num_outputs'], skip_scale=skip_scale, dropout_ratio=options['dropout_ratio'], weight_noise=options['weight_noise'], use_layer_norm=options['use_layer_norm'], peepholes=options['peepholes'], learn_init=options['learn_init'], grad_clipping=options['grad_clipping'], gradient_steps=options['gradient_steps'], use_projection=options['use_projection']) network_params = get_all_params(network, trainable=True) print("number of parameters in model: %d" % count_params(network, trainable=True)) if options['reload_model']: print('Loading Parameters...') pretrain_network_params_val, pretrain_update_params_val, pretrain_total_batch_cnt = pickle.load( open(options['reload_model'], 'rb')) print('Applying Parameters...') set_model_param_value(network_params, pretrain_network_params_val) else: pretrain_update_params_val = None pretrain_total_batch_cnt = 0 print 'Build network trainer' training_fn, trainer_params = set_network_trainer( input_data=input_data, input_mask=input_mask, target_data=target_data, target_mask=target_mask, num_outputs=options['num_outputs'], network=network, rand_layer_list=rand_layer_list, updater=options['updater'], learning_rate=options['lr'], grad_max_norm=options['grad_norm'], l2_lambda=options['l2_lambda'], load_updater_params=pretrain_update_params_val) print 'Build network predictor' predict_fn = set_network_predictor(input_data=input_data, input_mask=input_mask, target_data=target_data, target_mask=target_mask, num_outputs=options['num_outputs'], network=network) print 'Load data stream' train_datastream = get_datastream(path=options['data_path'], norm_path=options['norm_data_path'], which_set='train_si84', batch_size=options['batch_size']) print 'Start training' if os.path.exists(options['save_path'] + '_eval_history.npz'): evaluation_history = numpy.load( options['save_path'] + '_eval_history.npz')['eval_history'].tolist() else: evaluation_history = [[[10.0, 10.0, 1.0], [10.0, 10.0, 1.0]]] early_stop_flag = False early_stop_cnt = 0 total_batch_cnt = 0 try: # for each epoch for e_idx in range(options['num_epochs']): # for each batch for b_idx, data in enumerate( train_datastream.get_epoch_iterator()): total_batch_cnt += 1 if pretrain_total_batch_cnt >= total_batch_cnt: continue # get input, target data input_data = data[0].astype(floatX) input_mask = data[1].astype(floatX) # get target data target_data = data[2] target_mask = data[3].astype(floatX) # get output train_output = training_fn(input_data, input_mask, target_data, target_mask) train_predict_cost = train_output[0] network_grads_norm = train_output[1] skip_means = train_output[2:] # show intermediate result if total_batch_cnt % options[ 'train_disp_freq'] == 0 and total_batch_cnt != 0: # pdb.set_trace() best_idx = numpy.asarray(evaluation_history)[:, 1, 2].argmin() print '============================================================================================' print 'Model Name: ', options['save_path'].split('/')[-1] print '============================================================================================' print 'Epoch: ', str(e_idx), ', Update: ', str( total_batch_cnt) print '--------------------------------------------------------------------------------------------' print 'Prediction Cost: ', str(train_predict_cost) print 'Gradient Norm: ', str(network_grads_norm) print '--------------------------------------------------------------------------------------------' print 'Skip Ratio: ', skip_means print 'Skip Scale: ', str(skip_scale.get_value()) print '--------------------------------------------------------------------------------------------' print 'Train NLL: ', str( evaluation_history[-1][0][0]), ', BPC: ', str( evaluation_history[-1][0][1]), ', FER: ', str( evaluation_history[-1][0][2]) print 'Valid NLL: ', str( evaluation_history[-1][1][0]), ', BPC: ', str( evaluation_history[-1][1][1]), ', FER: ', str( evaluation_history[-1][1][2]) print '--------------------------------------------------------------------------------------------' print 'Best NLL: ', str( evaluation_history[best_idx][1][0]), ', BPC: ', str( evaluation_history[best_idx][1] [1]), ', FER: ', str( evaluation_history[best_idx][1][2]) # evaluation if total_batch_cnt % options[ 'train_eval_freq'] == 0 and total_batch_cnt != 0: train_eval_datastream = get_datastream( path=options['data_path'], norm_path=options['norm_data_path'], which_set='train_si84', batch_size=options['eval_batch_size']) valid_eval_datastream = get_datastream( path=options['data_path'], norm_path=options['norm_data_path'], which_set='test_dev93', batch_size=options['eval_batch_size']) train_nll, train_bpc, train_fer = network_evaluation( predict_fn, train_eval_datastream) valid_nll, valid_bpc, valid_fer = network_evaluation( predict_fn, valid_eval_datastream) # check over-fitting if valid_fer > numpy.asarray(evaluation_history)[:, 1, 2].min(): early_stop_cnt += 1. else: early_stop_cnt = 0. best_network_params_vals = get_model_param_values( network_params) pickle.dump( best_network_params_vals, open(options['save_path'] + '_best_model.pkl', 'wb')) if early_stop_cnt > 10: early_stop_flag = True break # save results evaluation_history.append( [[train_nll, train_bpc, train_fer], [valid_nll, valid_bpc, valid_fer]]) numpy.savez(options['save_path'] + '_eval_history', eval_history=evaluation_history) # save network if total_batch_cnt % options[ 'train_save_freq'] == 0 and total_batch_cnt != 0: cur_network_params_val = get_model_param_values( network_params) cur_trainer_params_val = get_update_params_values( trainer_params) cur_total_batch_cnt = total_batch_cnt pickle.dump([ cur_network_params_val, cur_trainer_params_val, cur_total_batch_cnt ], open(options['save_path'] + '_last_model.pkl', 'wb')) if total_batch_cnt % 1000 == 0 and total_batch_cnt != 0: skip_scale.set_value( convert_to_floatX(skip_scale.get_value() * 1.01)) if early_stop_flag: break except KeyboardInterrupt: print 'Training Interrupted' cur_network_params_val = get_model_param_values(network_params) cur_trainer_params_val = get_update_params_values(trainer_params) cur_total_batch_cnt = total_batch_cnt pickle.dump([ cur_network_params_val, cur_trainer_params_val, cur_total_batch_cnt ], open(options['save_path'] + '_last_model.pkl', 'wb'))
def build_resnet_model(): log.i('BUILDING RESNET MODEL...') # Random Seed lasagne_random.set_rng(cfg.getRandomState()) # Input layer for images net = l.InputLayer((None, cfg.IM_DIM, cfg.IM_SIZE[1], cfg.IM_SIZE[0])) # First Convolution net = l.Conv2DLayer(net, num_filters=cfg.FILTERS[0], filter_size=cfg.KERNEL_SIZES[0], pad='same', W=initialization(cfg.NONLINEARITY), nonlinearity=None) log.i(("\tFIRST CONV OUT SHAPE:", l.get_output_shape(net), "LAYER:", len(l.get_all_layers(net)) - 1)) # Residual Stacks for i in range(0, len(cfg.FILTERS)): net = resblock(net, filters=cfg.FILTERS[i] * cfg.RESNET_K, kernel_size=cfg.KERNEL_SIZES[i], stride=2, num_groups=cfg.NUM_OF_GROUPS[i]) for _ in range(1, cfg.RESNET_N): net = resblock(net, filters=cfg.FILTERS[i] * cfg.RESNET_K, kernel_size=cfg.KERNEL_SIZES[i], num_groups=cfg.NUM_OF_GROUPS[i], preactivated=False) log.i(("\tRES STACK", i + 1, "OUT SHAPE:", l.get_output_shape(net), "LAYER:", len(l.get_all_layers(net)) - 1)) # Post Activation net = batch_norm(net) net = l.NonlinearityLayer(net, nonlinearity=nonlinearity(cfg.NONLINEARITY)) # Pooling net = l.GlobalPoolLayer(net) log.i(("\tFINAL POOLING SHAPE:", l.get_output_shape(net), "LAYER:", len(l.get_all_layers(net)) - 1)) # Classification Layer net = l.DenseLayer(net, len(cfg.CLASSES), nonlinearity=nonlinearity('identity'), W=initialization('identity')) net = l.NonlinearityLayer(net, nonlinearity=nonlinearity('softmax')) log.i(("\tFINAL NET OUT SHAPE:", l.get_output_shape(net), "LAYER:", len(l.get_all_layers(net)))) log.i("...DONE!") # Model stats log.i(("MODEL HAS", (sum(hasattr(layer, 'W') for layer in l.get_all_layers(net))), "WEIGHTED LAYERS")) log.i(("MODEL HAS", l.count_params(net), "PARAMS")) return net
def build_vgg16(input_var=None, preload_vgg=False): # VGG-16, 16-layer model from the paper: # "Very Deep Convolutional Networks for Large-Scale Image Recognition" net = {} net['input'] = InputLayer((None, 3, 224, 224), input_var=input_var) net['conv1_1'] = ConvLayer(net['input'], 64, 3, pad=1, flip_filters=False) net['conv1_2'] = ConvLayer(net['conv1_1'], 64, 3, pad=1, flip_filters=False) net['pool1'] = PoolLayer(net['conv1_2'], 2) net['conv2_1'] = ConvLayer(net['pool1'], 128, 3, pad=1, flip_filters=False) net['conv2_2'] = ConvLayer(net['conv2_1'], 128, 3, pad=1, flip_filters=False) net['pool2'] = PoolLayer(net['conv2_2'], 2) net['conv3_1'] = ConvLayer(net['pool2'], 256, 3, pad=1, flip_filters=False) net['conv3_2'] = ConvLayer(net['conv3_1'], 256, 3, pad=1, flip_filters=False) net['conv3_3'] = ConvLayer(net['conv3_2'], 256, 3, pad=1, flip_filters=False) net['pool3'] = PoolLayer(net['conv3_3'], 2) net['conv4_1'] = ConvLayer(net['pool3'], 512, 3, pad=1, flip_filters=False) net['conv4_2'] = ConvLayer(net['conv4_1'], 512, 3, pad=1, flip_filters=False) net['conv4_3'] = ConvLayer(net['conv4_2'], 512, 3, pad=1, flip_filters=False) net['pool4'] = PoolLayer(net['conv4_3'], 2) net['conv5_1'] = ConvLayer(net['pool4'], 512, 3, pad=1, flip_filters=False) net['conv5_2'] = ConvLayer(net['conv5_1'], 512, 3, pad=1, flip_filters=False) net['conv5_3'] = ConvLayer(net['conv5_2'], 512, 3, pad=1, flip_filters=False) net['pool5'] = PoolLayer(net['conv5_3'], 2) net['fc6'] = DenseLayer(net['pool5'], num_units=4096) net['fc6_dropout'] = DropoutLayer(net['fc6'], p=0.5) net['fc7'] = DenseLayer(net['fc6_dropout'], num_units=4096) net['fc7_dropout'] = DropoutLayer(net['fc7'], p=0.5) net['fc8'] = DenseLayer(net['fc7_dropout'], num_units=101, nonlinearity=None) net['prob'] = NonlinearityLayer(net['fc8'], softmax) if preload_vgg is True: # preload vgg-166 weights with open('vgg16.pkl', 'rb') as f: params = pickle.load(f) set_all_param_values(net['fc7_dropout'], params['param values'][:-2]) print("VGG-16 has {} parameters".format(count_params(net['prob']))) return net
def build_baseline_model(): log.i('BUILDING BASELINE MODEL...') # Random Seed lasagne_random.set_rng(cfg.getRandomState()) # Input layer for images net = l.InputLayer((None, cfg.IM_DIM, cfg.IM_SIZE[1], cfg.IM_SIZE[0])) # Stride size (as an alternative to max pooling) if cfg.MAX_POOLING: s = 1 else: s = 2 # Convolutinal layer groups for i in range(len(cfg.FILTERS)): # 3x3 Convolution + Stride net = batch_norm( l.Conv2DLayer(net, num_filters=cfg.FILTERS[i], filter_size=cfg.KERNEL_SIZES[i], num_groups=cfg.NUM_OF_GROUPS[i], pad='same', stride=s, W=initialization(cfg.NONLINEARITY), nonlinearity=nonlinearity(cfg.NONLINEARITY))) # Pooling layer if cfg.MAX_POOLING: net = l.MaxPool2DLayer(net, pool_size=2) # Dropout Layer (we support different types of dropout) if cfg.DROPOUT_TYPE == 'channels' and cfg.DROPOUT > 0.0: net = l.dropout_channels(net, p=cfg.DROPOUT) elif cfg.DROPOUT_TYPE == 'location' and cfg.DROPOUT > 0.0: net = l.dropout_location(net, p=cfg.DROPOUT) elif cfg.DROPOUT > 0.0: net = l.DropoutLayer(net, p=cfg.DROPOUT) log.i(('\tGROUP', i + 1, 'OUT SHAPE:', l.get_output_shape(net))) # Final 1x1 Convolution net = batch_norm( l.Conv2DLayer(net, num_filters=cfg.FILTERS[i] * 2, filter_size=1, W=initialization('identity'), nonlinearity=nonlinearity('identity'))) log.i(('\tFINAL CONV OUT SHAPE:', l.get_output_shape(net))) # Global Pooling layer (default mode = average) net = l.GlobalPoolLayer(net) log.i(("\tFINAL POOLING SHAPE:", l.get_output_shape(net))) # Classification Layer (Softmax) net = l.DenseLayer(net, len(cfg.CLASSES), nonlinearity=nonlinearity('softmax'), W=initialization('softmax')) log.i(("\tFINAL NET OUT SHAPE:", l.get_output_shape(net))) log.i("...DONE!") # Model stats log.i(("MODEL HAS", (sum(hasattr(layer, 'W') for layer in l.get_all_layers(net))), "WEIGHTED LAYERS")) log.i(("MODEL HAS", l.count_params(net), "PARAMS")) return net
def build_WideResNet(input_var, n=3, k=2): ''' Adapted from https://github.com/Lasagne/Recipes/tree/master/papers/deep_residual_learning. Tweaked to be consistent with 'Identity Mappings in Deep Residual Networks', Kaiming He et al. 2016 (https://arxiv.org/abs/1603.05027) And 'Wide Residual Networks', Sergey Zagoruyko, Nikos Komodakis 2016 (http://arxiv.org/pdf/1605.07146v1.pdf) ''' n_filters = {0: 16, 1: 16 * k, 2: 32 * k, 3: 64 * k} # create a residual learning building block with two stacked 3x3 convlayers and dropout def residual_block(l, increase_dim=False, first=False, filters=16): if increase_dim: first_stride = (2, 2) else: first_stride = (1, 1) if first: # hacky solution to keep layers correct bn_pre_relu = l else: # contains the BN -> ReLU portion, steps 1 to 2 bn_pre_conv = BatchNormLayer(l) bn_pre_relu = NonlinearityLayer(bn_pre_conv, rectify) # contains the weight -> BN -> ReLU portion, steps 3 to 5 conv_1 = batch_norm( ConvLayer(bn_pre_relu, num_filters=filters, filter_size=(3, 3), stride=first_stride, nonlinearity=rectify, pad='same', W=HeNormal(gain='relu'))) dropout = DropoutLayer(conv_1, p=0.3) # contains the last weight portion, step 6 conv_2 = ConvLayer(dropout, num_filters=filters, filter_size=(3, 3), stride=(1, 1), nonlinearity=None, pad='same', W=HeNormal(gain='relu')) # add shortcut connections if increase_dim: # projection shortcut, as option B in paper projection = ConvLayer(l, num_filters=filters, filter_size=(1, 1), stride=(2, 2), nonlinearity=None, pad='same', b=None) block = ElemwiseSumLayer([conv_2, projection]) elif first: # projection shortcut, as option B in paper projection = ConvLayer(l, num_filters=filters, filter_size=(1, 1), stride=(1, 1), nonlinearity=None, pad='same', b=None) block = ElemwiseSumLayer([conv_2, projection]) else: block = ElemwiseSumLayer([conv_2, l]) return block # Building the network l_in = InputLayer(shape=(None, 3, 64, 64), input_var=input_var) # first layer= l = batch_norm( ConvLayer(l_in, num_filters=n_filters[0], filter_size=(3, 3), stride=(1, 1), nonlinearity=rectify, pad='same', W=HeNormal(gain='relu'))) # first stack of residual blocks l = residual_block(l, first=True, filters=n_filters[1]) for _ in range(1, n): l = residual_block(l, filters=n_filters[1]) # second stack of residual blocks l = residual_block(l, increase_dim=True, filters=n_filters[2]) for _ in range(1, n): l = residual_block(l, filters=n_filters[2]) # third stack of residual blocks l = residual_block(l, increase_dim=True, filters=n_filters[3]) for _ in range(1, n): l = residual_block(l, filters=n_filters[3]) bn_post_conv = BatchNormLayer(l) bn_post_relu = NonlinearityLayer(bn_post_conv, rectify) # average pooling avg_pool = GlobalPoolLayer(bn_post_relu) # fully connected layer network = DenseLayer(avg_pool, num_units=101, W=HeNormal(), nonlinearity=softmax) print("WideResNet has {} parameters".format(count_params(network))) return network
input_mask = T.fmatrix('input_mask') target_data = T.imatrix('target_data') target_mask = T.fmatrix('target_mask') network_output = deep_projection_ln_lstm_model_fix( input_var=input_data, mask_var=input_mask, num_inputs=input_dim, num_outputs=output_dim, num_layers=args.num_layers, num_units=args.num_units, grad_clipping=args.grad_clipping, dropout=args.dropout) network = network_output network_params = get_all_params(network, trainable=True) param_count = count_params(network, trainable=True) print('Number of parameters of the network: {:.2f}M'.format( float(param_count) / 1000000)) ###################### # reload model param # ###################### if args.reload_model: print('Loading model: {}'.format(args.reload_model)) with open(args.reload_model, 'rb') as f: [ pretrain_network_params_val, pretrain_update_params_val, pretrain_total_batch_cnt ] = pickle.load(f) set_model_param_value(network_params, pretrain_network_params_val) else:
logging=logging) all_g_layers = ll.get_all_layers(gan_model.g_layers.values()) all_d_layers = ll.get_all_layers(gan_model.d_layers.values()) glayer2name = defaultdict(str) dlayer2name = defaultdict(str) logging.info('~~~~~~~~~~~~~ G model ~~~~~~~~~~~~~~~~~~~~~~') glayer2name.update({v: k for k, v in gan_model.g_layers.iteritems()}) logging.info(get_network_str(all_g_layers, get_network=False, incomings=True, outgoings=True, layer2name=glayer2name)) dlayer2name.update(({v: k for k, v in gan_model.d_layers.iteritems()})) logging.info('G total trainable params: %g' % (ll.count_params(gan_model.l_out_g, trainable=True))) logging.info('~~~~~~~~~~~~~ D model ~~~~~~~~~~~~~~~~~~~~~~') logging.info(get_network_str(all_d_layers, get_network=False, incomings=True, outgoings=True, layer2name=dlayer2name)) logging.info('D total trainable params: %g' % (ll.count_params(gan_model.l_out_d, trainable=True))) ############################################################## gan_model.build_funcs() data_itr = batch(all_data, batch_size) tic = None hist = defaultdict(list)