################## for training record ###################### stdout_tap = Tap(sys.stdout) stderr_tap = Tap(sys.stderr) sys.stdout = stdout_tap sys.stderr = stderr_tap result_subdir = create_result_subdir(os.path.join(args.base_logdir, args.method, 'TRAIN'), 'exp') print("Saving logs to {}".format(result_subdir)) # Start dumping stdout and stderr into result directory. stdout_tap.set_file(open(os.path.join(result_subdir, 'stdout.txt'), 'wt')) stderr_tap.set_file(open(os.path.join(result_subdir, 'stderr.txt'), 'wt')) # Saving source files. export_sources(os.path.join(result_subdir, 'src')) # Saving model parameters. model_save_path = os.path.join(result_subdir, 'saved_model') if not os.path.exists(model_save_path): os.mkdir(model_save_path) # Saving data feature. feat_save_path = os.path.join(args.base_logdir, args.method, 'saved_feat') if not os.path.exists(feat_save_path): os.mkdir(feat_save_path) for arg in vars(args): print('{}--{}'.format(arg, getattr(args, arg))) cudnn.deterministic = True
def run_training(monitor_filename=None): # Sanity check network type. if config.network_type not in ['pi', 'tempens']: print("Unknown network type '%s'." % config.network_type) exit() # Create the result directory and basic run data. result_subdir = report.create_result_subdir(config.result_dir, config.run_desc) print "Saving results to", result_subdir # Start dumping stdout and stderr into result directory. stdout_tap.set_file(open(os.path.join(result_subdir, 'stdout.txt'), 'wt')) stderr_tap.set_file(open(os.path.join(result_subdir, 'stderr.txt'), 'wt')) # Set window title if on Windows. try: import ctypes ctypes.windll.kernel32.SetConsoleTitleA( '%s - Gpu %d' % (os.path.split(result_subdir)[1], config.cuda_device_number)) except: pass # Export run information. report.export_sources(os.path.join(result_subdir, 'src')) report.export_run_details(os.path.join(result_subdir, 'run.txt')) report.export_config(os.path.join(result_subdir, 'config.txt')) # Load the dataset. print("Loading dataset '%s'..." % config.dataset) if config.dataset == 'cifar-10': X_train, y_train, X_test, y_test = load_cifar_10() elif config.dataset == 'cifar-100': X_train, y_train, X_test, y_test = load_cifar_100() elif config.dataset == 'svhn': X_train, y_train, X_test, y_test = load_svhn() elif config.dataset == 'imu': X_train, y_train, X_test, y_test = load_imu() else: print("Unknown dataset '%s'." % config.dataset) exit() # Calculate number of classes. num_classes = len(set(y_train)) assert (set(y_train) == set(y_test) == set(range(num_classes)) ) # Check that all labels are in range [0, num_classes-1] print("Found %d classes in training set, %d in test set." % (len(set(y_train)), len(set(y_test)))) # Prepare dataset and print stats. X_train, y_train, mask_train, X_test, y_test = prepare_dataset( result_subdir, X_train, y_train, X_test, y_test, num_classes) print("Got %d training inputs, out of which %d are labeled." % (len(X_train), sum(mask_train))) print("Got %d test inputs." % len(X_test)) #---------------------------------------------------------------------------- # Prepare to train. #---------------------------------------------------------------------------- print("Network type is '%s'." % config.network_type) # Prepare Theano variables for inputs and targets input_var = T.tensor3('inputs') # input_var = T.matrix('inputs') label_var = T.ivector('labels') learning_rate_var = T.scalar('learning_rate') adam_beta1_var = T.scalar('adam_beta1') input_vars = [input_var] scaled_unsup_weight_max = config.unsup_weight_max if config.num_labels != 'all': scaled_unsup_weight_max *= 1.0 * config.num_labels / X_train.shape[0] if config.network_type == 'pi': input_b_var = T.tensor4('inputs_b') mask_var = T.vector('mask') unsup_weight_var = T.scalar('unsup_weight') input_vars.append(input_b_var) elif config.network_type == 'tempens': mask_var = T.vector('mask') target_var = T.matrix('targets') unsup_weight_var = T.scalar('unsup_weight') # Load/create the network. if config.load_network_filename is not None: net, input_var = load_network(config.load_network_filename) input_vars = [input_var] if config.network_type == 'pi': input_vars.append(input_b_var) else: print("Building network and compiling functions...") # X_train = np.reshape(X_train, (X_train.shape[0],X_train.shape[2],X_train.shape[1])) # X_test = np.reshape(X_test, (X_test.shape[0],X_test.shape[2],X_test.shape[1])) print('X_train shape: ', X_train.shape) net = build_network(input_var, X_train.shape[1], X_train.shape[2], num_classes) # net, l1_penalty, l2_penalty = build_network(input_var, X_train.shape[1], X_train.shape[2], num_classes) # Export topology report. with open(os.path.join(result_subdir, 'network-topology.txt'), 'wt') as fout: for line in report.generate_network_topology_info(net): print(line) fout.write(line + '\n') # Initialization updates and function. lasagne.layers.get_output(net, init=True) init_updates = [ u for l in lasagne.layers.get_all_layers(net) for u in getattr(l, 'init_updates', []) ] init_fn = theano.function(input_vars, [], updates=init_updates, on_unused_input='ignore') # Get training predictions, BN updates. train_prediction = lasagne.layers.get_output(net) if config.network_type == 'pi': train_prediction_b = lasagne.layers.get_output( net, inputs=input_b_var) # Second branch. bn_updates = [ u for l in lasagne.layers.get_all_layers(net) for u in getattr(l, 'bn_updates', []) ] train_acc = T.mean(T.eq(T.argmax(train_prediction, axis=1), label_var), dtype=theano.config.floatX, acc_dtype=theano.config.floatX) # Training loss. if config.network_type == 'pi': train_loss = T.mean(lasagne.objectives.categorical_crossentropy( train_prediction, label_var) * mask_var, dtype=theano.config.floatX, acc_dtype=theano.config.floatX) train_loss += unsup_weight_var * T.mean( lasagne.objectives.squared_error(train_prediction, train_prediction_b), dtype=theano.config.floatX, acc_dtype=theano.config.floatX) elif config.network_type == 'tempens': train_loss = T.mean(lasagne.objectives.categorical_crossentropy( train_prediction, label_var) * mask_var, dtype=theano.config.floatX, acc_dtype=theano.config.floatX) train_loss += unsup_weight_var * T.mean( lasagne.objectives.squared_error(train_prediction, target_var), dtype=theano.config.floatX, acc_dtype=theano.config.floatX) # train_loss = train_loss + l1_penalty + l2_penalty #regularization # ADAM update expressions for training. params = lasagne.layers.get_all_params(net, trainable=True) updates = robust_adam(train_loss, params, learning_rate=learning_rate_var, beta1=adam_beta1_var, beta2=config.adam_beta2, epsilon=config.adam_epsilon).items() # updates = lasagne.updates.sgd(train_loss, params, learning_rate=learning_rate_var) # Training function. if config.network_type == 'pi': train_fn = theano_utils.function([ input_var, input_b_var, label_var, mask_var, learning_rate_var, adam_beta1_var, unsup_weight_var ], [train_loss], updates=updates + bn_updates, on_unused_input='warn') elif config.network_type == 'tempens': train_fn = theano_utils.function([ input_var, label_var, mask_var, target_var, learning_rate_var, adam_beta1_var, unsup_weight_var ], [train_loss, train_prediction, train_acc], updates=updates + bn_updates, on_unused_input='warn') # Validation prediction, loss, and accuracy. test_prediction = lasagne.layers.get_output(net, deterministic=True) test_loss = T.mean(lasagne.objectives.categorical_crossentropy( test_prediction, label_var), dtype=theano.config.floatX, acc_dtype=theano.config.floatX) test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), label_var), dtype=theano.config.floatX, acc_dtype=theano.config.floatX) # Validation function. val_fn = theano_utils.function([input_var, label_var], [test_loss, test_acc], on_unused_input='warn') #---------------------------------------------------------------------------- # Start training. #---------------------------------------------------------------------------- print("Starting training.") if config.max_unlabeled_per_epoch is not None: print("Limiting number of unlabeled inputs per epoch to %d." % config.max_unlabeled_per_epoch) training_csv = report.GenericCSV( os.path.join(result_subdir, 'training.csv'), 'Epoch', 'EpochTime', 'TrainLoss', 'TestLoss', 'TrainAccuracy', 'TestAccuracy', 'LearningRate') # Initial training variables for temporal ensembling. if config.network_type == 'tempens': ensemble_prediction = np.zeros((len(X_train), num_classes)) training_targets = np.zeros((len(X_train), num_classes)) #---------------------------------------------------------------------------- # Training loop. #---------------------------------------------------------------------------- for epoch in range(config.start_epoch, config.num_epochs): # Export network snapshot every 50 epochs. if (epoch % 50) == 0 and epoch != config.start_epoch: save_network( net, os.path.join(result_subdir, 'network-snapshot-%03d.pkl' % epoch)) # Evaluate up/down ramps. rampup_value = rampup(epoch) rampdown_value = rampdown(epoch) # Initialize WN/MOBN layers with a properly augmented minibatch. if epoch == 0: if config.network_type == 'pi': minibatches = iterate_minibatches_augment_pi( X_train, np.zeros((len(X_train), )), np.zeros((len(X_train), )), config.minibatch_size) for (n, indices, inputs_a, inputs_b, labels, mask) in minibatches: init_fn(inputs_a, inputs_b) break elif config.network_type == 'tempens': minibatches = iterate_minibatches_augment_tempens( X_train, np.zeros((len(X_train), )), np.zeros((len(X_train), )), np.zeros((len(X_train), )), config.minibatch_size) # minibatches = iterate_minibatches(X_train, y_train, config.minibatch_size) for (n, indices, inputs, labels, mask, targets) in minibatches: init_fn(inputs) break # Initialize epoch predictions for temporal ensembling. if config.network_type == 'tempens': epoch_predictions = np.zeros((len(X_train), num_classes)) epoch_execmask = np.zeros( len(X_train)) # Which inputs were executed. training_targets = floatX(training_targets) # Training pass. start_time = time.time() train_err, train_n = 0., 0. train_acc = 0. learning_rate = rampup_value * rampdown_value * config.learning_rate_max adam_beta1 = rampdown_value * config.adam_beta1 + ( 1.0 - rampdown_value) * config.rampdown_beta1_target unsup_weight = rampup_value * scaled_unsup_weight_max if epoch == config.start_epoch: unsup_weight = 0.0 with thread_utils.ThreadPool(8) as thread_pool: if config.network_type == 'pi': minibatches = iterate_minibatches_augment_pi( X_train, y_train, mask_train, config.minibatch_size) minibatches = thread_utils.run_iterator_concurrently( minibatches, thread_pool) for (n, indices, inputs_a, inputs_b, labels, mask) in minibatches: (e_train, ) = train_fn(inputs_a, inputs_b, labels, mask, floatX(learning_rate), floatX(adam_beta1), floatX(unsup_weight)) train_err += e_train * n train_n += n elif config.network_type == 'tempens': minibatches = iterate_minibatches_augment_tempens( X_train, y_train, mask_train, training_targets, config.minibatch_size) minibatches = thread_utils.run_iterator_concurrently( minibatches, thread_pool) for (n, indices, inputs, labels, mask, targets) in minibatches: (e_train, prediction, acc) = train_fn(inputs, labels, mask, targets, floatX(learning_rate), floatX(adam_beta1), floatX(unsup_weight)) for i, j in enumerate(indices): epoch_predictions[j] = prediction[ i] # Gather epoch predictions. epoch_execmask[j] = 1.0 train_err += e_train * n train_n += n train_acc += acc * n # Test pass. val_err, val_acc, val_n = 0., 0., 0. with thread_utils.ThreadPool(8) as thread_pool: minibatches = iterate_minibatches(X_test, y_test, config.minibatch_size) minibatches = thread_utils.run_iterator_concurrently( minibatches, thread_pool) for (n, inputs, labels) in minibatches: err, acc = val_fn(inputs, labels) val_err += err * n val_acc += acc * n val_n += n if config.network_type == 'tempens': if config.max_unlabeled_per_epoch is None: # Basic mode. ensemble_prediction = ( config.prediction_decay * ensemble_prediction ) + (1.0 - config.prediction_decay) * epoch_predictions training_targets = ensemble_prediction / ( 1.0 - config.prediction_decay**( (epoch - config.start_epoch) + 1.0)) else: # Sparse updates. epoch_execmask = epoch_execmask.reshape(-1, 1) ensemble_prediction = epoch_execmask * ( config.prediction_decay * ensemble_prediction + (1.0 - config.prediction_decay) * epoch_predictions) + ( 1.0 - epoch_execmask) * ensemble_prediction training_targets = ensemble_prediction / ( np.sum(ensemble_prediction, axis=1, keepdims=True) + 1e-8 ) # Normalize # Export stats. training_csv.add_data(epoch, time.time() - start_time, train_err / train_n, val_err / val_n, train_acc / train_n * 100.0, val_acc / val_n * 100.0, learning_rate) # Export progress monitor data. if monitor_filename is not None: with open(monitor_filename, 'wt') as f: json.dump( { "loss": 1.0 - val_acc / val_n, "cur_epoch": (epoch + 1), "max_epoch": config.num_epochs }, f) # Print stats. print( "Epoch %3d of %3d took %6.3fs Loss %.7f, %.7f Acc=%5.2f, %5.2f LR=%.7f" % (epoch, config.num_epochs, time.time() - start_time, train_err / train_n, val_err / val_n, train_acc / train_n * 100.0, val_acc / val_n * 100.0, learning_rate)) #---------------------------------------------------------------------------- # Save and exit. #---------------------------------------------------------------------------- training_csv.close() print("Saving the final network.") np.savez(os.path.join(result_subdir, 'network-final.npz'), *lasagne.layers.get_all_param_values(net)) save_network(net, os.path.join(result_subdir, 'network-final.pkl')) print("Done.")