def entrenar(checkpoint, entrRuedas, entrOperaciones, input_dim, num_output_classes, testRuedas, testOperaciones): minibatch_size = 100; epocs=900; minibatchIteraciones = int(len(entrOperaciones) / minibatch_size); # Input variables denoting the features and label data feature = input((input_dim), np.float32) label = input((num_output_classes), np.float32) netout = crearRed(input_dim, num_output_classes, feature); ce = cross_entropy_with_softmax(netout, label) pe = classification_error(netout, label) lr_per_minibatch=learning_rate_schedule(0.25, UnitType.minibatch) # Instantiate the trainer object to drive the model training learner = sgd(netout.parameters, lr=lr_per_minibatch) progress_printer = ProgressPrinter(log_to_file=checkpoint+".log", num_epochs=epocs); trainer = Trainer(netout, (ce, pe), learner, progress_printer) if os.path.isfile(checkpoint): trainer.restore_from_checkpoint(checkpoint); npentrRuedas = np.array(entrRuedas).astype(np.float32); npentrOperaciones = np.array(entrOperaciones).astype(np.float32); #iteramos una vez por cada "epoc" for i in range(0, epocs): p = np.random.permutation(len(entrRuedas)); npentrOperaciones = npentrOperaciones[p]; npentrRuedas = npentrRuedas[p]; #ahora partimos los datos en "minibatches" y entrenamos for j in range(0, minibatchIteraciones): features = npentrRuedas[j*minibatch_size:(j+1)*minibatch_size]; labels = npentrOperaciones[j*minibatch_size:(j+1)*minibatch_size]; trainer.train_minibatch({feature: features, label: labels}); trainer.summarize_training_progress() trainer.save_checkpoint(checkpoint); minibatchIteraciones = int(len(testOperaciones) / minibatch_size); avg_error = 0; for j in range(0, minibatchIteraciones): test_features = np.array(testRuedas[j*minibatch_size:(j+1)*minibatch_size]).astype(np.float32); test_labels = np.array(testOperaciones[j*minibatch_size:(j+1)*minibatch_size]).astype(np.float32); #test_features = np.array( entrRuedas[0:minibatch_size]).astype(np.float32); #test_labels = np.array(entrOperaciones[0:minibatch_size]).astype(np.float32); avg_error = avg_error + ( trainer.test_minibatch( {feature: test_features, label: test_labels}) / minibatchIteraciones) return avg_error
def main(base_folder, training_mode='majority', model_name='VGG13', max_epochs = 100): # create needed folders. output_model_path = os.path.join(base_folder, R'models') output_model_folder = os.path.join(output_model_path, model_name + '_' + training_mode) if not os.path.exists(output_model_folder): os.makedirs(output_model_folder) # creating logging file logging.basicConfig(filename = os.path.join(output_model_folder, "train.log"), filemode = 'w', level = logging.INFO) logging.getLogger().addHandler(logging.StreamHandler()) logging.info("Starting with training mode {} using {} model and max epochs {}.".format(training_mode, model_name, max_epochs)) # create the model num_classes = len(emotion_table) model = build_model(num_classes, model_name) # set the input variables. input_var = ct.input_variable((1, model.input_height, model.input_width), np.float32) label_var = ct.input_variable((num_classes), np.float32) # read FER+ dataset. logging.info("Loading data...") train_params = FERPlusParameters(num_classes, model.input_height, model.input_width, training_mode, False) test_and_val_params = FERPlusParameters(num_classes, model.input_height, model.input_width, "majority", True) train_data_reader = FERPlusReader.create(base_folder, train_folders, "label.csv", train_params) val_data_reader = FERPlusReader.create(base_folder, valid_folders, "label.csv", test_and_val_params) test_data_reader = FERPlusReader.create(base_folder, test_folders, "label.csv", test_and_val_params) # print summary of the data. display_summary(train_data_reader, val_data_reader, test_data_reader) # get the probalistic output of the model. z = model.model(input_var) pred = ct.softmax(z) epoch_size = train_data_reader.size() minibatch_size = 32 # Training config lr_per_minibatch = [model.learning_rate]*20 + [model.learning_rate / 2.0]*20 + [model.learning_rate / 10.0] mm_time_constant = -minibatch_size/np.log(0.9) lr_schedule = learning_rate_schedule(lr_per_minibatch, unit=UnitType.minibatch, epoch_size=epoch_size) mm_schedule = momentum_as_time_constant_schedule(mm_time_constant) # loss and error cost train_loss = cost_func(training_mode, pred, label_var) pe = classification_error(z, label_var) # construct the trainer learner = momentum_sgd(z.parameters, lr_schedule, mm_schedule) trainer = Trainer(z, (train_loss, pe), learner) # Get minibatches of images to train with and perform model training max_val_accuracy = 0.0 final_test_accuracy = 0.0 best_test_accuracy = 0.0 logging.info("Start training...") epoch = 0 best_epoch = 0 while epoch < max_epochs: train_data_reader.reset() val_data_reader.reset() test_data_reader.reset() # Training start_time = time.time() training_loss = 0 training_accuracy = 0 while train_data_reader.has_more(): images, labels, current_batch_size = train_data_reader.next_minibatch(minibatch_size) # Specify the mapping of input variables in the model to actual minibatch data to be trained with trainer.train_minibatch({input_var : images, label_var : labels}) # keep track of statistics. training_loss += trainer.previous_minibatch_loss_average * current_batch_size training_accuracy += trainer.previous_minibatch_evaluation_average * current_batch_size training_accuracy /= train_data_reader.size() training_accuracy = 1.0 - training_accuracy # Validation val_accuracy = 0 while val_data_reader.has_more(): images, labels, current_batch_size = val_data_reader.next_minibatch(minibatch_size) val_accuracy += trainer.test_minibatch({input_var : images, label_var : labels}) * current_batch_size val_accuracy /= val_data_reader.size() val_accuracy = 1.0 - val_accuracy # if validation accuracy goes higher, we compute test accuracy test_run = False if val_accuracy > max_val_accuracy: best_epoch = epoch max_val_accuracy = val_accuracy trainer.save_checkpoint(os.path.join(output_model_folder, "model_{}".format(best_epoch))) test_run = True test_accuracy = 0 while test_data_reader.has_more(): images, labels, current_batch_size = test_data_reader.next_minibatch(minibatch_size) test_accuracy += trainer.test_minibatch({input_var : images, label_var : labels}) * current_batch_size test_accuracy /= test_data_reader.size() test_accuracy = 1.0 - test_accuracy final_test_accuracy = test_accuracy if final_test_accuracy > best_test_accuracy: best_test_accuracy = final_test_accuracy logging.info("Epoch {}: took {:.3f}s".format(epoch, time.time() - start_time)) logging.info(" training loss:\t{:e}".format(training_loss)) logging.info(" training accuracy:\t\t{:.2f} %".format(training_accuracy * 100)) logging.info(" validation accuracy:\t\t{:.2f} %".format(val_accuracy * 100)) if test_run: logging.info(" test accuracy:\t\t{:.2f} %".format(test_accuracy * 100)) epoch += 1 logging.info("") logging.info("Best validation accuracy:\t\t{:.2f} %, epoch {}".format(max_val_accuracy * 100, best_epoch)) logging.info("Test accuracy corresponding to best validation:\t\t{:.2f} %".format(final_test_accuracy * 100)) logging.info("Best test accuracy:\t\t{:.2f} %".format(best_test_accuracy * 100))
plot_data['batchindex'].append(batch_index) plot_data['loss'].append(trainer.previous_minibatch_loss_average) plot_data['error'].append( trainer.previous_minibatch_evaluation_average) progress_printer.update_with_trainer(trainer, with_metric=True) # log progress batch_index += 1 ev_avg += trainer.previous_minibatch_evaluation_average i_count += 1 # if ev_avg/i_count < 0.02: # break progress_printer.epoch_summary(with_metric=True) # print(epoch,' : ',ev_avg/i_count) try: trainer.save_checkpoint(model_temp_file) except: print('Problem with saving temp model') # # Evaluation action # epoch_size = 2200 #4000 #5000 minibatch_size = 32 # process minibatches and evaluate the model metric_numer = 0 metric_denom = 0 sample_count = 0 minibatch_index = 0 input_map = {
def train_and_evaluate(create_train_reader, test_reader, network_name, max_epochs, create_dist_learner, scale_up=False): set_computation_network_trace_level(0) # Input variables denoting the features and label data input_var = input_variable((num_channels, image_height, image_width)) label_var = input_variable((num_classes)) # create model, and configure learning parameters if network_name == 'resnet20': z = create_cifar10_model(input_var, 3, num_classes) lr_per_mb = [1.0] * 80 + [0.1] * 40 + [0.01] elif network_name == 'resnet110': z = create_cifar10_model(input_var, 18, num_classes) lr_per_mb = [0.1] * 1 + [1.0] * 80 + [0.1] * 40 + [0.01] else: return RuntimeError("Unknown model name!") # loss and metric ce = cross_entropy_with_softmax(z, label_var) pe = classification_error(z, label_var) # shared training parameters epoch_size = 50000 # for now we manually specify epoch size # NOTE: scaling up minibatch_size increases sample throughput. In 8-GPU machine, # ResNet110 samples-per-second is ~7x of single GPU, comparing to ~3x without scaling # up. However, bigger minimatch size on the same number of samples means less updates, # thus leads to higher training error. This is a trade-off of speed and accuracy minibatch_size = 128 * (distributed.Communicator.num_workers() if scale_up else 1) momentum_time_constant = -minibatch_size / np.log(0.9) l2_reg_weight = 0.0001 # Set learning parameters lr_per_sample = [lr / minibatch_size for lr in lr_per_mb] lr_schedule = learning_rate_schedule(lr_per_sample, epoch_size=epoch_size, unit=UnitType.sample) mm_schedule = momentum_as_time_constant_schedule(momentum_time_constant) # trainer object learner = create_dist_learner( momentum_sgd(z.parameters, lr_schedule, mm_schedule, unit_gain=True, l2_regularization_weight=l2_reg_weight)) trainer = Trainer(z, ce, pe, learner) total_number_of_samples = max_epochs * epoch_size train_reader = create_train_reader(total_number_of_samples) # define mapping from reader streams to network inputs input_map = { input_var: train_reader.streams.features, label_var: train_reader.streams.labels } log_number_of_parameters(z) print() progress_printer = ProgressPrinter(tag='Training') # perform model training current_epoch = 0 updated = True while updated: data = train_reader.next_minibatch( minibatch_size, input_map=input_map) # fetch minibatch. updated = trainer.train_minibatch(data) # update model with it progress_printer.update_with_trainer(trainer, with_metric=True) # log progress epoch_index = int(trainer.total_number_of_samples_seen / epoch_size) if current_epoch != epoch_index: # new epoch reached progress_printer.epoch_summary(with_metric=True) current_epoch = epoch_index trainer.save_checkpoint( os.path.join(model_path, network_name + "_{}.dnn".format(current_epoch))) # Evaluation parameters epoch_size = 10000 minibatch_size = 16 # process minibatches and evaluate the model metric_numer = 0 metric_denom = 0 sample_count = 0 minibatch_index = 0 while True: data = test_reader.next_minibatch(minibatch_size, input_map=input_map) if not data: break local_mb_samples = data[label_var].num_samples metric_numer += trainer.test_minibatch(data) * local_mb_samples metric_denom += local_mb_samples minibatch_index += 1 print("") print("Final Results: Minibatch[1-{}]: errs = {:0.2f}% * {}".format( minibatch_index + 1, (metric_numer * 100.0) / metric_denom, metric_denom)) print("") return metric_numer / metric_denom
# Initialize the parameters for the trainer minibatch_size = 64 num_samples_per_sweep = 60000 num_sweeps_to_train_with = 10 num_minibatches_to_train = (num_samples_per_sweep * num_sweeps_to_train_with) / minibatch_size # Run the trainer on and perform model training training_progress_output_freq = 500 plotdata = {"batchsize": [], "loss": [], "error": []} for i in range(0, int(num_minibatches_to_train)): mb = mb_source.next_minibatch(minibatch_size) # Specify the input variables mapping in the model to actual minibatch data to be trained arguments = {input: mb[features_si], label: mb[labels_si]} trainer.train_minibatch(arguments) batchsize, loss, error = print_training_progress( trainer, i, training_progress_output_freq, verbose=1) if not (loss == "NA" or error == "NA"): plotdata["batchsize"].append(batchsize) plotdata["loss"].append(loss) plotdata["error"].append(error) trainer.save_checkpoint("../output/model") save_metrics(trainer, '../output/metrics.txt')
def train_and_evaluate(create_train_reader, test_reader, network_name, max_epochs, create_dist_learner, scale_up=False): set_computation_network_trace_level(0) # Input variables denoting the features and label data input_var = input_variable((num_channels, image_height, image_width)) label_var = input_variable((num_classes)) # create model, and configure learning parameters if network_name == 'resnet20': z = create_cifar10_model(input_var, 3, num_classes) lr_per_mb = [1.0]*80+[0.1]*40+[0.01] elif network_name == 'resnet110': z = create_cifar10_model(input_var, 18, num_classes) lr_per_mb = [0.1]*1+[1.0]*80+[0.1]*40+[0.01] else: return RuntimeError("Unknown model name!") # loss and metric ce = cross_entropy_with_softmax(z, label_var) pe = classification_error(z, label_var) # shared training parameters epoch_size = 50000 # for now we manually specify epoch size # NOTE: scaling up minibatch_size increases sample throughput. In 8-GPU machine, # ResNet110 samples-per-second is ~7x of single GPU, comparing to ~3x without scaling # up. However, bigger minimatch size on the same number of samples means less updates, # thus leads to higher training error. This is a trade-off of speed and accuracy minibatch_size = 128 * (distributed.Communicator.num_workers() if scale_up else 1) momentum_time_constant = -minibatch_size/np.log(0.9) l2_reg_weight = 0.0001 # Set learning parameters lr_per_sample = [lr/minibatch_size for lr in lr_per_mb] lr_schedule = learning_rate_schedule(lr_per_sample, epoch_size=epoch_size, unit=UnitType.sample) mm_schedule = momentum_as_time_constant_schedule(momentum_time_constant) # trainer object learner = create_dist_learner(momentum_sgd(z.parameters, lr_schedule, mm_schedule, l2_regularization_weight = l2_reg_weight)) trainer = Trainer(z, ce, pe, learner) total_number_of_samples = max_epochs * epoch_size train_reader=create_train_reader(total_number_of_samples) # define mapping from reader streams to network inputs input_map = { input_var: train_reader.streams.features, label_var: train_reader.streams.labels } log_number_of_parameters(z) ; print() progress_printer = ProgressPrinter(tag='Training') # perform model training current_epoch=0 updated=True while updated: data=train_reader.next_minibatch(minibatch_size, input_map=input_map) # fetch minibatch. updated=trainer.train_minibatch(data) # update model with it progress_printer.update_with_trainer(trainer, with_metric=True) # log progress epoch_index = int(trainer.total_number_of_samples_seen/epoch_size) if current_epoch != epoch_index: # new epoch reached progress_printer.epoch_summary(with_metric=True) current_epoch=epoch_index trainer.save_checkpoint(os.path.join(model_path, network_name + "_{}.dnn".format(current_epoch))) # Evaluation parameters epoch_size = 10000 minibatch_size = 16 # process minibatches and evaluate the model metric_numer = 0 metric_denom = 0 sample_count = 0 minibatch_index = 0 while True: data = test_reader.next_minibatch(minibatch_size, input_map=input_map) if not data: break; local_mb_samples=data[label_var].num_samples metric_numer += trainer.test_minibatch(data) * local_mb_samples metric_denom += local_mb_samples minibatch_index += 1 print("") print("Final Results: Minibatch[1-{}]: errs = {:0.2f}% * {}".format(minibatch_index+1, (metric_numer*100.0)/metric_denom, metric_denom)) print("") return metric_numer/metric_denom