def do_test(test_file, batch_size, gpu_mode, num_workers, model_path, print_details): """ Train a model and save :param test_file: A CSV file containing test image information :param batch_size: Batch size for training :param gpu_mode: If true the model will be trained on GPU :param num_workers: Number of workers for data loading :param model_path: Path to a saved model :param num_classes: Number of output classes :return: """ sys.stderr.write(TextColor.PURPLE + 'Loading data\n' + TextColor.END) if os.path.isfile(model_path) is False: sys.stderr.write(TextColor.RED + "ERROR: INVALID PATH TO MODEL\n") exit(1) sys.stderr.write(TextColor.GREEN + "INFO: MODEL LOADING\n" + TextColor.END) transducer_model, hidden_size, gru_layers, prev_ite = \ ModelHandler.load_simple_model_for_training(model_path, input_channels=ImageSizeOptions.IMAGE_CHANNELS, image_features=ImageSizeOptions.IMAGE_HEIGHT, seq_len=ImageSizeOptions.SEQ_LENGTH, num_classes=ImageSizeOptions.TOTAL_LABELS) sys.stderr.write(TextColor.GREEN + "INFO: MODEL LOADED\n" + TextColor.END) if gpu_mode: transducer_model = torch.nn.DataParallel(transducer_model).cuda() stats_dictioanry = test(test_file, batch_size, gpu_mode, transducer_model, num_workers, gru_layers, hidden_size, num_classes=ImageSizeOptions.TOTAL_LABELS, print_details=print_details) sys.stderr.write(TextColor.PURPLE + 'DONE\n' + TextColor.END)
def train(train_file, test_file, batch_size, epoch_limit, gpu_mode, num_workers, retrain_model, retrain_model_path, gru_layers, hidden_size, lr, decay, model_dir, stats_dir, train_mode): if train_mode is True: train_loss_logger = open(stats_dir + "train_loss.csv", 'w') test_loss_logger = open(stats_dir + "test_loss.csv", 'w') confusion_matrix_logger = open(stats_dir + "confusion_matrix.txt", 'w') else: train_loss_logger = None test_loss_logger = None confusion_matrix_logger = None sys.stderr.write(TextColor.PURPLE + 'Loading data\n' + TextColor.END) train_data_set = SequenceDataset(train_file) train_loader = DataLoader(train_data_set, batch_size=batch_size, shuffle=True, num_workers=num_workers, pin_memory=gpu_mode) num_classes = ImageSizeOptions.TOTAL_LABELS if retrain_model is True: if os.path.isfile(retrain_model_path) is False: sys.stderr.write( TextColor.RED + "ERROR: INVALID PATH TO RETRAIN PATH MODEL --retrain_model_path\n" ) exit(1) sys.stderr.write(TextColor.GREEN + "INFO: RETRAIN MODEL LOADING\n" + TextColor.END) transducer_model, hidden_size, gru_layers, prev_ite = \ ModelHandler.load_simple_model_for_training(retrain_model_path, input_channels=ImageSizeOptions.IMAGE_CHANNELS, image_features=ImageSizeOptions.IMAGE_HEIGHT, seq_len=ImageSizeOptions.SEQ_LENGTH, num_classes=num_classes) if train_mode is True: epoch_limit = prev_ite + epoch_limit sys.stderr.write(TextColor.GREEN + "INFO: RETRAIN MODEL LOADED\n" + TextColor.END) else: transducer_model = ModelHandler.get_new_gru_model( input_channels=ImageSizeOptions.IMAGE_CHANNELS, image_features=ImageSizeOptions.IMAGE_HEIGHT, gru_layers=gru_layers, hidden_size=hidden_size, num_classes=num_classes) prev_ite = 0 param_count = sum(p.numel() for p in transducer_model.parameters() if p.requires_grad) sys.stderr.write(TextColor.RED + "INFO: TOTAL TRAINABLE PARAMETERS:\t" + str(param_count) + "\n" + TextColor.END) if gpu_mode: transducer_model = torch.nn.DataParallel(transducer_model).cuda() class_weights = torch.Tensor(CLASS_WEIGHTS) # Loss criterion = nn.CrossEntropyLoss(class_weights) if gpu_mode is True: criterion = criterion.cuda() model_optimizer = torch.optim.Adam(transducer_model.parameters(), lr=lr, weight_decay=decay) lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( model_optimizer, 'min') if retrain_model is True: sys.stderr.write(TextColor.GREEN + "INFO: OPTIMIZER LOADING\n" + TextColor.END) model_optimizer = ModelHandler.load_simple_optimizer( model_optimizer, retrain_model_path, gpu_mode) sys.stderr.write(TextColor.GREEN + "INFO: OPTIMIZER LOADED\n" + TextColor.END) start_epoch = prev_ite # Train the Model sys.stderr.write(TextColor.PURPLE + 'Training starting\n' + TextColor.END) stats = dict() stats['loss_epoch'] = [] stats['accuracy_epoch'] = [] sys.stderr.write(TextColor.BLUE + 'Start: ' + str(start_epoch + 1) + ' End: ' + str(epoch_limit) + "\n") for epoch in range(start_epoch, epoch_limit, 1): total_loss = 0 total_images = 0 sys.stderr.write(TextColor.BLUE + 'Train epoch: ' + str(epoch + 1) + "\n") # make sure the model is in train mode. BN is different in train and eval. batch_no = 1 with tqdm(total=len(train_loader), desc='Loss', leave=True, ncols=100) as progress_bar: transducer_model.train() for images, labels in train_loader: labels = labels.type(torch.LongTensor) images = images.type(torch.FloatTensor) if gpu_mode: # encoder_hidden = encoder_hidden.cuda() images = images.cuda() labels = labels.cuda() hidden = torch.zeros(images.size(0), 2 * TrainOptions.GRU_LAYERS, TrainOptions.HIDDEN_SIZE) if gpu_mode: hidden = hidden.cuda() for i in range(0, ImageSizeOptions.SEQ_LENGTH, TrainOptions.WINDOW_JUMP): model_optimizer.zero_grad() if i + TrainOptions.TRAIN_WINDOW > ImageSizeOptions.SEQ_LENGTH: break image_chunk = images[:, i:i + TrainOptions.TRAIN_WINDOW] label_chunk = labels[:, i:i + TrainOptions.TRAIN_WINDOW] output_, hidden = transducer_model(image_chunk, hidden) loss = criterion( output_.contiguous().view(-1, num_classes), label_chunk.contiguous().view(-1)) loss.backward() model_optimizer.step() total_loss += loss.item() total_images += image_chunk.size(0) hidden = hidden.detach() # update the progress bar avg_loss = (total_loss / total_images) if total_images else 0 progress_bar.set_description("Loss: " + str(avg_loss)) if train_mode is True: train_loss_logger.write( str(epoch + 1) + "," + str(batch_no) + "," + str(avg_loss) + "\n") progress_bar.refresh() progress_bar.update(1) batch_no += 1 progress_bar.close() stats_dictioanry = test(test_file, batch_size, gpu_mode, transducer_model, num_workers, gru_layers, hidden_size, num_classes=ImageSizeOptions.TOTAL_LABELS) stats['loss'] = stats_dictioanry['loss'] stats['accuracy'] = stats_dictioanry['accuracy'] stats['loss_epoch'].append((epoch, stats_dictioanry['loss'])) stats['accuracy_epoch'].append((epoch, stats_dictioanry['accuracy'])) lr_scheduler.step(stats['loss']) # update the loggers if train_mode is True: # save the model after each epoch # encoder_model, decoder_model, encoder_optimizer, decoder_optimizer, hidden_size, layers, epoch, # file_name save_best_model( transducer_model, model_optimizer, hidden_size, gru_layers, epoch, model_dir + "_epoch_" + str(epoch + 1) + '_checkpoint.pkl') test_loss_logger.write( str(epoch + 1) + "," + str(stats['loss']) + "," + str(stats['accuracy']) + "\n") confusion_matrix_logger.write( str(epoch + 1) + "\n" + str(stats_dictioanry['confusion_matrix']) + "\n") train_loss_logger.flush() test_loss_logger.flush() confusion_matrix_logger.flush() else: # this setup is for hyperband if epoch + 1 >= 10 and stats['accuracy'] < 98: sys.stderr.write( TextColor.PURPLE + 'EARLY STOPPING AS THE MODEL NOT DOING WELL\n' + TextColor.END) return transducer_model, model_optimizer, stats sys.stderr.write(TextColor.PURPLE + 'Finished training\n' + TextColor.END) return transducer_model, model_optimizer, stats
def train(train_file, test_file, batch_size, epoch_limit, gpu_mode, num_workers, retrain_model, retrain_model_path, gru_layers, hidden_size, encoder_lr, encoder_decay, decoder_lr, decoder_decay, model_dir, stats_dir, train_mode): if train_mode is True: train_loss_logger = open(stats_dir + "train_loss.csv", 'w') test_loss_logger = open(stats_dir + "test_loss.csv", 'w') confusion_matrix_logger = open(stats_dir + "confusion_matrix.txt", 'w') else: train_loss_logger = None test_loss_logger = None confusion_matrix_logger = None sys.stderr.write(TextColor.PURPLE + 'Loading data\n' + TextColor.END) train_data_set = SequenceDataset(train_file) train_loader = DataLoader(train_data_set, batch_size=batch_size, shuffle=True, num_workers=num_workers, pin_memory=gpu_mode) if retrain_model is True: if os.path.isfile(retrain_model_path) is False: sys.stderr.write( TextColor.RED + "ERROR: INVALID PATH TO RETRAIN PATH MODEL --retrain_model_path\n" ) exit(1) sys.stderr.write(TextColor.GREEN + "INFO: RETRAIN MODEL LOADING\n" + TextColor.END) encoder_model, decoder_model, hidden_size, gru_layers, prev_ite = \ ModelHandler.load_model_for_training(retrain_model_path, input_channels=5, seq_len=ImageSizeOptions.SEQ_LENGTH, num_classes=3) if train_mode is True: epoch_limit = prev_ite + epoch_limit sys.stderr.write(TextColor.GREEN + "INFO: RETRAIN MODEL LOADED\n" + TextColor.END) else: encoder_model, decoder_model = ModelHandler.get_new_model( input_channels=5, gru_layers=gru_layers, hidden_size=hidden_size, seq_len=ImageSizeOptions.SEQ_LENGTH, num_classes=3) prev_ite = 0 encoder_optimizer = torch.optim.Adam(encoder_model.parameters(), lr=encoder_lr, weight_decay=encoder_decay) decoder_optimizer = torch.optim.Adam(decoder_model.parameters(), lr=decoder_lr, weight_decay=decoder_decay) if retrain_model is True: sys.stderr.write(TextColor.GREEN + "INFO: OPTIMIZER LOADING\n" + TextColor.END) encoder_optimizer, decoder_optimizer = ModelHandler.load_optimizer( encoder_optimizer, decoder_optimizer, retrain_model_path, gpu_mode) sys.stderr.write(TextColor.GREEN + "INFO: OPTIMIZER LOADED\n" + TextColor.END) if gpu_mode: encoder_model = torch.nn.DataParallel(encoder_model).cuda() decoder_model = torch.nn.DataParallel(decoder_model).cuda() class_weights = torch.FloatTensor(CLASS_WEIGHTS) # Loss criterion = nn.CrossEntropyLoss(weight=class_weights) if gpu_mode is True: criterion = criterion.cuda() start_epoch = prev_ite # Train the Model sys.stderr.write(TextColor.PURPLE + 'Training starting\n' + TextColor.END) stats = dict() stats['loss_epoch'] = [] stats['accuracy_epoch'] = [] sys.stderr.write(TextColor.PURPLE + 'Start: ' + str(start_epoch + 1) + ' End: ' + str(epoch_limit + 1) + "\n") for epoch in range(start_epoch, epoch_limit, 1): total_loss = 0 total_images = 0 sys.stderr.write(TextColor.BLUE + 'Train epoch: ' + str(epoch + 1) + "\n") # make sure the model is in train mode. BN is different in train and eval. encoder_model.train() decoder_model.train() batch_no = 1 with tqdm(total=len(train_loader), desc='Loss', leave=True, ncols=100) as progress_bar: for images, labels in train_loader: # print(images.size(), labels.size()) # from modules.python.helper.tensor_analyzer import analyze_tensor # for label in labels[0].data: # print(label.item(), end='') # print() # analyze_tensor(images[0]) # exit() if gpu_mode: # encoder_hidden = encoder_hidden.cuda() images = images.cuda() labels = labels.cuda() encoder_hidden = torch.FloatTensor(images.size(0), gru_layers * 2, hidden_size).zero_() if gpu_mode: encoder_hidden = encoder_hidden.cuda() encoder_optimizer.zero_grad() decoder_optimizer.zero_grad() loss = 0 total_seq_length = images.size(2) start_index = 0 end_index = images.size(2) # from analysis.analyze_png_img import analyze_tensor # print(labels[0, :].data.numpy()) # analyze_tensor(images[0, :, :, :]) # exit() context_vector, hidden_encoder = encoder_model( images, encoder_hidden) for seq_index in range(start_index, end_index): current_batch_size = images.size(0) y = labels[:, seq_index - start_index] attention_index = torch.from_numpy( np.asarray([seq_index] * current_batch_size)).view( -1, 1) attention_index_onehot = torch.FloatTensor( current_batch_size, total_seq_length) attention_index_onehot.zero_() attention_index_onehot.scatter_(1, attention_index, 1) # print("\n", seq_index, attention_index_onehot) # exit() output_dec, decoder_hidden, attn = decoder_model( attention_index_onehot, context_vector=context_vector, encoder_hidden=hidden_encoder) # loss loss += criterion(output_dec, y) loss.backward() encoder_optimizer.step() decoder_optimizer.step() total_loss += loss.item() total_images += labels.size(0) # update the progress bar avg_loss = (total_loss / total_images) if total_images else 0 progress_bar.set_description("Loss: " + str(avg_loss)) if train_mode is True: train_loss_logger.write( str(epoch + 1) + "," + str(batch_no) + "," + str(avg_loss) + "\n") progress_bar.refresh() progress_bar.update(1) batch_no += 1 progress_bar.close() stats_dictioanry = test(test_file, batch_size, gpu_mode, encoder_model, decoder_model, num_workers, gru_layers, hidden_size, num_classes=3) stats['loss'] = stats_dictioanry['loss'] stats['accuracy'] = stats_dictioanry['accuracy'] stats['loss_epoch'].append((epoch, stats_dictioanry['loss'])) stats['accuracy_epoch'].append((epoch, stats_dictioanry['accuracy'])) # update the loggers if train_mode is True: # save the model after each epoch # encoder_model, decoder_model, encoder_optimizer, decoder_optimizer, hidden_size, layers, epoch, # file_name save_best_model( encoder_model, decoder_model, encoder_optimizer, decoder_optimizer, hidden_size, gru_layers, epoch, model_dir + "_epoch_" + str(epoch + 1) + '_checkpoint.pkl') test_loss_logger.write( str(epoch + 1) + "," + str(stats['loss']) + "," + str(stats['accuracy']) + "\n") confusion_matrix_logger.write( str(epoch + 1) + "\n" + str(stats_dictioanry['confusion_matrix']) + "\n") train_loss_logger.flush() test_loss_logger.flush() confusion_matrix_logger.flush() else: # this setup is for hyperband if epoch + 1 >= 2 and stats['accuracy'] < 90: sys.stderr.write( TextColor.PURPLE + 'EARLY STOPPING AS THE MODEL NOT DOING WELL\n' + TextColor.END) return encoder_model, decoder_model, encoder_optimizer, decoder_optimizer, stats sys.stderr.write(TextColor.PURPLE + 'Finished training\n' + TextColor.END) return encoder_model, decoder_model, encoder_optimizer, decoder_optimizer, stats