def save_best_model(transducer_model, model_optimizer, hidden_size, layers, epoch, file_name): """ Save the best model :param transducer_model: A trained model :param model_optimizer: Model optimizer :param hidden_size: Number of hidden layers :param layers: Number of GRU layers to use :param epoch: Epoch/iteration number :param file_name: Output file name :return: """ if os.path.isfile(file_name): os.remove(file_name) ModelHandler.save_checkpoint({ 'model_state_dict': transducer_model.state_dict(), 'model_optimizer': model_optimizer.state_dict(), 'hidden_size': hidden_size, 'gru_layers': layers, 'epochs': epoch, }, file_name) sys.stderr.write("[" + str(datetime.now().strftime('%m-%d-%Y %H:%M:%S')) + "] INFO: MODEL SAVED SUCCESSFULLY.\n")
def do_test(test_file, batch_size, gpu_mode, num_workers, model_path, print_details): """ Train a model and save :param test_file: A CSV file containing test image information :param batch_size: Batch size for training :param gpu_mode: If true the model will be trained on GPU :param num_workers: Number of workers for data loading :param model_path: Path to a saved model :param print_details: Print debug stuff :return: """ sys.stderr.write("[" + str(datetime.now().strftime('%m-%d-%Y %H:%M:%S')) + "] Loading data\n") if os.path.isfile(model_path) is False: sys.stderr.write("[" + str(datetime.now().strftime('%m-%d-%Y %H:%M:%S')) + "] ERROR: INVALID PATH TO MODEL\n") exit(1) sys.stderr.write("[" + str(datetime.now().strftime('%m-%d-%Y %H:%M:%S')) + "] INFO: MODEL LOADING\n") transducer_model, hidden_size, gru_layers, prev_ite = \ ModelHandler.load_simple_model_for_training(model_path, input_channels=ImageSizeOptions.IMAGE_CHANNELS, image_features=ImageSizeOptions.IMAGE_HEIGHT, seq_len=ImageSizeOptions.SEQ_LENGTH, num_classes=ImageSizeOptions.TOTAL_LABELS) sys.stderr.write("[" + str(datetime.now().strftime('%m-%d-%Y %H:%M:%S')) + "] INFO: MODEL LOADED\n") sys.stderr.flush() if gpu_mode: transducer_model = torch.nn.DataParallel(transducer_model).cuda() stats_dictioanry = test(test_file, batch_size, gpu_mode, transducer_model, num_workers, gru_layers, hidden_size, num_classes=ImageSizeOptions.TOTAL_LABELS, print_details=print_details) sys.stderr.write("[" + str(datetime.now().strftime('%m-%d-%Y %H:%M:%S')) + "] INFO: TEST COMPLETE")
def train(train_file, test_file, batch_size, epoch_limit, gpu_mode, num_workers, retrain_model, retrain_model_path, gru_layers, hidden_size, lr, decay, model_dir, stats_dir, train_mode, world_size, rank, device_id): if train_mode is True and rank == 0: train_loss_logger = open(stats_dir + "train_loss.csv", 'w') test_loss_logger = open(stats_dir + "test_loss.csv", 'w') confusion_matrix_logger = open(stats_dir + "confusion_matrix.txt", 'w') else: train_loss_logger = None test_loss_logger = None confusion_matrix_logger = None torch.cuda.set_device(device_id) if rank == 0: sys.stderr.write("[" + str(datetime.now().strftime('%m-%d-%Y %H:%M:%S')) + "] INFO: LOADING DATA\n") train_data_set = SequenceDataset(train_file) train_sampler = torch.utils.data.distributed.DistributedSampler( train_data_set, num_replicas=world_size, rank=rank) train_loader = torch.utils.data.DataLoader(dataset=train_data_set, batch_size=batch_size, shuffle=False, num_workers=0, pin_memory=True, sampler=train_sampler) num_classes = ImageSizeOptions.TOTAL_LABELS if retrain_model is True: if os.path.isfile(retrain_model_path) is False: sys.stderr.write( "[" + str(datetime.now().strftime('%m-%d-%Y %H:%M:%S')) + "] ERROR: INVALID PATH TO RETRAIN PATH MODEL --retrain_model_path\n" ) exit(1) sys.stderr.write("[" + str(datetime.now().strftime('%m-%d-%Y %H:%M:%S')) + "] INFO: RETRAIN MODEL LOADING\n") transducer_model, hidden_size, gru_layers, prev_ite = \ ModelHandler.load_simple_model_for_training(retrain_model_path, input_channels=ImageSizeOptions.IMAGE_CHANNELS, image_features=ImageSizeOptions.IMAGE_HEIGHT, seq_len=ImageSizeOptions.SEQ_LENGTH, num_classes=num_classes) if train_mode is True: epoch_limit = prev_ite + epoch_limit sys.stderr.write("[" + str(datetime.now().strftime('%m-%d-%Y %H:%M:%S')) + "] INFO: RETRAIN MODEL LOADED\n") else: transducer_model = ModelHandler.get_new_gru_model( input_channels=ImageSizeOptions.IMAGE_CHANNELS, image_features=ImageSizeOptions.IMAGE_HEIGHT, gru_layers=gru_layers, hidden_size=hidden_size, num_classes=num_classes) prev_ite = 0 param_count = sum(p.numel() for p in transducer_model.parameters() if p.requires_grad) if rank == 0: sys.stderr.write("[" + str(datetime.now().strftime('%m-%d-%Y %H:%M:%S')) + "] INFO: TOTAL TRAINABLE PARAMETERS:\t" + str(param_count) + "\n") model_optimizer = torch.optim.Adam(transducer_model.parameters(), lr=lr, weight_decay=decay) if retrain_model is True: sys.stderr.write("[" + str(datetime.now().strftime('%m-%d-%Y %H:%M:%S')) + "] INFO: OPTIMIZER LOADING\n") model_optimizer = ModelHandler.load_simple_optimizer( model_optimizer, retrain_model_path, gpu_mode) sys.stderr.write("[" + str(datetime.now().strftime('%m-%d-%Y %H:%M:%S')) + "] INFO: OPTIMIZER LOADED\n") if gpu_mode: transducer_model = transducer_model.to(device_id) transducer_model = nn.parallel.DistributedDataParallel( transducer_model, device_ids=[device_id]) class_weights = torch.Tensor(CLASS_WEIGHTS) # Loss criterion = nn.CrossEntropyLoss(class_weights) if gpu_mode is True: criterion = criterion.to(device_id) start_epoch = prev_ite # Train the Model if rank == 0: sys.stderr.write("[" + str(datetime.now().strftime('%m-%d-%Y %H:%M:%S')) + "] INFO: Training starting\n") sys.stderr.write("[" + str(datetime.now().strftime('%m-%d-%Y %H:%M:%S')) + "] Start: " + str(start_epoch + 1) + " End: " + str(epoch_limit) + "\n") stats = dict() stats['loss_epoch'] = [] stats['accuracy_epoch'] = [] for epoch in range(start_epoch, epoch_limit, 1): total_loss = 0 total_images = 0 if rank == 0: sys.stderr.write( "[" + str(datetime.now().strftime('%m-%d-%Y %H:%M:%S')) + "] Train epoch: " + str(epoch + 1) + "\n") # make sure the model is in train mode. BN is different in train and eval. batch_no = 1 if rank == 0: progress_bar = tqdm( total=len(train_loader), ncols=100, leave=True, position=rank, desc="Loss: ", ) else: progress_bar = None transducer_model.train() for images, labels in train_loader: labels = labels.type(torch.LongTensor) images = images.type(torch.FloatTensor) if gpu_mode: images = images.to(device_id) labels = labels.to(device_id) hidden = torch.zeros(images.size(0), 2 * TrainOptions.LSTM_LAYERS, TrainOptions.HIDDEN_SIZE) # cell_state = torch.zeros(images.size(0), 2 * TrainOptions.LSTM_LAYERS, TrainOptions.HIDDEN_SIZE) if gpu_mode: hidden = hidden.to(device_id) cell_state = cell_state.to(device_id) for i in range(0, ImageSizeOptions.SEQ_LENGTH, TrainOptions.WINDOW_JUMP): model_optimizer.zero_grad() if i + TrainOptions.TRAIN_WINDOW > ImageSizeOptions.SEQ_LENGTH: break image_chunk = images[:, i:i + TrainOptions.TRAIN_WINDOW] label_chunk = labels[:, i:i + TrainOptions.TRAIN_WINDOW] # output_, hidden, cell_state = transducer_model( image_chunk, hidden, cell_state) loss = criterion(output_.contiguous().view(-1, num_classes), label_chunk.contiguous().view(-1)) #without retaingraph=true this won't run loss.backward() model_optimizer.step() total_loss += loss.item() total_images += image_chunk.size(0) hidden = hidden.detach() cell_state = cell_state.detach() # update the progress bar avg_loss = (total_loss / total_images) if total_images else 0 if train_mode is True and rank == 0: train_loss_logger.write( str(epoch + 1) + "," + str(batch_no) + "," + str(avg_loss) + "\n") if rank == 0: progress_bar.set_description("Loss: " + str(avg_loss)) progress_bar.refresh() progress_bar.update(1) batch_no += 1 if rank == 0: progress_bar.close() dist.barrier() if rank == 0: stats_dictioanry = test(test_file, batch_size, gpu_mode, transducer_model, num_workers, gru_layers, hidden_size, num_classes=ImageSizeOptions.TOTAL_LABELS) stats['loss'] = stats_dictioanry['loss'] stats['accuracy'] = stats_dictioanry['accuracy'] stats['loss_epoch'].append((epoch, stats_dictioanry['loss'])) stats['accuracy_epoch'].append( (epoch, stats_dictioanry['accuracy'])) dist.barrier() # update the loggers if train_mode is True and rank == 0: # save the model after each epoch # encoder_model, decoder_model, encoder_optimizer, decoder_optimizer, hidden_size, layers, epoch, # file_name save_best_model( transducer_model, model_optimizer, hidden_size, gru_layers, epoch, model_dir + "_epoch_" + str(epoch + 1) + '_checkpoint.pkl') test_loss_logger.write( str(epoch + 1) + "," + str(stats['loss']) + "," + str(stats['accuracy']) + "\n") confusion_matrix_logger.write( str(epoch + 1) + "\n" + str(stats_dictioanry['confusion_matrix']) + "\n") train_loss_logger.flush() test_loss_logger.flush() confusion_matrix_logger.flush() elif train_mode is False: # this setup is for hyperband if epoch + 1 >= 10 and stats['accuracy'] < 98: sys.stderr.write( "[" + str(datetime.now().strftime('%m-%d-%Y %H:%M:%S')) + "] INFO: EARLY STOPPING AS THE MODEL NOT DOING WELL\n") return transducer_model, model_optimizer, stats if rank == 0: sys.stderr.write("[" + str(datetime.now().strftime('%m-%d-%Y %H:%M:%S')) + "] INFO: Finished training\n") return transducer_model, model_optimizer, stats
def predict(test_file, output_filename, model_path, batch_size, num_workers, gpu_mode): """ Create a prediction table/dictionary of an images set using a trained model. :param test_file: File to predict on :param output_filename: Name of output file :param batch_size: Batch size used for prediction :param model_path: Path to a trained model :param gpu_mode: If true, predictions will be done over GPU :param num_workers: Number of workers to be used by the dataloader :return: Prediction dictionary """ prediction_data_file = DataStore(output_filename, mode='w') # data loader test_data = SequenceDataset(test_file) test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False, num_workers=num_workers) transducer_model, hidden_size, gru_layers, prev_ite = \ ModelHandler.load_simple_model_for_training(model_path, input_channels=ImageSizeOptions.IMAGE_CHANNELS, image_features=ImageSizeOptions.IMAGE_HEIGHT, seq_len=ImageSizeOptions.SEQ_LENGTH, num_classes=ImageSizeOptions.TOTAL_LABELS) transducer_model.eval() if gpu_mode: transducer_model = transducer_model.cuda() sys.stderr.write("[" + str(datetime.now().strftime('%m-%d-%Y %H:%M:%S')) + "] INFO: STARTING INFERENCE\n") with torch.no_grad(): for contig, contig_start, contig_end, chunk_id, images, position, index in tqdm( test_loader, ncols=50): images = images.type(torch.FloatTensor) hidden = torch.zeros(images.size(0), 2 * TrainOptions.LSTM_LAYERS, TrainOptions.HIDDEN_SIZE) cell_state = torch.zeros(images.size(0), 2 * TrainOptions.LSTM_LAYERS, TrainOptions.HIDDEN_SIZE) prediction_base_tensor = torch.zeros( (images.size(0), images.size(1), ImageSizeOptions.TOTAL_LABELS)) if gpu_mode: images = images.cuda() hidden = hidden.cuda() prediction_base_tensor = prediction_base_tensor.cuda() for i in range(0, ImageSizeOptions.SEQ_LENGTH, TrainOptions.WINDOW_JUMP): if i + TrainOptions.TRAIN_WINDOW > ImageSizeOptions.SEQ_LENGTH: break chunk_start = i chunk_end = i + TrainOptions.TRAIN_WINDOW # chunk all the data image_chunk = images[:, chunk_start:chunk_end] # run inference output_base, hidden, cell_state = transducer_model( image_chunk, hidden, cell_state) # now calculate how much padding is on the top and bottom of this chunk so we can do a simple # add operation top_zeros = chunk_start bottom_zeros = ImageSizeOptions.SEQ_LENGTH - chunk_end # do softmax and get prediction # we run a softmax a padding to make the output tensor compatible for adding inference_layers = nn.Sequential( nn.Softmax(dim=2), nn.ZeroPad2d((0, 0, top_zeros, bottom_zeros))) if gpu_mode: inference_layers = inference_layers.cuda() base_prediction = inference_layers(output_base).cuda() else: base_prediction = inference_layers(output_base) # now simply add the tensor to the global counter prediction_base_tensor = torch.add(prediction_base_tensor, base_prediction) base_values, base_labels = torch.max(prediction_base_tensor, 2) predicted_base_labels = base_labels.cpu().numpy() for i in range(images.size(0)): prediction_data_file.write_prediction( contig[i], contig_start[i], contig_end[i], chunk_id[i], position[i], index[i], predicted_base_labels[i])
def predict_cpu(filepath, file_chunks, output_filepath, model_path, batch_size, total_callers, threads_per_caller, num_workers): """ Create a prediction table/dictionary of an images set using a trained model. :param filepath: Path to image files to predict on :param file_chunks: Path to chunked files :param batch_size: Batch size used for prediction :param model_path: Path to a trained model :param output_filepath: Path to output directory :param total_callers: Number of callers to spawn :param threads_per_caller: Number of threads to use per caller :param num_workers: Number of workers to be used by the dataloader :return: Prediction dictionary """ # load the model and create an ONNX session transducer_model, hidden_size, gru_layers, prev_ite = \ ModelHandler.load_simple_model_for_training(model_path, input_channels=ImageSizeOptions.IMAGE_CHANNELS, image_features=ImageSizeOptions.IMAGE_HEIGHT, seq_len=ImageSizeOptions.SEQ_LENGTH, num_classes=ImageSizeOptions.TOTAL_LABELS) transducer_model.eval() sys.stderr.write("[" + str(datetime.now().strftime('%m-%d-%Y %H:%M:%S')) + "] INFO: MODEL LOADING TO ONNX\n") x = torch.zeros(1, TrainOptions.TRAIN_WINDOW, ImageSizeOptions.IMAGE_HEIGHT) h = torch.zeros(1, 2 * TrainOptions.GRU_LAYERS, TrainOptions.HIDDEN_SIZE) if not os.path.isfile(model_path + ".onnx"): sys.stderr.write("[" + str(datetime.now().strftime('%m-%d-%Y %H:%M:%S')) + "] INFO: SAVING MODEL TO ONNX\n") torch.onnx.export(transducer_model, (x, h), model_path + ".onnx", training=False, opset_version=10, do_constant_folding=True, input_names=['input_image', 'input_hidden'], output_names=['output_pred', 'output_hidden'], dynamic_axes={ 'input_image': { 0: 'batch_size' }, 'input_hidden': { 0: 'batch_size' }, 'output_pred': { 0: 'batch_size' }, 'output_hidden': { 0: 'batch_size' } }) start_time = time.time() with concurrent.futures.ProcessPoolExecutor( max_workers=total_callers) as executor: futures = [ executor.submit(predict, filepath, file_chunks[thread_id], output_filepath, batch_size, num_workers, thread_id, threads_per_caller, model_path) for thread_id in range(0, total_callers) ] for fut in concurrent.futures.as_completed(futures): if fut.exception() is None: # get the results thread_id = fut.result() sys.stderr.write( "[" + str(datetime.now().strftime('%m-%d-%Y %H:%M:%S')) + "] INFO: THREAD " + str(thread_id) + " FINISHED SUCCESSFULLY.\n") else: sys.stderr.write("ERROR: " + str(fut.exception()) + "\n") fut._result = None # python issue 27144 end_time = time.time() mins = int((end_time - start_time) / 60) secs = int((end_time - start_time)) % 60 sys.stderr.write("[" + str(datetime.now().strftime('%m-%d-%Y %H:%M:%S')) + "] INFO: FINISHED PREDICTION\n") sys.stderr.write("[" + str(datetime.now().strftime('%m-%d-%Y %H:%M:%S')) + "] INFO: ELAPSED TIME: " + str(mins) + " Min " + str(secs) + " Sec\n")
def predict_distributed_cpu(filepath, file_chunks, output_filepath, model_path, batch_size, total_callers, threads, num_workers): """ Create a prediction table/dictionary of an images set using a trained model. :param filepath: Path to image files to predict on :param file_chunks: Path to chunked files :param batch_size: Batch size used for prediction :param model_path: Path to a trained model :param output_filepath: Path to output directory :param total_callers: Number of callers to spawn :param threads: Number of threads to use per caller :param num_workers: Number of workers to be used by the dataloader :return: Prediction dictionary """ # load the model and create an ONNX session transducer_model, hidden_size, gru_layers, prev_ite = \ ModelHandler.load_simple_model_for_training(model_path, input_channels=ImageSizeOptions.IMAGE_CHANNELS, image_features=ImageSizeOptions.IMAGE_HEIGHT, seq_len=ImageSizeOptions.SEQ_LENGTH, num_classes=ImageSizeOptions.TOTAL_LABELS) transducer_model.eval() sys.stderr.write("INFO: MODEL LOADING TO ONNX\n") x = torch.zeros(1, TrainOptions.TRAIN_WINDOW, ImageSizeOptions.IMAGE_HEIGHT) h = torch.zeros(1, 2 * TrainOptions.LSTM_LAYERS, TrainOptions.HIDDEN_SIZE) # ce = torch.zeros(1, 2 * TrainOptions.LSTM_LAYERS, TrainOptions.HIDDEN_SIZE) if not os.path.isfile(model_path + ".onnx"): sys.stderr.write("INFO: SAVING MODEL TO ONNX\n") # #torch.onnx.export(transducer_model, (x, h), torch.onnx.export(transducer_model, (x, h, ce), model_path + ".onnx", training=False, opset_version=10, do_constant_folding=True, input_names=['input_image', 'input_hidden'], output_names=['output_pred', 'output_hidden'], dynamic_axes={ 'input_image': { 0: 'batch_size' }, 'input_hidden': { 0: 'batch_size' }, 'output_pred': { 0: 'batch_size' }, 'output_hidden': { 0: 'batch_size' } }) args = (filepath, output_filepath, model_path, batch_size, num_workers, threads) mp.spawn(setup, args=(total_callers, args, file_chunks), nprocs=total_callers, join=True)
def predict(input_filepath, file_chunks, output_filepath, model_path, batch_size, num_workers, rank, device_id): transducer_model, hidden_size, gru_layers, prev_ite = \ ModelHandler.load_simple_model_for_training(model_path, input_channels=ImageSizeOptions.IMAGE_CHANNELS, image_features=ImageSizeOptions.IMAGE_HEIGHT, seq_len=ImageSizeOptions.SEQ_LENGTH, num_classes=ImageSizeOptions.TOTAL_LABELS) transducer_model.eval() transducer_model = transducer_model.eval() # create output file output_filename = output_filepath + "pepper_prediction_" + str( device_id) + ".hdf" prediction_data_file = DataStore(output_filename, mode='w') # data loader input_data = SequenceDataset(input_filepath, file_chunks) data_loader = DataLoader(input_data, batch_size=batch_size, shuffle=False, num_workers=num_workers) torch.cuda.set_device(device_id) transducer_model.to(device_id) transducer_model.eval() transducer_model = DistributedDataParallel(transducer_model, device_ids=[device_id]) if rank == 0: progress_bar = tqdm( total=len(data_loader), ncols=100, leave=False, position=rank, desc="GPU #" + str(device_id), ) with torch.no_grad(): for contig, contig_start, contig_end, chunk_id, images, position, index in data_loader: sys.stderr.flush() images = images.type(torch.FloatTensor) hidden = torch.zeros(images.size(0), 2 * TrainOptions.GRU_LAYERS, TrainOptions.HIDDEN_SIZE) prediction_base_tensor = torch.zeros( (images.size(0), images.size(1), ImageSizeOptions.TOTAL_LABELS)) images = images.to(device_id) hidden = hidden.to(device_id) prediction_base_tensor = prediction_base_tensor.to(device_id) for i in range(0, ImageSizeOptions.SEQ_LENGTH, TrainOptions.WINDOW_JUMP): if i + TrainOptions.TRAIN_WINDOW > ImageSizeOptions.SEQ_LENGTH: break chunk_start = i chunk_end = i + TrainOptions.TRAIN_WINDOW # chunk all the data image_chunk = images[:, chunk_start:chunk_end] # run inference output_base, hidden = transducer_model(image_chunk, hidden) # now calculate how much padding is on the top and bottom of this chunk so we can do a simple # add operation top_zeros = chunk_start bottom_zeros = ImageSizeOptions.SEQ_LENGTH - chunk_end # do softmax and get prediction # we run a softmax a padding to make the output tensor compatible for adding inference_layers = nn.Sequential( nn.Softmax(dim=2), nn.ZeroPad2d((0, 0, top_zeros, bottom_zeros))) inference_layers = inference_layers.to(device_id) # run the softmax and padding layers base_prediction = inference_layers(output_base).to(device_id) # now simply add the tensor to the global counter prediction_base_tensor = torch.add(prediction_base_tensor, base_prediction) del inference_layers torch.cuda.empty_cache() base_values, base_labels = torch.max(prediction_base_tensor, 2) # this part is for the phred score calculation counts = torch.ones( (base_values.size(0), base_values.size(1) - 2 * ImageSizeOptions.SEQ_OVERLAP)) top_ones = nn.ZeroPad2d( (ImageSizeOptions.SEQ_OVERLAP, ImageSizeOptions.SEQ_OVERLAP)) counts = top_ones(counts) + 1 base_values = base_labels.cpu().numpy() phred_score = -10 * torch.log10(1.0 - (base_values / counts)) phred_score[phred_score == float('inf')] = 100 predicted_base_labels = base_labels.cpu().numpy() phred_score = phred_score.cpu().numpy() for i in range(images.size(0)): prediction_data_file.write_prediction( contig[i], contig_start[i], contig_end[i], chunk_id[i], position[i], index[i], predicted_base_labels[i], phred_score[i]) if rank == 0: progress_bar.update(1) if rank == 0: progress_bar.close()