def test_interface(test_file, batch_size, gpu_mode, num_workers, model_path, output_directory, print_details): """ Test a trained model :param test_file: Path to directory containing test images :param batch_size: Batch size for training :param gpu_mode: If true the model will be trained on GPU :param num_workers: Number of workers for data loading :param model_path: Path to a saved model :param output_directory: Path to output_directory :param print_details: If true then logs will be printed :return: """ sys.stderr.write(TextColor.PURPLE + 'Loading data\n' + TextColor.END) output_directory = FileManager.handle_output_directory(output_directory) if os.path.isfile(model_path) is False: sys.stderr.write(TextColor.RED + "ERROR: INVALID PATH TO MODEL\n") exit(1) sys.stderr.write(TextColor.GREEN + "INFO: MODEL LOADING\n" + TextColor.END) transducer_model, hidden_size, gru_layers, prev_ite = \ ModelHandler.load_simple_model(model_path, input_channels=ImageSizeOptions.IMAGE_CHANNELS, image_features=ImageSizeOptions.IMAGE_HEIGHT, seq_len=ImageSizeOptions.SEQ_LENGTH, num_base_classes=ImageSizeOptions.TOTAL_BASE_LABELS, num_rle_classes=ImageSizeOptions.TOTAL_RLE_LABELS) sys.stderr.write(TextColor.GREEN + "INFO: MODEL LOADED\n" + TextColor.END) if print_details and gpu_mode: sys.stderr.write(TextColor.GREEN + "INFO: GPU MODE NOT AVAILABLE WHEN PRINTING DETAILS. " "SETTING GPU MODE TO FALSE.\n" + TextColor.END) gpu_mode = False if gpu_mode: transducer_model = transducer_model.cuda() stats_dictionary = test( test_file, batch_size, gpu_mode, transducer_model, num_workers, gru_layers, hidden_size, num_base_classes=ImageSizeOptions.TOTAL_BASE_LABELS, num_rle_classes=ImageSizeOptions.TOTAL_RLE_LABELS, output_directory=output_directory, print_details=print_details) save_rle_confusion_matrix(stats_dictionary, output_directory) save_base_confusion_matrix(stats_dictionary, output_directory) sys.stderr.write(TextColor.PURPLE + 'DONE\n' + TextColor.END)
def save_best_model(transducer_model, model_optimizer, hidden_size, layers, epoch, file_name): """ Save the best model :param transducer_model: A trained model :param model_optimizer: Model optimizer :param hidden_size: Number of hidden layers :param layers: Number of GRU layers to use :param epoch: Epoch/iteration number :param file_name: Output file name :return: """ if os.path.isfile(file_name): os.remove(file_name) ModelHandler.save_checkpoint({ 'model_state_dict': transducer_model.state_dict(), 'model_optimizer': model_optimizer.state_dict(), 'hidden_size': hidden_size, 'gru_layers': layers, 'epochs': epoch, }, file_name) sys.stderr.write(TextColor.RED + "\nMODEL SAVED SUCCESSFULLY.\n" + TextColor.END)
def train(train_file, test_file, batch_size, epoch_limit, gpu_mode, num_workers, retrain_model, retrain_model_path, gru_layers, hidden_size, lr, decay, model_dir, stats_dir, train_mode, world_size, rank, device_id): if train_mode is True and rank == 0: train_loss_logger = open(stats_dir + "train_loss.csv", 'w') test_loss_logger = open(stats_dir + "test_loss.csv", 'w') confusion_matrix_logger = open(stats_dir + "confusion_matrix.txt", 'w') else: train_loss_logger = None test_loss_logger = None confusion_matrix_logger = None torch.cuda.set_device(device_id) if rank == 0: sys.stderr.write(TextColor.PURPLE + 'Loading data\n' + TextColor.END) train_data_set = SequenceDataset(train_file) train_sampler = torch.utils.data.distributed.DistributedSampler( train_data_set, num_replicas=world_size, rank=rank ) train_loader = torch.utils.data.DataLoader( dataset=train_data_set, batch_size=batch_size, shuffle=False, num_workers=0, pin_memory=True, sampler=train_sampler) num_base_classes = ImageSizeOptions.TOTAL_BASE_LABELS num_rle_classes = ImageSizeOptions.TOTAL_RLE_LABELS if retrain_model is True: if os.path.isfile(retrain_model_path) is False: sys.stderr.write(TextColor.RED + "ERROR: INVALID PATH TO RETRAIN PATH MODEL --retrain_model_path\n") exit(1) sys.stderr.write(TextColor.GREEN + "INFO: RETRAIN MODEL LOADING\n" + TextColor.END) transducer_model, hidden_size, gru_layers, prev_ite = \ ModelHandler.load_simple_model(retrain_model_path, input_channels=ImageSizeOptions.IMAGE_CHANNELS, image_features=ImageSizeOptions.IMAGE_HEIGHT, seq_len=ImageSizeOptions.SEQ_LENGTH, num_base_classes=num_base_classes, num_rle_classes=num_rle_classes) if train_mode is True: epoch_limit = prev_ite + epoch_limit sys.stderr.write(TextColor.GREEN + "INFO: RETRAIN MODEL LOADED\n" + TextColor.END) else: transducer_model = ModelHandler.get_new_gru_model(input_channels=ImageSizeOptions.IMAGE_CHANNELS, image_features=ImageSizeOptions.IMAGE_HEIGHT, gru_layers=gru_layers, hidden_size=hidden_size, num_base_classes=num_base_classes, num_rle_classes=num_rle_classes) prev_ite = 0 param_count = sum(p.numel() for p in transducer_model.parameters() if p.requires_grad) if rank == 0: sys.stderr.write(TextColor.RED + "INFO: TOTAL TRAINABLE PARAMETERS:\t" + str(param_count) + "\n" + TextColor.END) model_optimizer = torch.optim.Adam(transducer_model.parameters(), lr=lr, weight_decay=decay) if retrain_model is True: sys.stderr.write(TextColor.GREEN + "INFO: OPTIMIZER LOADING\n" + TextColor.END) model_optimizer = ModelHandler.load_simple_optimizer(model_optimizer, retrain_model_path, gpu_mode) sys.stderr.write(TextColor.GREEN + "INFO: OPTIMIZER LOADED\n" + TextColor.END) if gpu_mode: transducer_model = transducer_model.to(device_id) transducer_model = nn.parallel.DistributedDataParallel(transducer_model, device_ids=[device_id]) class_weights = torch.Tensor(TrainOptions.CLASS_WEIGHTS) # we perform a multi-task classification, so we need two loss functions, each performing a single task # criterion base is the loss function for base prediction criterion_base = nn.CrossEntropyLoss() # criterion rle is the loss function for RLE prediction criterion_rle = nn.CrossEntropyLoss(weight=class_weights) if gpu_mode is True: criterion_base = criterion_base.to(device_id) criterion_rle = criterion_rle.to(device_id) start_epoch = prev_ite # Train the Model if rank == 0: sys.stderr.write(TextColor.PURPLE + 'Training starting\n' + TextColor.END) sys.stderr.write(TextColor.BLUE + 'Start: ' + str(start_epoch + 1) + ' End: ' + str(epoch_limit) + "\n") stats = dict() stats['loss_epoch'] = [] stats['accuracy_epoch'] = [] for epoch in range(start_epoch, epoch_limit, 1): total_loss_base = 0 total_loss_rle = 0 total_loss = 0 total_images = 0 if rank == 0: sys.stderr.write(TextColor.BLUE + 'Train epoch: ' + str(epoch + 1) + "\n") # make sure the model is in train mode. BN is different in train and eval. batch_no = 1 if rank == 0: progress_bar = tqdm( total=len(train_loader), ncols=100, leave=True, position=rank, desc="Loss: ", ) else: progress_bar = None transducer_model.train() for images, label_base, label_rle in train_loader: # convert the tensors to the proper datatypes. images = images.type(torch.FloatTensor) label_base = label_base.type(torch.LongTensor) label_rle = label_rle.type(torch.LongTensor) hidden = torch.zeros(images.size(0), 2 * TrainOptions.GRU_LAYERS, TrainOptions.HIDDEN_SIZE) if gpu_mode: hidden = hidden.to(device_id) images = images.to(device_id) label_base = label_base.to(device_id) label_rle = label_rle.to(device_id) for i in range(0, ImageSizeOptions.SEQ_LENGTH, TrainOptions.WINDOW_JUMP): model_optimizer.zero_grad() if i + TrainOptions.TRAIN_WINDOW > ImageSizeOptions.SEQ_LENGTH: break image_chunk = images[:, i:i+TrainOptions.TRAIN_WINDOW] label_base_chunk = label_base[:, i:i+TrainOptions.TRAIN_WINDOW] label_rle_chunk = label_rle[:, i:i+TrainOptions.TRAIN_WINDOW] # get the inference from the model output_base, output_rle, hidden = transducer_model(image_chunk, hidden) # calculate loss for base prediction loss_base = criterion_base(output_base.contiguous().view(-1, num_base_classes), label_base_chunk.contiguous().view(-1)) # calculate loss for RLE prediction loss_rle = criterion_rle(output_rle.contiguous().view(-1, num_rle_classes), label_rle_chunk.contiguous().view(-1)) # sum the losses to have a singlee optimization over multiple tasks loss = loss_base + loss_rle # backpropagation and weight update loss.backward() model_optimizer.step() # update the loss values total_loss += loss.item() total_loss_base += loss_base.item() total_loss_rle += loss_rle.item() total_images += image_chunk.size(0) # detach the hidden from the graph as the next chunk will be a new optimization hidden = hidden.detach() # update the progress bar avg_loss = (total_loss / total_images) if total_images else 0 if train_mode is True and rank == 0: train_loss_logger.write(str(epoch + 1) + "," + str(batch_no) + "," + str(avg_loss) + "\n") if rank == 0: avg_loss = (total_loss / total_images) if total_images else 0 progress_bar.set_description("Base: " + str(round(total_loss_base, 4)) + ", RLE: " + str(round(total_loss_rle, 4)) + ", TOTAL: " + str(round(total_loss, 4))) progress_bar.refresh() progress_bar.update(1) batch_no += 1 if rank == 0: progress_bar.close() dist.barrier() if rank == 0: stats_dictionary = test(test_file, batch_size, gpu_mode, transducer_model, num_workers, gru_layers, hidden_size, num_base_classes=ImageSizeOptions.TOTAL_BASE_LABELS, num_rle_classes=ImageSizeOptions.TOTAL_RLE_LABELS) stats['loss'] = stats_dictionary['loss'] stats['accuracy'] = stats_dictionary['accuracy'] stats['loss_epoch'].append((epoch, stats_dictionary['loss'])) stats['accuracy_epoch'].append((epoch, stats_dictionary['accuracy'])) dist.barrier() # update the loggers if train_mode is True and rank == 0: ModelHandler.save_model(transducer_model, model_optimizer, hidden_size, gru_layers, epoch, model_dir + "HELEN_epoch_" + str(epoch + 1) + '_checkpoint.pkl') sys.stderr.write(TextColor.RED + "\nMODEL SAVED SUCCESSFULLY.\n" + TextColor.END) test_loss_logger.write(str(epoch + 1) + "," + str(stats['loss']) + "," + str(stats['accuracy']) + "\n") confusion_matrix_logger.write(str(epoch + 1) + "\n" + str(stats_dictionary['base_confusion_matrix']) + "\n") train_loss_logger.flush() test_loss_logger.flush() confusion_matrix_logger.flush() elif train_mode is False: # this setup is for hyperband if epoch + 1 >= 10 and stats['accuracy'] < 98: sys.stderr.write(TextColor.PURPLE + 'EARLY STOPPING AS THE MODEL NOT DOING WELL\n' + TextColor.END) return transducer_model, model_optimizer, stats if rank == 0: sys.stderr.write(TextColor.PURPLE + 'Finished training\n' + TextColor.END) return transducer_model, model_optimizer, stats
def predict_cpu(file_chunks, output_filepath, model_path, batch_size, total_callers, threads, num_workers): """ Create a prediction table/dictionary of an images set using a trained model. :param file_chunks: Path to chunked files :param batch_size: Batch size used for prediction :param model_path: Path to a trained model :param output_filepath: Path to output directory :param total_callers: Number of callers to spawn :param threads: Number of threads to use per caller :param num_workers: Number of workers to be used by the dataloader :return: Prediction dictionary """ # load the model using the model path transducer_model, hidden_size, gru_layers, prev_ite = \ ModelHandler.load_simple_model(model_path, input_channels=ImageSizeOptions.IMAGE_CHANNELS, image_features=ImageSizeOptions.IMAGE_HEIGHT, seq_len=ImageSizeOptions.SEQ_LENGTH, num_base_classes=ImageSizeOptions.TOTAL_BASE_LABELS, num_rle_classes=ImageSizeOptions.TOTAL_RLE_LABELS) transducer_model.eval() sys.stderr.write("INFO: MODEL LOADING TO ONNX\n") x = torch.zeros(1, TrainOptions.TRAIN_WINDOW, ImageSizeOptions.IMAGE_HEIGHT) h = torch.zeros(1, 2 * TrainOptions.GRU_LAYERS, TrainOptions.HIDDEN_SIZE) if not os.path.isfile(model_path + ".onnx"): sys.stderr.write("INFO: SAVING MODEL TO ONNX\n") # export the model as ONNX mode torch.onnx.export( transducer_model, (x, h), model_path + ".onnx", training=False, opset_version=10, do_constant_folding=True, input_names=['input_image', 'input_hidden'], output_names=['output_pred', 'output_rle', 'output_hidden'], dynamic_axes={ 'input_image': { 0: 'batch_size' }, 'input_hidden': { 0: 'batch_size' }, 'output_pred': { 0: 'batch_size' }, 'output_rle': { 0: 'batch_size' }, 'output_hidden': { 0: 'batch_size' } }) # create the arguments to send for prediction args = (output_filepath, model_path, batch_size, num_workers, threads) # spawn the processes to call the prediction method mp.spawn(setup, args=(total_callers, args, file_chunks), nprocs=total_callers, join=True)
def train(train_file, test_file, batch_size, epoch_limit, gpu_mode, num_workers, retrain_model, retrain_model_path, gru_layers, hidden_size, lr, decay, model_dir, stats_dir, not_hyperband): """ This method implements the training scheme of HELEN. It takes a set of training images and a set of testing images and trains the model on the training images and after each epoch it evaluates the trained model on the test image set. It saves all model state after each epoch regardless of it's performance. It also saves some statistics of the saved models and the training run. :param train_file: Path to train image set. :param test_file: Path to test image set. :param batch_size: Batch size for minibatch operation :param epoch_limit: Number of iterations the training will go for :param gpu_mode: If True, training and testing will be done on GPU :param num_workers: Number of workers for dataloader :param retrain_model: If True then it will load a previously-trained model for retraining :param retrain_model_path: Path to a previously trained model :param gru_layers: Number of GRU layers in the model :param hidden_size: Hidden size of the model :param lr: Learning Rate for the optimizer :param decay: Weight Decay for the optimizer :param model_dir: Directory where models will be saved :param stats_dir: Directory where statistics of this run will be saved :param not_hyperband: This is used by hyperband. If True then hyperband is not running. :return: """ # if hyperband is not running then create stat logger for train, test and confusion if not_hyperband is True: train_loss_logger = open(stats_dir + "train_loss.csv", 'w') test_loss_logger = open(stats_dir + "test_loss.csv", 'w') confusion_matrix_logger = open(stats_dir + "confusion_matrix.txt", 'w') else: train_loss_logger = None test_loss_logger = None confusion_matrix_logger = None sys.stderr.write(TextColor.PURPLE + 'Loading data\n' + TextColor.END) # initialize training dataset loader train_data_set = SequenceDataset(train_file) train_loader = DataLoader(train_data_set, batch_size=batch_size, shuffle=True, num_workers=num_workers, pin_memory=gpu_mode) num_base_classes = ImageSizeOptions.TOTAL_BASE_LABELS num_rle_classes = ImageSizeOptions.TOTAL_RLE_LABELS # if retrain model is true then load the model from the model path if retrain_model is True: if os.path.isfile(retrain_model_path) is False: sys.stderr.write( TextColor.RED + "ERROR: INVALID PATH TO RETRAIN PATH MODEL --retrain_model_path\n" ) exit(1) sys.stderr.write(TextColor.GREEN + "INFO: RETRAIN MODEL LOADING\n" + TextColor.END) transducer_model, hidden_size, gru_layers, prev_ite = \ ModelHandler.load_simple_model(retrain_model_path, input_channels=ImageSizeOptions.IMAGE_CHANNELS, image_features=ImageSizeOptions.IMAGE_HEIGHT, seq_len=ImageSizeOptions.SEQ_LENGTH, num_base_classes=num_base_classes, num_rle_classes=num_rle_classes) if not_hyperband is True: epoch_limit = prev_ite + epoch_limit sys.stderr.write(TextColor.GREEN + "INFO: RETRAIN MODEL LOADED\n" + TextColor.END) else: # if training from scractch, then create a new model transducer_model = ModelHandler.get_new_gru_model( input_channels=ImageSizeOptions.IMAGE_CHANNELS, image_features=ImageSizeOptions.IMAGE_HEIGHT, gru_layers=gru_layers, hidden_size=hidden_size, num_base_classes=num_base_classes, num_rle_classes=num_rle_classes) prev_ite = 0 # count the number of trainable parameters for reporting param_count = sum(p.numel() for p in transducer_model.parameters() if p.requires_grad) sys.stderr.write(TextColor.RED + "INFO: TOTAL TRAINABLE PARAMETERS:\t" + str(param_count) + "\n" + TextColor.END) # create a model optimizer model_optimizer = torch.optim.Adam(transducer_model.parameters(), lr=lr, weight_decay=decay) # this learning rate scheduler reduces learning rate when model reaches plateau lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( model_optimizer, 'min') # if retrain model is true then load the optimizer if retrain_model is True: sys.stderr.write(TextColor.GREEN + "INFO: OPTIMIZER LOADING\n" + TextColor.END) model_optimizer = ModelHandler.load_simple_optimizer( model_optimizer, retrain_model_path, gpu_mode) sys.stderr.write(TextColor.GREEN + "INFO: OPTIMIZER LOADED\n" + TextColor.END) class_weights = torch.Tensor(TrainOptions.CLASS_WEIGHTS) # we perform a multi-task classification, so we need two loss functions, each performing a single task # criterion base is the loss function for base prediction criterion_base = nn.CrossEntropyLoss() # criterion rle is the loss function for RLE prediction criterion_rle = nn.CrossEntropyLoss(weight=class_weights) # if gpu mode is true then transfer the model and loss functions to cuda if gpu_mode is True: transducer_model = torch.nn.DataParallel(transducer_model).cuda() criterion_base = criterion_base.cuda() criterion_rle = criterion_rle.cuda() start_epoch = prev_ite # Train the Model sys.stderr.write(TextColor.PURPLE + 'Training starting\n' + TextColor.END) # create stats dicts stats = dict() stats['loss_epoch'] = [] stats['accuracy_epoch'] = [] sys.stderr.write(TextColor.BLUE + 'Start: ' + str(start_epoch + 1) + ' End: ' + str(epoch_limit) + "\n") # for each epoch we iterate over the training dataset once for epoch in range(start_epoch, epoch_limit, 1): total_loss_base = 0 total_loss_rle = 0 total_loss = 0 total_images = 0 sys.stderr.write(TextColor.BLUE + 'Train epoch: ' + str(epoch + 1) + "\n") batch_no = 1 # tqdm is the progress bar we use for logging with tqdm(total=len(train_loader), desc='Loss', leave=True, ncols=100) as progress_bar: # make sure the model is in train mode. BN is different in train and eval. transducer_model.train() for images, label_base, label_rle in train_loader: # convert the tensors to the proper datatypes. images = images.type(torch.FloatTensor) label_base = label_base.type(torch.LongTensor) label_rle = label_rle.type(torch.LongTensor) # initialize the hidden input for the first chunk hidden = torch.zeros(images.size(0), 2 * TrainOptions.GRU_LAYERS, TrainOptions.HIDDEN_SIZE) # if gpu_mode is true then transfer all tensors to cuda if gpu_mode: images = images.cuda() label_base = label_base.cuda() label_rle = label_rle.cuda() hidden = hidden.cuda() # perform a sliding window on the entire image sequence length for i in range(0, ImageSizeOptions.SEQ_LENGTH, TrainOptions.WINDOW_JUMP): # we optimize over each chunk model_optimizer.zero_grad() # if current position + window size goes beyond the size of the window, # that means we've reached the end if i + TrainOptions.TRAIN_WINDOW > ImageSizeOptions.SEQ_LENGTH: break # get the chunks for this window image_chunk = images[:, i:i + TrainOptions.TRAIN_WINDOW] label_base_chunk = label_base[:, i:i + TrainOptions.TRAIN_WINDOW] label_rle_chunk = label_rle[:, i:i + TrainOptions.TRAIN_WINDOW] # get the inference from the model output_base, output_rle, hidden = transducer_model( image_chunk, hidden) # calculate loss for base prediction loss_base = criterion_base( output_base.contiguous().view(-1, num_base_classes), label_base_chunk.contiguous().view(-1)) # calculate loss for RLE prediction loss_rle = criterion_rle( output_rle.contiguous().view(-1, num_rle_classes), label_rle_chunk.contiguous().view(-1)) # sum the losses to have a singlee optimization over multiple tasks loss = loss_base + loss_rle # backpropagation and weight update loss.backward() model_optimizer.step() # update the loss values total_loss += loss.item() total_loss_base += loss_base.item() total_loss_rle += loss_rle.item() total_images += image_chunk.size(0) # detach the hidden from the graph as the next chunk will be a new optimization hidden = hidden.detach() # update the progress bar avg_loss = (total_loss / total_images) if total_images else 0 progress_bar.set_description("Base: " + str(round(total_loss_base, 4)) + ", RLE: " + str(round(total_loss_rle, 4)) + ", TOTAL: " + str(round(total_loss, 4))) if not_hyperband is True: train_loss_logger.write( str(epoch + 1) + "," + str(batch_no) + "," + str(avg_loss) + "\n") progress_bar.refresh() progress_bar.update(1) batch_no += 1 progress_bar.close() # after each epoch, evaluate the current state of the model stats_dictionary = test( test_file, batch_size, gpu_mode, transducer_model, num_workers, gru_layers, hidden_size, num_base_classes=ImageSizeOptions.TOTAL_BASE_LABELS, num_rle_classes=ImageSizeOptions.TOTAL_RLE_LABELS) stats['loss'] = stats_dictionary['loss'] stats['accuracy'] = stats_dictionary['accuracy'] stats['loss_epoch'].append((epoch, stats_dictionary['loss'])) stats['accuracy_epoch'].append((epoch, stats_dictionary['accuracy'])) lr_scheduler.step(stats['loss']) # save the model after each epoch and update the loggers after each epoch if not_hyperband is True: ModelHandler.save_model( transducer_model, model_optimizer, hidden_size, gru_layers, epoch, model_dir + "HELEN_epoch_" + str(epoch + 1) + '_checkpoint.pkl') sys.stderr.write(TextColor.RED + "\nMODEL SAVED SUCCESSFULLY.\n" + TextColor.END) test_loss_logger.write( str(epoch + 1) + "," + str(stats['loss']) + "," + str(stats['accuracy']) + "\n") confusion_matrix_logger.write( str(epoch + 1) + "\n" + str(stats_dictionary['base_confusion_matrix']) + "\n") train_loss_logger.flush() test_loss_logger.flush() confusion_matrix_logger.flush() # else: # # this setup is for hyperband # if epoch + 1 >= 2 and stats['accuracy'] < 90: # sys.stderr.write(TextColor.PURPLE + 'EARLY STOPPING AS THE MODEL NOT DOING WELL\n' + TextColor.END) # return transducer_model, model_optimizer, stats # notify that the model has finished training. sys.stderr.write(TextColor.PURPLE + 'Finished training\n' + TextColor.END)
def predict(test_file, output_filename, model_path, batch_size, num_workers, rank, device_id): """ The predict method loads images generated by MarginPolish and produces base predictions using a sequence transduction model based deep neural network. This method loads the model and iterates over minibatch images to generate the predictions and saves the predictions to a hdf5 file. :param test_file: File to predict on :param output_filename: Name and path to the output file :param batch_size: Batch size used for minibatch prediction :param model_path: Path to a trained model :param rank: Rank of this caller :param num_workers: Number of workers to be used by the dataloader :param device_id: Device id of the model to set to :param threads: Number of threads to use with pytorch :return: Prediction dictionary """ # create the output hdf5 file where all the predictions will be saved prediction_data_file = DataStore(output_filename + "_" + str(rank) + ".hdf", mode='w') # load the model using the model path transducer_model, hidden_size, gru_layers, prev_ite = \ ModelHandler.load_simple_model(model_path, input_channels=ImageSizeOptions.IMAGE_CHANNELS, image_features=ImageSizeOptions.IMAGE_HEIGHT, seq_len=ImageSizeOptions.SEQ_LENGTH, num_base_classes=ImageSizeOptions.TOTAL_BASE_LABELS, num_rle_classes=ImageSizeOptions.TOTAL_RLE_LABELS) transducer_model.eval() torch.cuda.set_device(device_id) transducer_model.to(device_id) transducer_model.eval() transducer_model = DistributedDataParallel(transducer_model, device_ids=[device_id]) # only output for rank 0 caller if rank == 0: print(output_filename + "_" + str(rank) + ".hdf") sys.stderr.write(TextColor.GREEN + 'INFO: TORCH THREADS SET TO: ' + str(torch.get_num_threads()) + ".\n" + TextColor.END) # notify that the process has started and loading data sys.stderr.write(TextColor.PURPLE + 'Loading data\n' + TextColor.END) # create a pytorch dataset and dataloader that loads the data in mini_batches test_data = SequenceDataset(image_directory=None, file_list=test_file) test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False, num_workers=num_workers) # iterate over the data in minibatches with torch.no_grad(): # keep an eye for batch total_batches = len(test_loader) batch_iterator = 0 # the dataloader loop, iterates in minibatches. tqdm is the progress logger. for contig, contig_start, contig_end, chunk_id, images, position, filename in test_loader: start_time = time.time() # the images are usually in uint8, convert them to FloatTensor images = images.type(torch.FloatTensor) # initialize the first hidden input as all zeros hidden = torch.zeros(images.size(0), 2 * TrainOptions.GRU_LAYERS, TrainOptions.HIDDEN_SIZE) # this is a multi-task neural network where we predict a base and a run-length. We use two dictionaries # to keep track of predictions. # these two dictionaries save predictions for each of the chunks and later we aggregate all the predictions # over the entire sequence to get a sequence prediction for the whole sequence. prediction_base_tensor = torch.zeros( (images.size(0), images.size(1), ImageSizeOptions.TOTAL_BASE_LABELS)) prediction_rle_tensor = torch.zeros( (images.size(0), images.size(1), ImageSizeOptions.TOTAL_RLE_LABELS)) # move the tensors to CUDA prediction_base_tensor = prediction_base_tensor.to(device_id) prediction_rle_tensor = prediction_rle_tensor.to(device_id) # now the images usually contain 1000 bases, we iterate on a sliding window basis where we process # the window size then jump to the next window for i in range(0, ImageSizeOptions.SEQ_LENGTH, TrainOptions.WINDOW_JUMP): # if current position + window size goes beyond the size of the window, that means we've reached the end if i + TrainOptions.TRAIN_WINDOW > ImageSizeOptions.SEQ_LENGTH: break chunk_start = i chunk_end = i + TrainOptions.TRAIN_WINDOW # get the image chunk image_chunk = images[:, chunk_start:chunk_end] # move the images and the hidden layers to CUDA images = images.to(device_id) hidden = hidden.to(device_id) # run inference output_base, output_rle, hidden = transducer_model( image_chunk, hidden) # now calculate how much padding is on the top and bottom of this chunk so we can do a simple # add operation top_zeros = chunk_start bottom_zeros = ImageSizeOptions.SEQ_LENGTH - chunk_end # we run a softmax a padding to make the output tensor compatible for adding inference_layers = nn.Sequential( nn.Softmax(dim=2), nn.ZeroPad2d((0, 0, top_zeros, bottom_zeros))) inference_layers = inference_layers.to(device_id) # run the softmax and padding layers base_prediction = inference_layers(output_base) rle_prediction = inference_layers(output_rle) # now simply add the tensor to the global counter prediction_base_tensor = torch.add(prediction_base_tensor, base_prediction) prediction_rle_tensor = torch.add(prediction_rle_tensor, rle_prediction) # all done now create a SEQ_LENGTH long prediction list prediction_base_tensor = prediction_base_tensor.cpu() prediction_rle_tensor = prediction_rle_tensor.cpu() base_values, base_labels = torch.max(prediction_base_tensor, 2) rle_values, rle_labels = torch.max(prediction_rle_tensor, 2) predicted_base_labels = base_labels.cpu().numpy() predicted_rle_labels = rle_labels.cpu().numpy() batch_iterator += 1 if rank == 0: # calculate the expected time to finish eta = (time.time() - start_time) * (total_batches - batch_iterator) hours = str(int(eta / 3600)) eta = eta - (eta / 3600) mins = str(int(eta / 60)) secs = str(int(eta) % 60) time_stamp = hours + " HOURS " + mins + " MINS " + secs + " SECS." batch_string = "BATCHES DONE: " + str( batch_iterator) + "/" + str(total_batches) + ". " time_left = "ESTIMATED TIME LEFT: " + str(time_stamp) sys.stderr.write(TextColor.GREEN + "INFO: " + batch_string + time_left + "\n" + TextColor.END) # go to each of the images and save the predictions to the file for i in range(images.size(0)): prediction_data_file.write_prediction( contig[i], contig_start[i], contig_end[i], chunk_id[i], position[i], predicted_base_labels[i], predicted_rle_labels[i], filename[i])
def predict(test_file, output_filename, model_path, batch_size, num_workers, threads, gpu_mode): """ The predict method loads images generated by MarginPolish and produces base predictions using a sequence transduction model based deep neural network. This method loads the model and iterates over minibatch images to generate the predictions and saves the predictions to a hdf5 file. :param test_file: File to predict on :param output_filename: Name and path to the output file :param batch_size: Batch size used for minibatch prediction :param model_path: Path to a trained model :param gpu_mode: If true, predictions will be done over GPU :param num_workers: Number of workers to be used by the dataloader :param threads: Number of threads to use with pytorch :return: Prediction dictionary """ # create the output hdf5 file where all the predictions will be saved prediction_data_file = DataStore(output_filename, mode='w') torch.set_num_threads(threads) sys.stderr.write(TextColor.GREEN + 'INFO: TORCH THREADS SET TO: ' + str(torch.get_num_threads()) + ".\n" + TextColor.END) # notify that the process has started and loading data sys.stderr.write(TextColor.PURPLE + 'Loading data\n' + TextColor.END) # create a pytorch dataset and dataloader that loads the data in mini_batches test_data = SequenceDataset(test_file) test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False, num_workers=num_workers) # load the model using the model path transducer_model, hidden_size, gru_layers, prev_ite = \ ModelHandler.load_simple_model(model_path, input_channels=ImageSizeOptions.IMAGE_CHANNELS, image_features=ImageSizeOptions.IMAGE_HEIGHT, seq_len=ImageSizeOptions.SEQ_LENGTH, num_base_classes=ImageSizeOptions.TOTAL_BASE_LABELS, num_rle_classes=ImageSizeOptions.TOTAL_RLE_LABELS) # set the model to evaluation mode. transducer_model.eval() # if gpu mode is True, then load the model in the GPUs if gpu_mode: transducer_model = torch.nn.DataParallel(transducer_model).cuda() # notify that the model has loaded successfully sys.stderr.write(TextColor.CYAN + 'MODEL LOADED\n') # iterate over the data in minibatches with torch.no_grad(): # the dataloader loop, iterates in minibatches. tqdm is the progress logger. for contig, contig_start, contig_end, chunk_id, images, position, filename in tqdm( test_loader, ncols=50): # the images are usually in uint8, convert them to FloatTensor images = images.type(torch.FloatTensor) # initialize the first hidden input as all zeros hidden = torch.zeros(images.size(0), 2 * TrainOptions.GRU_LAYERS, TrainOptions.HIDDEN_SIZE) # if gpu_mode is True, transfer the image and hidden tensors to the GPU if gpu_mode: images = images.cuda() hidden = hidden.cuda() # this is a multi-task neural network where we predict a base and a run-length. We use two dictionaries # to keep track of predictions. # these two dictionaries save predictions for each of the chunks and later we aggregate all the predictions # over the entire sequence to get a sequence prediction for the whole sequence. prediction_base_tensor = torch.zeros( (images.size(0), images.size(1), ImageSizeOptions.TOTAL_BASE_LABELS)) prediction_rle_tensor = torch.zeros( (images.size(0), images.size(1), ImageSizeOptions.TOTAL_RLE_LABELS)) if gpu_mode: prediction_base_tensor = prediction_base_tensor.cuda() prediction_rle_tensor = prediction_rle_tensor.cuda() # now the images usually contain 1000 bases, we iterate on a sliding window basis where we process # the window size then jump to the next window for i in range(0, ImageSizeOptions.SEQ_LENGTH, TrainOptions.WINDOW_JUMP): # if current position + window size goes beyond the size of the window, that means we've reached the end if i + TrainOptions.TRAIN_WINDOW > ImageSizeOptions.SEQ_LENGTH: break chunk_start = i chunk_end = i + TrainOptions.TRAIN_WINDOW # get the image chunk image_chunk = images[:, chunk_start:chunk_end] # run inference output_base, output_rle, hidden = transducer_model( image_chunk, hidden) # now calculate how much padding is on the top and bottom of this chunk so we can do a simple # add operation top_zeros = chunk_start bottom_zeros = ImageSizeOptions.SEQ_LENGTH - chunk_end # we run a softmax a padding to make the output tensor compatible for adding inference_layers = nn.Sequential( nn.Softmax(dim=2), nn.ZeroPad2d((0, 0, top_zeros, bottom_zeros))) if gpu_mode: inference_layers = inference_layers.cuda() # run the softmax and padding layers base_prediction = inference_layers(output_base) rle_prediction = inference_layers(output_rle) # now simply add the tensor to the global counter prediction_base_tensor = torch.add(prediction_base_tensor, base_prediction) prediction_rle_tensor = torch.add(prediction_rle_tensor, rle_prediction) # all done now create a SEQ_LENGTH long prediction list prediction_base_tensor = prediction_base_tensor.cpu() prediction_rle_tensor = prediction_rle_tensor.cpu() base_values, base_labels = torch.max(prediction_base_tensor, 2) rle_values, rle_labels = torch.max(prediction_rle_tensor, 2) predicted_base_labels = base_labels.cpu().numpy() predicted_rle_labels = rle_labels.cpu().numpy() # go to each of the images and save the predictions to the file for i in range(images.size(0)): prediction_data_file.write_prediction( contig[i], contig_start[i], contig_end[i], chunk_id[i], position[i], predicted_base_labels[i], predicted_rle_labels[i], filename[i])