class UNetExperiment(PytorchExperiment): """ The UnetExperiment is inherited from the PytorchExperiment. It implements the basic life cycle for a segmentation task with UNet(https://arxiv.org/abs/1505.04597). It is optimized to work with the provided NumpyDataLoader. The basic life cycle of a UnetExperiment is the same s PytorchExperiment: setup() (--> Automatically restore values if a previous checkpoint is given) prepare() for epoch in n_epochs: train() validate() (--> save current checkpoint) end() """ def setup(self): pkl_dir = self.config.split_dir with open(os.path.join(pkl_dir, "splits.pkl"), 'rb') as f: splits = pickle.load(f) tr_keys = splits[self.config.fold]['train'] val_keys = splits[self.config.fold]['val'] test_keys = splits[self.config.fold]['test'] self.device = torch.device( self.config.device if torch.cuda.is_available() else "cpu") self.train_data_loader = NumpyDataSet( self.config.data_dir, target_size=self.config.patch_size, batch_size=self.config.batch_size, keys=tr_keys) self.val_data_loader = NumpyDataSet(self.config.data_dir, target_size=self.config.patch_size, batch_size=self.config.batch_size, keys=val_keys, mode="val", do_reshuffle=False) self.test_data_loader = NumpyDataSet( self.config.data_test_dir, target_size=self.config.patch_size, batch_size=self.config.batch_size, keys=test_keys, mode="test", do_reshuffle=False) self.model = UNet(num_classes=self.config.num_classes, in_channels=self.config.in_channels) self.model.to(self.device) # We use a combination of DICE-loss and CE-Loss in this example. # This proved good in the medical segmentation decathlon. self.dice_loss = SoftDiceLoss( batch_dice=True) # Softmax für DICE Loss! self.ce_loss = torch.nn.CrossEntropyLoss( ) # Kein Softmax für CE Loss -> ist in torch schon mit drin! self.optimizer = optim.Adam(self.model.parameters(), lr=self.config.learning_rate) self.scheduler = ReduceLROnPlateau(self.optimizer, 'min') # If directory for checkpoint is provided, we load it. if self.config.do_load_checkpoint: if self.config.checkpoint_dir == '': print( 'checkpoint_dir is empty, please provide directory to load checkpoint.' ) else: self.load_checkpoint(name=self.config.checkpoint_dir, save_types=("model")) self.save_checkpoint(name="checkpoint_start") self.elog.print('Experiment set up.') def train(self, epoch): self.elog.print('=====TRAIN=====') self.model.train() data = None batch_counter = 0 for data_batch in self.train_data_loader: self.optimizer.zero_grad() # Shape of data_batch = [1, b, c, w, h] # Desired shape = [b, c, w, h] # Move data and target to the GPU data = data_batch['data'][0].float().to(self.device) target = data_batch['seg'][0].long().to(self.device) pred = self.model(data) pred_softmax = F.softmax( pred, dim=1 ) # We calculate a softmax, because our SoftDiceLoss expects that as an input. The CE-Loss does the softmax internally. loss = self.dice_loss(pred_softmax, target.squeeze()) + self.ce_loss( pred, target.squeeze()) # loss = self.ce_loss(pred, target.squeeze()) loss.backward() self.optimizer.step() # Some logging and plotting if (batch_counter % self.config.plot_freq) == 0: self.elog.print('Epoch: %d Loss: %.4f' % (self._epoch_idx, loss)) self.add_result( value=loss.item(), name='Train_Loss', tag='Loss', counter=epoch + (batch_counter / self.train_data_loader.data_loader.num_batches)) self.clog.show_image_grid(data.float(), name="data", normalize=True, scale_each=True, n_iter=epoch) self.clog.show_image_grid(target.float(), name="mask", title="Mask", n_iter=epoch) self.clog.show_image_grid(torch.argmax(pred.cpu(), dim=1, keepdim=True), name="unt_argmax", title="Unet", n_iter=epoch) self.clog.show_image_grid(pred.cpu()[:, 1:2, ], name="unt", normalize=True, scale_each=True, n_iter=epoch) batch_counter += 1 assert data is not None, 'data is None. Please check if your dataloader works properly' def validate(self, epoch): self.elog.print('VALIDATE') self.model.eval() data = None loss_list = [] with torch.no_grad(): for data_batch in self.val_data_loader: data = data_batch['data'][0].float().to(self.device) target = data_batch['seg'][0].long().to(self.device) pred = self.model(data) pred_softmax = F.softmax( pred ) # We calculate a softmax, because our SoftDiceLoss expects that as an input. The CE-Loss does the softmax internally. loss = self.dice_loss(pred_softmax, target.squeeze()) + self.ce_loss( pred, target.squeeze()) loss_list.append(loss.item()) assert data is not None, 'data is None. Please check if your dataloader works properly' self.scheduler.step(np.mean(loss_list)) self.elog.print('Epoch: %d Loss: %.4f' % (self._epoch_idx, np.mean(loss_list))) self.add_result(value=np.mean(loss_list), name='Val_Loss', tag='Loss', counter=epoch + 1) self.clog.show_image_grid(data.float(), name="data_val", normalize=True, scale_each=True, n_iter=epoch) self.clog.show_image_grid(target.float(), name="mask_val", title="Mask", n_iter=epoch) self.clog.show_image_grid(torch.argmax(pred.data.cpu(), dim=1, keepdim=True), name="unt_argmax_val", title="Unet", n_iter=epoch) self.clog.show_image_grid(pred.data.cpu()[:, 1:2, ], name="unt_val", normalize=True, scale_each=True, n_iter=epoch) def test(self): # TODO print('TODO: Implement your test() method here')
class UNetExperiment: """ This class implements the basic life cycle for a segmentation task with UNet(https://arxiv.org/abs/1505.04597). The basic life cycle of a UNetExperiment is: run(): for epoch in n_epochs: train() validate() test() """ def __init__(self, config, split, dataset): self.n_epochs = config.n_epochs self.split = split self._time_start = "" self._time_end = "" self.epoch = 0 self.name = config.name # Create output folders dirname = f'{time.strftime("%Y-%m-%d_%H%M", time.gmtime())}_{self.name}' self.out_dir = os.path.join(config.test_results_dir, dirname) os.makedirs(self.out_dir, exist_ok=True) # Create data loaders # TASK: SlicesDataset class is not complete. Go to the file and complete it. # Note that we are using a 2D version of UNet here, which means that it will expect # batches of 2D slices. self.train_loader = DataLoader(SlicesDataset(dataset[split["train"]]), batch_size=config.batch_size, shuffle=True, num_workers=0) self.val_loader = DataLoader(SlicesDataset(dataset[split["val"]]), batch_size=config.batch_size, shuffle=True, num_workers=0) # we will access volumes directly for testing self.test_data = dataset[split["test"]] # Do we have CUDA available? if not torch.cuda.is_available(): print( "WARNING: No CUDA device is found. This may take significantly longer!" ) self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") # Configure our model and other training implements # We will use a recursive UNet model from German Cancer Research Center, # Division of Medical Image Computing. It is quite complicated and works # very well on this task. Feel free to explore it or plug in your own model self.model = UNet(num_classes=3) self.model.to(self.device) # We are using a standard cross-entropy loss since the model output is essentially # a tensor with softmax'd prediction of each pixel's probability of belonging # to a certain class self.loss_function = torch.nn.CrossEntropyLoss() # We are using standard SGD method to optimize our weights self.optimizer = optim.Adam(self.model.parameters(), lr=config.learning_rate) # Scheduler helps us update learning rate automatically self.scheduler = optim.lr_scheduler.ReduceLROnPlateau( self.optimizer, 'min') # Set up Tensorboard. By default it saves data into runs folder. You need to launch self.tensorboard_train_writer = SummaryWriter(comment="_train") self.tensorboard_val_writer = SummaryWriter(comment="_val") def train(self): """ This method is executed once per epoch and takes care of model weight update cycle """ print(f"Training epoch {self.epoch}...") self.model.train() # Loop over our minibatches for i, batch in enumerate(self.train_loader): self.optimizer.zero_grad() # TASK: You have your data in batch variable. Put the slices as 4D Torch Tensors of # shape [BATCH_SIZE, 1, PATCH_SIZE, PATCH_SIZE] into variables data and target. # Feed data to the model and feed target to the loss function # # data = <YOUR CODE HERE> # target = <YOUR CODE HERE> data = batch["image"].to(self.device, dtype=torch.float) target = batch["seg"].to(self.device) prediction = self.model(data) # We are also getting softmax'd version of prediction to output a probability map # so that we can see how the model converges to the solution prediction_softmax = F.softmax(prediction, dim=1) loss = self.loss_function(prediction, target[:, 0, :, :]) # TASK: What does each dimension of variable prediction represent? # ANSWER: Dimensions represent: batch_size, classes, coronal data, axial data loss.backward() self.optimizer.step() if (i % 10) == 0: # Output to console on every 10th batch print( f"\nEpoch: {self.epoch} Train loss: {loss}, {100*(i+1)/len(self.train_loader):.1f}% complete" ) counter = 100 * self.epoch + 100 * (i / len(self.train_loader)) # You don't need to do anything with this function, but you are welcome to # check it out if you want to see how images are logged to Tensorboard # or if you want to output additional debug data log_to_tensorboard(self.tensorboard_train_writer, loss, data, target, prediction_softmax, prediction, counter) print(".", end='') print("\nTraining complete") def validate(self): """ This method runs validation cycle, using same metrics as Train method. Note that model needs to be switched to eval mode and no_grad needs to be called so that gradients do not propagate """ print(f"Validating epoch {self.epoch}...") # Turn off gradient accumulation by switching model to "eval" mode self.model.eval() loss_list = [] with torch.no_grad(): for i, batch in enumerate(self.val_loader): # TASK: Write validation code that will compute loss on a validation sample # <YOUR CODE HERE> data = batch["image"].to(self.device, dtype=torch.float) target = batch["seg"].to(self.device) prediction = self.model(data) prediction_softmax = F.softmax(prediction, dim=1) loss = self.loss_function(prediction, target[:, 0, :, :]) print(f"Batch {i}. Data shape {data.shape} Loss {loss}") # We report loss that is accumulated across all of validation set loss_list.append(loss.item()) self.scheduler.step(np.mean(loss_list)) log_to_tensorboard(self.tensorboard_val_writer, np.mean(loss_list), data, target, prediction_softmax, prediction, (self.epoch + 1) * 100) print(f"Validation complete") def save_model_parameters(self): """ Saves model parameters to a file in results directory """ path = os.path.join(self.out_dir, "model.pth") torch.save(self.model.state_dict(), path) def load_model_parameters(self, path=''): """ Loads model parameters from a supplied path or a results directory """ if not path: model_path = os.path.join(self.out_dir, "model.pth") else: model_path = path if os.path.exists(model_path): self.model.load_state_dict(torch.load(model_path)) else: raise Exception(f"Could not find path {model_path}") def run_test(self): """ This runs test cycle on the test dataset. Note that process and evaluations are quite different Here we are computing a lot more metrics and returning a dictionary that could later be persisted as JSON """ print("Testing...") self.model.eval() # In this method we will be computing metrics that are relevant to the task of 3D volume # segmentation. Therefore, unlike train and validation methods, we will do inferences # on full 3D volumes, much like we will be doing it when we deploy the model in the # clinical environment. # TASK: Inference Agent is not complete. Go and finish it. Feel free to test the class # in a module of your own by running it against one of the data samples inference_agent = UNetInferenceAgent(model=self.model, device=self.device) out_dict = {} out_dict["volume_stats"] = [] dc_list = [] jc_list = [] # for every in test set for i, x in enumerate(self.test_data): pred_label = inference_agent.single_volume_inference(x["image"]) # We compute and report Dice and Jaccard similarity coefficients which # assess how close our volumes are to each other # TASK: Dice3D and Jaccard3D functions are not implemented. # Complete the implementation as we discussed # in one of the course lessons, you can look up definition of Jaccard index # on Wikipedia. If you completed it # correctly (and if you picked your train/val/test split right ;)), # your average Jaccard on your test set should be around 0.80 dc = Dice3d(pred_label, x["seg"]) jc = Jaccard3d(pred_label, x["seg"]) dc_list.append(dc) jc_list.append(jc) # STAND-OUT SUGGESTION: By way of exercise, consider also outputting: # * Sensitivity and specificity (and explain semantic meaning in terms of # under/over segmenting) # * Dice-per-slice and render combined slices with lowest and highest DpS # * Dice per class (anterior/posterior) out_dict["volume_stats"].append({ "filename": x['filename'], "dice": dc, "jaccard": jc }) print( f"{x['filename']} Dice {dc:.4f}. {100*(i+1)/len(self.test_data):.2f}% complete" ) out_dict["overall"] = { "mean_dice": np.mean(dc_list), "mean_jaccard": np.mean(jc_list) } print("\nTesting complete.") return out_dict def run(self): """ Kicks off train cycle and writes model parameter file at the end """ self._time_start = time.time() print("Experiment started.") # Iterate over epochs for self.epoch in range(self.n_epochs): self.train() self.validate() # save model for inferencing self.save_model_parameters() self._time_end = time.time() print( f"Run complete. Total time: {time.strftime('%H:%M:%S', time.gmtime(self._time_end - self._time_start))}" )
class UNetExperiment: """ This class implements the basic life cycle for a segmentation task with UNet(https://arxiv.org/abs/1505.04597). The basic life cycle of a UNetExperiment is: run(): for epoch in n_epochs: train() validate() test() """ def __init__(self, config, split, dataset): self.n_epochs = config.n_epochs self.split = split self._time_start = "" self._time_end = "" self.epoch = 0 self.name = config.name # Create output folders dirname = f'{time.strftime("%Y-%m-%d_%H%M", time.gmtime())}_{self.name}' self.out_dir = os.path.join(config.test_results_dir, dirname) os.makedirs(self.out_dir, exist_ok=True) # Create data loaders self.train_loader = DataLoader(SlicesDataset(dataset[split["train"]]), batch_size=config.batch_size, shuffle=True, num_workers=0) self.val_loader = DataLoader(SlicesDataset(dataset[split["val"]]), batch_size=config.batch_size, shuffle=True, num_workers=0) # access volumes directly for testing self.test_data = dataset[split["test"]] if not torch.cuda.is_available(): print("WARNING: No CUDA device is found. This may take significantly longer!") #self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.device = torch.device('cpu') # use a recursive UNet model from German Cancer Research Center, Division of Medical Image Computing self.model = UNet() self.model.to(self.device) # use a standard cross-entropy loss since the model output is essentially # a tensor with softmax prediction of each pixel's probability of belonging to a certain class self.loss_function = torch.nn.CrossEntropyLoss() # use standard SGD method to optimize the weights self.optimizer = optim.Adam(self.model.parameters(), lr=config.learning_rate) # Scheduler helps to update learning rate automatically self.scheduler = optim.lr_scheduler.ReduceLROnPlateau(self.optimizer, 'min') # Set up Tensorboard. By default it saves data into runs folder. You need to launch # self.tensorboard_train_writer = SummaryWriter(comment="_train") # self.tensorboard_val_writer = SummaryWriter(comment="_val") def train(self): """ This method is executed once per epoch and takes care of model weight update cycle """ print(f"Training epoch {self.epoch}...") self.model.train() # Loop over the minibatches for i, batch in enumerate(self.train_loader): self.optimizer.zero_grad() # Feed data to the model and feed target to the loss function data = batch['image'].float() target = batch['seg'] prediction = self.model(data.to(self.device)) prediction_softmax = F.softmax(prediction, dim=1) loss = self.loss_function(prediction_softmax, target[:, 0, :, :].to(self.device)) # What does each dimension of variable prediction represent? # batch_size, 3 classes, coronal, axial loss.backward() self.optimizer.step() if (i % 10) == 0: # Output to console on every 10th batch print(f"\nEpoch: {self.epoch} Train loss: {loss}, {100*(i+1)/len(self.train_loader):.1f}% complete") counter = 100*self.epoch + 100*(i/len(self.train_loader)) # log_to_tensorboard( # self.tensorboard_train_writer, # loss, # data, # target, # prediction_softmax, # prediction, # counter) print(".", end='') print("\nTraining complete") def validate(self): """ This method runs validation cycle, using same metrics as Train method. Note that model needs to be switched to eval mode and no_grad needs to be called so that gradients do not propagate """ print(f"Validating epoch {self.epoch}...") # Turn off gradient accumulation by switching model to "eval" mode self.model.eval() loss_list = [] with torch.no_grad(): for i, batch in enumerate(self.val_loader): data = batch['image'].float() target = batch['seg'] prediction = self.model(data.to(self.device)) prediction_softmax = F.softmax(prediction, dim=1) loss = self.loss_function(prediction_softmax, target[:, 0, :, :].to(self.device)) print(f"Batch {i}. Data shape {data.shape} Loss {loss}") # We report loss that is accumulated across all of validation set loss_list.append(loss.item()) self.scheduler.step(np.mean(loss_list)) # log_to_tensorboard( # self.tensorboard_val_writer, # np.mean(loss_list), # data, # target, # prediction_softmax, # prediction, # (self.epoch+1) * 100) print(f"Validation complete") def save_model_parameters(self): """ Saves model parameters to a file in results directory """ path = os.path.join(self.out_dir, "model.pth") torch.save(self.model.state_dict(), path) def load_model_parameters(self, path=''): """ Loads model parameters from a supplied path or a results directory """ if not path: model_path = os.path.join(self.out_dir, "model.pth") else: model_path = path if os.path.exists(model_path): self.model.load_state_dict(torch.load(model_path)) else: raise Exception(f"Could not find path {model_path}") def run_test(self): """ This runs test cycle on the test dataset. Note that process and evaluations are quite different Here we are computing a lot more metrics and returning a dictionary that could later be persisted as JSON """ print("Testing...") self.model.eval() inference_agent = UNetInferenceAgent(model=self.model, device=self.device) out_dict = {} out_dict["volume_stats"] = [] dc_list = [] jc_list = [] # for every in test set for i, x in enumerate(self.test_data): pred_label = inference_agent.single_volume_inference(x["image"]) # We compute and report Dice and Jaccard similarity coefficients which # assess how close our volumes are to each other dc = Dice3d(pred_label, x["seg"]) jc = Jaccard3d(pred_label, x["seg"]) dc_list.append(dc) jc_list.append(jc) # STAND-OUT SUGGESTION: By way of exercise, consider also outputting: # * Sensitivity and specificity (and explain semantic meaning in terms of # under/over segmenting) # * Dice-per-slice and render combined slices with lowest and highest DpS # * Dice per class (anterior/posterior) out_dict["volume_stats"].append({ "filename": x['filename'], "dice": dc, "jaccard": jc }) print(f"{x['filename']} Dice {dc:.4f}. {100*(i+1)/len(self.test_data):.2f}% complete") out_dict["overall"] = { "mean_dice": np.mean(dc_list), "mean_jaccard": np.mean(jc_list)} print("\nTesting complete.") return out_dict def run(self): """ Kicks off train cycle and writes model parameter file at the end """ self._time_start = time.time() print("Experiment started.") # Iterate over epochs for self.epoch in range(self.n_epochs): self.train() self.validate() # save model for inferencing self.save_model_parameters() self._time_end = time.time() print(f"Run complete. Total time: {time.strftime('%H:%M:%S', time.gmtime(self._time_end - self._time_start))}")
class UNetExperiment(PytorchExperiment): """ The UnetExperiment is inherited from the PytorchExperiment. It implements the basic life cycle for a segmentation task with UNet(https://arxiv.org/abs/1505.04597). It is optimized to work with the provided NumpyDataLoader. The basic life cycle of a UnetExperiment is the same s PytorchExperiment: setup() (--> Automatically restore values if a previous checkpoint is given) prepare() for epoch in n_epochs: train() validate() (--> save current checkpoint) end() """ def setup(self): pkl_dir = self.config.split_dir with open(os.path.join(pkl_dir, "splits.pkl"), 'rb') as f: splits = pickle.load(f) tr_keys = splits[self.config.fold]['train'] val_keys = splits[self.config.fold]['val'] test_keys = splits[self.config.fold]['test'] self.device = torch.device( self.config.device if torch.cuda.is_available() else "cpu") self.train_data_loader = NumpyDataSet( self.config.data_dir, target_size=self.config.patch_size, batch_size=self.config.batch_size, keys=tr_keys) self.val_data_loader = NumpyDataSet(self.config.data_dir, target_size=self.config.patch_size, batch_size=self.config.batch_size, keys=val_keys, mode="val", do_reshuffle=False) self.test_data_loader = NumpyDataSet( self.config.data_test_dir, target_size=self.config.patch_size, batch_size=self.config.batch_size, keys=test_keys, mode="test", do_reshuffle=False) self.model = UNet(num_classes=self.config.num_classes, in_channels=self.config.in_channels) self.model.to(self.device) # We use a combination of DICE-loss and CE-Loss in this example. # This proved good in the medical segmentation decathlon. self.dice_loss = SoftDiceLoss( batch_dice=True) # Softmax for DICE Loss! self.ce_loss = torch.nn.CrossEntropyLoss( ) # No softmax for CE Loss -> is implemented in torch! self.optimizer = optim.Adam(self.model.parameters(), lr=self.config.learning_rate) self.scheduler = ReduceLROnPlateau(self.optimizer, 'min') # If directory for checkpoint is provided, we load it. if self.config.do_load_checkpoint: if self.config.checkpoint_dir == '': print( 'checkpoint_dir is empty, please provide directory to load checkpoint.' ) else: self.load_checkpoint(name=self.config.checkpoint_dir, save_types=("model")) self.save_checkpoint(name="checkpoint_start") self.elog.print('Experiment set up.') def train(self, epoch): self.elog.print('=====TRAIN=====') self.model.train() data = None batch_counter = 0 for data_batch in self.train_data_loader: self.optimizer.zero_grad() # Shape of data_batch = [1, b, c, w, h] # Desired shape = [b, c, w, h] # Move data and target to the GPU data = data_batch['data'][0].float().to(self.device) target = data_batch['seg'][0].long().to(self.device) pred = self.model(data) pred_softmax = F.softmax( pred, dim=1 ) # We calculate a softmax, because our SoftDiceLoss expects that as an input. The CE-Loss does the softmax internally. #loss = self.dice_loss(pred_softmax, target.squeeze()) + self.ce_loss(pred, target.squeeze()) loss = self.ce_loss(pred, target.squeeze()) loss.backward() self.optimizer.step() # Some logging and plotting if (batch_counter % self.config.plot_freq) == 0: self.elog.print('Epoch: {0} Loss: {1:.4f}'.format( self._epoch_idx, loss)) self.add_result( value=loss.item(), name='Train_Loss', tag='Loss', counter=epoch + (batch_counter / self.train_data_loader.data_loader.num_batches)) self.clog.show_image_grid(data.float().cpu(), name="data", normalize=True, scale_each=True, n_iter=epoch) self.clog.show_image_grid(target.float().cpu(), name="mask", title="Mask", n_iter=epoch) self.clog.show_image_grid(torch.argmax(pred.cpu(), dim=1, keepdim=True), name="unt_argmax", title="Unet", n_iter=epoch) self.clog.show_image_grid(pred.cpu()[:, 1:2, ], name="unt", normalize=True, scale_each=True, n_iter=epoch) batch_counter += 1 assert data is not None, 'data is None. Please check if your dataloader works properly' def validate(self, epoch): self.elog.print('VALIDATE') self.model.eval() data = None loss_list = [] with torch.no_grad(): for data_batch in self.val_data_loader: data = data_batch['data'][0].float().to(self.device) target = data_batch['seg'][0].long().to(self.device) pred = self.model(data) pred_softmax = F.softmax( pred, dim=1 ) # We calculate a softmax, because our SoftDiceLoss expects that as an input. The CE-Loss does the softmax internally. #loss = self.dice_loss(pred_softmax, target.squeeze()) + self.ce_loss(pred, target.squeeze()) loss = self.ce_loss(pred, target.squeeze()) loss_list.append(loss.item()) assert data is not None, 'data is None. Please check if your dataloader works properly' self.scheduler.step(np.mean(loss_list)) self.elog.print('Epoch: %d Loss: %.4f' % (self._epoch_idx, np.mean(loss_list))) self.add_result(value=np.mean(loss_list), name='Val_Loss', tag='Loss', counter=epoch + 1) self.clog.show_image_grid(data.float().cpu(), name="data_val", normalize=True, scale_each=True, n_iter=epoch) self.clog.show_image_grid(target.float().cpu(), name="mask_val", title="Mask", n_iter=epoch) self.clog.show_image_grid(torch.argmax(pred.data.cpu(), dim=1, keepdim=True), name="unt_argmax_val", title="Unet", n_iter=epoch) self.clog.show_image_grid(pred.data.cpu()[:, 1:2, ], name="unt_val", normalize=True, scale_each=True, n_iter=epoch) def test(self): from evaluation.evaluator import aggregate_scores, Evaluator from collections import defaultdict self.elog.print('=====TEST=====') self.model.eval() pred_dict = defaultdict(list) gt_dict = defaultdict(list) batch_counter = 0 with torch.no_grad(): for data_batch in self.test_data_loader: print('testing...', batch_counter) batch_counter += 1 # Get data_batches mr_data = data_batch['data'][0].float().to(self.device) mr_target = data_batch['seg'][0].float().to(self.device) pred = self.model(mr_data) pred_argmax = torch.argmax(pred.data.cpu(), dim=1, keepdim=True) fnames = data_batch['fnames'] for i, fname in enumerate(fnames): pred_dict[fname[0]].append( pred_argmax[i].detach().cpu().numpy()) gt_dict[fname[0]].append( mr_target[i].detach().cpu().numpy()) test_ref_list = [] for key in pred_dict.keys(): test_ref_list.append( (np.stack(pred_dict[key]), np.stack(gt_dict[key]))) scores = aggregate_scores(test_ref_list, evaluator=Evaluator, json_author=self.config.author, json_task=self.config.name, json_name=self.config.name, json_output_file=self.elog.work_dir + "/{}_".format(self.config.author) + self.config.name + '.json') print("Scores:\n", scores) def segment_single_image(self, data): self.model = UNet(num_classes=self.config.num_classes, in_channels=self.config.in_channels) self.device = torch.device( self.config.device if torch.cuda.is_available() else "cpu") # a model must be present and loaded in here if self.config.model_dir == '': print( 'model_dir is empty, please provide directory to load checkpoint.' ) else: self.load_checkpoint(name=self.config.model_dir, save_types=("model")) self.elog.print("=====SEGMENT_SINGLE_IMAGE=====") self.model.eval() self.model.to(self.device) # Desired shape = [b, c, w, h] # split into even chunks (lets use size) with torch.no_grad(): ###### # When working entirely on CPU and in memory, the following lines replace the split/concat method # mr_data = data.float().to(self.device) # pred = self.model(mr_data) # pred_argmax = torch.argmax(pred.data.cpu(), dim=1, keepdim=True) ###### ###### # for CUDA (also works on CPU) split into batches blocksize = self.config.batch_size # number_of_elements = round(data.shape[0]/blocksize+0.5) # make blocks large enough to not lose any slices chunks = [ data[i:i + blocksize, ::, ::, ::] for i in range(0, data.shape[0], blocksize) ] pred_list = [] for data_batch in chunks: mr_data = data_batch.float().to(self.device) pred_dict = self.model(mr_data) pred_list.append(pred_dict.cpu()) pred = torch.Tensor(np.concatenate(pred_list)) pred_argmax = torch.argmax(pred, dim=1, keepdim=True) # detach result and put it back to cpu so that we can work with, create a numpy array result = pred_argmax.short().detach().cpu().numpy() return result
class UNetExperiment(PytorchExperiment): """ The UnetExperiment is inherited from the PytorchExperiment. It implements the basic life cycle for a segmentation task with UNet(https://arxiv.org/abs/1505.04597). It is optimized to work with the provided NumpyDataLoader. The basic life cycle of a UnetExperiment is the same s PytorchExperiment: setup() (--> Automatically restore values if a previous checkpoint is given) prepare() for epoch in n_epochs: train() validate() (--> save current checkpoint) end() """ def setup(self): pkl_dir = self.config.split_dir with open(os.path.join(pkl_dir, "splits.pkl"), 'rb') as f: splits = pickle.load(f) tr_keys = splits[self.config.fold]['train'] val_keys = splits[self.config.fold]['val'] test_keys = splits[self.config.fold]['test'] print("pkl_dir: ", pkl_dir) print("tr_keys: ", tr_keys) print("val_keys: ", val_keys) print("test_keys: ", test_keys) self.device = torch.device( self.config.device if torch.cuda.is_available() else "cpu") task = self.config.dataset_name self.train_data_loader = torch.utils.data.DataLoader( NucleusDataset(self.config.data_root_dir, train=True, transform=transforms.Compose([ Normalize(), Rescale(self.config.patch_size), ToTensor() ]), target_transform=transforms.Compose([ Normalize(), Rescale(self.config.patch_size), ToTensor() ]), mode="train", keys=tr_keys, taskname=task), batch_size=self.config.batch_size, shuffle=True) self.val_data_loader = torch.utils.data.DataLoader( NucleusDataset(self.config.data_root_dir, train=True, transform=transforms.Compose([ Normalize(), Rescale(self.config.patch_size), ToTensor() ]), target_transform=transforms.Compose([ Normalize(), Rescale(self.config.patch_size), ToTensor() ]), mode="val", keys=val_keys, taskname=self.config.dataset_name), batch_size=self.config.batch_size, shuffle=True) self.test_data_loader = torch.utils.data.DataLoader( NucleusDataset(self.config.data_root_dir, train=True, transform=transforms.Compose([ Normalize(), Rescale(self.config.patch_size), ToTensor() ]), target_transform=transforms.Compose([ Normalize(), Rescale(self.config.patch_size), ToTensor() ]), mode="test", keys=test_keys, taskname=self.config.dataset_name), batch_size=self.config.batch_size, shuffle=True) self.model = UNet(num_classes=self.config.num_classes, in_channels=self.config.in_channels) #self.model = UNet() self.model.to(self.device) self.bce_weight = 0.5 self.optimizer = optim.Adam(self.model.parameters(), lr=self.config.learning_rate) self.scheduler = ReduceLROnPlateau(self.optimizer, 'min') # If directory for checkpoint is provided, we load it. if self.config.do_load_checkpoint: if self.config.checkpoint_dir == '': print( 'checkpoint_dir is empty, please provide directory to load checkpoint.' ) else: self.load_checkpoint(name=self.config.checkpoint_dir, save_types=("model")) self.save_checkpoint(name="checkpoint_start") self.elog.print('Experiment set up.') def train(self, epoch): self.elog.print('=====TRAIN=====') self.model.train() data = None batch_counter = 0 metrics = defaultdict(float) #running_loss = 0.0 for batch_idx, (images, masks) in enumerate(self.train_data_loader): data, target = images.to(self.device), masks.to(self.device) self.optimizer.zero_grad() #print("data shape :",data.shape, "target shape :",target.shape) pred = self.model(data) pred = torch.sigmoid(pred) #pred = F.softmax(pred, dim=1) #We calculate a softmax, because our SoftDiceLoss expects that as an input. The CE-Loss does the softmax internally. #print("pred_softmax shape :",pred_softmax.shape, "target shape :",target.shape) #loss = self.dice_loss(pred_softmax, target.squeeze()) + self.ce_loss(pred, target.squeeze()) #loss = F.binary_cross_entropy(pred, target) + soft_dice(pred,target) loss = self.bce_weight * F.binary_cross_entropy(pred, target) + ( 1 - self.bce_weight) * soft_dice(pred, target) #loss,_ = calc_loss(pred, target, metrics) loss.backward() self.optimizer.step() #running_loss+=loss.item() #epoch_loss = running_loss/len(train_data_loader) # Some logging and plotting if (batch_counter % self.config.plot_freq) == 0: self.elog.print('Epoch: {0} Loss: {1:.4f}'.format( self._epoch_idx, loss.item())) #self.add_result(value=loss.item(), name='Train_Loss', tag='Loss', counter=epoch + (batch_counter / self.train_data_loader.num_batches)) self.add_result(value=loss.item(), name='Train_Loss', tag='Loss', counter=epoch) self.clog.show_image_grid(data.float().cpu(), name="data", normalize=True, scale_each=True, n_iter=epoch) self.clog.show_image_grid(target.float().cpu(), name="mask", title="Mask", n_iter=epoch) #self.clog.show_image_grid(torch.argmax(pred.cpu(), dim=1, keepdim=True), name="unt_argmax", title="Unet", n_iter=epoch) self.clog.show_image_grid(pred.cpu(), name="unt", normalize=True, scale_each=True, n_iter=epoch) batch_counter += 1 assert data is not None, 'data is None. Please check if your dataloader works properly' def validate(self, epoch): self.elog.print('-------------VALIDATE-------------') self.model.eval() data = None loss_list = [] acc_list = [] metrics = defaultdict(float) with torch.no_grad(): for batch_idx, (images, masks) in enumerate(self.val_data_loader): data, target = images.to(self.device), masks.to(self.device) pred = self.model(data) pred = torch.sigmoid(pred) #pred = F.softmax(pred, dim=1) # We calculate a softmax, because our SoftDiceLoss expects that as an input. The CE-Loss does the softmax internally. # Ramesh check if soft max is needed # loss = self.dice_loss(pred_softmax, target.squeeze()) + self.ce_loss(pred, target.squeeze()) # loss = F.binary_cross_entropy(pred, masks) #loss,dice = calc_loss(pred, target, metrics) acc = (-1) * soft_dice(pred, target) acc_list.append(acc.item()) #loss = F.binary_cross_entropy(pred, target) + soft_dice(pred,target) loss = self.bce_weight * F.binary_cross_entropy( pred, target) + (1 - self.bce_weight) * soft_dice( pred, target) loss_list.append(loss.item()) assert data is not None, 'data is None. Please check if your dataloader works properly' self.scheduler.step(np.mean(loss_list)) self.elog.print( 'Epoch: %d Mean Loss: %.4f Mean Dice :' % (self._epoch_idx, np.mean(loss_list)), np.mean(acc_list)) self.add_result(value=np.mean(loss_list), name='Val_Loss', tag='Loss', counter=epoch + 1) self.add_result(value=np.mean(acc_list), name='Val_Mean_Accuracy', tag='Accuracy', counter=epoch + 1) self.clog.show_image_grid(data.float().cpu(), name="data_val", normalize=True, scale_each=True, n_iter=epoch) self.clog.show_image_grid(target.float().cpu(), name="mask_val", title="Mask", n_iter=epoch) self.clog.show_image_grid(torch.argmax(pred.data.cpu(), dim=1, keepdim=True), name="unt_argmax_val", title="Unet", n_iter=epoch) self.clog.show_image_grid(pred.data.cpu(), name="unt_val", normalize=True, scale_each=True, n_iter=epoch) def test(self): # TODO print(' In test() method here') self.elog.print('----------Test-------------') self.model.eval() trial = 10 data = None loss_list = [] acc_list = [] metrics = defaultdict(float) with torch.no_grad(): for batch_idx, (images, masks) in enumerate(self.test_data_loader): data, target = images.to(self.device), masks.to(self.device) pred = self.model(data) pred = torch.sigmoid(pred) #pred = np.where(pred > 0.5,1,0) #pred = F.softmax(pred, dim=1) # We calculate a softmax, because our SoftDiceLoss expects that as an input. The CE-Loss does the softmax internally. # Ramesh check if soft max is needed # loss = self.dice_loss(pred_softmax, target.squeeze()) + self.ce_loss(pred, target.squeeze()) # loss = F.binary_cross_entropy(pred, masks) #loss,dice = calc_loss(pred, target, metrics) acc = (-1) * soft_dice(pred, target) acc_list.append(acc.item()) #loss = F.binary_cross_entropy(pred, target) + soft_dice(pred,target) loss = self.bce_weight * F.binary_cross_entropy( pred, target) + (1 - self.bce_weight) * soft_dice( pred, target) loss_list.append(loss.item()) assert data is not None, 'data is None. Please check if your dataloader works properly' #self.scheduler.step(np.mean(loss_list)) self.add_result(value=loss.item(), name='Test_Loss', tag='Test_Loss', counter=trial + 1) self.add_result(value=acc.item(), name='Test_Mean_Accuracy', tag='Test_Accuracy', counter=trial + 1) self.clog.show_image_grid(data.float().cpu(), name="data_test", normalize=True, scale_each=True, n_iter=trial) self.clog.show_image_grid(target.float().cpu(), name="mask_test", title="Mask", n_iter=trial) self.clog.show_image_grid(torch.argmax(pred.data.cpu(), dim=1, keepdim=True), name="unt_argmax_test", title="Unet", n_iter=trial) self.clog.show_image_grid(pred.data.cpu(), name="unt_test", normalize=True, scale_each=True, n_iter=trial) self.elog.print( 'Test Mean Loss: %.4f Test Mean Dice :' % (np.mean(loss_list)), np.mean(acc_list))
class UNetExperiment: """ This class implements the basic life cycle for a segmentation task with UNet(https://arxiv.org/abs/1505.04597). The basic life cycle of a UNetExperiment is: run(): for epoch in n_epochs: train() validate() test() """ def __init__(self, config, split, dataset): self.n_epochs = config.n_epochs self.split = split self._time_start = "" self._time_end = "" self.epoch = 0 self.name = config.name # Create output folders dirname = f'{time.strftime("%Y-%m-%d_%H%M", time.gmtime())}_{self.name}' self.out_dir = os.path.join(config.test_results_dir, dirname) os.makedirs(self.out_dir, exist_ok=True) # Create data loaders self.train_loader = DataLoader(SlicesDataset(dataset[split["train"]]), batch_size=config.batch_size, shuffle=True, num_workers=0) self.val_loader = DataLoader(SlicesDataset(dataset[split["val"]]), batch_size=config.batch_size, shuffle=True, num_workers=0) # we will access volumes directly for testing self.test_data = dataset[split["test"]] # Do we have CUDA available? if not torch.cuda.is_available(): print( "WARNING: No CUDA device is found. This may take significantly longer!" ) self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") # Configure our model and other training implements self.model = UNet(num_classes=3) self.model.to(self.device) # Cross entropy loss self.loss_function = torch.nn.CrossEntropyLoss() # We are using standard SGD method to optimize our weights self.optimizer = optim.Adam(self.model.parameters(), lr=config.learning_rate) # Scheduler helps us update learning rate automatically self.scheduler = optim.lr_scheduler.ReduceLROnPlateau( self.optimizer, 'min') # Set up Tensorboard. By default it saves data into runs folder. self.tensorboard_train_writer = SummaryWriter(comment="_train") self.tensorboard_val_writer = SummaryWriter(comment="_val") def train(self): """ This method is executed once per epoch and takes care of model weight update cycle """ print(f"Training epoch {self.epoch}...") self.model.train() # Loop over our minibatches for i, batch in enumerate(self.train_loader): self.optimizer.zero_grad() # Put the slices as 4D Torch Tensors of # shape [BATCH_SIZE, 1, PATCH_SIZE, PATCH_SIZE] into variables data and target. # Feed data to the model and feed target to the loss function data = batch['image'].float().to(self.device) target = batch['seg'].to(self.device) prediction = self.model(data) # We are also getting softmax'd version of prediction to output a probability map prediction_softmax = F.softmax(prediction, dim=1) loss = self.loss_function(prediction, target[:, 0, :, :].long()) loss.backward() self.optimizer.step() if (i % 10) == 0: # Output to console on every 10th batch print( f"\nEpoch: {self.epoch} Train loss: {loss}, {100*(i+1)/len(self.train_loader):.1f}% complete" ) counter = 100 * self.epoch + 100 * (i / len(self.train_loader)) log_to_tensorboard(self.tensorboard_train_writer, loss, data, target, prediction_softmax, prediction, counter) print(".", end='') print("\nTraining complete") def validate(self): """ This method runs validation cycle, using same metrics as Train method. Note that model needs to be switched to eval mode and no_grad needs to be called so that gradients do not propagate """ print(f"Validating epoch {self.epoch}...") # Turn off gradient accumulation by switching model to "eval" mode self.model.eval() loss_list = [] with torch.no_grad(): for i, batch in enumerate(self.val_loader): # Compute loss on a validation sample data = batch["image"].float().to(self.device) target = batch["seg"].to(self.device) prediction = self.model(data) prediction_softmax = F.softmax(prediction, dim=1) loss = self.loss_function(prediction, target[:, 0, :, :].long()) print(f"Batch {i}. Data shape {data.shape} Loss {loss}") # We report loss that is accumulated across all of validation set loss_list.append(loss.item()) self.scheduler.step(np.mean(loss_list)) log_to_tensorboard(self.tensorboard_val_writer, np.mean(loss_list), data, target, prediction_softmax, prediction, (self.epoch + 1) * 100) print(f"Validation complete") def save_model_parameters(self): """ Saves model parameters to a file in results directory """ path = os.path.join(self.out_dir, "model.pth") torch.save(self.model.state_dict(), path) def load_model_parameters(self, path=''): """ Loads model parameters from a supplied path or a results directory """ if not path: model_path = os.path.join(self.out_dir, "model.pth") else: model_path = path if os.path.exists(model_path): self.model.load_state_dict(torch.load(model_path)) else: raise Exception(f"Could not find path {model_path}") def run_test(self): """ This runs test cycle on the test dataset. Note that process and evaluations are quite different Here we are computing a lot more metrics and returning a dictionary that could later be persisted as JSON """ print("Testing...") self.model.eval() # In this method we will be computing metrics that are relevant to the task of 3D volume # segmentation. Therefore, unlike train and validation methods, we will do inferences # on full 3D volumes, much like we will be doing it when we deploy the model in the # clinical environment. # Inference Agent is not complete. inference_agent = UNetInferenceAgent(model=self.model, device=self.device) out_dict = {} out_dict["volume_stats"] = [] dc_list = [] jc_list = [] # for every in test set for i, x in enumerate(self.test_data): pred_label = inference_agent.single_volume_inference(x["image"]) # Dice3D and Jaccard3D functions are not implemented. dc = Dice3d(pred_label, x["seg"]) jc = Jaccard3d(pred_label, x["seg"]) dc_list.append(dc) jc_list.append(jc) out_dict["volume_stats"].append({ "filename": x['filename'], "dice": dc, "jaccard": jc }) print( f"{x['filename']} Dice {dc:.4f}. {100*(i+1)/len(self.test_data):.2f}% complete" ) out_dict["overall"] = { "mean_dice": np.mean(dc_list), "mean_jaccard": np.mean(jc_list) } print("\nTesting complete.") return out_dict def run(self): """ Kicks off train cycle and writes model parameter file at the end """ self._time_start = time.time() print("Experiment started.") # Iterate over epochs for self.epoch in range(self.n_epochs): self.train() self.validate() # save model for inferencing self.save_model_parameters() self._time_end = time.time() print( f"Run complete. Total time: {time.strftime('%H:%M:%S', time.gmtime(self._time_end - self._time_start))}" )
class UNetExperiment(PytorchExperiment): """ The UnetExperiment is inherited from the PytorchExperiment. It implements the basic life cycle for a segmentation task with UNet(https://arxiv.org/abs/1505.04597). It is optimized to work with the provided NumpyDataLoader. The basic life cycle of a UnetExperiment is the same s PytorchExperiment: setup() (--> Automatically restore values if a previous checkpoint is given) prepare() for epoch in n_epochs: train() validate() (--> save current checkpoint) end() """ def setup(self): pkl_dir = self.config.split_dir with open(os.path.join(pkl_dir, "splits.pkl"), 'rb') as f: splits = pickle.load(f) tr_keys = splits[self.config.fold]['train'] val_keys = splits[self.config.fold]['val'] test_keys = splits[self.config.fold]['test'] self.device = torch.device( self.config.device if torch.cuda.is_available() else "cpu") self.train_data_loader = NumpyDataSet( self.config.data_dir, target_size=self.config.patch_size, batch_size=self.config.batch_size, keys=tr_keys) self.val_data_loader = NumpyDataSet(self.config.data_dir, target_size=self.config.patch_size, batch_size=self.config.batch_size, keys=val_keys, mode="val", do_reshuffle=False) self.test_data_loader = NumpyDataSet( self.config.data_test_dir, target_size=self.config.patch_size, batch_size=self.config.batch_size, keys=test_keys, mode="test", do_reshuffle=False) self.model = UNet(num_classes=self.config.num_classes, in_channels=self.config.in_channels) self.model.to(self.device) # We use a combination of DICE-loss and CE-Loss in this example. # This proved good in the medical segmentation decathlon. self.dice_loss = SoftDiceLoss( batch_dice=True) # Softmax for DICE Loss! self.ce_loss = torch.nn.CrossEntropyLoss( ) # No softmax for CE Loss -> is implemented in torch! self.optimizer = optim.Adam(self.model.parameters(), lr=self.config.learning_rate) self.scheduler = ReduceLROnPlateau(self.optimizer, 'min') # If directory for checkpoint is provided, we load it. if self.config.do_load_checkpoint: if self.config.checkpoint_dir == '': print('Checkpoint_dir is empty, training from scratch.') else: self.load_checkpoint(name=self.config.checkpoint_filename, save_types=("model"), path=self.config.checkpoint_dir) if self.config.fine_tune in ['expanding_all', 'expanding_plus1']: # freeze part of the network, fine-tune the other part unfreeze_block_parameters( model=self.model, fine_tune_option=self.config.fine_tune) # else just train the whole network self.save_checkpoint(name="checkpoint_start") self.elog.print('Experiment set up.') # overloaded method from the base class PytorchExperiment def load_checkpoint(self, name="checkpoint", save_types=("model", "optimizer", "simple", "th_vars", "results"), n_iter=None, iter_format="{:05d}", prefix=False, path=None): """ Loads a checkpoint and restores the experiment. Make sure you have your torch stuff already on the right devices beforehand, otherwise this could lead to errors e.g. when making a optimizer step (and for some reason the Adam states are not already on the GPU: https://discuss.pytorch.org/t/loading-a-saved-model-for-continue-training/17244/3 ) Args: name (str): The name of the checkpoint file save_types (list or tuple): What kind of member variables should be loaded? Choices are: "model" <-- Pytorch models, "optimizer" <-- Optimizers, "simple" <-- Simple python variables (basic types and lists/tuples), "th_vars" <-- torch tensors, "results" <-- The result dict n_iter (int): Number of iterations. Together with the name, defined by the iter_format, a file name will be created and searched for. iter_format (str): Defines how the name and the n_iter will be combined. prefix (bool): If True, the formatted n_iter will be prepended, otherwise appended. path (str): If no path is given then it will take the current experiment dir and formatted name, otherwise it will simply use the path and the formatted name to define the checkpoint file. """ if self.elog is None: return model_dict = {} optimizer_dict = {} simple_dict = {} th_vars_dict = {} results_dict = {} if "model" in save_types: model_dict = self.get_pytorch_modules() if "optimizer" in save_types: optimizer_dict = self.get_pytorch_optimizers() if "simple" in save_types: simple_dict = self.get_simple_variables() if "th_vars" in save_types: th_vars_dict = self.get_pytorch_variables() if "results" in save_types: results_dict = {"results": self.results} checkpoint_dict = { **model_dict, **optimizer_dict, **simple_dict, **th_vars_dict, **results_dict } if n_iter is not None: name = name_and_iter_to_filename(name, n_iter, ".pth.tar", iter_format=iter_format, prefix=prefix) # Jorg Begin # if self.config.dont_load_lastlayer: # exclude_layer_dict = {'model': ['model.model.5.weight', 'model.model.5.bias']} # else: # exclude_layer_dict = {} exclude_layer_dict = {} # Jorg End if path is None: restore_dict = self.elog.load_checkpoint(name=name, **checkpoint_dict) else: checkpoint_path = os.path.join(path, name) if checkpoint_path.endswith("/"): checkpoint_path = checkpoint_path[:-1] restore_dict = self.elog.load_checkpoint_static( checkpoint_file=checkpoint_path, exclude_layer_dict=exclude_layer_dict, **checkpoint_dict) self.update_attributes(restore_dict) def train(self, epoch): self.elog.print('=====TRAIN=====') self.model.train() data = None batch_counter = 0 for data_batch in self.train_data_loader: self.optimizer.zero_grad() # Shape of data_batch = [1, b, c, w, h] # Desired shape = [b, c, w, h] # Move data and target to the GPU data = data_batch['data'][0].float().to(self.device) target = data_batch['seg'][0].long().to(self.device) pred = self.model(data) pred_softmax = F.softmax( pred, dim=1 ) # We calculate a softmax, because our SoftDiceLoss expects that as an input. The CE-Loss does the softmax internally. loss = self.dice_loss(pred_softmax, target.squeeze()) + self.ce_loss( pred, target.squeeze()) loss.backward() self.optimizer.step() # Some logging and plotting if (batch_counter % self.config.plot_freq) == 0: self.elog.print('Epoch: {0} Loss: {1:.4f}'.format( self._epoch_idx, loss)) self.add_result( value=loss.item(), name='Train_Loss', tag='Loss', counter=epoch + (batch_counter / self.train_data_loader.data_loader.num_batches)) self.clog.show_image_grid(data.float().cpu(), name="data", normalize=True, scale_each=True, n_iter=epoch) self.clog.show_image_grid(target.float().cpu(), name="mask", title="Mask", n_iter=epoch) self.clog.show_image_grid(torch.argmax(pred.cpu(), dim=1, keepdim=True), name="unt_argmax", title="Unet", n_iter=epoch) self.clog.show_image_grid(pred.cpu()[:, 1:2, ], name="unt", normalize=True, scale_each=True, n_iter=epoch) batch_counter += 1 assert data is not None, 'data is None. Please check if your dataloader works properly' def validate(self, epoch): self.elog.print('VALIDATE') self.model.eval() data = None loss_list = [] with torch.no_grad(): for data_batch in self.val_data_loader: data = data_batch['data'][0].float().to(self.device) target = data_batch['seg'][0].long().to(self.device) pred = self.model(data) pred_softmax = F.softmax( pred, dim=1 ) # We calculate a softmax, because our SoftDiceLoss expects that as an input. The CE-Loss does the softmax internally. loss = self.dice_loss(pred_softmax, target.squeeze()) + self.ce_loss( pred, target.squeeze()) loss_list.append(loss.item()) assert data is not None, 'data is None. Please check if your dataloader works properly' self.scheduler.step(np.mean(loss_list)) self.elog.print('Epoch: %d Loss: %.4f' % (self._epoch_idx, np.mean(loss_list))) self.add_result(value=np.mean(loss_list), name='Val_Loss', tag='Loss', counter=epoch + 1) self.clog.show_image_grid(data.float().cpu(), name="data_val", normalize=True, scale_each=True, n_iter=epoch) self.clog.show_image_grid(target.float().cpu(), name="mask_val", title="Mask", n_iter=epoch) self.clog.show_image_grid(torch.argmax(pred.data.cpu(), dim=1, keepdim=True), name="unt_argmax_val", title="Unet", n_iter=epoch) self.clog.show_image_grid(pred.data.cpu()[:, 1:2, ], name="unt_val", normalize=True, scale_each=True, n_iter=epoch) def test(self): from evaluation.evaluator import aggregate_scores, Evaluator from collections import defaultdict self.elog.print('=====TEST=====') self.model.eval() pred_dict = defaultdict(list) gt_dict = defaultdict(list) batch_counter = 0 if self.config.visualize_segm: color_class_converter = LabelTensorToColor() with torch.no_grad(): for data_batch in self.test_data_loader: print('testing...', batch_counter) batch_counter += 1 # Get data_batches mr_data = data_batch['data'][0].float().to(self.device) mr_target = data_batch['seg'][0].float().to(self.device) pred = self.model(mr_data) pred_argmax = torch.argmax(pred.data.cpu(), dim=1, keepdim=True) fnames = data_batch['fnames'] for i, fname in enumerate(fnames): pred_dict[fname[0]].append( pred_argmax[i].detach().cpu().numpy()) gt_dict[fname[0]].append( mr_target[i].detach().cpu().numpy()) if batch_counter == 35 and self.config.visualize_segm: segm_visualization(mr_data, mr_target, pred_argmax, color_class_converter, self.config) test_ref_list = [] for key in pred_dict.keys(): test_ref_list.append( (np.stack(pred_dict[key]), np.stack(gt_dict[key]))) scores = aggregate_scores(test_ref_list, evaluator=Evaluator, json_author=self.config.author, json_task=self.config.name, json_name=self.config.name, json_output_file=self.elog.work_dir + "/{}_".format(self.config.author) + self.config.name + '.json') self.scores = scores print("Scores:\n", scores)
class UNetExperiment: """ This class implements the basic life cycle for a segmentation task with UNet(https://arxiv.org/abs/1505.04597). The basic life cycle of a UNetExperiment is: run(): for epoch in n_epochs: train() validate() test() """ def __init__(self, config, split, dataset): self.n_epochs = config.n_epochs self.split = split self._time_start = "" self._time_end = "" self.epoch = 0 self.name = config.name # Create output folders dirname = f'{time.strftime("%Y-%m-%d_%H%M", time.gmtime())}_{self.name}' self.out_dir = os.path.join(config.test_results_dir, dirname) os.makedirs(self.out_dir, exist_ok=True) self.out_images_dir = os.path.join(self.out_dir, "images") os.makedirs(self.out_images_dir) # Create data loaders # Note that we are using a 2D version of UNet here, which means that it will expect # batches of 2D slices. self.train_loader = DataLoader(SlicesDataset(dataset[split["train"]]), batch_size=config.batch_size, shuffle=True, num_workers=0) self.val_loader = DataLoader(SlicesDataset(dataset[split["val"]]), batch_size=config.batch_size, shuffle=True, num_workers=0) # we will access volumes directly for testing self.test_data = dataset[split["test"]] # Do we have CUDA available? if not torch.cuda.is_available(): print("WARNING: No CUDA device is found. This may take significantly longer!") self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Configure our model and other training implements # We will use a recursive UNet model from German Cancer Research Center, # Division of Medical Image Computing. It is quite complicated and works # very well on this task. self.model = UNet(num_classes=3) self.model.to(self.device) # We are using a standard cross-entropy loss since the model output is essentially # a tensor with softmax'd prediction of each pixel's probability of belonging # to a certain class self.loss_function = torch.nn.CrossEntropyLoss() # We are using standard SGD method to optimize our weights self.optimizer = optim.Adam(self.model.parameters(), lr=config.learning_rate) # Scheduler helps us update learning rate automatically self.scheduler = optim.lr_scheduler.ReduceLROnPlateau(self.optimizer, 'min') # Set up Tensorboard. By default it saves data into runs folder. You need to launch self.tensorboard_train_writer = SummaryWriter(comment="_train") self.tensorboard_val_writer = SummaryWriter(comment="_val") def train(self): """ This method is executed once per epoch and takes care of model weight update cycle """ print(f"Training epoch {self.epoch}...") self.model.train() # Loop over our minibatches for i, batch in enumerate(self.train_loader): self.optimizer.zero_grad() # We have our data in batch variable. Put the slices as 4D Torch Tensors of # shape [BATCH_SIZE, 1, PATCH_SIZE, PATCH_SIZE] into variables data and target. # Feed data to the model and feed target to the loss function # data = batch['image'].to(self.device, dtype=torch.float) target = batch['seg'].to(self.device) prediction = self.model(data) # We are also getting softmax'd version of prediction to output a probability map # so that we can see how the model converges to the solution prediction_softmax = F.softmax(prediction, dim=1) loss = self.loss_function(prediction, target[:, 0, :, :]) # What does each dimension of variable prediction represent? # Each dimension is the probability for each pixel of a imput 2D slice for each class loss.backward() self.optimizer.step() if (i % 10) == 0: # Output to console on every 10th batch print(f"\nEpoch: {self.epoch} Train loss: {loss}, {100*(i+1)/len(self.train_loader):.1f}% complete") counter = 100*self.epoch + 100*(i/len(self.train_loader)) log_to_tensorboard( self.tensorboard_train_writer, loss, data, target, prediction_softmax, prediction, counter) print(".", end='') print("\nTraining complete") def validate(self): """ This method runs validation cycle, using same metrics as Train method. Note that model needs to be switched to eval mode and no_grad needs to be called so that gradients do not propagate """ print(f"Validating epoch {self.epoch}...") # Turn off gradient accumulation by switching model to "eval" mode self.model.eval() loss_list = [] with torch.no_grad(): for i, batch in enumerate(self.val_loader): # Compute loss on a validation sample data = batch['image'].to(self.device, dtype=torch.float) target = batch['seg'].to(self.device) prediction = self.model(data) # We are also getting softmax'd version of prediction to output a probability map # so that we can see how the model converges to the solution prediction_softmax = F.softmax(prediction, dim=1) loss = self.loss_function(prediction, target[:, 0, :, :]) print(f"Batch {i}. Data shape {data.shape} Loss {loss}") # We report loss that is accumulated across all of validation set loss_list.append(loss.item()) self.scheduler.step(np.mean(loss_list)) log_to_tensorboard( self.tensorboard_val_writer, np.mean(loss_list), data, target, prediction_softmax, prediction, (self.epoch+1) * 100) print(f"Validation complete") def save_predictions(self): """ Saves model predicted images in results directory """ print("Save image predictions") # Prepare model for inference self.model.eval() inference_agent = UNetInferenceAgent(model=self.model, device=self.device) # Get first test data volume first_test_data = self.test_data[0] # Get the model predictions pred_label = inference_agent.single_volume_inference(first_test_data["image"]) # Calculate middle slice indice axial_middle_index = int(pred_label.shape[0] / 2) # Create middle slice images for these volumes for mri image, target and predictions for this epoch image = (first_test_data["image"][axial_middle_index] * 255).astype(np.uint8) label = (first_test_data["seg"][axial_middle_index] * 255).astype(np.uint8) prediction = (pred_label[axial_middle_index] * 255).astype(np.uint8) # Convert from numpy array to image objects image = Image.fromarray(image) label = Image.fromarray(label) prediction = Image.fromarray(prediction) # Save images image.save(self.out_images_dir + '/Epoch' + str(self.epoch) + '-image.png', cmap='Greys') label.save(self.out_images_dir + '/Epoch' + str(self.epoch) + '-label.png', cmap='Greys') prediction.save(self.out_images_dir + '/Epoch' + str(self.epoch) + '-prediction.png', cmap='Greys') def save_model_parameters(self): """ Saves model parameters to a file in results directory """ path = os.path.join(self.out_dir, "model.pth") torch.save(self.model.state_dict(), path) def load_model_parameters(self, path=''): """ Loads model parameters from a supplied path or a results directory """ if not path: model_path = os.path.join(self.out_dir, "model.pth") else: model_path = path if os.path.exists(model_path): self.model.load_state_dict(torch.load(model_path)) else: raise Exception(f"Could not find path {model_path}") def run_test(self): """ This runs test cycle on the test dataset. Note that process and evaluations are quite different Here we are computing a lot more metrics and returning a dictionary that could later be persisted as JSON """ print("Testing...") self.model.eval() # In this method we will be computing metrics that are relevant to the task of 3D volume # segmentation. Therefore, unlike train and validation methods, we will do inferences # on full 3D volumes, much like we will be doing it when we deploy the model in the # clinical environment. # Instantiate inference agent inference_agent = UNetInferenceAgent(model=self.model, device=self.device) out_dict = {} out_dict["volume_stats"] = [] dc_list = [] jc_list = [] # for every in test set for i, x in enumerate(self.test_data): pred_label = inference_agent.single_volume_inference(x["image"]) # We compute and report Dice and Jaccard similarity coefficients which # assess how close our volumes are to each other dc = Dice3d(pred_label, x["seg"]) jc = Jaccard3d(pred_label, x["seg"]) dc_list.append(dc) jc_list.append(jc) out_dict["volume_stats"].append({ "filename": x['filename'], "dice": dc, "jaccard": jc }) print(f"{x['filename']} Dice {dc:.4f} Jaccard {dc:.4f} {100*(i+1)/len(self.test_data):.2f}% complete") mean_dice = np.mean(dc_list) mean_jaccard = np.mean(jc_list) print(f" Mean Dice {mean_dice:.4f} Mean Jaccard {mean_jaccard:.4f}") out_dict["overall"] = { "mean_dice": mean_dice, "mean_jaccard": mean_jaccard} print("\nTesting complete.") return out_dict def run(self): """ Kicks off train cycle and writes model parameter file at the end """ self._time_start = time.time() print("Experiment started.") # Iterate over epochs for self.epoch in range(self.n_epochs): self.train() self.validate() self.save_predictions() # save model for inferencing self.save_model_parameters() self._time_end = time.time() print(f"Run complete. Total time: {time.strftime('%H:%M:%S', time.gmtime(self._time_end - self._time_start))}")
class FCNExperiment(PytorchExperiment): """ The UnetExperiment is inherited from the PytorchExperiment. It implements the basic life cycle for a segmentation task with UNet(https://arxiv.org/abs/1505.04597). It is optimized to work with the provided NumpyDataLoader. The basic life cycle of a UnetExperiment is the same s PytorchExperiment: setup() (--> Automatically restore values if a previous checkpoint is given) prepare() for epoch in n_epochs: train() validate() (--> save current checkpoint) end() """ def setup(self): pkl_dir = self.config.split_dir with open(os.path.join(pkl_dir, "splits.pkl"), 'rb') as f: splits = pickle.load(f) tr_keys = splits[self.config.fold]['train'] val_keys = splits[self.config.fold]['val'] test_keys = splits[self.config.fold]['test'] self.device = torch.device( self.config.device if torch.cuda.is_available() else 'cpu') # self.train_data_loader = NumpyDataSet( self.config.scaled_image_64_dir, target_size=64, batch_size=self.config.batch_size, keys=tr_keys, do_reshuffle=True) self.val_data_loader = NumpyDataSet(self.config.scaled_image_64_dir, target_size=64, batch_size=self.config.batch_size, keys=val_keys, mode="val", do_reshuffle=True) self.test_data_loader = NumpyDataSet(self.config.scaled_image_64_dir, target_size=64, batch_size=self.config.batch_size, keys=test_keys, mode="test", do_reshuffle=False) self.model = UNet(num_classes=self.config.num_classes, num_downs=3) self.model.to(self.device) # We use a combination of DICE-loss and CE-Loss in this example. # This proved good in the medical segmentation decathlon. self.dice_loss = SoftDiceLoss( batch_dice=True) # Softmax für DICE Loss! # weight = torch.tensor([1, 30, 30]).float().to(self.device) self.ce_loss = torch.nn.CrossEntropyLoss( ) # Kein Softmax für CE Loss -> ist in torch schon mit drin! # self.dice_pytorch = dice_pytorch(self.config.num_classes) self.optimizer = optim.Adam(self.model.parameters(), lr=self.config.learning_rate) # self.optimizer = optim.SGD(self.model.parameters(), lr=self.config.learning_rate) self.scheduler = ReduceLROnPlateau(self.optimizer, 'min') # If directory for checkpoint is provided, we load it. if self.config.do_load_checkpoint: if self.config.checkpoint_dir == '': print( 'checkpoint_dir is empty, please provide directory to load checkpoint.' ) else: self.load_checkpoint(name=self.config.checkpoint_dir, save_types=("model")) self.save_checkpoint(name="checkpoint_start") self.elog.print('Experiment set up.') def train(self, epoch): self.elog.print('=====TRAIN=====') self.model.train() data = None batch_counter = 0 for data_batch in self.train_data_loader: self.optimizer.zero_grad() # Shape of data_batch = [1, b, c, w, h] # Desired shape = [b, c, w, h] # Move data and target to the GPU data = data_batch['data'][0].float().to(self.device) target = data_batch['seg'][0].long().to(self.device) max_value = target.max() min_value = target.min() pred = self.model(data) pred_softmax = F.softmax( pred, dim=1 ) # We calculate a softmax, because our SoftDiceLoss expects that as an input. The CE-Loss does the softmax internally. pred_image = torch.argmax(pred_softmax, dim=1) t = target.squeeze() # loss = self.dice_pytorch(outputs=pred_image, labels=target) loss = self.ce_loss(pred, target.squeeze()) + self.dice_loss( pred_softmax, target.squeeze()) # loss = self.dice_loss(pred_softmax, target.squeeze()) loss.backward() self.optimizer.step() # Some logging and plotting if (batch_counter % self.config.plot_freq) == 0: self.elog.print('Epoch: %d Loss: %.4f' % (self._epoch_idx, loss)) self.add_result( value=loss.item(), name='Train_Loss', tag='Loss', counter=epoch + (batch_counter / self.train_data_loader.data_loader.num_batches)) self.clog.show_image_grid(data.float(), name="data", normalize=True, scale_each=True, n_iter=epoch) self.clog.show_image_grid(target.float(), name="mask", title="Mask", n_iter=epoch) self.clog.show_image_grid(torch.argmax(pred.cpu(), dim=1, keepdim=True), name="unt_argmax", title="Unet", n_iter=epoch) self.clog.show_image_grid(pred.cpu()[:, 1:2, ], name="unt", normalize=True, scale_each=True, n_iter=epoch) batch_counter += 1 assert data is not None, 'data is None. Please check if your dataloader works properly' def validate(self, epoch): self.elog.print('VALIDATE') self.model.eval() data = None loss_list = [] with torch.no_grad(): for data_batch in self.val_data_loader: data = data_batch['data'][0].float().to(self.device) target = data_batch['seg'][0].long().to(self.device) pred = self.model(data) pred_softmax = F.softmax( pred ) # We calculate a softmax, because our SoftDiceLoss expects that as an input. The CE-Loss does the softmax internally. loss = self.dice_loss( pred_softmax, target.squeeze()) #self.ce_loss(pred, target.squeeze()) loss_list.append(loss.item()) assert data is not None, 'data is None. Please check if your dataloader works properly' self.scheduler.step(np.mean(loss_list)) self.elog.print('Epoch: %d Loss: %.4f' % (self._epoch_idx, np.mean(loss_list))) self.add_result(value=np.mean(loss_list), name='Val_Loss', tag='Loss', counter=epoch + 1) self.clog.show_image_grid(data.float(), name="data_val", normalize=True, scale_each=True, n_iter=epoch) self.clog.show_image_grid(target.float(), name="mask_val", title="Mask", n_iter=epoch) self.clog.show_image_grid(torch.argmax(pred.data.cpu(), dim=1, keepdim=True), name="unt_argmax_val", title="Unet", n_iter=epoch) self.clog.show_image_grid(pred.data.cpu()[:, 1:2, ], name="unt_val", normalize=True, scale_each=True, n_iter=epoch) def test(self): self.model.eval() data = None dice_array = np.array([0]) num_of_parameters = sum(p.numel() for p in self.model.parameters() if p.requires_grad) print("number of parameters:", num_of_parameters) with torch.no_grad(): for data_batch in self.test_data_loader: data = data_batch['data'][0].float().to(self.device) target = data_batch['seg'][0].long().to(self.device) file_dir = data_batch['fnames'] # 8*tuple (a,) pred = self.model(data) pred_softmax = F.softmax( pred, dim=1 ) # We calculate a softmax, because our SoftDiceLoss expects that as an input. The CE-Loss does the softmax internally. pred_image = torch.argmax(pred_softmax, dim=1) dice_result = dice_pytorch(outputs=pred_image, labels=target, N_class=self.config.num_classes) dice_loss = self.dice_loss(pred_softmax, target.squeeze()) ce_loss = self.ce_loss(pred, target.squeeze()) print('ce_loss:%.4f dice:%s' % (ce_loss.data, dice_result.data)) data_image = data.data.cpu().numpy() pred_image = pred_image.data.cpu().numpy() target_image = target.data.cpu().numpy() pred_softmax = pred_softmax.data.cpu().numpy() dice_result = dice_result.data.cpu().numpy() size = np.shape(dice_result)[0] for i in range(size): dice_array = np.concatenate((dice_array, [dice_result[i]])) for k in range(self.config.batch_size): ##save the results pred = pred_softmax[k].reshape((3, 64, 64)) filename = file_dir[k][0][-8:-4] output_dir = os.path.join( self.config.cross_vali_result_all_dir, 'pred_' + self.config.dataset_name + filename) if os.path.exists(output_dir + '.npy'): all_image = np.load(output_dir + '.npy') output = np.concatenate( (data_image[k], target_image[k], pred), axis=0).reshape((1, 5, 64, 64)) all_image = np.concatenate((all_image, output), axis=0) else: all_image = np.concatenate( (data_image[k], target_image[k], pred), axis=0).reshape((1, 5, 64, 64)) np.save(output_dir, all_image) # saveName = filenames[k] dice_array = dice_array[dice_array != 0] print("average dice:", np.average(dice_array)) print('test_data loading finished') assert data is not None, 'data is None. Please check if your dataloader works properly'