def __process_evaluate(model: UNet2D, data_loader: DataLoader, criterion: BCEWithLogitsLoss, logger: NLM): # Check if we can use a GPU Device if torch.cuda.is_available(): device = torch.device('cuda') else: device = torch.device('cpu') # Perform an evaluation using the defined network model.eval() # Warp the iterable Data Loader with TQDM bar = tqdm(enumerate(data_loader)) sample_length = 0 total_loss = 0 total_acc = 0 for i, (_input, _target) in bar: # Move data and label to device _input = _input.type(torch.FloatTensor).to(device) _target = _target.type(torch.FloatTensor).to(device) # Pass the input data through the defined network architecture output = model(_input) # Compute a loss function loss = criterion(output, _target) total_loss += loss.item() * len(_target[0]) # Loss per batch * batch # Compute network accuracy acc = torch.sum(torch.eq(output > 0.5, _target > 0.5)).item() # output and targets binary sample_length += len(_input[0]) total_acc += acc # Trace the log message = logger.write(i, len(data_loader), CELoss=total_loss / sample_length, ACC=(total_acc / sample_length) * 100.0) bar.set_description(message) return total_loss / sample_length
def __process_evaluate(model: AlexNet, data_loader: DataLoader, criterion, logger: NLM): if torch.cuda.is_available(): pDevice = torch.device('cuda') else: pDevice = torch.device('cpu') model.eval() bar = tqdm(enumerate(data_loader)) total_loss = 0.0 total_correct = 0 sample_length = 0 with torch.no_grad(): for i, (_input, _target) in bar: _input = _input.type(torch.FloatTensor).to(pDevice) _target = _target.type(torch.LongTensor).to(pDevice) output = model(_input) loss = criterion(output, _target) total_loss += loss.item() * _target.size(0) _, target_hat = output.max(1) sample_length += _target.size(0) total_correct += target_hat.eq(_target).sum().item() message = logger.write(i, len(data_loader), Loss=total_loss / sample_length, Acc=100 * total_correct / sample_length) bar.set_description(message) return total_loss / sample_length
def __process_train(model: ResNet50, data_loader: DataLoader, optimizer, criterion, logger: NLM): if torch.cuda.is_available(): device = torch.device('cuda') else: device = torch.device('cpu') model.train() bar = tqdm(enumerate(data_loader)) total_loss = 0.0 total_correct = 0 sample_length = 0 for i, (_input, _target) in bar: _input = _input.type(torch.FloatTensor).to(device) _target = _target.type(torch.LongTensor).to(device) output = model(_input) optimizer.zero_grad() loss = criterion(output, _target) loss.backward() optimizer.step() total_loss += loss.item() * _target.size(0) _, target_hat = output.max(1) sample_length += _target.size(0) total_correct += target_hat.eq(_target).sum().item() message = logger.write(i, len(data_loader), Loss=total_loss / sample_length, Acc=100 * total_correct / sample_length) bar.set_description(message)
def train(epoch: int, model_path: str, train_data: Dataset1D, train_label: Dataset1D, eval_data: Dataset1D, eval_label: Dataset1D, channel=8, depth=4, batch_size=1, num_workers=0, # 0: CPU / 4 : GPU dropout=0.3, decay=0.5, is_init_epoch=False): def learning_func(iStep): return 1.0 - max(0, iStep - epoch * (1 - decay)) / (decay * epoch + 1) # Check if we can use a GPU Device if torch.cuda.is_available(): device = torch.device('cuda') else: device = torch.device('cpu') print("{} device activation".format(device.__str__())) # Define the training and testing data-set train_set = UNetDataset(train_data, train_label) train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=num_workers, pin_memory=True) valid_set = UNetDataset(eval_data, eval_label) valid_loader = DataLoader(valid_set, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=True) # Define a network model model = UNet2D(input_dim=train_set.input_dim, output_dim=train_set.output_dim, channel=channel, depth=depth, dropout=dropout).to(device) # Set the optimizer with adam optimizer = torch.optim.Adam(model.parameters()) # Set the training criterion criterion = torch.nn.BCEWithLogitsLoss(reduction='mean') # Set the scheduler to control the learning rate scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=learning_func) # Load pre-trained model start = 0 print("Directory of the pre-trained model: {}".format(model_path)) if model_path is not None and os.path.exists(model_path) and is_init_epoch is False: model_data = torch.load(model_path, map_location=device) start = model_data['epoch'] model.load_state_dict(model_data['model']) optimizer.load_state_dict(model_data['optimizer']) print("## Successfully load the model at {} epochs!".format(start)) # Define the log manager train_logger = NLM(start, root="./NLM/UNet2D", mode="Train") eval_logger = NLM(start, root="./NLM/UNet2D", mode="Eval") # Train and Test Repeat min_loss = 10000.0 for i in range(start, epoch + 1): # Train the network __process_train(model=model, data_loader=train_loader, criterion=criterion, optimizer=optimizer, logger=train_logger) # Test the network loss = __process_evaluate(model=model, data_loader=valid_loader, criterion=criterion, logger=eval_logger) # Change the learning rate scheduler.step() # Rollback the model when loss is NaN if math.isnan(loss): if model_path is not None and os.path.exists(model_path): # Reload the best model and decrease the learning rate model_data = torch.load(model_path, map_location=device) model.load_state_dict(model_data['model']) optimizer_data = model_data['optimizer'] optimizer_data['param_groups'][0]['lr'] /= 2 # Decrease the learning rate by 2 optimizer.load_state_dict(optimizer_data) print("## Rollback the Model with half learning rate!") # Save the optimal model elif loss < min_loss: min_loss = loss torch.save({'epoch': i, 'model': model.state_dict(), 'optimizer': optimizer.state_dict()}, model_path) elif i % 100 == 0: torch.save({'epoch': i, 'model': model.state_dict(), 'optimizer': optimizer.state_dict()}, 'unet_{}epoch.pth'.format(i))
def train(epoch: int, model_path: str, num_class: int, train_data: Dataset1D, eval_data: Dataset1D, transform: Transform1D, batch_size=32, num_workers=0, # 0: CPU / 4 : GPU learning_rate=0.1, is_init_epoch=False): # Check if we can use a GPU Device if torch.cuda.is_available(): device = torch.device('cuda') else: device = torch.device('cpu') print("{} device activation".format(device.__str__())) # Define the training and testing data-set train_set = ClassificationDataset(train_data, num_class, transform) train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, collate_fn=collate_segmentation, num_workers=num_workers, pin_memory=True) valid_set = ClassificationDataset(eval_data, num_class, transform) pValidationLoader = DataLoader(valid_set, batch_size=batch_size, shuffle=False, collate_fn=collate_segmentation, num_workers=num_workers, pin_memory=True) # Define a network model model = AlexNet(input_dim=train_set.input_dim, num_class=train_set.output_dim).to(device) criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9, weight_decay=5e-4) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.5) # Load pre-trained model start = 0 print("Directory of the pre-trained model: {}".format(model_path)) if model_path is not None and os.path.exists(model_path) and is_init_epoch is False: model_data = torch.load(model_path, map_location=device) start = model_data['epoch'] model.load_state_dict(model_data['model']) optimizer.load_state_dict(model_data['optimizer']) print("## Successfully load the model at {} epochs!".format(start)) # Define the log manager train_logger = NLM(start, root="./NLM/AlexNet", mode="Train") eval_logger = NLM(start, root="./NLM/AlexNet", mode="Eval") # Train and Test Repeat min_loss = 10000.0 for i in range(start, epoch + 1): __process_train(model=model, data_loader=train_loader, criterion=criterion, optimizer=optimizer, logger=train_logger) loss = __process_evaluate(model=model, data_loader=pValidationLoader, criterion=criterion, logger=eval_logger) # Change the learning rate scheduler.step() # Rollback the model when loss is NaN if math.isnan(loss): if model_path is not None and os.path.exists(model_path): # Reload the best model and decrease the learning rate model_data = torch.load(model_path, map_location=device) model.load_state_dict(model_data['model']) optimizer_data = model_data['optimizer'] optimizer_data['param_groups'][0]['lr'] /= 2 # Decrease the learning rate by 2 optimizer.load_state_dict(optimizer_data) print("## Rollback the Model with half learning rate!") # Save the optimal model elif loss < min_loss: min_loss = loss torch.save({'epoch': i, 'model': model.state_dict(), 'optimizer': optimizer.state_dict()}, model_path) elif i % 100 == 0: torch.save({'epoch': i, 'model': model.state_dict(), 'optimizer': optimizer.state_dict()}, 'alexnet_{}epoch.pth'.format(i))