def train(args): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Generate the train and validation sets for the model: split_train_val(args, per_val=args.per_val) current_time = datetime.now().strftime('%b%d_%H%M%S') log_dir = os.path.join('runs', current_time + "_{}".format(args.arch)) writer = SummaryWriter(log_dir=log_dir) # Setup Augmentations if args.aug: data_aug = Compose( [RandomRotate(10), RandomHorizontallyFlip(), AddNoise()]) else: data_aug = None train_set = PatchLoader(is_transform=True, split='train', stride=args.stride, patch_size=args.patch_size, augmentations=data_aug) # Without Augmentation: val_set = PatchLoader(is_transform=True, split='val', stride=args.stride, patch_size=args.patch_size) n_classes = train_set.n_classes trainloader = data.DataLoader(train_set, batch_size=args.batch_size, num_workers=1, shuffle=True) valloader = data.DataLoader(val_set, batch_size=args.batch_size, num_workers=1) # Setup Metrics running_metrics = runningScore(n_classes) running_metrics_val = runningScore(n_classes) # Setup Model edited by Tannistha if args.resume is not None: if os.path.isfile(args.resume): print("Loading model and optimizer from checkpoint '{}'".format(args.resume)) model = torch.load(args.resume) else: print("No checkpoint found at '{}'".format(args.resume)) else: #model = getattr(deeplab, 'resnet101')( #pretrained=(not args.scratch), #num_classes=n_classes, #num_groups=args.groups, #weight_std=args.weight_std, #beta=args.beta) # edited by Tannistha model = getattr(ResNet9, 'resnet9')( pretrained=(args.scratch), num_classes=n_classes, num_groups=args.groups, weight_std=args.weight_std, beta=args.beta) # Use as many GPUs as we can model = torch.nn.DataParallel( model, device_ids=range(torch.cuda.device_count())) model = model.to(device) # Send to GPU # PYTROCH NOTE: ALWAYS CONSTRUCT OPTIMIZERS AFTER MODEL IS PUSHED TO GPU/CPU, # Check if model has custom optimizer / loss if hasattr(model.module, 'optimizer'): print('Using custom optimizer') optimizer = model.module.optimizer else: # optimizer = torch.optim.Adadelta(model.parameters()) optimizer = torch.optim.SGD(model.parameters(),lr=args.base_lr, weight_decay=0.0001, momentum=0.9) #optimizer = torch.optim.Adam(model.parameters(),lr=args.base_lr, weight_decay=0.0001, amsgrad=True) ### edited by Tannistha to work with new optimizer if args.train: criterion = nn.CrossEntropyLoss(ignore_index=255) model.train() if args.freeze_bn: for m in model.modules(): if isinstance(m, nn.BatchNorm2d): m.eval() m.weight.requires_grad = False m.bias.requires_grad = False #optimizer = torch.optim.SGD(model.parameters(),lr=args.base_lr, weight_decay=0.0001, momentum=0.9) optimizer = torch.optim.Adam(model.parameters(),lr=args.base_lr, weight_decay=0.0001, amsgrad=True) start_epoch = 0 loss_fn = core.loss.cross_entropy if args.class_weights: # weights are inversely proportional to the frequency of the classes in the training set class_weights = torch.tensor( [0.7151, 0.8811, 0.5156, 0.9346, 0.9683, 0.9852], device=device, requires_grad=False) else: class_weights = None best_iou = -100.0 class_names = ['upper_ns', 'middle_ns', 'lower_ns', 'rijnland_chalk', 'scruff', 'zechstein'] for arg in vars(args): text = arg + ': ' + str(getattr(args, arg)) writer.add_text('Parameters/', text) model_fname = 'data/deeplab_' + str(args.base_lr) + '_batch_size_' + str(args.batch_size) + '_' + args.exp + '_epoch_%d.pth' val_fname = 'val_lr_' + str(args.base_lr) + '_batch_size_' + str(args.batch_size) + '_' + args.exp for epoch in range(args.n_epoch): # Training Mode: model.train() loss_train, total_iteration = 0, 0 for i, (images, labels) in enumerate(trainloader): image_original, labels_original = images, labels images, labels = images.to(device), labels.to(device) outputs = model(images) pred = outputs.detach().max(1)[1].cpu().numpy() gt = labels.detach().cpu().numpy() running_metrics.update(gt, pred) loss = loss_fn(input=outputs, target=labels, weight=class_weights) loss_train += loss.item() optimizer.zero_grad() loss.backward() if args.clip != 0: torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) optimizer.step() total_iteration = total_iteration + 1 if (i) % 20 == 0: print('epoch: {0}/{1}\t\t' 'iter: {2}/{3}\t\t' 'training Loss:{4:.4f}'.format(epoch + 1, args.n_epoch, i + 1, len(trainloader), loss.item())) numbers = [0] if i in numbers: # number 0 image in the batch tb_original_image = vutils.make_grid( image_original[0][0], normalize=True, scale_each=True) writer.add_image('train/original_image', tb_original_image, epoch + 1) labels_original = labels_original.numpy()[0] correct_label_decoded = train_set.decode_segmap(np.squeeze(labels_original)) writer.add_image('train/original_label',np_to_tb(correct_label_decoded), epoch + 1) out = F.softmax(outputs, dim=1) # this returns the max. channel number: prediction = out.max(1)[1].cpu().numpy()[0] # this returns the confidence: confidence = out.max(1)[0].cpu().detach()[0] tb_confidence = vutils.make_grid( confidence, normalize=True, scale_each=True) decoded = train_set.decode_segmap(np.squeeze(prediction)) writer.add_image('train/predicted', np_to_tb(decoded), epoch + 1) writer.add_image('train/confidence', tb_confidence, epoch + 1) unary = outputs.cpu().detach() unary_max = torch.max(unary) unary_min = torch.min(unary) unary = unary.add((-1*unary_min)) unary = unary/(unary_max - unary_min) for channel in range(0, len(class_names)): decoded_channel = unary[0][channel] tb_channel = vutils.make_grid( decoded_channel, normalize=True, scale_each=True) writer.add_image(f'train_classes/_{class_names[channel]}', tb_channel, epoch + 1) # Average metrics, and save in writer() loss_train /= total_iteration score, class_iou = running_metrics.get_scores() writer.add_scalar('train/Pixel Acc', score['Pixel Acc: '], epoch+1) writer.add_scalar('train/Mean Class Acc', score['Mean Class Acc: '], epoch+1) writer.add_scalar('train/Freq Weighted IoU', score['Freq Weighted IoU: '], epoch+1) writer.add_scalar('train/Mean_IoU', score['Mean IoU: '], epoch+1) running_metrics.reset() writer.add_scalar('train/loss', loss_train, epoch+1) if args.per_val != 0: with torch.no_grad(): # operations inside don't track history # Validation Mode: model.eval() loss_val, total_iteration_val = 0, 0 for i_val, (images_val, labels_val) in enumerate(valloader): image_original, labels_original = images_val, labels_val images_val, labels_val = images_val.to( device), labels_val.to(device) #image_val = to_3_channels(images_val) outputs_val = model(images_val) #outputs_val = model(image_val) pred = outputs_val.detach().max(1)[1].cpu().numpy() gt = labels_val.detach().cpu().numpy() running_metrics_val.update(gt, pred) loss = loss_fn(input=outputs_val, target=labels_val) loss_val += loss.item() total_iteration_val = total_iteration_val + 1 if (i_val) % 20 == 0: print("Epoch [%d/%d] validation Loss: %.4f" % (epoch+1, args.n_epoch, loss.item())) numbers = [0] if i_val in numbers: # number 0 image in the batch tb_original_image = vutils.make_grid( image_original[0][0], normalize=True, scale_each=True) writer.add_image('val/original_image', tb_original_image, epoch) labels_original = labels_original.numpy()[0] correct_label_decoded = train_set.decode_segmap( np.squeeze(labels_original)) writer.add_image('val/original_label', np_to_tb(correct_label_decoded), epoch + 1) out = F.softmax(outputs_val, dim=1) # this returns the max. channel number: prediction = out.max(1)[1].cpu().detach().numpy()[0] # this returns the confidence: confidence = out.max(1)[0].cpu().detach()[0] tb_confidence = vutils.make_grid( confidence, normalize=True, scale_each=True) decoded = train_set.decode_segmap( np.squeeze(prediction)) writer.add_image('val/predicted', np_to_tb(decoded), epoch + 1) writer.add_image('val/confidence', tb_confidence, epoch + 1) unary = outputs.cpu().detach() unary_max, unary_min = torch.max( unary), torch.min(unary) unary = unary.add((-1*unary_min)) unary = unary/(unary_max - unary_min) for channel in range(0, len(class_names)): tb_channel = vutils.make_grid( unary[0][channel], normalize=True, scale_each=True) writer.add_image( f'val_classes/_{class_names[channel]}', tb_channel, epoch + 1) loss_val /= total_iteration_val score, class_iou = running_metrics_val.get_scores() pd.DataFrame([running_metrics_val.get_scores()[0]["Pixel Acc: "]]).to_csv(os.path.join(val_fname, "metrics", "pixel_acc.csv"), index=False, mode='a', header=(i==0)) pd.DataFrame([running_metrics_val.get_scores()[0]["Mean Class Acc: "]]).to_csv(os.path.join(val_fname, "metrics", "mean_class_acc.csv"),index=False, mode='a', header=(i==0)) pd.DataFrame([running_metrics_val.get_scores()[0]["Freq Weighted IoU: "]]).to_csv(os.path.join(val_fname, "metrics", "freq_weighted_iou.csv"),index=False, mode='a', header=(i==0)) pd.DataFrame([running_metrics_val.get_scores()[0]["Mean IoU: "]]).to_csv(os.path.join(val_fname, "metrics", "mean_iou.csv"), index=False, mode='a', header=(i==0)) cname = os.path.join(val_fname, "metrics", "confusion_matrix", "confusion_matrix_" + str(epoch + 1) + ".csv") pd.DataFrame(running_metrics_val.get_scores()[0]["confusion_matrix"]).to_csv(cname, index=False) pd.DataFrame(running_metrics_val.get_scores()[0]["Class Accuracy: "].reshape((1, 6)), columns=[0, 1, 2, 3, 4, 5]).to_csv(os.path.join(val_fname, "metrics", "class_acc.csv"), index=False, mode = "a", header = (i == 0)) pd.DataFrame(running_metrics_val.get_scores()[1], columns=[0, 1, 2, 3, 4, 5], index=[0]).to_csv(os.path.join(val_fname, "metrics", "cls_iu.csv"), mode = "a", header = (i == 0)) writer.add_scalar( 'val/Pixel Acc', score['Pixel Acc: '], epoch+1) writer.add_scalar('val/Mean IoU', score['Mean IoU: '], epoch+1) writer.add_scalar('val/Mean Class Acc', score['Mean Class Acc: '], epoch+1) writer.add_scalar('val/Freq Weighted IoU', score['Freq Weighted IoU: '], epoch+1) writer.add_scalar('val/loss', loss_val, epoch+1) running_metrics_val.reset() if score['Mean IoU: '] >= best_iou: best_iou = score['Mean IoU: '] model_dir = os.path.join( log_dir, f"{args.arch}_model.pkl") #torch.save(model, model_dir) torch.save({'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(),}, model_fname % (epoch + 1)) else: # validation is turned off: # just save the latest model: if (epoch+1) % 5 == 0: model_dir = os.path.join( log_dir, f"{args.arch}_ep{epoch+1}_model.pkl") #torch.save(model, model_dir) torch.save({'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(),}, model_fname % (epoch + 1)) writer.add_scalar('train/epoch_lr', optimizer.param_groups[0]["lr"], epoch+1) writer.close()
def train(args): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Generate the train and validation sets for the model: split_train_val(args, per_val=args.per_val) current_time = datetime.now().strftime('%b%d_%H%M%S') log_dir = os.path.join( 'runs', current_time + "_{}_{}".format(args.arch, args.loss)) writer = SummaryWriter(log_dir=log_dir) # Setup Augmentations if args.aug: data_aug = Compose( [RandomRotate(30), RandomHorizontallyFlip(), AddNoise()]) else: data_aug = None train_set = patch_loader(is_transform=True, split='train', stride=args.stride, patch_size=args.patch_size, augmentations=data_aug) # Without Augmentation: val_set = patch_loader(is_transform=True, split='val', stride=args.stride, patch_size=args.patch_size) n_classes = train_set.n_classes trainloader = data.DataLoader(train_set, batch_size=args.batch_size, num_workers=4, shuffle=True) valloader = data.DataLoader(val_set, batch_size=args.batch_size, num_workers=4) # Setup Metrics running_metrics = runningScore(n_classes) running_metrics_val = runningScore(n_classes) # Setup Model if args.resume is not None: if os.path.isfile(args.resume): print("Loading model and optimizer from checkpoint '{}'".format( args.resume)) model = torch.load(args.resume) else: print("No checkpoint found at '{}'".format(args.resume)) else: model = get_model(args.arch, args.pretrained, n_classes) # Use as many GPUs as we can model = torch.nn.DataParallel(model, device_ids=range(torch.cuda.device_count())) model = model.to(device) # Send to GPU # PYTROCH NOTE: ALWAYS CONSTRUCT OPTIMIZERS AFTER MODEL IS PUSHED TO GPU/CPU, # Check if model has custom optimizer / loss if hasattr(model.module, 'optimizer'): print('Using custom optimizer') optimizer = model.module.optimizer else: # optimizer = torch.optim.Adadelta(model.parameters()) optimizer = torch.optim.Adam(model.parameters(), amsgrad=True) if (args.loss == 'FL'): loss_fn = core.loss.focal_loss2d else: loss_fn = core.loss.cross_entropy if args.class_weights: # weights are inversely proportional to the frequency of the classes in the training set class_weights = torch.tensor( [0.7151, 0.8811, 0.5156, 0.9346, 0.9683, 0.9852], device=device, requires_grad=False) else: class_weights = None best_iou = -100.0 class_names = [ 'upper_ns', 'middle_ns', 'lower_ns', 'rijnland_chalk', 'scruff', 'zechstein' ] for arg in vars(args): text = arg + ': ' + str(getattr(args, arg)) writer.add_text('Parameters/', text) # training for epoch in range(args.n_epoch): # Training Mode: model.train() loss_train, total_iteration = 0, 0 for i, (images, labels) in enumerate(trainloader): image_original, labels_original = images, labels images, labels = images.to(device), labels.to(device) optimizer.zero_grad() outputs = model(images) pred = outputs.detach().max(1)[1].cpu().numpy() gt = labels.detach().cpu().numpy() running_metrics.update(gt, pred) loss = loss_fn(input=outputs, target=labels, weight=class_weights) loss_train += loss.item() loss.backward() # gradient clipping if args.clip != 0: torch.nn.utils.clip_grad_norm(model.parameters(), args.clip) optimizer.step() total_iteration = total_iteration + 1 if (i) % 20 == 0: print("Epoch [%d/%d] training Loss: %.4f" % (epoch + 1, args.n_epoch, loss.item())) numbers = [0] if i in numbers: # number 0 image in the batch tb_original_image = vutils.make_grid(image_original[0][0], normalize=True, scale_each=True) writer.add_image('train/original_image', tb_original_image, epoch + 1) labels_original = labels_original.numpy()[0] correct_label_decoded = train_set.decode_segmap( np.squeeze(labels_original)) writer.add_image('train/original_label', correct_label_decoded, epoch + 1) out = F.softmax(outputs, dim=1) # this returns the max. channel number: prediction = out.max(1)[1].cpu().numpy()[0] # this returns the confidence: confidence = out.max(1)[0].cpu().detach()[0] tb_confidence = vutils.make_grid(confidence, normalize=True, scale_each=True) decoded = train_set.decode_segmap(np.squeeze(prediction)) writer.add_image('train/predicted', decoded, epoch + 1) writer.add_image('train/confidence', tb_confidence, epoch + 1) unary = outputs.cpu().detach() unary_max = torch.max(unary) unary_min = torch.min(unary) unary = unary.add((-1 * unary_min)) unary = unary / (unary_max - unary_min) for channel in range(0, len(class_names)): decoded_channel = unary[0][channel] tb_channel = vutils.make_grid(decoded_channel, normalize=True, scale_each=True) writer.add_image(f'train_classes/_{class_names[channel]}', tb_channel, epoch + 1) # Average metrics, and save in writer() loss_train /= total_iteration score, class_iou = running_metrics.get_scores() writer.add_scalar('train/Pixel Acc', score['Pixel Acc: '], epoch + 1) writer.add_scalar('train/Mean Class Acc', score['Mean Class Acc: '], epoch + 1) writer.add_scalar('train/Freq Weighted IoU', score['Freq Weighted IoU: '], epoch + 1) writer.add_scalar('train/Mean_IoU', score['Mean IoU: '], epoch + 1) running_metrics.reset() writer.add_scalar('train/loss', loss_train, epoch + 1) if args.per_val != 0: with torch.no_grad(): # operations inside don't track history # Validation Mode: model.eval() loss_val, total_iteration_val = 0, 0 for i_val, (images_val, labels_val) in tqdm(enumerate(valloader)): image_original, labels_original = images_val, labels_val images_val, labels_val = images_val.to( device), labels_val.to(device) outputs_val = model(images_val) pred = outputs_val.detach().max(1)[1].cpu().numpy() gt = labels_val.detach().cpu().numpy() running_metrics_val.update(gt, pred) loss = loss_fn(input=outputs_val, target=labels_val) total_iteration_val = total_iteration_val + 1 if (i_val) % 20 == 0: print("Epoch [%d/%d] validation Loss: %.4f" % (epoch, args.n_epoch, loss.item())) numbers = [0] if i_val in numbers: # number 0 image in the batch tb_original_image = vutils.make_grid( image_original[0][0], normalize=True, scale_each=True) writer.add_image('val/original_image', tb_original_image, epoch) labels_original = labels_original.numpy()[0] correct_label_decoded = train_set.decode_segmap( np.squeeze(labels_original)) writer.add_image('val/original_label', correct_label_decoded, epoch + 1) out = F.softmax(outputs_val, dim=1) # this returns the max. channel number: prediction = out.max(1)[1].cpu().detach().numpy()[0] # this returns the confidence: confidence = out.max(1)[0].cpu().detach()[0] tb_confidence = vutils.make_grid(confidence, normalize=True, scale_each=True) decoded = train_set.decode_segmap( np.squeeze(prediction)) writer.add_image('val/predicted', decoded, epoch + 1) writer.add_image('val/confidence', tb_confidence, epoch + 1) unary = outputs.cpu().detach() unary_max, unary_min = torch.max(unary), torch.min( unary) unary = unary.add((-1 * unary_min)) unary = unary / (unary_max - unary_min) for channel in range(0, len(class_names)): tb_channel = vutils.make_grid(unary[0][channel], normalize=True, scale_each=True) writer.add_image( f'val_classes/_{class_names[channel]}', tb_channel, epoch + 1) score, class_iou = running_metrics_val.get_scores() for k, v in score.items(): print(k, v) writer.add_scalar('val/Pixel Acc', score['Pixel Acc: '], epoch + 1) writer.add_scalar('val/Mean IoU', score['Mean IoU: '], epoch + 1) writer.add_scalar('val/Mean Class Acc', score['Mean Class Acc: '], epoch + 1) writer.add_scalar('val/Freq Weighted IoU', score['Freq Weighted IoU: '], epoch + 1) writer.add_scalar('val/loss', loss.item(), epoch + 1) running_metrics_val.reset() if score['Mean IoU: '] >= best_iou: best_iou = score['Mean IoU: '] model_dir = os.path.join( log_dir, f"{args.arch}_{args.loss}_model.pkl") torch.save(model, model_dir) else: # validation is turned off: # just save the latest model: if (epoch + 1) % 5 == 0: model_dir = os.path.join( log_dir, f"{args.arch}_{args.loss}_ep{epoch+1}_model.pkl") torch.save(model, model_dir) writer.close()
def train(args): device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") #Selects Torch Device split_train_val( args, per_val=args.per_val ) #Generate the train and validation sets for the model as text files: current_time = datetime.now().strftime( '%b%d_%H%M%S') #Gets Current Time and Date log_dir = os.path.join( 'runs', current_time + f"_{args.arch}_{args.model_name}") #Greate the log directory writer = SummaryWriter( log_dir=log_dir) #Initialize the tensorboard summary writer # Setup Augmentations if args.aug: #if augmentation is true data_aug = Compose( [RandomRotate(10), RandomHorizontallyFlip(), AddNoise()]) #compose some augmentation functions else: data_aug = None loader = section_loader #name the loader train_set = loader( is_transform=True, split='train', augmentations=data_aug ) #use custom data loader to get the training set (instance of the loader class) val_set = loader( is_transform=True, split='val') #use custom made data loader to get the validation n_classes = train_set.n_classes #initalize the number of classes which is hard coded in the dataloader # Create sampler: shuffle = False # must turn False if using a custom sampler with open(pjoin('data', 'splits', 'section_train.txt'), 'r') as f: train_list = f.read().splitlines( ) #load the section train list previously stored in a text file created by split_train_val() function with open(pjoin('data', 'splits', 'section_val.txt'), 'r') as f: val_list = f.read().splitlines( ) #load the section train list previously stored in a text file created by split_train_val() function class CustomSamplerTrain(torch.utils.data.Sampler ): #create a custom sampler def __iter__(self): char = ['i' if np.random.randint(2) == 1 else 'x' ] #choose randomly between letter i and letter x self.indices = [ idx for (idx, name) in enumerate(train_list) if char[0] in name ] #choose index all inlines or all crosslines from the training list created by split_train_val() function return (self.indices[i] for i in torch.randperm(len(self.indices)) ) #shuffle the indices and return them class CustomSamplerVal(torch.utils.data.Sampler): def __iter__(self): char = ['i' if np.random.randint(2) == 1 else 'x' ] #choose randomly between letter i and letter x self.indices = [ idx for (idx, name) in enumerate(val_list) if char[0] in name ] #choose index all inlines or all crosslines from the validation list created by split_train_val() function return (self.indices[i] for i in torch.randperm(len(self.indices)) ) #shuffle the indices and return them trainloader = data.DataLoader( train_set, batch_size=args.batch_size, num_workers=12, shuffle=True ) #use pytorch data loader to get the batches of training set valloader = data.DataLoader( val_set, batch_size=args.batch_size, num_workers=12 ) #use pytorch data loader to get the batches of validation set # Setup Metrics running_metrics = runningScore( n_classes ) #initialize class instance for evaluation metrics for training running_metrics_val = runningScore( n_classes ) #initialize class instance for evaluation meterics for validation # Setup Model if args.resume is not None: #Check if we have a stored model or not if os.path.isfile(args.resume): #if yes then load the stored model print("Loading model and optimizer from checkpoint '{}'".format( args.resume)) model = torch.load(args.resume) else: print("No checkpoint found at '{}'".format( args.resume)) #if stored model requested with invalid path else: #if no stord model then load the requested model #n_classes=64 model = get_model(name=args.arch, pretrained=args.pretrained, batch_size=args.batch_size, growth_rate=32, drop_rate=0, n_classes=n_classes) #get the stored model model = torch.nn.DataParallel( model, device_ids=range( torch.cuda.device_count())) #Use as many GPUs as we can model = model.to(device) # Send to GPU # Check if model has custom optimizer / loss if hasattr(model.module, 'optimizer'): print('Using custom optimizer') optimizer = model.module.optimizer else: optimizer = torch.optim.Adam( model.parameters(), lr=args.lr, amsgrad=True, weight_decay=args.weight_decay, eps=args.eps ) #if no specified optimizer then load the defualt optimizer loss_fn = core.loss.focal_loss2d #initialize a function loss function if args.class_weights: #if class weights are to be used then intailize them # weights are inversely proportional to the frequency of the classes in the training set class_weights = torch.tensor( [0.7151, 0.8811, 0.5156, 0.9346, 0.9683, 0.9852], device=device, requires_grad=False) else: class_weights = None #if no class weights then no need to use them best_iou = -100.0 class_names = [ 'null', 'upper_ns', 'middle_ns', 'lower_ns', 'rijnland_chalk', 'scruff', 'zechstein' ] #initialize the name of different classes for arg in vars( args ): #Before training start writting the summary of the parameters text = arg + ': ' + str(getattr( args, arg)) #get the attribute name and value, make them as string writer.add_text('Parameters/', text) #store the whole string # training for epoch in range(args.n_epoch): #for loop on the number of epochs # Training Mode: model.train() #initialize training mode loss_train, total_iteration = 0, 0 # intialize training loss and total number of iterations for i, (images, labels) in enumerate( trainloader ): #start the epoch then initialize the number of iterations per epoch i is the batch number image_original, labels_original = images, labels #store the image and label batch in new varaibles images, labels = images.to(device), labels.to( device) #move images and labels to the GPU optimizer.zero_grad() #intialize the optimizer outputs = model( images ) #feed forward the images through the model (outputs is a 7 channel o/p) pred = outputs.detach().max(1)[1].cpu().numpy( ) #get the model o/p from GPU, select the index of the maximum channel and send it back to CPU gt = labels.detach().cpu().numpy( ) #get the true lablels from GPU and send them to CPU running_metrics.update( gt, pred ) #call the function update and pass the ground truth and the predicted classes loss = loss_fn(input=outputs, target=labels, gamma=args.gamma, loss_type=args.loss_parameters ) #call the loss fuction to calculate the loss loss_train += loss.item() #gets the scalar value held in the loss. loss.backward( ) # Use autograd to compute the backward pass. This call will compute the gradient of loss with respect to all Tensors with requires_grad=True. # gradient clipping if args.clip != 0: torch.nn.utils.clip_grad_norm( model.parameters(), args.clip ) #The norm is computed over all gradients together, as if they were concatenated into a single vector. Gradients are modified in-place. optimizer.step( ) #step the optimizer (update the model weights with the new gradients) total_iteration = total_iteration + 1 #increment the total number of iterations by 1 if ( i ) % 20 == 0: #if 20% of the total number of iterations pass then print( "Epoch [%d/%d] training Loss: %.4f" % (epoch + 1, args.n_epoch, loss.item()) ) #print the current epoch, total number of epochs and the current training loss numbers = [0, 14, 29, 49, 99] #select some numbers if i in numbers: #if the current batch number is in numbers # number 0 image in the batch tb_original_image = vutils.make_grid( image_original[0][0], normalize=True, scale_each=True ) #select the first image in the batch create a tensorboard grid form the image tensor writer.add_image('train/original_image', tb_original_image, epoch + 1) #send the image to writer labels_original = labels_original.numpy( )[0] #convert the ground truth lablels of the first image in the batch to numpy array correct_label_decoded = train_set.decode_segmap( np.squeeze(labels_original) ) #Decode segmentation class labels into a color image writer.add_image('train/original_label', np_to_tb(correct_label_decoded), epoch + 1) #send the image to the writer out = F.softmax(outputs, dim=1) #softmax of the network o/p prediction = out.max(1)[1].cpu().numpy()[ 0] #get the index of the maximum value after softmax confidence = out.max(1)[0].cpu().detach()[ 0] # this returns the confidence in the chosen class tb_confidence = vutils.make_grid( confidence, normalize=True, scale_each=True ) #convert the confidence from tensor to image decoded = train_set.decode_segmap(np.squeeze( prediction)) #Decode predicted classes to colours writer.add_image( 'train/predicted', np_to_tb(decoded), epoch + 1 ) #send predicted map to writer along with the epoch number writer.add_image( 'train/confidence', tb_confidence, epoch + 1 ) #send the confidence to writer along with the epoch number unary = outputs.cpu().detach( ) #get the Nw o/p for the whole batch unary_max = torch.max( unary) #normalize the Nw o/p w.r.t whole batch unary_min = torch.min(unary) unary = unary.add((-1 * unary_min)) unary = unary / (unary_max - unary_min) for channel in range(0, len(class_names)): decoded_channel = unary[0][ channel] #get the normalized o/p for the first image in the batch tb_channel = vutils.make_grid( decoded_channel, normalize=True, scale_each=True) #prepare a image from tensor writer.add_image(f'train_classes/_{class_names[channel]}', tb_channel, epoch + 1) #send image to writer # Average metrics after finishing all batches for the whole epoch, and save in writer() loss_train /= total_iteration #total loss for all iterations/ number of iterations score, class_iou = running_metrics.get_scores( ) #returns a dictionary of the calculated accuracy metrics and class iu writer.add_scalar( 'train/Pixel Acc', score['Pixel Acc: '], epoch + 1) # store the epoch metrics in the tensorboard writer writer.add_scalar('train/Mean Class Acc', score['Mean Class Acc: '], epoch + 1) writer.add_scalar('train/Freq Weighted IoU', score['Freq Weighted IoU: '], epoch + 1) writer.add_scalar('train/Mean_IoU', score['Mean IoU: '], epoch + 1) confusion = score['confusion_matrix'] writer.add_image(f'train/confusion matrix', np_to_tb(confusion), epoch + 1) running_metrics.reset() #resets the confusion matrix writer.add_scalar('train/loss', loss_train, epoch + 1) #store the training loss #Finished one epoch of training, starting one epoch of testing if args.per_val != 0: # if validation is required with torch.no_grad(): # operations inside don't track history # Validation Mode: model.eval() #start validation mode loss_val, total_iteration_val = 0, 0 # initialize validation loss and total number of iterations for i_val, (images_val, labels_val) in tqdm( enumerate(valloader)): #start validation testing image_original, labels_original = images_val, labels_val #store original validation errors images_val, labels_val = images_val.to( device), labels_val.to( device) #send validation images and labels to GPU outputs_val = model(images_val) #feedforward the image pred = outputs_val.detach().max( 1)[1].cpu().numpy() #get the network class prediction gt = labels_val.detach().cpu().numpy( ) #get the ground truth from the GPU running_metrics_val.update( gt, pred) #run metrics on the validation data loss = loss_fn(input=outputs_val, target=labels_val, gamma=args.gamma, loss_type=args.loss_parameters ) #calculate the loss function total_iteration_val = total_iteration_val + 1 #increment the loop counter if ( i_val ) % 20 == 0: #After 20% of batches for validation print the validation loss print("Epoch [%d/%d] validation Loss: %.4f" % (epoch, args.n_epoch, loss.item())) numbers = [0] if i_val in numbers: #select batch number 0 # number 0 image in the batch tb_original_image = vutils.make_grid( image_original[0][0], normalize=True, scale_each=True ) #make first tensor in the batch as image writer.add_image('val/original_image', tb_original_image, epoch) #send image to writer labels_original = labels_original.numpy()[ 0] #get origianl labels of image 0 correct_label_decoded = train_set.decode_segmap( np.squeeze(labels_original) ) #convert the labels to colour map writer.add_image('val/original_label', np_to_tb(correct_label_decoded), epoch + 1) #send the coloured map to writer out = F.softmax( outputs_val, dim=1) #get soft max of the network 7 channel o/p # this returns the max. channel number: prediction = out.max(1)[1].cpu().detach().numpy( )[0] #get the position of the max o/p across different channels # this returns the confidence: confidence = out.max(1)[0].cpu().detach( )[0] #get the maximum o/p of the Nw across different channels tb_confidence = vutils.make_grid( confidence, normalize=True, scale_each=True) #convert tensor to image decoded = train_set.decode_segmap( np.squeeze(prediction) ) #convert predicted classes to colour maps writer.add_image('val/predicted', np_to_tb(decoded), epoch + 1) #send prediction to writer writer.add_image('val/confidence', tb_confidence, epoch + 1) #send confidence to writer unary = outputs.cpu().detach( ) #get Nw o/p of the current batch unary_max, unary_min = torch.max(unary), torch.min( unary) #normalize across all the Nw o/p unary = unary.add((-1 * unary_min)) unary = unary / (unary_max - unary_min) for channel in range( 0, len(class_names) ): #for all the 7 channels of the Nw op tb_channel = vutils.make_grid( unary[0][channel], normalize=True, scale_each=True ) #convert the channel o/p of the class to image writer.add_image( f'val_classes/_{class_names[channel]}', tb_channel, epoch + 1) #send image to writer # finished one cycle of validation after iterating over all validation batched score, class_iou = running_metrics_val.get_scores( ) #returns a dictionary of the calculated accuracy metrics and class iu for k, v in score.items(): #?? print(k, v) writer.add_scalar('val/Pixel Acc', score['Pixel Acc: '], epoch + 1) #send metrics to writer writer.add_scalar('val/Mean IoU', score['Mean IoU: '], epoch + 1) writer.add_scalar('val/Mean Class Acc', score['Mean Class Acc: '], epoch + 1) writer.add_scalar('val/Freq Weighted IoU', score['Freq Weighted IoU: '], epoch + 1) confusion = score['confusion_matrix'] writer.add_image(f'val/confusion matrix', np_to_tb(confusion), epoch + 1) writer.add_scalar('val/loss', loss.item(), epoch + 1) running_metrics_val.reset() #reset confusion matrix if score['Mean IoU: '] >= best_iou: #compare with the validation mean iou of current epoch with the best stored validation mean IoU best_iou = score[ 'Mean IoU: '] #if better, then store the better and store the current model as the best model model_dir = os.path.join(log_dir, f"{args.arch}_model_best.pkl") torch.save(model, model_dir) if epoch % 10 == 0: #every 10 epochs store the current model model_dir = os.path.join( log_dir, f"{args.arch}_ep{epoch}_model.pkl") torch.save(model, model_dir) else: # validation is turned off: # just save the latest model every 10 epochs: if (epoch + 1) % 10 == 0: model_dir = os.path.join( log_dir, f"{args.arch}_ep{epoch + 1}_model.pkl") torch.save(model, model_dir) writer.close() #close the writer