def main(): best_acc = 0 co_transform = MyCoTransform(ENCODER_ONLY, augment=True, height=IMAGE_HEIGHT) co_transform_val = MyCoTransform(ENCODER_ONLY, augment=False, height=IMAGE_HEIGHT) #train data dataset_train = idd_lite(DATA_ROOT, co_transform, 'train') print("length of training set: ", len(dataset_train)) #test data dataset_val = idd_lite(DATA_ROOT, co_transform_val, 'val') print("length of validation set: ", len(dataset_val)) # NOTE: PLEASE DON'T CHANGE batch_size and num_workers here. We have limited resources. loader_train = DataLoader(dataset_train, num_workers=NUM_WORKERS, batch_size=BATCH_SIZE, shuffle=True) loader_val = DataLoader(dataset_val, num_workers=NUM_WORKERS, batch_size=BATCH_SIZE, shuffle=True) dataiter = iter(loader_val) seven_val_images = [] for i in range(7): (val_image_A, val_image_B, val_image_labels) = dataiter.next() seven_val_images.append( (val_image_A.to(device), val_image_B.to(device))) cv2.imwrite( os.path.join(OUTPUT_DIR, str(i), 'A.tiff'), np.rollaxis((val_image_A[0, :, :, :].squeeze().cpu().numpy() * 255).astype('uint8'), 0, 3)) cv2.imwrite( os.path.join(OUTPUT_DIR, str(i), 'B.tiff'), np.rollaxis((val_image_B[0, :, :, :].squeeze().cpu().numpy() * 255).astype('uint8'), 0, 3)) cv2.imwrite(os.path.join(OUTPUT_DIR, str(i), 'label.tiff'), (val_image_labels[0, :, :, :].squeeze().cpu().numpy() ).astype('uint8')) # ## Cross Entropy Loss ## # Negative Log Loss |Plot of -log(x) vs x # - | - # ![alt](img/nll.png) | ![alt](img/nll-log.png) # # The negative log-likelihood becomes unhappy at smaller values, where it can reach infinite unhappiness (that’s too sad), and becomes less unhappy at larger values. Because we are summing the loss function to all the correct classes, what’s actually happening is that whenever the network assigns high confidence at the correct class, the unhappiness is low, but when the network assigns low confidence at the correct class, the unhappiness is high. # In[12]: criterion = torch.nn.CrossEntropyLoss() #get some random training images print("length of training couples: ", len(loader_train)) print(len(loader_val)) dataiter = iter(loader_train) (images, images1, labels, filename) = dataiter.next() #ChangedByUs # for step, (images, labels) in enumerate(loader_train): # plt.figure() # plt.imshow(ToPILImage()(images[0].cpu())) # plt.figure() # plt.imshow(ToPILImage()(Colorize()(labels[0].cpu()))) # break # ## Model ## model_file = importlib.import_module('erfnet') model = model_file.Net(NUM_CLASSES).to(device) # ### Optimizer ### # We use adam optimizer. It can be replaced with SGD and other optimizers optimizer = Adam(model.parameters(), 5e-4, (0.9, 0.999), eps=1e-08, weight_decay=1e-4) start_epoch = 1 print("device used: ", device) # ### Training Procedure ### softmax = torch.nn.Softmax(dim=1) steps_loss = 50 my_start_time = time.time() for epoch in range(start_epoch, NUM_EPOCHS + 1): print("----- TRAINING - EPOCH", epoch, "-----") epoch_loss = [] time_train = [] doIouTrain = IOUTRAIN doIouVal = IOUVAL if (doIouTrain): iouEvalTrain = iouEval(NUM_CLASSES) model.train() for step, (images, images1, labels, filename) in enumerate(loader_train): #ChangedByUs start_time = time.time() # inputs = [images.to(device), images1.to(device)] #ChangedByUs inputs = images.to(device) inputs1 = images1.to(device) #ChangedByUs targets = labels.to(device) targets_orig = targets.clone() targets[targets_orig >= 128] = 1 # ChangedByUs targets[targets_orig < 128] = 0 # ChangedByUs #for x_u in targets.unique(): # print(int(x_u), ' appears ', int(torch.stack([(targets==x_u).sum()])), ' times.\n') outputs = model([inputs, inputs1], only_encode=ENCODER_ONLY) # zero the parameter gradients optimizer.zero_grad() # forward + backward + optimize loss = criterion(outputs, targets[:, 0]) loss.backward() optimizer.step() epoch_loss.append(loss.item()) time_train.append(time.time() - start_time) if (doIouTrain): #start_time_iou = time.time() iouEvalTrain.addBatch( outputs.max(1)[1].unsqueeze(1).data, targets.data) #print ("Time to add confusion matrix: ", time.time() - start_time_iou) # print statistics if steps_loss > 0 and step % steps_loss == 0: average = sum(epoch_loss) / len(epoch_loss) print( 'loss: {average:', average, '} (epoch: {', epoch, '}, step: {', step, '})', "// Avg time/img: %.4f s" % (sum(time_train) / len(time_train) / BATCH_SIZE)) average_epoch_loss_train = sum(epoch_loss) / len(epoch_loss) iouTrain = 0 if (doIouTrain): iouTrain, iou_classes = iouEvalTrain.getIoU() iouStr = getColorEntry(iouTrain) + '{:0.2f}'.format( iouTrain * 100) + '\033[0m' print("EPOCH IoU on TRAIN set: ", iouStr, "%") #save one image per epoch # if USE_CUDA: # first_val_image_A = first_val_image_A.to(device) # first_val_image_B = first_val_image_B.to(device) # ChangedByUs # first_val_image_labels = first_val_image_labels.to(device) # # inputs = first_val_image_A.to(device) # inputs1 = first_val_image_B.to(device) # ChangedByUs for i in range(len(seven_val_images)): outputs_val = model( [seven_val_images[i][0].cuda(), seven_val_images[i][1].cuda()], only_encode=ENCODER_ONLY) outputs_val = softmax(outputs_val) cv2.imwrite( os.path.join(OUTPUT_DIR, str(i), 'epoch' + str(epoch) + '_output.tiff'), (((outputs_val[0, 1, :, :] > 0.5) * 255).squeeze().cpu().numpy()).astype('uint8')) my_end_time = time.time() print(my_end_time - my_start_time) print( 'loss: {average:', average, '} (epoch: {', epoch, '}, step: {', step, '})', "// Avg time/img: %.4f s" % (sum(time_train) / len(time_train) / BATCH_SIZE)) # # ### Validation ### # #Validate on val images after each epoch of training # print("----- VALIDATING - EPOCH", epoch, "-----") # model.eval() # epoch_loss_val = [] # time_val = [] # # if (doIouVal): # iouEvalVal = iouEval(NUM_CLASSES) # # for step, (images, labels) in enumerate(loader_val): # start_time = time.time() # # inputs = images.to(device) # targets = labels.to(device) # # with torch.no_grad(): # outputs = model(inputs, only_encode=ENCODER_ONLY) # #outputs = model(inputs) # loss = criterion(outputs, targets[:, 0]) # epoch_loss_val.append(loss.item()) # time_val.append(time.time() - start_time) # # # #Add batch to calculate TP, FP and FN for iou estimation # if (doIouVal): # #start_time_iou = time.time() # iouEvalVal.addBatch(outputs.max(1)[1].unsqueeze(1).data, targets.data) # #print ("Time to add confusion matrix: ", time.time() - start_time_iou) # # if steps_loss > 0 and step % steps_loss == 0: # average = sum(epoch_loss_val) / len(epoch_loss_val) # print('VAL loss: {average:',average,'} (epoch: {',epoch,'}, step: {',step,'})', # "// Avg time/img: %.4f s" % (sum(time_val) / len(time_val) / BATCH_SIZE)) # # # average_epoch_loss_val = sum(epoch_loss_val) / len(epoch_loss_val) # # iouVal = 0 # if (doIouVal): # # iouVal, iou_classes = iouEvalVal.getIoU() # print(iou_classes) # iouStr = getColorEntry(iouVal)+'{:0.2f}'.format(iouVal*100) + '\033[0m' # print ("EPOCH IoU on VAL set: ", iouStr, "%") # # ### Visualizing the Output### torch.save(model.state_dict(), r'C:\Users\inbal.tlgip\modelsave.pt') # Qualitative Analysis ##################### calc iou on test data ##################### dataset_test = idd_lite(DATA_ROOT, co_transform_val, 'test') loader_test = DataLoader(dataset_test, num_workers=NUM_WORKERS, batch_size=BATCH_SIZE, shuffle=True) # dataiter = iter(loader_test) # (val_image_A, val_image_B, val_image_labels) = dataiter.next() for step, (images, images1, labels, filename) in enumerate(loader_test): outputs_val = model([images.cuda(), images1.cuda()], only_encode=ENCODER_ONLY) outputs_val = softmax(outputs_val) cv2.imwrite( r'D:\Users Data\inbal.tlgip\Project\output_images\test_output/' + str(step) + '.tiff', (((outputs_val[0, 1, :, :] > 0.5) * 255).squeeze().cpu().numpy()).astype('uint8'))
def train(args, model, enc=False): best_acc = 0 #TODO: calculate weights by processing dataset histogram (now its being set by hand from the torch values) #create a loder to run all images and calculate histogram of labels, then create weight array using class balancing weight = torch.ones(NUM_CLASSES) if (enc): weight[0] = 2.3653597831726 weight[1] = 4.4237880706787 weight[2] = 2.9691488742828 weight[3] = 5.3442072868347 weight[4] = 5.2983593940735 weight[5] = 5.2275490760803 weight[6] = 5.4394111633301 weight[7] = 5.3659925460815 weight[8] = 3.4170460700989 weight[9] = 5.2414722442627 weight[10] = 4.7376127243042 weight[11] = 5.2286224365234 weight[12] = 5.455126285553 weight[13] = 4.3019247055054 weight[14] = 5.4264230728149 weight[15] = 5.4331531524658 weight[16] = 5.433765411377 weight[17] = 5.4631009101868 weight[18] = 5.3947434425354 else: weight[0] = 2.8149201869965 weight[1] = 6.9850029945374 weight[2] = 3.7890393733978 weight[3] = 9.9428062438965 weight[4] = 9.7702074050903 weight[5] = 9.5110931396484 weight[6] = 10.311357498169 weight[7] = 10.026463508606 weight[8] = 4.6323022842407 weight[9] = 9.5608062744141 weight[10] = 7.8698215484619 weight[11] = 9.5168733596802 weight[12] = 10.373730659485 weight[13] = 6.6616044044495 weight[14] = 10.260489463806 weight[15] = 10.287888526917 weight[16] = 10.289801597595 weight[17] = 10.405355453491 weight[18] = 10.138095855713 weight[19] = 0 assert os.path.exists(args.datadir), "Error: datadir (dataset directory) could not be loaded" co_transform = MyCoTransform(enc, augment=True, height=args.height)#1024) co_transform_val = MyCoTransform(enc, augment=False, height=args.height)#1024) dataset_train = cityscapes(args.datadir, co_transform, 'train',50) dataset_val = cityscapes(args.datadir, co_transform_val, 'val',100) print(len(dataset_train)) loader = DataLoader(dataset_train, num_workers=args.num_workers, batch_size=args.batch_size, shuffle=True) loader_val = DataLoader(dataset_val, num_workers=args.num_workers, batch_size=args.batch_size, shuffle=False) # print(list(enumerate(loader))) if args.cuda: weight = weight.cuda() criterion = CrossEntropyLoss2d(weight) savedir = f'../save/{args.savedir}' if (enc): automated_log_path = savedir + "/automated_log_encoder.txt" modeltxtpath = savedir + "/model_encoder.txt" else: automated_log_path = savedir + "/automated_log.txt" modeltxtpath = savedir + "/model.txt" if (not os.path.exists(automated_log_path)): #dont add first line if it exists with open(automated_log_path, "a") as myfile: myfile.write("Epoch\t\tTrain-loss\t\tTest-loss\t\tTrain-IoU\t\tTest-IoU\t\tlearningRate") with open(modeltxtpath, "w") as myfile: myfile.write(str(model)) #TODO: reduce memory in first gpu: https://discuss.pytorch.org/t/multi-gpu-training-memory-usage-in-balance/4163/4 #https://github.com/pytorch/pytorch/issues/1893 #optimizer = Adam(model.parameters(), 5e-4, (0.9, 0.999), eps=1e-08, weight_decay=2e-4) ## scheduler 1 optimizer = Adam(model.parameters(), 5e-4, (0.9, 0.999), eps=1e-08, weight_decay=1e-4) ## scheduler 2 start_epoch = 1 if args.resume: #Must load weights, optimizer, epoch and best value. if enc: filenameCheckpoint = savedir + '/checkpoint_enc.pth.tar' else: filenameCheckpoint = savedir + '/checkpoint.pth.tar' assert os.path.exists(filenameCheckpoint), "Error: resume option was used but checkpoint was not found in folder" checkpoint = torch.load(filenameCheckpoint) start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) best_acc = checkpoint['best_acc'] print("=> Loaded checkpoint at epoch {})".format(checkpoint['epoch'])) #scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.5) # set up scheduler ## scheduler 1 lambda1 = lambda epoch: pow((1-((epoch-1)/args.num_epochs)),0.9) ## scheduler 2 scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda1) ## scheduler 2 if args.visualize and args.steps_plot > 0: board = Dashboard(args.port) for epoch in range(start_epoch, args.num_epochs+1): print("----- TRAINING - EPOCH", epoch, "-----") scheduler.step(epoch) ## scheduler 2 epoch_loss = [] time_train = [] doIouTrain = args.iouTrain doIouVal = args.iouVal if (doIouTrain): iouEvalTrain = iouEval(NUM_CLASSES) usedLr = 0 for param_group in optimizer.param_groups: print("LEARNING RATE: ", param_group['lr']) usedLr = float(param_group['lr']) model.train() #print("this is me!!!!!") #print(len(loader)) for step, (images, labels) in enumerate(loader): start_time = time.time() #print("this is also m") #print (labels.size()) #print (np.unique(labels.numpy())) #print("labels: ", np.unique(labels[0].numpy())) #labels = torch.ones(4, 1, 512, 1024).long() if args.cuda: images = images.cuda() labels = labels.cuda() inputs = Variable(images) targets = Variable(labels) outputs = model(inputs, only_encode=enc) #print("targets", np.unique(targets[:, 0].cpu().data.numpy())) #print("This is me on traget") #print(np.min(targets.cpu().detach().numpy())) #print("This is me after target") optimizer.zero_grad() loss = criterion(outputs, targets[:, 0]) #print("This is me on loss") #print(loss) #print("This is me after loss") loss.backward() optimizer.step() epoch_loss.append(loss.cpu().detach().numpy().item()) time_train.append(time.time() - start_time) if (doIouTrain): #start_time_iou = time.time() iouEvalTrain.addBatch(outputs.max(1)[1].unsqueeze(1).data, targets.data) #print ("Time to add confusion matrix: ", time.time() - start_time_iou) #print(outputs.size()) if args.visualize and args.steps_plot > 0 and step % args.steps_plot == 0: start_time_plot = time.time() image = inputs[0].cpu().data #image[0] = image[0] * .229 + .485 #image[1] = image[1] * .224 + .456 #image[2] = image[2] * .225 + .406 #print("output", np.unique(outputs[0].cpu().max(0)[1].data.numpy())) board.image(image, f'input (epoch: {epoch}, step: {step})') if isinstance(outputs, list): #merge gpu tensors board.image(color_transform(outputs[0][0].cpu().max(0)[1].data.unsqueeze(0)), f'output (epoch: {epoch}, step: {step})') else: board.image(color_transform(outputs[0].cpu().max(0)[1].data.unsqueeze(0)), f'output (epoch: {epoch}, step: {step})') board.image(color_transform(targets[0].cpu().data), f'target (epoch: {epoch}, step: {step})') print ("Time to paint images: ", time.time() - start_time_plot) if args.steps_loss > 0 and step % args.steps_loss == 0: average = sum(epoch_loss) / len(epoch_loss) print(f'loss: {average:0.4} (epoch: {epoch}, step: {step})', "// Avg time/img: %.4f s" % (sum(time_train) / len(time_train) / args.batch_size)) average_epoch_loss_train = sum(epoch_loss) / len(epoch_loss) iouTrain = 0 if (doIouTrain): iouTrain, iou_classes = iouEvalTrain.getIoU() iouStr = getColorEntry(iouTrain)+'{:0.2f}'.format(iouTrain*100) + '\033[0m' print ("EPOCH IoU on TRAIN set: ", iouStr, "%") #Validate on 500 val images after each epoch of training print("----- VALIDATING - EPOCH", epoch, "-----") model.eval() epoch_loss_val = [] time_val = [] if (doIouVal): iouEvalVal = iouEval(NUM_CLASSES) for step, (images, labels) in enumerate(loader_val): start_time = time.time() if args.cuda: images = images.cuda() labels = labels.cuda() inputs = Variable(images, volatile=True) #volatile flag makes it free backward or outputs for eval targets = Variable(labels, volatile=True) outputs = model(inputs, only_encode=enc) loss = criterion(outputs, targets[:, 0]) epoch_loss_val.append(loss.cpu().detach().numpy().item()) time_val.append(time.time() - start_time) #Add batch to calculate TP, FP and FN for iou estimation if (doIouVal): #start_time_iou = time.time() iouEvalVal.addBatch(outputs.max(1)[1].unsqueeze(1).data, targets.data) #print ("Time to add confusion matrix: ", time.time() - start_time_iou) if args.visualize and args.steps_plot > 0 and step % args.steps_plot == 0: start_time_plot = time.time() image = inputs[0].cpu().data board.image(image, f'VAL input (epoch: {epoch}, step: {step})') if isinstance(outputs, list): #merge gpu tensors board.image(color_transform(outputs[0][0].cpu().max(0)[1].data.unsqueeze(0)), f'VAL output (epoch: {epoch}, step: {step})') else: board.image(color_transform(outputs[0].cpu().max(0)[1].data.unsqueeze(0)), f'VAL output (epoch: {epoch}, step: {step})') board.image(color_transform(targets[0].cpu().data), f'VAL target (epoch: {epoch}, step: {step})') print ("Time to paint images: ", time.time() - start_time_plot) if args.steps_loss > 0 and step % args.steps_loss == 0: average = sum(epoch_loss_val) / len(epoch_loss_val) print(f'VAL loss: {average:0.4} (epoch: {epoch}, step: {step})', "// Avg time/img: %.4f s" % (sum(time_val) / len(time_val) / args.batch_size)) average_epoch_loss_val = sum(epoch_loss_val) / len(epoch_loss_val) #scheduler.step(average_epoch_loss_val, epoch) ## scheduler 1 # update lr if needed iouVal = 0 if (doIouVal): iouVal, iou_classes = iouEvalVal.getIoU() iouStr = getColorEntry(iouVal)+'{:0.2f}'.format(iouVal*100) + '\033[0m' print ("EPOCH IoU on VAL set: ", iouStr, "%") # remember best valIoU and save checkpoint if iouVal == 0: current_acc = -average_epoch_loss_val else: current_acc = iouVal is_best = current_acc > best_acc best_acc = max(current_acc, best_acc) if enc: filenameCheckpoint = savedir + '/checkpoint_enc.pth.tar' filenameBest = savedir + '/model_best_enc.pth.tar' else: filenameCheckpoint = savedir + '/checkpoint.pth.tar' filenameBest = savedir + '/model_best.pth.tar' save_checkpoint({ 'epoch': epoch + 1, 'arch': str(model), 'state_dict': model.state_dict(), 'best_acc': best_acc, 'optimizer' : optimizer.state_dict(), }, is_best, filenameCheckpoint, filenameBest) #SAVE MODEL AFTER EPOCH if (enc): filename = f'{savedir}/model_encoder-{epoch:03}.pth' filenamebest = f'{savedir}/model_encoder_best.pth' else: filename = f'{savedir}/model-{epoch:03}.pth' filenamebest = f'{savedir}/model_best.pth' if args.epochs_save > 0 and step > 0 and step % args.epochs_save == 0: torch.save(model.state_dict(), filename) print(f'save: {filename} (epoch: {epoch})') if (is_best): torch.save(model.state_dict(), filenamebest) print(f'save: {filenamebest} (epoch: {epoch})') if (not enc): with open(savedir + "/best.txt", "w") as myfile: myfile.write("Best epoch is %d, with Val-IoU= %.4f" % (epoch, iouVal)) else: with open(savedir + "/best_encoder.txt", "w") as myfile: myfile.write("Best epoch is %d, with Val-IoU= %.4f" % (epoch, iouVal)) #SAVE TO FILE A ROW WITH THE EPOCH RESULT (train loss, val loss, train IoU, val IoU) #Epoch Train-loss Test-loss Train-IoU Test-IoU learningRate with open(automated_log_path, "a") as myfile: myfile.write("\n%d\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.8f" % (epoch, average_epoch_loss_train, average_epoch_loss_val, iouTrain, iouVal, usedLr )) return(model) #return model (convenience for encoder-decoder training)
def train(args, model, enc=False): best_acc = 0 #TODO: calculate weights by processing dataset histogram (now its being set by hand from the torch values) #create a loder to run all images and calculate histogram of labels, then create weight array using class balancing weight = torch.ones(NUM_CLASSES) if (enc): weight[0] = 4.38133159 weight[1] = 1.29574148 else: weight[0] = 4.40513628 weight[1] = 1.293674 if (enc): up = torch.nn.Upsample(scale_factor=16, mode='bilinear') else: up = torch.nn.Upsample(scale_factor=2, mode='bilinear') if args.cuda: up = up.cuda() assert os.path.exists(args.datadir), "Error: datadir (dataset directory) could not be loaded" co_transform = MyCoTransform(enc, augment=True, height=args.height)#1024) co_transform_val = MyCoTransform(enc, augment=False, height=args.height)#1024) dataset_train = cityscapes(args.datadir, co_transform, 'train') dataset_val = cityscapes(args.datadir, co_transform_val, 'val') loader = DataLoader(dataset_train, num_workers=args.num_workers, batch_size=args.batch_size, shuffle=True) loader_val = DataLoader(dataset_val, num_workers=args.num_workers, batch_size=args.batch_size, shuffle=False) if args.cuda: weight = weight.cuda() if args.weighted: criterion = CrossEntropyLoss2d(weight) else: criterion = CrossEntropyLoss2d() print(type(criterion)) savedir = args.savedir if (enc): automated_log_path = savedir + "/automated_log_encoder.txt" modeltxtpath = savedir + "/model_encoder.txt" else: automated_log_path = savedir + "/automated_log.txt" modeltxtpath = savedir + "/model.txt" if (not os.path.exists(automated_log_path)): #dont add first line if it exists with open(automated_log_path, "a") as myfile: myfile.write("Epoch\t\tTrain-loss\t\tTest-loss\t\tTrain-IoU\t\tTest-IoU\t\tlearningRate") with open(modeltxtpath, "w") as myfile: myfile.write(str(model)) #TODO: reduce memory in first gpu: https://discuss.pytorch.org/t/multi-gpu-training-memory-usage-in-balance/4163/4 #https://github.com/pytorch/pytorch/issues/1893 #optimizer = Adam(model.parameters(), 5e-4, (0.9, 0.999), eps=1e-08, weight_decay=2e-4) ## scheduler 1 optimizer = Adam(model.parameters(), 5e-4, (0.9, 0.999), eps=1e-08, weight_decay=1e-4) ## scheduler 2 start_epoch = 1 if args.resume: #Must load weights, optimizer, epoch and best value. if enc: filenameCheckpoint = savedir + '/checkpoint_enc.pth.tar' else: filenameCheckpoint = savedir + '/checkpoint.pth.tar' assert os.path.exists(filenameCheckpoint), "Error: resume option was used but checkpoint was not found in folder" checkpoint = torch.load(filenameCheckpoint) start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) best_acc = checkpoint['best_acc'] print("=> Loaded checkpoint at epoch {})".format(checkpoint['epoch'])) #scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.5) # set up scheduler ## scheduler 1 lambda1 = lambda epoch: pow((1-((epoch-1)/args.num_epochs)),0.9) ## scheduler 2 scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda1) ## scheduler 2 if args.visualize and args.steps_plot > 0: board = Dashboard(args.port) for epoch in range(start_epoch, args.num_epochs+1): print("----- TRAINING - EPOCH", epoch, "-----") scheduler.step(epoch) ## scheduler 2 epoch_loss = [] time_train = [] doIouTrain = args.iouTrain doIouVal = args.iouVal if (doIouTrain): iouEvalTrain = iouEval(NUM_CLASSES, args.ignoreindex) usedLr = 0 for param_group in optimizer.param_groups: print("LEARNING RATE: ", param_group['lr']) usedLr = float(param_group['lr']) model.train() for step, (images, labels, images_orig, labels_orig) in enumerate(loader): start_time = time.time() #print (labels.size()) #print (np.unique(labels.numpy())) #print("labels: ", np.unique(labels[0].numpy())) #labels = torch.ones(4, 1, 512, 1024).long() if args.cuda: images = images.cuda() labels = labels.cuda() inputs = Variable(images) targets = Variable(labels) outputs = model(inputs, only_encode=enc) #print("targets", np.unique(targets[:, 0].cpu().data.numpy())) optimizer.zero_grad() loss = criterion(outputs, targets[:, 0]) loss.backward() optimizer.step() epoch_loss.append(loss.data[0]) time_train.append(time.time() - start_time) if (doIouTrain): #start_time_iou = time.time() upsampledOutputs = up(outputs) iouEvalTrain.addBatch(upsampledOutputs.max(1)[1].unsqueeze(1).data, labels_orig) #print ("Time to add confusion matrix: ", time.time() - start_time_iou) #print(outputs.size()) if args.visualize and args.steps_plot > 0 and step % args.steps_plot == 0: start_time_plot = time.time() image = inputs[0].cpu().data #image[0] = image[0] * .229 + .485 #image[1] = image[1] * .224 + .456 #image[2] = image[2] * .225 + .406 #print("output", np.unique(outputs[0].cpu().max(0)[1].data.numpy())) board.image(image, f'input (epoch: {epoch}, step: {step})') if isinstance(outputs, list): #merge gpu tensors board.image(color_transform(outputs[0][0].cpu().max(0)[1].data.unsqueeze(0)), f'output (epoch: {epoch}, step: {step})') else: board.image(color_transform(outputs[0].cpu().max(0)[1].data.unsqueeze(0)), f'output (epoch: {epoch}, step: {step})') board.image(color_transform(targets[0].cpu().data), f'target (epoch: {epoch}, step: {step})') print ("Time to paint images: ", time.time() - start_time_plot) if args.steps_loss > 0 and step % args.steps_loss == 0: average = sum(epoch_loss) / len(epoch_loss) print(f'loss: {average:0.4} (epoch: {epoch}, step: {step})', "// Avg time/img: %.4f s" % (sum(time_train) / len(time_train) / args.batch_size)) average_epoch_loss_train = sum(epoch_loss) / len(epoch_loss) iouTrain = 0 if (doIouTrain): iouTrain, iou_classes = iouEvalTrain.getIoU() iouStr = getColorEntry(iouTrain)+'{:0.2f}'.format(iouTrain*100) + '\033[0m' print ("EPOCH IoU on TRAIN set: ", iouStr, "%", iou_classes) #Validate on 500 val images after each epoch of training print("----- VALIDATING - EPOCH", epoch, "-----") model.eval() epoch_loss_val = [] time_val = [] if (doIouVal): iouEvalVal = iouEval(NUM_CLASSES, args.ignoreindex) for step, (images, labels, images_orig, labels_orig) in enumerate(loader_val): start_time = time.time() if args.cuda: images = images.cuda() labels = labels.cuda() inputs = Variable(images, volatile=True) #volatile flag makes it free backward or outputs for eval targets = Variable(labels, volatile=True) outputs = model(inputs, only_encode=enc) loss = criterion(outputs, targets[:, 0]) epoch_loss_val.append(loss.data[0]) time_val.append(time.time() - start_time) #Add batch to calculate TP, FP and FN for iou estimation if (doIouVal): #start_time_iou = time.time() upsampledOutputs = up(outputs) iouEvalVal.addBatch(upsampledOutputs.max(1)[1].unsqueeze(1).data, labels_orig) #print ("Time to add confusion matrix: ", time.time() - start_time_iou) if args.visualize and args.steps_plot > 0 and step % args.steps_plot == 0: start_time_plot = time.time() image = inputs[0].cpu().data board.image(image, f'VAL input (epoch: {epoch}, step: {step})') if isinstance(outputs, list): #merge gpu tensors board.image(color_transform(outputs[0][0].cpu().max(0)[1].data.unsqueeze(0)), f'VAL output (epoch: {epoch}, step: {step})') else: board.image(color_transform(outputs[0].cpu().max(0)[1].data.unsqueeze(0)), f'VAL output (epoch: {epoch}, step: {step})') board.image(color_transform(targets[0].cpu().data), f'VAL target (epoch: {epoch}, step: {step})') print ("Time to paint images: ", time.time() - start_time_plot) if args.steps_loss > 0 and step % args.steps_loss == 0: average = sum(epoch_loss_val) / len(epoch_loss_val) print(f'VAL loss: {average:0.4} (epoch: {epoch}, step: {step})', "// Avg time/img: %.4f s" % (sum(time_val) / len(time_val) / args.batch_size)) average_epoch_loss_val = sum(epoch_loss_val) / len(epoch_loss_val) #scheduler.step(average_epoch_loss_val, epoch) ## scheduler 1 # update lr if needed iouVal = 0 if (doIouVal): iouVal, iou_classes = iouEvalVal.getIoU() iouStr = getColorEntry(iouVal)+'{:0.2f}'.format(iouVal*100) + '\033[0m' print ("EPOCH IoU on VAL set: ", iouStr, "%", iou_classes) # remember best valIoU and save checkpoint if iouVal == 0: current_acc = -average_epoch_loss_val else: current_acc = iouVal is_best = current_acc > best_acc best_acc = max(current_acc, best_acc) if enc: filenameCheckpoint = savedir + '/checkpoint_enc.pth.tar' filenameBest = savedir + '/model_best_enc.pth.tar' else: filenameCheckpoint = savedir + '/checkpoint.pth.tar' filenameBest = savedir + '/model_best.pth.tar' save_checkpoint({ 'epoch': epoch + 1, 'arch': str(model), 'state_dict': model.state_dict(), 'best_acc': best_acc, 'optimizer' : optimizer.state_dict(), }, is_best, filenameCheckpoint, filenameBest) #SAVE MODEL AFTER EPOCH if (enc): filename = f'{savedir}/model_encoder-{epoch:03}.pth' filenamebest = f'{savedir}/model_encoder_best.pth' else: filename = f'{savedir}/model-{epoch:03}.pth' filenamebest = f'{savedir}/model_best.pth' if args.epochs_save > 0 and step > 0 and step % args.epochs_save == 0: torch.save(model.state_dict(), filename) print(f'save: {filename} (epoch: {epoch})') if (is_best): torch.save(model.state_dict(), filenamebest) print(f'save: {filenamebest} (epoch: {epoch})') if (not enc): with open(savedir + "/best.txt", "w") as myfile: myfile.write("Best epoch is %d, with Val-IoU= %.4f" % (epoch, iouVal)) else: with open(savedir + "/best_encoder.txt", "w") as myfile: myfile.write("Best epoch is %d, with Val-IoU= %.4f" % (epoch, iouVal)) #SAVE TO FILE A ROW WITH THE EPOCH RESULT (train loss, val loss, train IoU, val IoU) #Epoch Train-loss Test-loss Train-IoU Test-IoU learningRate with open(automated_log_path, "a") as myfile: myfile.write("\n%d\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.8f" % (epoch, average_epoch_loss_train, average_epoch_loss_val, iouTrain, iouVal, usedLr )) return(model) #return model (convenience for encoder-decoder training)
def main(args): modelpath = args.loadDir + args.loadModel weightspath = args.loadDir + args.loadWeights print("Loading model: " + modelpath) print("Loading weights: " + weightspath) model = ERFNet(NUM_CLASSES) model = torch.nn.DataParallel(model) if (not args.cpu): model = model.cuda() def load_my_state_dict( model, state_dict ): #custom function to load model when not all dict elements own_state = model.state_dict() for name, param in state_dict.items(): if name not in own_state: print(name, " not loaded") continue own_state[name].copy_(param) return model model = load_my_state_dict(model, torch.load(weightspath)) print("Model and weights LOADED successfully") model.eval() if (not os.path.exists(args.datadir)): print("Error: datadir could not be loaded") loader = DataLoader(cityscapes(args.datadir, input_transform_cityscapes, target_transform_cityscapes, subset=args.subset), num_workers=args.num_workers, batch_size=args.batch_size, shuffle=False) iouEvalVal = iouEval(NUM_CLASSES) start = time.time() for step, (images, labels, filename, filenameGt) in enumerate(loader): if (not args.cpu): images = images.cuda() labels = labels.cuda() inputs = Variable(images, volatile=True) outputs = model(inputs) iouEvalVal.addBatch(outputs.max(1)[1].unsqueeze(1).data, labels) filenameSave = filename[0].split("leftImg8bit/")[1] print(step, filenameSave) iouVal, iou_classes = iouEvalVal.getIoU() iou_classes_str = [] for i in range(iou_classes.size(0)): iouStr = getColorEntry(iou_classes[i]) + '{:0.2f}'.format( iou_classes[i] * 100) + '\033[0m' iou_classes_str.append(iouStr) print("---------------------------------------") print("Took ", time.time() - start, "seconds") print("=======================================") #print("TOTAL IOU: ", iou * 100, "%") print("Per-Class IoU:") print(iou_classes_str[0], "Road") print(iou_classes_str[1], "sidewalk") print(iou_classes_str[2], "building") print(iou_classes_str[3], "wall") print(iou_classes_str[4], "fence") print(iou_classes_str[5], "pole") print(iou_classes_str[6], "traffic light") print(iou_classes_str[7], "traffic sign") print(iou_classes_str[8], "vegetation") print(iou_classes_str[9], "terrain") print(iou_classes_str[10], "sky") print(iou_classes_str[11], "person") print(iou_classes_str[12], "rider") print(iou_classes_str[13], "car") print(iou_classes_str[14], "truck") print(iou_classes_str[15], "bus") print(iou_classes_str[16], "train") print(iou_classes_str[17], "motorcycle") print(iou_classes_str[18], "bicycle") print("=======================================") iouStr = getColorEntry(iouVal) + '{:0.2f}'.format(iouVal * 100) + '\033[0m' print("MEAN IoU: ", iouStr, "%")
def train(args, model_student, model_teacher, enc=False): global best_acc weight = torch.ones(1) assert os.path.exists(args.datadir), "Error: datadir (dataset directory) could not be loaded" # Set data loading variables co_transform = MyCoTransform(enc, augment=True, height=480)#1024) co_transform_val = MyCoTransform(enc, augment=False, height=480)#1024) dataset_train = self_supervised_power(args.datadir, co_transform, 'train', file_format="csv", label_name="class", subsample=args.subsample) # dataset_train = self_supervised_power(args.datadir, None, 'train') dataset_val = self_supervised_power(args.datadir, None, 'val', file_format="csv", label_name="class", subsample=args.subsample) if args.force_n_classes > 0: color_transform_classes_prob = ColorizeClassesProb(args.force_n_classes) # Automatic color based on max class probability color_transform_classes = ColorizeClasses(args.force_n_classes) # Automatic color based on max class probability loader = DataLoader(dataset_train, num_workers=args.num_workers, batch_size=args.batch_size, shuffle=True) loader_val = DataLoader(dataset_val, num_workers=args.num_workers, batch_size=args.batch_size, shuffle=False) if args.cuda: weight = weight.cuda() # Set Loss functions if args.force_n_classes > 0: criterion = L1LossClassProbMasked() # L1 loss weighted with class prob with averaging over mini-batch else: criterion = L1LossMasked() criterion = CrossEntropyLoss2d() criterion_trav = L1LossTraversability() criterion_consistency = MSELossWeighted() criterion_val = CrossEntropyLoss2d() criterion_acc = ClassificationAccuracy() print(type(criterion)) savedir = f'../save/{args.savedir}' if (enc): automated_log_path = savedir + "/automated_log_encoder.txt" modeltxtpath = savedir + "/model_encoder.txt" else: automated_log_path = savedir + "/automated_log.txt" modeltxtpath = savedir + "/model.txt" if (not os.path.exists(automated_log_path)): #dont add first line if it exists with open(automated_log_path, "a") as myfile: myfile.write("Epoch\t\tTrain-loss\t\tTest-loss\t\tTrain-IoU\t\tTest-IoU\t\tlearningRate") with open(modeltxtpath, "w") as myfile: myfile.write(str(model_student)) #TODO: reduce memory in first gpu: https://discuss.pytorch.org/t/multi-gpu-training-memory-usage-in-balance/4163/4 #https://github.com/pytorch/pytorch/issues/1893 #optimizer = Adam(model.parameters(), 5e-4, (0.9, 0.999), eps=1e-08, weight_decay=2e-4) ## scheduler 1 optimizer = Adam(model_student.parameters(), LEARNING_RATE, BETAS, eps=OPT_EPS, weight_decay=WEIGHT_DECAY) if args.alternate_optimization: params_prob = [param for name, param in model.named_parameters() if name != "module.class_power"] params_power = [param for name, param in model.named_parameters() if name == "module.class_power"] optimizer_prob = Adam(params_prob, LEARNING_RATE, BETAS, eps=OPT_EPS, weight_decay=WEIGHT_DECAY) optimizer_power = Adam(params_power, LEARNING_RATE, BETAS, eps=OPT_EPS, weight_decay=WEIGHT_DECAY) start_epoch = 1 if args.resume: #Must load weights, optimizer, epoch and best value. if enc: filenameCheckpoint = savedir + '/checkpoint_enc.pth.tar' else: filenameCheckpoint = savedir + '/checkpoint.pth.tar' assert os.path.exists(filenameCheckpoint), "Error: resume option was used but checkpoint was not found in folder" checkpoint = torch.load(filenameCheckpoint) start_epoch = checkpoint['epoch'] model_student.load_state_dict(checkpoint['state_dict']) model_teacher.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) best_acc = checkpoint['best_acc'] print("=> Loaded checkpoint at epoch {})".format(checkpoint['epoch'])) # Initialize teacher with same weights as student. copyWeightsToModelNoGrad(model_student, model_teacher) #scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.5) # set up scheduler ## scheduler 1 lambda1 = lambda epoch: pow((1-((epoch-1)/args.num_epochs)),0.9) ## scheduler 2 scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda1) ## scheduler 2 if args.alternate_optimization: scheduler_prob = lr_scheduler.LambdaLR(optimizer_prob, lr_lambda=lambda1) ## scheduler 2 scheduler_power = lr_scheduler.LambdaLR(optimizer_power, lr_lambda=lambda1) ## scheduler 2 if args.visualize: board = Dashboard(args.port) writer = SummaryWriter() log_base_dir = writer.file_writer.get_logdir() + "/" print("Saving tensorboard log to: " + log_base_dir) total_steps_train = 0 total_steps_val = 0 # Figure out histogram plot indices. steps_hist = int(len(loader_val)/NUM_HISTOGRAMS) steps_img_train = int(len(loader)/(NUM_IMG_PER_EPOCH-1)) if steps_img_train == 0: steps_img_train = 1 steps_img_val = int(len(loader_val)/(NUM_IMG_PER_EPOCH-1)) if steps_img_val == 0: steps_img_val = 1 hist_bins = np.arange(-0.5, args.force_n_classes+0.5, 1.0) for epoch in range(start_epoch, args.num_epochs+1): print("----- TRAINING - EPOCH", epoch, "-----") if epoch < MAX_CONSISTENCY_EPOCH: cur_consistency_weight = epoch / MAX_CONSISTENCY_EPOCH else: cur_consistency_weight = 1.0 if args.no_mean_teacher: cur_consistency_weight = 0.0 if args.alternate_optimization: if epoch % 2 == 0: scheduler_power.step(epoch) else: scheduler_prob.step(epoch) else: scheduler.step(epoch) ## scheduler 2 average_loss_student_val = 0 average_loss_teacher_val = 0 epoch_loss_student = [] epoch_loss_teacher = [] epoch_acc_student = [] epoch_acc_teacher = [] epoch_loss_trav_student = [] epoch_loss_trav_teacher = [] epoch_loss_consistency = [] time_train = [] time_load = [] time_iter = [0.0] doIouTrain = args.iouTrain doIouVal = args.iouVal usedLr = 0 for param_group in optimizer.param_groups: print("LEARNING RATE: ", param_group['lr']) usedLr = float(param_group['lr']) model_student.train() model_teacher.train() start_time = time.time() for step, (images1, images2, labels) in enumerate(loader): time_load.append(time.time() - start_time) start_time = time.time() #print (labels.size()) #print (np.unique(labels.numpy())) #print("labels: ", np.unique(labels[0].numpy())) #labels = torch.ones(4, 1, 512, 1024).long() if args.cuda: images1 = images1.cuda() images2 = images2.cuda() labels = labels.cuda() inputs1 = Variable(images1) inputs2 = Variable(images2) targets = Variable(labels) if (args.force_n_classes) > 0: # Forced into discrete classes. output_student_prob, output_student_trav, output_student_power = model_student(inputs1, only_encode=enc) output_teacher_prob, output_teacher_trav, output_teacher_power = model_teacher(inputs2, only_encode=enc) if args.alternate_optimization: if epoch % 2 == 0: optimizer_power.zero_grad() else: optimizer_prob.zero_grad() else: optimizer.zero_grad() loss_student_pred = criterion(output_student_prob, targets) loss_teacher_pred = criterion(output_teacher_prob, targets) loss_consistency = criterion_consistency(output_student_prob, output_teacher_prob, cur_consistency_weight) acc_student = criterion_acc(output_student_prob, targets) acc_teacher = criterion_acc(output_teacher_prob, targets) else: # Straight regressoin output_student, output_student_trav = model_student(inputs1, only_encode=enc) output_teacher, output_teacher_trav = model_teacher(inputs2, only_encode=enc) optimizer.zero_grad() loss_student_pred = criterion(output_student, targets) loss_teacher_pred = criterion(output_teacher, targets) loss_consistency = criterion_consistency(output_student, output_teacher, cur_consistency_weight) # Loss independent of how scalar value is determined loss_student_trav = criterion_trav(output_student_trav, targets) loss_teacher_trav = criterion_trav(output_teacher_trav, targets) #print("targets", np.unique(targets[:, 0].cpu().data.numpy())) # Do backward pass. loss_student_pred.backward(retain_graph=True) if epoch>0 and not args.no_mean_teacher: loss_student_trav.backward(retain_graph=True) loss_consistency.backward() else: loss_student_trav.backward() if args.alternate_optimization: if epoch % 2 == 0: optimizer_power.step() else: optimizer_prob.step() else: optimizer.step() # Average over first 50 epochs. if epoch < DISCOUNT_RATE_START_EPOCH: cur_discount_rate = DISCOUNT_RATE_START else: cur_discount_rate = DISCOUNT_RATE copyWeightsToModelWithDiscount(model_student, model_teacher, cur_discount_rate) # copyWeightsToModelWithDiscount(model_student, model_teacher, DISCOUNT_RATE) epoch_loss_student.append(loss_student_pred.data.item()) epoch_loss_teacher.append(loss_teacher_pred.data.item()) epoch_loss_trav_student.append(loss_student_trav.data.item()) epoch_loss_trav_teacher.append(loss_teacher_trav.data.item()) epoch_loss_consistency.append(loss_consistency.data.item()) if (args.force_n_classes) > 0: epoch_acc_student.append(acc_student.data.item()) epoch_acc_teacher.append(acc_teacher.data.item()) time_train.append(time.time() - start_time) # if (doIouTrain): # #start_time_iou = time.time() # iouEvalTrain.addBatch(outputs.max(1)[1].unsqueeze(1).data, targets.data) # #print ("Time to add confusion matrix: ", time.time() - start_time_iou) #print(outputs.size()) if args.visualize and step % steps_img_train == 0: step_vis_no = total_steps_train + len(epoch_loss_student) # Figure out and compute tensor to visualize. if args.force_n_classes > 0: # Compute weighted power consumption sum_dim = output_student_prob.dim()-3 # weighted_sum_output = (output_student_prob * output_student_power).sum(dim=sum_dim, keepdim=True) if (isinstance(output_student_prob, list)): max_prob, vis_output = getMaxProbValue(output_student_prob[0][0].cpu().data, output_student_power[0][0].cpu().data) max_prob_teacher, vis_output_teacher = getMaxProbValue(output_teacher_prob[0][0].cpu().data, output_teacher_power[0][0].cpu().data) writer.add_image("train/2_classes", color_transform_classes_prob(output_student_prob[0][0].cpu().data), step_vis_no) writer.add_image("train/3_max_class_probability", max_prob[0][0], step_vis_no) # writer.add_image("train/4_weighted_output", color_transform_output(weighted_sum_output[0][0].cpu().data), step_vis_no) else: max_prob, vis_output = getMaxProbValue(output_student_prob[0].cpu().data, output_student_power[0].cpu().data) max_prob_teacher, vis_output_teacher = getMaxProbValue(output_teacher_prob[0].cpu().data, output_teacher_power[0].cpu().data) writer.add_image("train/2_classes", color_transform_classes_prob(output_student_prob[0].cpu().data), step_vis_no) writer.add_image("train/3_max_class_probability", max_prob[0], step_vis_no) # writer.add_image("train/4_weighted_output", color_transform_output(weighted_sum_output[0].cpu().data), step_vis_no) else: if (isinstance(output_teacher, list)): vis_output = output_student[0][0].cpu().data vis_output_teacher = output_teacher[0][0].cpu().data else: vis_output = output_student[0].cpu().data vis_output_teacher = output_teacher[0].cpu().data if (isinstance(output_teacher_trav, list)): trav_output = output_student_trav[0][0].cpu().data trav_output_teacher = output_teacher_trav[0][0].cpu().data else: trav_output = output_student_trav[0].cpu().data trav_output_teacher = output_teacher_trav[0].cpu().data start_time_plot = time.time() image1 = inputs1[0].cpu().data image2 = inputs2[0].cpu().data # board.image(image, f'input (epoch: {epoch}, step: {step})') writer.add_image("train/1_input_student", image1, step_vis_no) writer.add_image("train/1_input_teacher", image2, step_vis_no) # writer.add_image("train/5_output_student", color_transform_output(vis_output), step_vis_no) # writer.add_image("train/5_output_teacher", color_transform_output(vis_output_teacher), step_vis_no) writer.add_image("train/7_output_trav_student", trav_output, step_vis_no) writer.add_image("train/7_output_trav_teacher", trav_output_teacher, step_vis_no) # board.image(color_transform_target(targets[0].cpu().data), # f'target (epoch: {epoch}, step: {step})') writer.add_image("train/6_target", color_transform_classes(targets.cpu().data), step_vis_no) # Visualize graph. writer.add_graph(model_teacher, inputs2) print ("Time for visualization: ", time.time() - start_time_plot) len_epoch_loss = len(epoch_loss_student) for ind, val in enumerate(epoch_loss_student): writer.add_scalar("train/instant_loss_student", val, total_steps_train + ind) for ind, val in enumerate(epoch_loss_teacher): writer.add_scalar("train/instant_loss_teacher", val, total_steps_train + ind) for ind, val in enumerate(epoch_loss_trav_student): writer.add_scalar("train/instant_loss_trav_student", val, total_steps_train + ind) for ind, val in enumerate(epoch_loss_trav_teacher): writer.add_scalar("train/instant_loss_trav_teacher", val, total_steps_train + ind) for ind, val in enumerate(epoch_loss_consistency): writer.add_scalar("train/instant_loss_consistency", val, total_steps_train + ind) if (args.force_n_classes) > 0: for ind, val in enumerate(epoch_acc_student): writer.add_scalar("train/instant_acc_student", val, total_steps_train + ind) for ind, val in enumerate(epoch_acc_teacher): writer.add_scalar("train/instant_acc_teacher", val, total_steps_train + ind) total_steps_train += len_epoch_loss avg_loss_teacher = sum(epoch_loss_teacher)/len(epoch_loss_teacher) writer.add_scalar("train/epoch_loss_student", sum(epoch_loss_student)/len(epoch_loss_student), total_steps_train) writer.add_scalar("train/epoch_loss_teacher", avg_loss_teacher, total_steps_train) writer.add_scalar("train/epoch_loss_trav_student", sum(epoch_loss_trav_student)/len(epoch_loss_trav_student), total_steps_train) writer.add_scalar("train/epoch_loss_trav_teacher", sum(epoch_loss_trav_teacher)/len(epoch_loss_trav_teacher), total_steps_train) writer.add_scalar("train/epoch_loss_consistency", sum(epoch_loss_consistency)/len(epoch_loss_consistency), total_steps_train) if (args.force_n_classes) > 0: writer.add_scalar("train/epoch_acc_student", sum(epoch_acc_student)/len(epoch_acc_student), total_steps_train) writer.add_scalar("train/epoch_acc_teacher", sum(epoch_acc_teacher)/len(epoch_acc_teacher), total_steps_train) # Clear loss for next loss print iteration. # Output class power costs power_dict = {} if args.force_n_classes > 0: for ind, val in enumerate(output_teacher_power.squeeze()): power_dict[str(ind)] = val writer.add_scalars("params/class_cost", power_dict, total_steps_train) epoch_loss_student = [] epoch_loss_teacher = [] epoch_loss_consistency = [] epoch_loss_trav_student = [] epoch_loss_trav_teacher = [] epoch_acc_student = [] epoch_acc_teacher = [] # Print current loss. print(f'loss: {avg_loss_teacher:0.4} (epoch: {epoch}, step: {step})', "// Train: %.4f s" % (sum(time_train) / len(time_train) / args.batch_size), "// Load: %.4f s" % (sum(time_load) / len(time_load) / args.batch_size), "// Iter: %.4f s" % (sum(time_iter) / len(time_iter) / args.batch_size)) if step == 0: time_iter.clear() time_iter.append(time.time() - start_time) # Save time for image loading duration. start_time = time.time() average_epoch_loss_train = avg_loss_teacher iouTrain = 0 if (doIouTrain): iouTrain, iou_classes = iouEvalTrain.getIoU() iouStr = getColorEntry(iouTrain)+'{:0.2f}'.format(iouTrain*100) + '\033[0m' print ("EPOCH IoU on TRAIN set: ", iouStr, "%") #Validate on 500 val images after each epoch of training print("----- VALIDATING - EPOCH", epoch, "-----") model_student.eval() model_teacher.eval() epoch_loss_student_val = [] epoch_loss_teacher_val = [] epoch_acc_student_val = [] epoch_acc_teacher_val = [] epoch_loss_trav_student_val = [] epoch_loss_trav_teacher_val = [] time_val = [] for step, (images1, images2, labels) in enumerate(loader_val): start_time = time.time() if args.cuda: images1 = images1.cuda() images2 = images2.cuda() labels = labels.cuda() inputs1 = Variable(images1, volatile=True) #volatile flag makes it free backward or outputs for eval inputs2 = Variable(images2, volatile=True) #volatile flag makes it free backward or outputs for eval targets = Variable(labels, volatile=True) if args.force_n_classes: output_student_prob, output_student_trav, output_student_power = model_student(inputs1, only_encode=enc) output_teacher_prob, output_teacher_trav, output_teacher_power = model_teacher(inputs2, only_encode=enc) max_prob, output_student = getMaxProbValue(output_student_prob, output_student_power) max_prob, output_teacher = getMaxProbValue(output_teacher_prob, output_teacher_power) # Compute weighted power consumption sum_dim = output_student_prob.dim()-3 # weighted_sum_output = (output_student_prob * output_student_power).sum(dim=sum_dim, keepdim=True) else: output_student, output_student_trav = model_student(inputs1, only_encode=enc) output_teacher, output_teacher_trav = model_teacher(inputs2, only_encode=enc) loss_student = criterion_val(output_student_prob, targets) loss_teacher = criterion_val(output_teacher_prob, targets) loss_student_trav = criterion_trav(output_student_trav, targets) loss_teacher_trav = criterion_trav(output_teacher_trav, targets) epoch_loss_student_val.append(loss_student.data.item()) epoch_loss_teacher_val.append(loss_teacher.data.item()) epoch_loss_trav_student_val.append(loss_student_trav.data.item()) epoch_loss_trav_teacher_val.append(loss_teacher_trav.data.item()) if args.force_n_classes: acc_student = criterion_acc(output_student_prob, targets) acc_teacher = criterion_acc(output_teacher_prob, targets) epoch_acc_student_val.append(acc_student.data.item()) epoch_acc_teacher_val.append(acc_teacher.data.item()) time_val.append(time.time() - start_time) #Add batch to calculate TP, FP and FN for iou estimation # if (doIouVal): # #start_time_iou = time.time() # iouEvalVal.addBatch(outputs.max(1)[1].unsqueeze(1).data, targets.data) # #print ("Time to add confusion matrix: ", time.time() - start_time_iou) # Plot images if args.visualize and step % steps_img_val == 0: if (isinstance(output_teacher_trav, list)): trav_output = output_student_trav[0][0].cpu().data trav_output_teacher = output_teacher_trav[0][0].cpu().data else: trav_output = output_student_trav[0].cpu().data trav_output_teacher = output_teacher_trav[0].cpu().data step_vis_no = total_steps_val + len(epoch_loss_student_val) start_time_plot = time.time() image1 = inputs1[0].cpu().data image2 = inputs2[0].cpu().data # board.image(image, f'VAL input (epoch: {epoch}, step: {step})') writer.add_image("val/1_input_student", image1, step_vis_no) writer.add_image("val/1_input_teacher", image2, step_vis_no) if isinstance(output_teacher, list): #merge gpu tensors # board.image(color_transform_output(outputs[0][0].cpu().data), # f'VAL output (epoch: {epoch}, step: {step})') # writer.add_image("val/5_output_teacher", color_transform_output(output_teacher[0][0].cpu().data), step_vis_no) # writer.add_image("val/5_output_student", color_transform_output(output_student[0][0].cpu().data), step_vis_no) if args.force_n_classes > 0: writer.add_image("val/2_classes", color_transform_classes_prob(output_teacher_prob[0][0].cpu().data), step_vis_no) writer.add_image("val/3_max_class_probability", max_prob[0][0], step_vis_no) # writer.add_image("val/4_weighted_output", color_transform_output(weighted_sum_output[0][0].cpu().data), step_vis_no) else: # board.image(color_transform_output(outputs[0].cpu().data), # f'VAL output (epoch: {epoch}, step: {step})') # writer.add_image("val/5_output_teacher", color_transform_output(output_teacher[0].cpu().data), step_vis_no) # writer.add_image("val/5_output_student", color_transform_output(output_student[0].cpu().data), step_vis_no) if args.force_n_classes > 0: writer.add_image("val/2_classes", color_transform_classes_prob(output_teacher_prob[0].cpu().data), step_vis_no) writer.add_image("val/3_max_class_probability", max_prob[0], step_vis_no) # writer.add_image("val/4_weighted_output", color_transform_output(weighted_sum_output[0].cpu().data), step_vis_no) # board.image(color_transform_target(targets[0].cpu().data), # f'VAL target (epoch: {epoch}, step: {step})') writer.add_image("val/7_output_trav_student", trav_output, step_vis_no) writer.add_image("val/7_output_trav_teacher", trav_output_teacher, step_vis_no) writer.add_image("val/6_target", color_transform_classes(targets.cpu().data), step_vis_no) print ("Time to paint images: ", time.time() - start_time_plot) # Plot histograms if args.force_n_classes > 0 and args.visualize and steps_hist > 0 and step % steps_hist == 0: image1 = inputs1[0].cpu().data+0.5 # +0.5 to remove zero-mean normalization image2 = inputs2[0].cpu().data+0.5 hist_ind = int(step / steps_hist) if (isinstance(output_teacher_prob, list)): _, hist_array = output_teacher_prob[0][0].cpu().data.max(dim=0, keepdim=True) else: _, hist_array = output_teacher_prob[0].cpu().data.max(dim=0, keepdim=True) writer.add_histogram("val/hist_"+str(hist_ind), hist_array.numpy().flatten(), total_steps_train, hist_bins) # Use train steps so we can compare with class power plot if isinstance(output_teacher, list): writer.add_image("val/classes_"+str(hist_ind), color_transform_classes_prob(output_teacher_prob[0][0].cpu().data), total_steps_train) else: writer.add_image("val/classes_"+str(hist_ind), color_transform_classes_prob(output_teacher_prob[0].cpu().data), total_steps_train) if epoch == start_epoch: writer.add_image("val/hist/input_"+str(hist_ind), image2, total_steps_train) # Visualize image used to compute histogram total_steps_val += len(epoch_loss_student_val) avg_loss_teacher_val = sum(epoch_loss_teacher_val) / len(epoch_loss_teacher_val) print(f'VAL loss_teacher: {avg_loss_teacher_val:0.4} (epoch: {epoch}, step: {total_steps_val})', "// Avg time/img: %.4f s" % (sum(time_val) / len(time_val) / args.batch_size)) writer.add_scalar("val/epoch_loss_student", sum(epoch_loss_student_val) / len(epoch_loss_student_val), total_steps_val) writer.add_scalar("val/epoch_loss_teacher", avg_loss_teacher_val, total_steps_val) writer.add_scalar("val/epoch_loss_trav_student", sum(epoch_loss_trav_student_val) / len(epoch_loss_trav_student_val), total_steps_val) writer.add_scalar("val/epoch_loss_trav_teacher", sum(epoch_loss_trav_teacher_val) / len(epoch_loss_trav_teacher_val), total_steps_val) if args.force_n_classes: writer.add_scalar("val/epoch_acc_student", sum(epoch_acc_student_val) / len(epoch_acc_student_val), total_steps_val) writer.add_scalar("val/epoch_acc_teacher", sum(epoch_acc_teacher_val) / len(epoch_acc_teacher_val), total_steps_val) epoch_loss_student_val = [] epoch_loss_teacher_val = [] epoch_acc_student_val = [] epoch_acc_teacher_val = [] epoch_loss_trav_student_val = [] epoch_loss_trav_teacher_val = [] average_epoch_loss_val = avg_loss_teacher_val #scheduler.step(average_epoch_loss_val, epoch) ## scheduler 1 # update lr if needed iouVal = 0 if (doIouVal): iouVal, iou_classes = iouEvalVal.getIoU() iouStr = getColorEntry(iouVal)+'{:0.2f}'.format(iouVal*100) + '\033[0m' print ("EPOCH IoU on VAL set: ", iouStr, "%") # remember best valIoU and save checkpoint if iouVal == 0: current_acc = average_epoch_loss_val else: current_acc = iouVal is_best = current_acc > best_acc best_acc = max(current_acc, best_acc) if enc: filenameCheckpoint = savedir + '/checkpoint_enc.pth.tar' filenameBest = savedir + '/model_best_enc.pth.tar' else: filenameCheckpoint = savedir + '/checkpoint.pth.tar' filenameBest = savedir + '/model_best.pth.tar' save_checkpoint({ 'epoch': epoch + 1, 'arch': str(model_teacher), 'state_dict': model_teacher.state_dict(), 'best_acc': best_acc, 'optimizer' : optimizer.state_dict(), }, is_best, filenameCheckpoint, filenameBest) #SAVE MODEL AFTER EPOCH if (enc): filename = f'{savedir}/model_encoder-{epoch:03}.pth' filenamebest = f'{savedir}/model_encoder_best.pth' else: filename = f'{savedir}/model-{epoch:03}.pth' filenamebest = f'{savedir}/model_best.pth' if args.epochs_save > 0 and step > 0 and step % args.epochs_save == 0: torch.save(model_teacher.state_dict(), filename) print(f'save: {filename} (epoch: {epoch})') if (is_best): torch.save(model_teacher.state_dict(), filenamebest) print(f'save: {filenamebest} (epoch: {epoch})') if (not enc): with open(savedir + "/best.txt", "w") as myfile: myfile.write("Best epoch is %d, with Val-IoU= %.4f" % (epoch, iouVal)) else: with open(savedir + "/best_encoder.txt", "w") as myfile: myfile.write("Best epoch is %d, with Val-IoU= %.4f" % (epoch, iouVal)) #SAVE TO FILE A ROW WITH THE EPOCH RESULT (train loss, val loss, train IoU, val IoU) #Epoch Train-loss Test-loss Train-IoU Test-IoU learningRate with open(automated_log_path, "a") as myfile: myfile.write("\n%d\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.8f" % (epoch, average_epoch_loss_train, average_epoch_loss_val, iouTrain, iouVal, usedLr )) return(model_student, model_teacher) #return model (convenience for encoder-decoder training)
def train(args, model, enc=False): global best_acc weight = torch.ones(NUM_CLASSES) weight[0] = 121.21 weight[1] = 947.02 weight[2] = 151.92 weight[3] = 428.31 weight[4] = 25.88 weight[5] = 235.97 weight[6] = 885.72 weight[7] = 911.87 weight[8] = 307.49 weight[9] = 204.69 weight[10] = 813.92 weight[11] = 5.83 weight[12] = 34.22 weight[13] = 453.34 weight[14] = 346.10 weight[15] = 250.19 weight[16] = 119.99 weight[17] = 75.28 weight[18] = 76.71 weight[19] = 8.58 weight[20] = 281.68 weight[21] = 924.07 weight[22] = 3.91 weight[23] = 7.14 weight[24] = 88.89 weight[25] = 59.00 weight[26] = 126.59 weight[27] = 0 assert os.path.exists( args.datadir), "Error: datadir (dataset directory) could not be loaded" co_transform = MyCoTransform(enc, augment=True, height=args.height) #1024) co_transform_val = MyCoTransform(enc, augment=False, height=args.height) #1024) dataset_train = cityscapes(args.datadir, co_transform, 'train') dataset_val = cityscapes(args.datadir, co_transform_val, 'val') loader = DataLoader(dataset_train, num_workers=args.num_workers, batch_size=args.batch_size, shuffle=True) loader_val = DataLoader(dataset_val, num_workers=args.num_workers, batch_size=args.batch_size, shuffle=False) if args.cuda: #criterion =LovaszLoss2d() #criterion = CrossEntropyLoss2d(weight.cuda()) criterion = FocalLoss2d(weight.cuda()) else: #criterion = LovaszLoss2d() #criterion = CrossEntropyLoss2d(weight) criterion = FocalLoss2d(weight.cuda()) print(type(criterion)) savedir = f'../save/{args.savedir}' if (enc): automated_log_path = savedir + "/automated_log_encoder.txt" modeltxtpath = savedir + "/model_encoder.txt" else: automated_log_path = savedir + "/automated_log.txt" modeltxtpath = savedir + "/model.txt" if (not os.path.exists(automated_log_path) ): #dont add first line if it exists with open(automated_log_path, "a") as myfile: myfile.write( "Epoch\t\tTrain-loss\t\tTest-loss\t\tTrain-IoU\t\tTest-IoU\t\tlearningRate" ) with open(modeltxtpath, "w") as myfile: myfile.write(str(model)) #optimizer = Adam(model.parameters(), 5e-4, (0.9, 0.999), eps=1e-08, weight_decay=2e-4) ## scheduler 1 optimizer = Adam(model.parameters(), 1e-4, (0.9, 0.999), eps=1e-08, weight_decay=1e-4) ## scheduler 2 start_epoch = 1 #scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.5) # set up scheduler ## scheduler 1 lambda1 = lambda epoch: pow( (1 - ((epoch - 1) / args.num_epochs)), 0.9) ## scheduler 2 scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda1) ## scheduler 2 time_train_perepoch = [] for epoch in range(start_epoch, args.num_epochs + 1): print("----- TRAINING - EPOCH", epoch, "-----") start_time_perepoch = time.time() scheduler.step(epoch) ## scheduler 2 epoch_loss = [] time_train = [] doIouTrain = args.iouTrain doIouVal = args.iouVal usedLr = 0 for param_group in optimizer.param_groups: print("LEARNING RATE: ", param_group['lr']) usedLr = float(param_group['lr']) model.train() for step, (images, labels) in enumerate(loader): start_time = time.time() if args.cuda: images = images.cuda() labels = labels.cuda() #inputs = images #targets= labels inputs = Variable(images) targets = Variable(labels) outputs = model(inputs, only_encode=enc) optimizer.zero_grad() loss = criterion(outputs, targets[:, 0]) #loss = criterion(outputs, targets) loss.backward() optimizer.step() epoch_loss.append(loss.data[0]) time_train.append(time.time() - start_time) if args.steps_loss > 0 and step % args.steps_loss == 0: average = sum(epoch_loss) / len(epoch_loss) print( f'loss: {average:0.4} (epoch: {epoch}, step: {step})', "// Avg time/img: %.4f s" % (sum(time_train) / len(time_train) / args.batch_size)) average_epoch_loss_train = sum(epoch_loss) / len(epoch_loss) #evalIoU.printConfMatrix(confMatrix, evalIoU.args) time_train_perepoch.append(time.time() - start_time_perepoch) print("// Time per epoch: %.4f hours" % (sum(time_train_perepoch) / len(time_train_perepoch) / 3600.0)) #Validate on 500 val images after each epoch of training print("----- VALIDATING - EPOCH", epoch, "-----") model.eval() epoch_loss_val = [] time_val = [] if (doIouVal): iouEvalVal = iouEval(NUM_CLASSES) with torch.no_grad(): for step, (images, labels) in enumerate(loader_val): start_time = time.time() if args.cuda: images = images.cuda() labels = labels.cuda() #inputs =images #targets=labels inputs = Variable( images, requires_grad=False ) #, volatile=True) #volatile flag makes it free backward or outputs for eval targets = Variable(labels, requires_grad=False) #, volatile=True) outputs = model(inputs, only_encode=enc) loss = criterion(outputs, targets[:, 0]) epoch_loss_val.append(loss.data[0]) time_val.append(time.time() - start_time) if (doIouVal): iouEvalVal.addBatch( outputs.max(1)[1].unsqueeze(1).data, targets.data) if args.steps_loss > 0 and step % args.steps_loss == 0: average = sum(epoch_loss_val) / len(epoch_loss_val) print( f'VAL loss: {average:0.4} (epoch: {epoch}, step: {step})', "// Avg time/img: %.4f s" % (sum(time_val) / len(time_val) / args.batch_size)) average_epoch_loss_val = sum(epoch_loss_val) / len(epoch_loss_val) #scheduler.step(average_epoch_loss_val, epoch) ## scheduler 1 # update lr if needed # Calculate IOU scores on class level from matrix iouVal = 0 iouTrain = 0 if (doIouVal): iouVal, iou_classes, accVal, acc_classes = iouEvalVal.getIoU() print("pole : %.6f" % (iou_classes[0] * 100.0), "%\t") print("slight : %.6f" % (iou_classes[1] * 100.0), "%\t") print("bboard : %.6f" % (iou_classes[2] * 100.0), "%\t") print("tlight : %.6f" % (iou_classes[3] * 100.0), "%\t") print("car : %.6f" % (iou_classes[4] * 100.0), "%\t") print("truck : %.6f" % (iou_classes[5] * 100.0), "%\t") print("bicycle : %.6f" % (iou_classes[6] * 100.0), "%\t") print("motor : %.6f" % (iou_classes[7] * 100.0), "%\t") print("bus : %.6f" % (iou_classes[8] * 100.0), "%\t") print("tsignf : %.6f" % (iou_classes[9] * 100.0), "%\t") print("tsignb : %.6f" % (iou_classes[10] * 100.0), "%\t") print("road : %.6f" % (iou_classes[11] * 100.0), "%\t") print("sidewalk: %.6f" % (iou_classes[12] * 100.0), "%\t") print("curbcut : %.6f" % (iou_classes[13] * 100.0), "%\t") print("crosspln: %.6f" % (iou_classes[14] * 100.0), "%\t") print("bikelane: %.6f" % (iou_classes[15] * 100.0), "%\t") print("curb : %.6f" % (iou_classes[16] * 100.0), "%\t") print("fence : %.6f" % (iou_classes[17] * 100.0), "%\t") print("wall : %.6f" % (iou_classes[18] * 100.0), "%\t") print("building: %.6f" % (iou_classes[19] * 100.0), "%\t") print("person : %.6f" % (iou_classes[20] * 100.0), "%\t") print("rider : %.6f" % (iou_classes[21] * 100.0), "%\t") print("sky : %.6f" % (iou_classes[22] * 100.0), "%\t") print("vege : %.6f" % (iou_classes[23] * 100.0), "%\t") print("terrain : %.6f" % (iou_classes[24] * 100.0), "%\t") print("markings: %.6f" % (iou_classes[25] * 100.0), "%\t") print("crosszeb: %.6f" % (iou_classes[26] * 100.0), "%\t") iouStr = getColorEntry(iouVal) + '{:0.2f}'.format( iouVal * 100) + '\033[0m' print("EPOCH IoU on VAL set: ", iouStr, "%") print("pole : %.6f" % (acc_classes[0] * 100.0), "%\t") print("slight : %.6f" % (acc_classes[1] * 100.0), "%\t") print("bboard : %.6f" % (acc_classes[2] * 100.0), "%\t") print("tlight : %.6f" % (acc_classes[3] * 100.0), "%\t") print("car : %.6f" % (acc_classes[4] * 100.0), "%\t") print("truck : %.6f" % (acc_classes[5] * 100.0), "%\t") print("bicycle : %.6f" % (acc_classes[6] * 100.0), "%\t") print("motor : %.6f" % (acc_classes[7] * 100.0), "%\t") print("bus : %.6f" % (acc_classes[8] * 100.0), "%\t") print("tsignf : %.6f" % (acc_classes[9] * 100.0), "%\t") print("tsignb : %.6f" % (acc_classes[10] * 100.0), "%\t") print("road : %.6f" % (acc_classes[11] * 100.0), "%\t") print("sidewalk: %.6f" % (acc_classes[12] * 100.0), "%\t") print("curbcut : %.6f" % (acc_classes[13] * 100.0), "%\t") print("crosspln: %.6f" % (acc_classes[14] * 100.0), "%\t") print("bikelane: %.6f" % (acc_classes[15] * 100.0), "%\t") print("curb : %.6f" % (acc_classes[16] * 100.0), "%\t") print("fence : %.6f" % (acc_classes[17] * 100.0), "%\t") print("wall : %.6f" % (acc_classes[18] * 100.0), "%\t") print("building: %.6f" % (acc_classes[19] * 100.0), "%\t") print("person : %.6f" % (acc_classes[20] * 100.0), "%\t") print("rider : %.6f" % (acc_classes[21] * 100.0), "%\t") print("sky : %.6f" % (acc_classes[22] * 100.0), "%\t") print("vege : %.6f" % (acc_classes[23] * 100.0), "%\t") print("terrain : %.6f" % (acc_classes[24] * 100.0), "%\t") print("markings: %.6f" % (acc_classes[25] * 100.0), "%\t") print("crosszeb: %.6f" % (acc_classes[26] * 100.0), "%\t") accStr = getColorEntry(accVal) + '{:0.2f}'.format( accVal * 100) + '\033[0m' print("EPOCH ACC on VAL set: ", accStr, "%") # remember best valIoU and save checkpoint if iouVal == 0: current_acc = average_epoch_loss_val else: current_acc = iouVal is_best = current_acc > best_acc best_acc = max(current_acc, best_acc) if (enc and epoch == args.num_epochs): best_acc = 0 if enc: filenameCheckpoint = savedir + '/checkpoint_enc.pth' filenameBest = savedir + '/model_best_enc.pth' else: filenameCheckpoint = savedir + '/checkpoint.pth' filenameBest = savedir + '/model_best.pth' save_checkpoint({ 'state_dict': model.state_dict(), }, is_best, filenameCheckpoint, filenameBest) #SAVE MODEL AFTER EPOCH if (enc): filename = f'{savedir}/model_encoder-{epoch:03}.pth' filenamebest = f'{savedir}/model_encoder_best_each.pth' else: filename = f'{savedir}/model-{epoch:03}.pth' filenamebest = f'{savedir}/model_best_each.pth' if args.epochs_save > 0 and step > 0 and step % args.epochs_save == 0: torch.save(model.state_dict(), filename) print(f'save: {filename} (epoch: {epoch})') #if (True) #(is_best): torch.save(model.state_dict(), filenamebest) print(f'save: {filenamebest} (epoch: {epoch})') filenameSuperBest = f'{savedir}/model_superbest.pth' if (is_best): torch.save(model.state_dict(), filenameSuperBest) print(f'saving superbest') if (not enc): with open(savedir + "/best.txt", "w") as myfile: myfile.write("Best epoch is %d, with Val-IoU= %.4f" % (epoch, iouVal)) else: with open(savedir + "/best_encoder.txt", "w") as myfile: myfile.write("Best epoch is %d, with Val-IoU= %.4f" % (epoch, iouVal)) #SAVE TO FILE A ROW WITH THE EPOCH RESULT (train loss, val loss, train IoU, val IoU) #Epoch Train-loss Test-loss Train-IoU Test-IoU learningRate with open(automated_log_path, "a") as myfile: myfile.write("\n%d\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.8f" % (epoch, average_epoch_loss_train, average_epoch_loss_val, iouTrain, iouVal, usedLr)) return (model) #return model (convenience for encoder-decoder training)
def train(savedir, model, dataloader_train, dataloader_eval, criterion, optimizer, args, enc=False): min_loss = float('inf') # use tensorboard writer = SummaryWriter(log_dir=savedir) if (enc): automated_log_path = savedir + "/automated_log_encoder.txt" modeltxtpath = savedir + "/model_encoder.txt" else: automated_log_path = savedir + "/automated_log.txt" modeltxtpath = savedir + "/model.txt" if (not os.path.exists(automated_log_path) ): #dont add first line if it exists with open(automated_log_path, "a") as myfile: myfile.write( "Epoch\t\tTrain-loss\t\tTest-loss\t\tTrain-IoU\t\tTest-IoU\t\tlearningRate" ) with open(modeltxtpath, "w") as myfile: myfile.write(str(model)) start_epoch = 1 if args.resume: #Must load weights, optimizer, epoch and best value. if enc: filenameCheckpoint = savedir + '/checkpoint_enc.pth.tar' else: filenameCheckpoint = savedir + '/checkpoint.pth.tar' assert os.path.exists( filenameCheckpoint ), "Error: resume option was used but checkpoint was not found in folder" checkpoint = torch.load(filenameCheckpoint) start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) best_acc = checkpoint['best_acc'] print("=> Loaded checkpoint at epoch {})".format(checkpoint['epoch'])) #scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.5) # set up scheduler ## scheduler 1 lambda1 = lambda epoch: pow( (1 - ((epoch - 1) / args.num_epochs)), 0.9) ## scheduler 2 scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda1) ## scheduler 2 if args.visualize and args.steps_plot > 0: board = Dashboard(args.port) for epoch in range(start_epoch, args.num_epochs + 1): print("----- TRAINING - EPOCH", epoch, "-----") scheduler.step(epoch) epoch_loss = [] time_train = [] doIouTrain = args.iouTrain doIouVal = args.iouVal if (doIouTrain): iouEvalTrain = iouEval(mean_and_var) usedLr = 0 for param_group in optimizer.param_groups: print("LEARNING RATE: ", param_group['lr']) usedLr = float(param_group['lr']) model.train() for step, (images, labels, _) in enumerate(dataloader_train): start_time = time.time() #print (labels.size()) #print (np.unique(labels.numpy())) #print("labels: ", np.unique(labels[0].numpy())) #labels = torch.ones(4, 1, 512, 1024).long() if args.cuda: images = images.cuda() labels = labels.cuda() #print("image: ", images.size()) #print("labels: ", labels.size()) inputs = Variable(images) targets = Variable(labels) outputs = model(inputs, only_encode=enc) # print("output: ", outputs.size()) #TODO # print("targets", np.unique(targets[:, 0].cpu().data.numpy())) optimizer.zero_grad() loss = criterion(outputs, targets[:, 0]) loss.backward() optimizer.step() epoch_loss.append(loss) time_train.append(time.time() - start_time) if (doIouTrain): #start_time_iou = time.time() iouEvalTrain.addBatch( outputs.max(1)[1].unsqueeze(1).data, targets.data) #print ("Time to add confusion matrix: ", time.time() - start_time_iou) #print(outputs.size()) if args.visualize and args.steps_plot > 0 and step % args.steps_plot == 0: start_time_plot = time.time() image = inputs[0].cpu().data #image[0] = image[0] * .229 + .485 #image[1] = image[1] * .224 + .456 #image[2] = image[2] * .225 + .406 #print("output", np.unique(outputs[0].cpu().max(0)[1].data.numpy())) board.image(image, f'input (epoch: {epoch}, step: {step})') if isinstance(outputs, list): #merge gpu tensors board.image( color_transform( outputs[0][0].cpu().max(0)[1].data.unsqueeze(0)), f'output (epoch: {epoch}, step: {step})') else: board.image( color_transform( outputs[0].cpu().max(0)[1].data.unsqueeze(0)), f'output (epoch: {epoch}, step: {step})') board.image(color_transform(targets[0].cpu().data), f'target (epoch: {epoch}, step: {step})') print("Time to paint images: ", time.time() - start_time_plot) if args.steps_loss > 0 and step % args.steps_loss == 0: average = sum(epoch_loss) / len(epoch_loss) print( f'loss: {average:0.4} (epoch: {epoch}, step: {step})', "// Avg time/img: %.4f s" % (sum(time_train) / len(time_train) / args.batch_size)) average_epoch_loss_train = sum(epoch_loss) / len(epoch_loss) writer.add_scalar('train_loss', average_epoch_loss_train, epoch) iouTrain = 0 if (doIouTrain): iouTrain, iou_classes = iouEvalTrain.getIoU() iouStr = getColorEntry(iouTrain) + '{:0.2f}'.format( iouTrain * 100) + '\033[0m' print("EPOCH IoU on TRAIN set: ", iouStr, "%") #Validate on 500 val images after each epoch of training print("----- VALIDATING - EPOCH", epoch, "-----") model.eval() epoch_loss_val = [] time_val = [] if (doIouVal): iouEvalVal = iouEval(mean_and_var) for step, (images, labels, _) in enumerate(dataloader_eval): start_time = time.time() if args.cuda: images = images.cuda() labels = labels.cuda() optimizer.zero_grad() inputs = Variable(images) targets = Variable(labels) with torch.no_grad(): outputs = model(inputs, only_encode=enc) loss = criterion(outputs, targets[:, 0]) epoch_loss_val.append(loss.data) time_val.append(time.time() - start_time) if args.steps_loss > 0 and step % args.steps_loss == 0: average = sum(epoch_loss_val) / len(epoch_loss_val) print( f'VAL loss: {average:0.4} (epoch: {epoch}, step: {step})', "// Avg time/img: %.4f s" % (sum(time_val) / len(time_val) / args.batch_size)) average_epoch_loss_val = sum(epoch_loss_val) / len(epoch_loss_val) #scheduler.step(average_epoch_loss_val, epoch) ## scheduler 1 # update lr if needed writer.add_scalar('eval_loss', average_epoch_loss_val, epoch) iouVal = 0 if (doIouVal): iouVal, iou_classes = iouEvalVal.getIoU() iouStr = getColorEntry(iouVal) + '{:0.2f}'.format( iouVal * 100) + '\033[0m' print("EPOCH IoU on VAL set: ", iouStr, "%") is_best = average_epoch_loss_val < min_loss min_loss = min(min_loss, average_epoch_loss_val) if enc: filenameCheckpoint = savedir + '/checkpoint_enc.pth.tar' filenameBest = savedir + '/model_best_enc.pth.tar' else: filenameCheckpoint = savedir + '/checkpoint.pth.tar' filenameBest = savedir + '/model_best.pth.tar' save_checkpoint( { 'epoch': epoch + 1, 'arch': str(model), 'state_dict': model.state_dict(), 'best_acc': min_loss, 'optimizer': optimizer.state_dict(), }, is_best, filenameCheckpoint, filenameBest) #SAVE MODEL AFTER EPOCH if (enc): filename = f'{savedir}/model_encoder-{epoch:03}.pth' filenamebest = f'{savedir}/model_encoder_best.pth' else: filename = f'{savedir}/model-{epoch:03}.pth' filenamebest = f'{savedir}/model_best.pth' if args.epochs_save > 0 and step > 0 and step % args.epochs_save == 0: torch.save(model.state_dict(), filename) print(f'save: {filename} (epoch: {epoch})') if (is_best): torch.save(model.state_dict(), filenamebest) print(f'save: {filenamebest} (epoch: {epoch})') if (not enc): with open(savedir + "/best.txt", "w") as myfile: myfile.write("Best epoch is %d, with Val-IoU= %.4f" % (epoch, iouVal)) else: with open(savedir + "/best_encoder.txt", "w") as myfile: myfile.write("Best epoch is %d, with Val-IoU= %.4f" % (epoch, iouVal)) #SAVE TO FILE A ROW WITH THE EPOCH RESULT (train loss, val loss, train IoU, val IoU) #Epoch Train-loss Test-loss Train-IoU Test-IoU learningRate with open(automated_log_path, "a") as myfile: myfile.write("\n%d\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.8f" % (epoch, average_epoch_loss_train, average_epoch_loss_val, iouTrain, iouVal, usedLr)) writer.close() torch.save(model.state_dict(), f'{savedir}/weight_final.pth') return (model) #return model (convenience for encoder-decoder training)
def train(args, model, classNum, epochNum, encoderOnly=False): start_epoch = 1 best_acc = 0 # === Dataset Processing === # if args.dataset == 'cityscapes': co_transform = MyCoTransform(encoderOnly, dataAugment=True, height=args.height) co_transform_val = MyCoTransform(encoderOnly, dataAugment=False, height=args.height) dataDir = '/media/commlab/TenTB/swhung/SegNet/Cityscapes/' dataset_train = cityscapes(dataDir, co_transform, 'train') dataset_val = cityscapes(dataDir, co_transform_val, 'val') saveDir = f'../save/{args.saveDir}' # # loader_train = DataLoader(dataset_train, num_workers=args.num_workers, batch_size=args.batchSize, shuffle=True) loader_val = DataLoader(dataset_val, num_workers=args.num_workers, batch_size=args.batchSize, shuffle=False) # === Optimization Setting === # # ** optimizer if args.optimizer == 'adam': optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=1e-4) elif args.optimizer == 'sgd': optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=0.9, weight_decay=1e-4) # ** learing rate scheduler my_lambda = lambda epoch: pow((1 - ((epoch - 1) / epochNum)), 0.9) # poly scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=my_lambda) # ** apply loss function classWeight = getClassWeight(args.dataset, classNum) if args.cuda: classWeight = classWeight.cuda() criterion = CrossEntropyLoss2d(weight=classWeight, ignore_index=19) # === save information in .txt files === # if (encoderOnly): automated_log_path = saveDir + "/automated_log_encoder.txt" modeltxtpath = saveDir + "/model_txt_encoder.txt" else: automated_log_path = saveDir + "/automated_log.txt" modeltxtpath = saveDir + "/model_txt.txt" if (not os.path.exists(automated_log_path) ): # do not add first line if it exists with open(automated_log_path, "a") as myfile: myfile.write( "Epoch\t\tTrain-loss\t\tTest-loss\t\tTrain-IoU\t\tTest-IoU\t\tlearningRate" ) with open(modeltxtpath, "w") as myfile: myfile.write(str(model)) # === Training === # for epoch in range(start_epoch, epochNum + 1): print("----- TRAINING - EPOCH", epoch, "-----") model.train() scheduler.step(epoch - 1) epoch_loss = [] time_train = [] if (args.doEvalTrain): iouEvalTrain = iouEval(classNum) usedLr = 0 for param_group in optimizer.param_groups: print("learning rate: ", param_group['lr']) usedLr = float(param_group['lr']) # ** training iteration for iter, (images, labels) in enumerate(loader_train): start_time = time.time() slice = torch.split(images, 1, 1) rgb = torch.cat((slice[0], slice[1], slice[2]), 1) d = torch.cat((slice[3], slice[4]), 1) #depth and luminance if args.cuda: rgb_inputs = rgb.cuda() d_input = d.cuda() targets = labels.cuda() img_size = list(targets.size())[2:4] # run the model if args.onlyWholeNet: outputs = model(inputs) else: outputs = model(rgb_inputs, d_input, only_encoder=encoderOnly) # run the back-propagation loss = criterion(outputs, targets[:, 0]) optimizer.zero_grad() loss.backward() optimizer.step() epoch_loss.append(loss.item()) time_train.append(time.time() - start_time) if (args.doEvalTrain): iouEvalTrain.addBatch( outputs.max(1)[1].unsqueeze(1).data, targets.data) # print the training loss information if args.iter_loss > 0 and iter % args.iter_loss == 0: average = sum(epoch_loss) / len(epoch_loss) print( f'loss: {average:0.4} (epoch: {epoch}, iter: {iter})', "// Avg time/img: %.4f s" % (sum(time_train) / len(time_train) / args.batchSize)) average_epoch_loss_train = sum(epoch_loss) / len(epoch_loss) iouTrain = 0 if (args.doEvalTrain): iouTrain, iou_classes = iouEvalTrain.getIoU() iouStr = getColorEntry(iouTrain) + '{:0.2f}'.format( iouTrain * 100) + '\033[0m' print("EPOCH IoU on TRAIN set: ", iouStr, "%") if epoch <= 10 or epoch >= 70: with torch.no_grad(): # Validate on 500 val images after each epoch of training print("----- VALIDATING - EPOCH", epoch, "-----") model.eval() epoch_loss_val = [] time_val = [] if (args.doEvalVal): iouEvalVal = iouEval(classNum) # ** valadation iteration for iter, (images, labels) in enumerate(loader_val): start_time = time.time() slice = torch.split(images, 1, 1) rgb = torch.cat((slice[0], slice[1], slice[2]), 1) d = torch.cat((slice[3], slice[4]), 1) #depth and luminance if args.cuda: rgb_inputs = rgb.cuda() d_input = d.cuda() targets = labels.cuda() img_size = list(targets.size())[2:4] # run the model if args.onlyWholeNet: outputs = model(inputs) else: outputs = model(rgb_inputs, d_input, only_encoder=encoderOnly) loss = criterion(outputs, targets[:, 0]) epoch_loss_val.append(loss.item()) time_val.append(time.time() - start_time) # Add batch to calculate TP, FP and FN for iou estimation if (args.doEvalVal): iouEvalVal.addBatch( outputs.max(1)[1].unsqueeze(1).data, targets.data) # print the valadation loss information if args.iter_loss > 0 and iter % args.iter_loss == 0: average = sum(epoch_loss_val) / len(epoch_loss_val) print( f'VAL loss: {average:0.4} (epoch: {epoch}, iter: {iter})', "// Avg time/img: %.4f s" % (sum(time_val) / len(time_val) / args.batchSize)) average_epoch_loss_val = sum(epoch_loss_val) / len(epoch_loss_val) # print epoch val IoU accuracy iouVal = 0 if (args.doEvalVal): iouVal, iou_classes = iouEvalVal.getIoU() iouStr = getColorEntry(iouVal) + '{:0.2f}'.format( iouVal * 100) + '\033[0m' print("EPOCH IoU on VAL set: ", iouStr, "%") # remember best valIoU and save checkpoint if iouVal == 0: current_acc = average_epoch_loss_val else: current_acc = iouVal is_best = current_acc > best_acc best_acc = max(current_acc, best_acc) if encoderOnly: filenameCheckpoint = saveDir + '/checkpoint_enc.pth.tar' filenameBest = saveDir + '/model_best_encoder.pth.tar' else: filenameCheckpoint = saveDir + '/checkpoint.pth.tar' filenameBest = saveDir + '/model_best.pth.tar' save_checkpoint( { 'epoch': epoch + 1, 'arch': str(model), 'state_dict': model.state_dict(), 'best_acc': best_acc, 'optimizer': optimizer.state_dict(), }, is_best, filenameCheckpoint, filenameBest) if (encoderOnly): filename = f'{saveDir}/model_encoder-{epoch:03}.pth' filenamebest = f'{saveDir}/model_best_encoder.pth' else: filename = f'{saveDir}/model-{epoch:03}.pth' filenamebest = f'{saveDir}/model_best.pth' # save model after some epochs if args.epochs_save > 0 and iter > 0 and iter % args.epochs_save == 0: torch.save(model.state_dict(), filename) print(f'save: {filename} (epoch: {epoch})') # save the best model if (is_best): torch.save(model.state_dict(), filenamebest) print(f'save: {filenamebest} (epoch: {epoch})') if (not encoderOnly): with open(saveDir + "/best_IoU.txt", "w") as myfile: myfile.write("Best epoch is %d, with Val-IoU= %.4f" % (epoch, iouVal)) else: with open(saveDir + "/best_IoU_encoder.txt", "w") as myfile: myfile.write("Best epoch is %d, with Val-IoU= %.4f" % (epoch, iouVal)) # save information in .txt files #SAVE TO FILE A ROW WITH THE EPOCH RESULT (train loss, val loss, train IoU, val IoU) #Epoch Train-loss Test-loss Train-IoU Test-IoU learningRate with open(automated_log_path, "a") as myfile: myfile.write( "\n%d\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.8f" % (epoch, average_epoch_loss_train, average_epoch_loss_val, iouTrain, iouVal, usedLr)) return model # return model (convenience for encoder-decoder training)
def main(args): modelpath = args.loadDir + args.loadModel weightspath = args.loadDir + args.loadWeights print("Loading model: " + modelpath) print("Loading weights: " + weightspath) model = FSFNet(NUM_CLASSES) #model = torch.nn.DataParallel(model) if (not args.cpu): model = torch.nn.DataParallel(model).cuda() def load_my_state_dict( model, state_dict ): #custom function to load model when not all dict elements own_state = model.state_dict() for name, param in state_dict.items(): # print(name) # print(param) if name not in own_state: if name.startswith("module."): own_state[name.split("module.")[-1]].copy_(param) else: print(name, " not loaded") continue else: own_state[name].copy_(param) return model model = load_my_state_dict( model, torch.load(weightspath, map_location=lambda storage, loc: storage)) print("Model and weights LOADED successfully") model.eval() if (not os.path.exists(args.datadir)): print("Error: datadir could not be loaded") loader = DataLoader(camvid(args.datadir, input_transform_camvid, target_transform_camvid, subset=args.subset), num_workers=args.num_workers, batch_size=args.batch_size, shuffle=False) iouEvalVal = iouEval(NUM_CLASSES) start = time.time() for step, (images, labels, filename, filenameGt) in enumerate(loader): if (not args.cpu): images = images.cuda() labels = labels.cuda() inputs = Variable(images, volatile=True) outputs = model(inputs) iouEvalVal.addBatch(outputs.max(1)[1].unsqueeze(1).data, labels) filenameSave = filename[0].split("images/")[1] print(step, filenameSave) iouVal, iou_classes = iouEvalVal.getIoU() iou_classes_str = [] for i in range(iou_classes.size(0)): iouStr = getColorEntry(iou_classes[i]) + '{:0.2f}'.format( iou_classes[i] * 100) + '\033[0m' iou_classes_str.append(iouStr) print("---------------------------------------") print("Took ", time.time() - start, "seconds") print("=======================================") #print("TOTAL IOU: ", iou * 100, "%") print("Per-Class IoU:") print(iou_classes_str[0], "Sky") print(iou_classes_str[1], "Building") print(iou_classes_str[2], "Pole") print(iou_classes_str[3], "Road") print(iou_classes_str[4], "Pavement") print(iou_classes_str[5], "Tree") print(iou_classes_str[6], "SignSymbol") print(iou_classes_str[7], "Fence") print(iou_classes_str[8], "Car") print(iou_classes_str[9], "Pedestrian") print(iou_classes_str[10], "Bicyclist") print("=======================================") iouStr = getColorEntry(iouVal) + '{:0.2f}'.format(iouVal * 100) + '\033[0m' print("MEAN IoU: ", iouStr, "%")
def train(args, rmodel, model, enc=False): best_acc = 0 weight = classWeights(NUM_CLASSES) assert os.path.exists( args.datadir), "Error: datadir (dataset directory) could not be loaded" co_transform = MyCoTransform(augment=True, height=args.height) co_transform_val = MyCoTransform(augment=False, height=args.height) dataset_train = cityscapes(args.datadir, co_transform, 'train') dataset_val = cityscapes(args.datadir, co_transform_val, 'val') loader = DataLoader(dataset_train, num_workers=args.num_workers, batch_size=args.batch_size, shuffle=True) loader_val = DataLoader(dataset_val, num_workers=args.num_workers, batch_size=args.batch_size, shuffle=False) if args.cuda: weight = weight.cuda() rcriterion = torch.nn.L1Loss() savedir = '/home/shyam.nandan/NewExp/F_erfnet_pytorch_ours_w_gt_v2_multiply/save/' + args.savedir #change path if (enc): automated_log_path = savedir + "/automated_log_encoder.txt" modeltxtpath = savedir + "/model_encoder.txt" else: automated_log_path = savedir + "/automated_log.txt" modeltxtpath = savedir + "/model.txt" if (not os.path.exists(automated_log_path)): with open(automated_log_path, "a") as myfile: myfile.write( "Epoch\t\tTrain-loss\t\tTest-loss\t\tTrain-IoU\t\tTest-IoU\t\tlearningRate" ) with open(modeltxtpath, "w") as myfile: myfile.write(str(model)) optimizer = Adam(model.parameters(), 5e-4, (0.9, 0.999), eps=1e-08, weight_decay=2e-4) ## roptimizer = Adam(rmodel.parameters(), 2e-4, (0.9, 0.999)) ## restoration scheduler start_epoch = 1 scheduler = lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.99) rscheduler = lr_scheduler.StepLR(roptimizer, step_size=30, gamma=0.5) ## Restoration schedular for epoch in range(start_epoch, args.num_epochs + 1): print("----- TRAINING - EPOCH", epoch, "-----") scheduler.step() ## scheduler 2 rscheduler.step() epoch_loss = [] time_train = [] doIouTrain = args.iouTrain doIouVal = args.iouVal if (doIouTrain): iouEvalTrain = iouEval(NUM_CLASSES) usedLr = 0 rusedLr = 0 for param_group in optimizer.param_groups: print("Segmentation LEARNING RATE: ", param_group['lr']) usedLr = float(param_group['lr']) for param_group in roptimizer.param_groups: print("Restoration LEARNING RATE: ", param_group['lr']) rusedLr = float(param_group['lr']) model.eval() epoch_loss_val = [] time_val = [] if (doIouVal): iouEvalVal = iouEval(NUM_CLASSES) for step, (timages, images, labels, filename) in enumerate(loader_val): start_time = time.time() if args.cuda: images = images.cuda() labels = labels.cuda() timages = timages.cuda() inputs = Variable( timages, volatile=True ) #volatile flag makes it free backward or outputs for eval itargets = Variable(images, volatile=True) targets = Variable(labels, volatile=True) ss_inputs = rmodel(inputs, flag=0, r_fb1=0, r_fb2=0) outs = model(ss_inputs, only_encode=enc) tminus_outs = outs.detach() tplus_outs = outs.detach() for num_feedback in range(3): optimizer.zero_grad() roptimizer.zero_grad() ss_inputs = rmodel(inputs, flag=1, r_fb1=(tplus_outs - tminus_outs), r_fb2=ss_inputs.detach()) loss = rcriterion(ss_inputs, itargets) outs = model(ss_inputs.detach(), only_encode=enc) tminus_outs = tplus_outs tplus_outs = outs.detach() outputs = outs del outs, tminus_outs, tplus_outs gc.collect() Gamma = [0, 0, 0] Alpha = [1, 1, 1] loss = CB_iFl(outputs, targets[:, 0], weight, gamma=Gamma[0], alpha=Alpha[0]) epoch_loss_val.append(loss.data[0]) time_val.append(time.time() - start_time) if (doIouVal): #start_time_iou = time.time() iouEvalVal_img = iouEval(NUM_CLASSES) iouEvalVal_img.addBatch( outputs.max(1)[1].unsqueeze(1).data, targets.data) iouEvalVal.addBatch( outputs.max(1)[1].unsqueeze(1).data, targets.data) #print ("Time to add confusion matrix: ", time.time() - start_time_iou) label_color = Colorize()( outputs[0].max(0)[1].byte().cpu().data.unsqueeze(0)) label_save = ToPILImage()(label_color) filenameSave = '../save_color_restored_joint_afl_CBFL/' + filename[ 0].split('/')[-2] im_iou, _ = iouEvalVal_img.getIoU() if not os.path.exists(filenameSave): os.makedirs(filenameSave) #Uncomment to save output #label_save.save(filenameSave+ '/' + str(" %6.4f " %im_iou[0].data.numpy()) + '_' + filename[0].split('/')[-1]) if args.steps_loss > 0 and step % args.steps_loss == 0: average = sum(epoch_loss_val) / len(epoch_loss_val) print('Val loss: ', average, 'Epoch: ', epoch, 'Step: ', step) average_epoch_loss_val = sum(epoch_loss_val) / len(epoch_loss_val) iouVal = 0 if (doIouVal): iouVal, iou_classes = iouEvalVal.getIoU() iouStr = getColorEntry(iouVal) + '{:0.2f}'.format( iouVal * 100) + '\033[0m' print(iouVal, iou_classes, iouStr) return (model)
def train(args, rmodel, model, enc=False): best_acc = 0 weight = classWeights(NUM_CLASSES) assert os.path.exists(args.datadir), "Error: datadir (dataset directory) could not be loaded" co_transform = MyCoTransform(augment=True, height=args.height) co_transform_val = MyCoTransform(augment=False, height=args.height) dataset_train = cityscapes(args.datadir, co_transform, 'train') dataset_val = cityscapes(args.datadir, co_transform_val, 'val') loader = DataLoader(dataset_train, num_workers=args.num_workers, batch_size=args.batch_size, shuffle=True) loader_val = DataLoader(dataset_val, num_workers=args.num_workers, batch_size=args.batch_size, shuffle=False) if args.cuda: weight = weight.cuda() rcriterion = torch.nn.L1Loss() savedir = '/home/shyam.nandan/NewExp/final_code/save/' + args.savedir automated_log_path = savedir + "/automated_log.txt" modeltxtpath = savedir + "/model.txt" if (not os.path.exists(automated_log_path)): with open(automated_log_path, "a") as myfile: myfile.write("Epoch\t\tTrain-loss\t\tTest-loss\t\tTrain-IoU\t\tTest-IoU\t\tlearningRate") with open(modeltxtpath, "w") as myfile: myfile.write(str(model)) optimizer = Adam(model.parameters(), 5e-4, (0.9, 0.999),eps=1e-08, weight_decay=2e-4) roptimizer = Adam(rmodel.parameters(), 2e-4, (0.9, 0.999)) start_epoch = 1 scheduler = lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.99) rscheduler = lr_scheduler.StepLR(roptimizer, step_size=30, gamma=0.5) for epoch in range(start_epoch, args.num_epochs+1): print("----- TRAINING - EPOCH", epoch, "-----") scheduler.step() rscheduler.step() epoch_loss = [] time_train = [] doIouTrain = args.iouTrain doIouVal = args.iouVal if (doIouTrain): iouEvalTrain = iouEval(NUM_CLASSES) usedLr = 0 rusedLr = 0 for param_group in optimizer.param_groups: print("Segmentation LEARNING RATE: ", param_group['lr']) usedLr = float(param_group['lr']) for param_group in roptimizer.param_groups: print("Restoration LEARNING RATE: ", param_group['lr']) rusedLr = float(param_group['lr']) model.train() for step, (timages, images, labels) in enumerate(loader): start_time = time.time() if args.cuda: images = images.cuda() labels = labels.cuda() timages = timages.cuda() inputs = Variable(timages) itargets = Variable(images) targets = Variable(labels) ss_inputs = rmodel(inputs, flag = 0, r_fb1 = 0, r_fb2 = 0) outs = model(ss_inputs, only_encode=enc) tminus_outs = outs.detach() tplus_outs = outs.detach() outputs = [] for num_feedback in range(3): optimizer.zero_grad() roptimizer.zero_grad() ss_inputs = rmodel(inputs, flag= 1, r_fb1 = (tplus_outs - tminus_outs) , r_fb2 = ss_inputs.detach()) loss = rcriterion(ss_inputs, itargets) loss.backward() roptimizer.step() optimizer.zero_grad() roptimizer.zero_grad() outs = model(ss_inputs.detach(),only_encode=enc) outputs.append(outs) tminus_outs = tplus_outs tplus_outs = outs.detach() del outs, tminus_outs, tplus_outs gc.collect() loss = 0.0 Gamma = [0, 0.1, 0.2] Alpha = [1, 1, 1] for i, o in enumerate(outputs): loss += CB_iFl(o, targets[:, 0], weight, gamma = Gamma[i], alpha = Alpha[i]) loss.backward() optimizer.step() epoch_loss.append(loss.data[0]) time_train.append(time.time() - start_time) if (doIouTrain): iouEvalTrain.addBatch(outputs.max(1)[1].unsqueeze(1).data, targets.data) if args.steps_loss > 0 and step % args.steps_loss == 0: average = sum(epoch_loss) / len(epoch_loss) print('loss: ', average.data.cpu()[0], 'Epoch: ', epoch, 'Step: ', step) average_epoch_loss_train = sum(epoch_loss) / len(epoch_loss) iouTrain = 0 if (doIouTrain): iouTrain, iou_classes = iouEvalTrain.getIoU() iouStr = getColorEntry(iouTrain)+'{:0.2f}'.format(iouTrain*100) + '\033[0m' print ("EPOCH IoU on TRAIN set: ", iouStr, "%") print("----- VALIDATING - EPOCH", epoch, "-----") model.eval() epoch_loss_val = [] time_val = [] if (doIouVal): iouEvalVal = iouEval(NUM_CLASSES) for step, (timages, images, labels) in enumerate(loader_val): start_time = time.time() if args.cuda: images = images.cuda() labels = labels.cuda() timages = timages.cuda() inputs = Variable(timages, volatile=True) itargets = Variable(images, volatile=True) targets = Variable(labels, volatile=True) ss_inputs = rmodel(inputs, flag = 0, r_fb1 = 0, r_fb2 = 0) outs = model(ss_inputs, only_encode=enc) tminus_outs = outs.detach() tplus_outs = outs.detach() for num_feedback in range(3): optimizer.zero_grad() roptimizer.zero_grad() ss_inputs = rmodel(inputs, flag= 1, r_fb1 = (tplus_outs - tminus_outs) , r_fb2 = ss_inputs.detach()) loss = rcriterion(ss_inputs, itargets) outs = model(ss_inputs.detach(),only_encode=enc) tminus_outs = tplus_outs tplus_outs = outs.detach() ################################## del ss_inputs, tplus_outs, tminus_outs outputs = outs loss = CB_iFl(outputs, targets[:, 0], weight, gamma = Gamma[0], alpha = Alpha[0]) epoch_loss_val.append(loss.data[0]) time_val.append(time.time() - start_time) if (doIouVal): iouEvalVal.addBatch(outputs.max(1)[1].unsqueeze(1).data, targets.data) if args.steps_loss > 0 and step % args.steps_loss == 0: average = sum(epoch_loss_val) / len(epoch_loss_val) print('Val loss: ', average, 'Epoch: ', epoch, 'Step: ', step) average_epoch_loss_val = sum(epoch_loss_val) / len(epoch_loss_val) iouVal = 0 if (doIouVal): iouVal, iou_classes = iouEvalVal.getIoU() iouStr = getColorEntry(iouVal)+'{:0.2f}'.format(iouVal*100) + '\033[0m' print ("EPOCH IoU on VAL set: ", iouStr, "%") # remember best valIoU and save checkpoint if iouVal == 0: current_acc = -average_epoch_loss_val else: current_acc = iouVal is_best = current_acc > best_acc best_acc = max(current_acc, best_acc) filenameCheckpoint = savedir + '/checkpoint.pth.tar' filenameBest = savedir + '/model_best.pth.tar' save_checkpoint({ 'epoch': epoch + 1, 'arch': str(model), 'state_dict': model.state_dict(), 'best_acc': best_acc, 'optimizer' : optimizer.state_dict(), }, is_best, filenameCheckpoint, filenameBest) #SAVE MODEL AFTER EPOCH filename = savedir + '/model-{epoch:03}.pth' filenamebest = savedir + '/model_best.pth' if args.epochs_save > 0 and step > 0 and step % args.epochs_save == 0: torch.save(model.state_dict(), filename) print(filename, epoch) if (is_best): torch.save(model.state_dict(), filenamebest) torch.save(rmodel.state_dict(), savedir + '/rmodel_best.pth') print(filenamebest,epoch) with open(savedir + "/best.txt", "w") as myfile: myfile.write("Best epoch is %d, with Val-IoU= %.4f" % (epoch, iouVal)) #SAVE TO FILE A ROW WITH THE EPOCH RESULT (train loss, val loss, train IoU, val IoU) #Epoch Train-loss Test-loss Train-IoU Test-IoU learningRate with open(automated_log_path, "a") as myfile: myfile.write("\n%d\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.8f" % (epoch, average_epoch_loss_train, average_epoch_loss_val, iouTrain, iouVal, usedLr )) return(model)