Example #1
0
def main():
    best_acc = 0

    co_transform = MyCoTransform(ENCODER_ONLY,
                                 augment=True,
                                 height=IMAGE_HEIGHT)
    co_transform_val = MyCoTransform(ENCODER_ONLY,
                                     augment=False,
                                     height=IMAGE_HEIGHT)

    #train data
    dataset_train = idd_lite(DATA_ROOT, co_transform, 'train')
    print("length of training set: ", len(dataset_train))
    #test data
    dataset_val = idd_lite(DATA_ROOT, co_transform_val, 'val')
    print("length of validation set: ", len(dataset_val))

    # NOTE: PLEASE DON'T CHANGE batch_size and num_workers here. We have limited resources.
    loader_train = DataLoader(dataset_train,
                              num_workers=NUM_WORKERS,
                              batch_size=BATCH_SIZE,
                              shuffle=True)
    loader_val = DataLoader(dataset_val,
                            num_workers=NUM_WORKERS,
                            batch_size=BATCH_SIZE,
                            shuffle=True)
    dataiter = iter(loader_val)
    seven_val_images = []
    for i in range(7):
        (val_image_A, val_image_B, val_image_labels) = dataiter.next()
        seven_val_images.append(
            (val_image_A.to(device), val_image_B.to(device)))
        cv2.imwrite(
            os.path.join(OUTPUT_DIR, str(i), 'A.tiff'),
            np.rollaxis((val_image_A[0, :, :, :].squeeze().cpu().numpy() *
                         255).astype('uint8'), 0, 3))
        cv2.imwrite(
            os.path.join(OUTPUT_DIR, str(i), 'B.tiff'),
            np.rollaxis((val_image_B[0, :, :, :].squeeze().cpu().numpy() *
                         255).astype('uint8'), 0, 3))
        cv2.imwrite(os.path.join(OUTPUT_DIR, str(i), 'label.tiff'),
                    (val_image_labels[0, :, :, :].squeeze().cpu().numpy()
                     ).astype('uint8'))

    # ## Cross Entropy  Loss ##
    # Negative Log Loss   |Plot of -log(x) vs x
    # - | -
    # ![alt](img/nll.png) | ![alt](img/nll-log.png)
    #
    # The negative log-likelihood becomes unhappy at smaller values, where it can reach infinite unhappiness (that’s too sad), and becomes less unhappy at larger values. Because we are summing the loss function to all the correct classes, what’s actually happening is that whenever the network assigns high confidence at the correct class, the unhappiness is low, but when the network assigns low confidence at the correct class, the unhappiness is high.

    # In[12]:

    criterion = torch.nn.CrossEntropyLoss()

    #get some random training images
    print("length of training couples: ", len(loader_train))
    print(len(loader_val))
    dataiter = iter(loader_train)
    (images, images1, labels, filename) = dataiter.next()  #ChangedByUs
    # for step, (images, labels) in enumerate(loader_train):
    # plt.figure()
    # plt.imshow(ToPILImage()(images[0].cpu()))
    # plt.figure()
    # plt.imshow(ToPILImage()(Colorize()(labels[0].cpu())))
    # break

    # ## Model ##

    model_file = importlib.import_module('erfnet')
    model = model_file.Net(NUM_CLASSES).to(device)

    # ### Optimizer ###

    # We use adam optimizer. It can be replaced with SGD and other optimizers
    optimizer = Adam(model.parameters(),
                     5e-4, (0.9, 0.999),
                     eps=1e-08,
                     weight_decay=1e-4)
    start_epoch = 1

    print("device used: ", device)

    # ### Training Procedure ###
    softmax = torch.nn.Softmax(dim=1)

    steps_loss = 50
    my_start_time = time.time()
    for epoch in range(start_epoch, NUM_EPOCHS + 1):
        print("----- TRAINING - EPOCH", epoch, "-----")

        epoch_loss = []
        time_train = []

        doIouTrain = IOUTRAIN
        doIouVal = IOUVAL

        if (doIouTrain):
            iouEvalTrain = iouEval(NUM_CLASSES)

        model.train()
        for step, (images, images1, labels,
                   filename) in enumerate(loader_train):  #ChangedByUs
            start_time = time.time()
            # inputs = [images.to(device), images1.to(device)] #ChangedByUs
            inputs = images.to(device)
            inputs1 = images1.to(device)  #ChangedByUs
            targets = labels.to(device)
            targets_orig = targets.clone()
            targets[targets_orig >= 128] = 1  # ChangedByUs
            targets[targets_orig < 128] = 0  # ChangedByUs
            #for x_u in targets.unique():
            #    print(int(x_u), ' appears ', int(torch.stack([(targets==x_u).sum()])), ' times.\n')
            outputs = model([inputs, inputs1], only_encode=ENCODER_ONLY)
            # zero the parameter gradients
            optimizer.zero_grad()
            # forward + backward + optimize
            loss = criterion(outputs, targets[:, 0])
            loss.backward()
            optimizer.step()

            epoch_loss.append(loss.item())
            time_train.append(time.time() - start_time)

            if (doIouTrain):
                #start_time_iou = time.time()
                iouEvalTrain.addBatch(
                    outputs.max(1)[1].unsqueeze(1).data, targets.data)
                #print ("Time to add confusion matrix: ", time.time() - start_time_iou)

            # print statistics
            if steps_loss > 0 and step % steps_loss == 0:
                average = sum(epoch_loss) / len(epoch_loss)
                print(
                    'loss: {average:', average, '} (epoch: {', epoch,
                    '}, step: {', step, '})', "// Avg time/img: %.4f s" %
                    (sum(time_train) / len(time_train) / BATCH_SIZE))

        average_epoch_loss_train = sum(epoch_loss) / len(epoch_loss)

        iouTrain = 0
        if (doIouTrain):
            iouTrain, iou_classes = iouEvalTrain.getIoU()
            iouStr = getColorEntry(iouTrain) + '{:0.2f}'.format(
                iouTrain * 100) + '\033[0m'
            print("EPOCH IoU on TRAIN set: ", iouStr, "%")

        #save one image per epoch
        # if USE_CUDA:
        #     first_val_image_A = first_val_image_A.to(device)
        #     first_val_image_B = first_val_image_B.to(device)  # ChangedByUs
        #     first_val_image_labels = first_val_image_labels.to(device)
        #
        # inputs = first_val_image_A.to(device)
        # inputs1 = first_val_image_B.to(device)  # ChangedByUs
        for i in range(len(seven_val_images)):
            outputs_val = model(
                [seven_val_images[i][0].cuda(), seven_val_images[i][1].cuda()],
                only_encode=ENCODER_ONLY)
            outputs_val = softmax(outputs_val)
            cv2.imwrite(
                os.path.join(OUTPUT_DIR, str(i),
                             'epoch' + str(epoch) + '_output.tiff'),
                (((outputs_val[0, 1, :, :] > 0.5) *
                  255).squeeze().cpu().numpy()).astype('uint8'))

    my_end_time = time.time()
    print(my_end_time - my_start_time)

    print(
        'loss: {average:', average, '} (epoch: {', epoch, '}, step: {', step,
        '})', "// Avg time/img: %.4f s" %
        (sum(time_train) / len(time_train) / BATCH_SIZE))

    # # ### Validation ###
    # #Validate on val images after each epoch of training
    # print("----- VALIDATING - EPOCH", epoch, "-----")
    # model.eval()
    # epoch_loss_val = []
    # time_val = []
    #
    # if (doIouVal):
    #     iouEvalVal = iouEval(NUM_CLASSES)
    #
    # for step, (images, labels) in enumerate(loader_val):
    #     start_time = time.time()
    #
    #     inputs = images.to(device)
    #     targets = labels.to(device)
    #
    #     with torch.no_grad():
    #         outputs = model(inputs, only_encode=ENCODER_ONLY)
    #         #outputs = model(inputs)
    #     loss = criterion(outputs, targets[:, 0])
    #     epoch_loss_val.append(loss.item())
    #     time_val.append(time.time() - start_time)
    #
    #
    #     #Add batch to calculate TP, FP and FN for iou estimation
    #     if (doIouVal):
    #         #start_time_iou = time.time()
    #         iouEvalVal.addBatch(outputs.max(1)[1].unsqueeze(1).data, targets.data)
    #         #print ("Time to add confusion matrix: ", time.time() - start_time_iou)
    #
    #     if steps_loss > 0 and step % steps_loss == 0:
    #         average = sum(epoch_loss_val) / len(epoch_loss_val)
    #         print('VAL loss: {average:',average,'} (epoch: {',epoch,'}, step: {',step,'})',
    #                 "// Avg time/img: %.4f s" % (sum(time_val) / len(time_val) / BATCH_SIZE))
    #
    #
    # average_epoch_loss_val = sum(epoch_loss_val) / len(epoch_loss_val)
    #
    # iouVal = 0
    # if (doIouVal):
    #
    #     iouVal, iou_classes = iouEvalVal.getIoU()
    #     print(iou_classes)
    #     iouStr = getColorEntry(iouVal)+'{:0.2f}'.format(iouVal*100) + '\033[0m'
    #     print ("EPOCH IoU on VAL set: ", iouStr, "%")

    #
    #  ### Visualizing the Output###
    torch.save(model.state_dict(), r'C:\Users\inbal.tlgip\modelsave.pt')
    # Qualitative Analysis

    ##################### calc iou on test data #####################
    dataset_test = idd_lite(DATA_ROOT, co_transform_val, 'test')
    loader_test = DataLoader(dataset_test,
                             num_workers=NUM_WORKERS,
                             batch_size=BATCH_SIZE,
                             shuffle=True)
    # dataiter = iter(loader_test)
    # (val_image_A, val_image_B, val_image_labels) = dataiter.next()
    for step, (images, images1, labels, filename) in enumerate(loader_test):

        outputs_val = model([images.cuda(), images1.cuda()],
                            only_encode=ENCODER_ONLY)
        outputs_val = softmax(outputs_val)
        cv2.imwrite(
            r'D:\Users Data\inbal.tlgip\Project\output_images\test_output/' +
            str(step) + '.tiff',
            (((outputs_val[0, 1, :, :] > 0.5) *
              255).squeeze().cpu().numpy()).astype('uint8'))
Example #2
0
def train(args, model, enc=False):
    best_acc = 0

    #TODO: calculate weights by processing dataset histogram (now its being set by hand from the torch values)
    #create a loder to run all images and calculate histogram of labels, then create weight array using class balancing

    weight = torch.ones(NUM_CLASSES)
    if (enc):
        weight[0] = 2.3653597831726	
        weight[1] = 4.4237880706787	
        weight[2] = 2.9691488742828	
        weight[3] = 5.3442072868347	
        weight[4] = 5.2983593940735	
        weight[5] = 5.2275490760803	
        weight[6] = 5.4394111633301	
        weight[7] = 5.3659925460815	
        weight[8] = 3.4170460700989	
        weight[9] = 5.2414722442627	
        weight[10] = 4.7376127243042	
        weight[11] = 5.2286224365234	
        weight[12] = 5.455126285553	
        weight[13] = 4.3019247055054	
        weight[14] = 5.4264230728149	
        weight[15] = 5.4331531524658	
        weight[16] = 5.433765411377	
        weight[17] = 5.4631009101868	
        weight[18] = 5.3947434425354
    else:
        weight[0] = 2.8149201869965	
        weight[1] = 6.9850029945374	
        weight[2] = 3.7890393733978	
        weight[3] = 9.9428062438965	
        weight[4] = 9.7702074050903	
        weight[5] = 9.5110931396484	
        weight[6] = 10.311357498169	
        weight[7] = 10.026463508606	
        weight[8] = 4.6323022842407	
        weight[9] = 9.5608062744141	
        weight[10] = 7.8698215484619	
        weight[11] = 9.5168733596802	
        weight[12] = 10.373730659485	
        weight[13] = 6.6616044044495	
        weight[14] = 10.260489463806	
        weight[15] = 10.287888526917	
        weight[16] = 10.289801597595	
        weight[17] = 10.405355453491	
        weight[18] = 10.138095855713	

    weight[19] = 0

    assert os.path.exists(args.datadir), "Error: datadir (dataset directory) could not be loaded"

    co_transform = MyCoTransform(enc, augment=True, height=args.height)#1024)
    co_transform_val = MyCoTransform(enc, augment=False, height=args.height)#1024)
    dataset_train = cityscapes(args.datadir, co_transform, 'train',50)
    dataset_val = cityscapes(args.datadir, co_transform_val, 'val',100)
    print(len(dataset_train))
    loader = DataLoader(dataset_train, num_workers=args.num_workers, batch_size=args.batch_size, shuffle=True)
    loader_val = DataLoader(dataset_val, num_workers=args.num_workers, batch_size=args.batch_size, shuffle=False)
#     print(list(enumerate(loader)))
    if args.cuda:
        weight = weight.cuda()
    criterion = CrossEntropyLoss2d(weight)

    savedir = f'../save/{args.savedir}'

    if (enc):
        automated_log_path = savedir + "/automated_log_encoder.txt"
        modeltxtpath = savedir + "/model_encoder.txt"
    else:
        automated_log_path = savedir + "/automated_log.txt"
        modeltxtpath = savedir + "/model.txt"    

    if (not os.path.exists(automated_log_path)):    #dont add first line if it exists 
        with open(automated_log_path, "a") as myfile:
            myfile.write("Epoch\t\tTrain-loss\t\tTest-loss\t\tTrain-IoU\t\tTest-IoU\t\tlearningRate")

    with open(modeltxtpath, "w") as myfile:
        myfile.write(str(model))


    #TODO: reduce memory in first gpu: https://discuss.pytorch.org/t/multi-gpu-training-memory-usage-in-balance/4163/4        #https://github.com/pytorch/pytorch/issues/1893

    #optimizer = Adam(model.parameters(), 5e-4, (0.9, 0.999),  eps=1e-08, weight_decay=2e-4)     ## scheduler 1
    optimizer = Adam(model.parameters(), 5e-4, (0.9, 0.999),  eps=1e-08, weight_decay=1e-4)      ## scheduler 2

    start_epoch = 1
    if args.resume:
        #Must load weights, optimizer, epoch and best value. 
        if enc:
            filenameCheckpoint = savedir + '/checkpoint_enc.pth.tar'
        else:
            filenameCheckpoint = savedir + '/checkpoint.pth.tar'

        assert os.path.exists(filenameCheckpoint), "Error: resume option was used but checkpoint was not found in folder"
        checkpoint = torch.load(filenameCheckpoint)
        start_epoch = checkpoint['epoch']
        model.load_state_dict(checkpoint['state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        best_acc = checkpoint['best_acc']
        print("=> Loaded checkpoint at epoch {})".format(checkpoint['epoch']))

    #scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.5) # set up scheduler     ## scheduler 1
    lambda1 = lambda epoch: pow((1-((epoch-1)/args.num_epochs)),0.9)  ## scheduler 2
    scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda1)                             ## scheduler 2

    if args.visualize and args.steps_plot > 0:
        board = Dashboard(args.port)

    for epoch in range(start_epoch, args.num_epochs+1):
        print("----- TRAINING - EPOCH", epoch, "-----")

        scheduler.step(epoch)    ## scheduler 2

        epoch_loss = []
        time_train = []
     
        doIouTrain = args.iouTrain   
        doIouVal =  args.iouVal      

        if (doIouTrain):
            iouEvalTrain = iouEval(NUM_CLASSES)

        usedLr = 0
        for param_group in optimizer.param_groups:
            print("LEARNING RATE: ", param_group['lr'])
            usedLr = float(param_group['lr'])

        model.train()
        #print("this is me!!!!!")
        #print(len(loader))
        for step, (images, labels) in enumerate(loader):

            start_time = time.time()
            #print("this is also m")
            #print (labels.size())
            #print (np.unique(labels.numpy()))
            #print("labels: ", np.unique(labels[0].numpy()))
            #labels = torch.ones(4, 1, 512, 1024).long()
            if args.cuda:
                images = images.cuda()
                labels = labels.cuda()

            inputs = Variable(images)
            targets = Variable(labels)
            outputs = model(inputs, only_encode=enc)

            #print("targets", np.unique(targets[:, 0].cpu().data.numpy()))
            #print("This is me on traget")
            #print(np.min(targets.cpu().detach().numpy()))
            #print("This is me after target")
            optimizer.zero_grad()
            loss = criterion(outputs, targets[:, 0])
            #print("This is me on loss")
            #print(loss)
            #print("This is me after loss")
            loss.backward()
            optimizer.step()

            epoch_loss.append(loss.cpu().detach().numpy().item())
            time_train.append(time.time() - start_time)

            if (doIouTrain):
                #start_time_iou = time.time()
                iouEvalTrain.addBatch(outputs.max(1)[1].unsqueeze(1).data, targets.data)
                #print ("Time to add confusion matrix: ", time.time() - start_time_iou)      

            #print(outputs.size())
            if args.visualize and args.steps_plot > 0 and step % args.steps_plot == 0:
                start_time_plot = time.time()
                image = inputs[0].cpu().data
                #image[0] = image[0] * .229 + .485
                #image[1] = image[1] * .224 + .456
                #image[2] = image[2] * .225 + .406
                #print("output", np.unique(outputs[0].cpu().max(0)[1].data.numpy()))
                board.image(image, f'input (epoch: {epoch}, step: {step})')
                if isinstance(outputs, list):   #merge gpu tensors
                    board.image(color_transform(outputs[0][0].cpu().max(0)[1].data.unsqueeze(0)),
                    f'output (epoch: {epoch}, step: {step})')
                else:
                    board.image(color_transform(outputs[0].cpu().max(0)[1].data.unsqueeze(0)),
                    f'output (epoch: {epoch}, step: {step})')
                board.image(color_transform(targets[0].cpu().data),
                    f'target (epoch: {epoch}, step: {step})')
                print ("Time to paint images: ", time.time() - start_time_plot)
            if args.steps_loss > 0 and step % args.steps_loss == 0:
                average = sum(epoch_loss) / len(epoch_loss)
                print(f'loss: {average:0.4} (epoch: {epoch}, step: {step})', 
                        "// Avg time/img: %.4f s" % (sum(time_train) / len(time_train) / args.batch_size))

            
        average_epoch_loss_train = sum(epoch_loss) / len(epoch_loss)
        
        iouTrain = 0
        if (doIouTrain):
            iouTrain, iou_classes = iouEvalTrain.getIoU()
            iouStr = getColorEntry(iouTrain)+'{:0.2f}'.format(iouTrain*100) + '\033[0m'
            print ("EPOCH IoU on TRAIN set: ", iouStr, "%")  

        #Validate on 500 val images after each epoch of training
        print("----- VALIDATING - EPOCH", epoch, "-----")
        model.eval()
        epoch_loss_val = []
        time_val = []

        if (doIouVal):
            iouEvalVal = iouEval(NUM_CLASSES)

        for step, (images, labels) in enumerate(loader_val):
            start_time = time.time()
            if args.cuda:
                images = images.cuda()
                labels = labels.cuda()

            inputs = Variable(images, volatile=True)    #volatile flag makes it free backward or outputs for eval
            targets = Variable(labels, volatile=True)
            outputs = model(inputs, only_encode=enc) 

            loss = criterion(outputs, targets[:, 0])
            epoch_loss_val.append(loss.cpu().detach().numpy().item())
            time_val.append(time.time() - start_time)


            #Add batch to calculate TP, FP and FN for iou estimation
            if (doIouVal):
                #start_time_iou = time.time()
                iouEvalVal.addBatch(outputs.max(1)[1].unsqueeze(1).data, targets.data)
                #print ("Time to add confusion matrix: ", time.time() - start_time_iou)

            if args.visualize and args.steps_plot > 0 and step % args.steps_plot == 0:
                start_time_plot = time.time()
                image = inputs[0].cpu().data
                board.image(image, f'VAL input (epoch: {epoch}, step: {step})')
                if isinstance(outputs, list):   #merge gpu tensors
                    board.image(color_transform(outputs[0][0].cpu().max(0)[1].data.unsqueeze(0)),
                    f'VAL output (epoch: {epoch}, step: {step})')
                else:
                    board.image(color_transform(outputs[0].cpu().max(0)[1].data.unsqueeze(0)),
                    f'VAL output (epoch: {epoch}, step: {step})')
                board.image(color_transform(targets[0].cpu().data),
                    f'VAL target (epoch: {epoch}, step: {step})')
                print ("Time to paint images: ", time.time() - start_time_plot)
            if args.steps_loss > 0 and step % args.steps_loss == 0:
                average = sum(epoch_loss_val) / len(epoch_loss_val)
                print(f'VAL loss: {average:0.4} (epoch: {epoch}, step: {step})', 
                        "// Avg time/img: %.4f s" % (sum(time_val) / len(time_val) / args.batch_size))
                       

        average_epoch_loss_val = sum(epoch_loss_val) / len(epoch_loss_val)
        #scheduler.step(average_epoch_loss_val, epoch)  ## scheduler 1   # update lr if needed

        iouVal = 0
        if (doIouVal):
            iouVal, iou_classes = iouEvalVal.getIoU()
            iouStr = getColorEntry(iouVal)+'{:0.2f}'.format(iouVal*100) + '\033[0m'
            print ("EPOCH IoU on VAL set: ", iouStr, "%") 
           

        # remember best valIoU and save checkpoint
        if iouVal == 0:
            current_acc = -average_epoch_loss_val
        else:
            current_acc = iouVal 
        is_best = current_acc > best_acc
        best_acc = max(current_acc, best_acc)
        if enc:
            filenameCheckpoint = savedir + '/checkpoint_enc.pth.tar'
            filenameBest = savedir + '/model_best_enc.pth.tar'    
        else:
            filenameCheckpoint = savedir + '/checkpoint.pth.tar'
            filenameBest = savedir + '/model_best.pth.tar'
        save_checkpoint({
            'epoch': epoch + 1,
            'arch': str(model),
            'state_dict': model.state_dict(),
            'best_acc': best_acc,
            'optimizer' : optimizer.state_dict(),
        }, is_best, filenameCheckpoint, filenameBest)

        #SAVE MODEL AFTER EPOCH
        if (enc):
            filename = f'{savedir}/model_encoder-{epoch:03}.pth'
            filenamebest = f'{savedir}/model_encoder_best.pth'
        else:
            filename = f'{savedir}/model-{epoch:03}.pth'
            filenamebest = f'{savedir}/model_best.pth'
        if args.epochs_save > 0 and step > 0 and step % args.epochs_save == 0:
            torch.save(model.state_dict(), filename)
            print(f'save: {filename} (epoch: {epoch})')
        if (is_best):
            torch.save(model.state_dict(), filenamebest)
            print(f'save: {filenamebest} (epoch: {epoch})')
            if (not enc):
                with open(savedir + "/best.txt", "w") as myfile:
                    myfile.write("Best epoch is %d, with Val-IoU= %.4f" % (epoch, iouVal))   
            else:
                with open(savedir + "/best_encoder.txt", "w") as myfile:
                    myfile.write("Best epoch is %d, with Val-IoU= %.4f" % (epoch, iouVal))           

        #SAVE TO FILE A ROW WITH THE EPOCH RESULT (train loss, val loss, train IoU, val IoU)
        #Epoch		Train-loss		Test-loss	Train-IoU	Test-IoU		learningRate
        with open(automated_log_path, "a") as myfile:
            myfile.write("\n%d\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.8f" % (epoch, average_epoch_loss_train, average_epoch_loss_val, iouTrain, iouVal, usedLr ))
    
    return(model)   #return model (convenience for encoder-decoder training)
Example #3
0
def train(args, model, enc=False):
    best_acc = 0

    #TODO: calculate weights by processing dataset histogram (now its being set by hand from the torch values)
    #create a loder to run all images and calculate histogram of labels, then create weight array using class balancing

    weight = torch.ones(NUM_CLASSES)
    if (enc):        
        weight[0] = 4.38133159
        weight[1] = 1.29574148
    else:
        weight[0] = 4.40513628
        weight[1] = 1.293674
        
    if (enc):
        up = torch.nn.Upsample(scale_factor=16, mode='bilinear')
    else:
        up = torch.nn.Upsample(scale_factor=2, mode='bilinear')
        
    if args.cuda:
        up = up.cuda()

    assert os.path.exists(args.datadir), "Error: datadir (dataset directory) could not be loaded"

    co_transform = MyCoTransform(enc, augment=True, height=args.height)#1024)
    co_transform_val = MyCoTransform(enc, augment=False, height=args.height)#1024)
    dataset_train = cityscapes(args.datadir, co_transform, 'train')
    dataset_val = cityscapes(args.datadir, co_transform_val, 'val')

    loader = DataLoader(dataset_train, num_workers=args.num_workers, batch_size=args.batch_size, shuffle=True)
    loader_val = DataLoader(dataset_val, num_workers=args.num_workers, batch_size=args.batch_size, shuffle=False)

    if args.cuda:
        weight = weight.cuda()
  
    if args.weighted:
        criterion = CrossEntropyLoss2d(weight)
    else:            
        criterion = CrossEntropyLoss2d()
        
    print(type(criterion))

    savedir = args.savedir

    if (enc):
        automated_log_path = savedir + "/automated_log_encoder.txt"
        modeltxtpath = savedir + "/model_encoder.txt"
    else:
        automated_log_path = savedir + "/automated_log.txt"
        modeltxtpath = savedir + "/model.txt"    

    if (not os.path.exists(automated_log_path)):    #dont add first line if it exists 
        with open(automated_log_path, "a") as myfile:
            myfile.write("Epoch\t\tTrain-loss\t\tTest-loss\t\tTrain-IoU\t\tTest-IoU\t\tlearningRate")

    with open(modeltxtpath, "w") as myfile:
        myfile.write(str(model))


    #TODO: reduce memory in first gpu: https://discuss.pytorch.org/t/multi-gpu-training-memory-usage-in-balance/4163/4        #https://github.com/pytorch/pytorch/issues/1893

    #optimizer = Adam(model.parameters(), 5e-4, (0.9, 0.999),  eps=1e-08, weight_decay=2e-4)     ## scheduler 1
    optimizer = Adam(model.parameters(), 5e-4, (0.9, 0.999),  eps=1e-08, weight_decay=1e-4)      ## scheduler 2

    start_epoch = 1
    if args.resume:
        #Must load weights, optimizer, epoch and best value. 
        if enc:
            filenameCheckpoint = savedir + '/checkpoint_enc.pth.tar'
        else:
            filenameCheckpoint = savedir + '/checkpoint.pth.tar'

        assert os.path.exists(filenameCheckpoint), "Error: resume option was used but checkpoint was not found in folder"
        checkpoint = torch.load(filenameCheckpoint)
        start_epoch = checkpoint['epoch']
        model.load_state_dict(checkpoint['state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        best_acc = checkpoint['best_acc']
        print("=> Loaded checkpoint at epoch {})".format(checkpoint['epoch']))

    #scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.5) # set up scheduler     ## scheduler 1
    lambda1 = lambda epoch: pow((1-((epoch-1)/args.num_epochs)),0.9)  ## scheduler 2
    scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda1)                             ## scheduler 2

    if args.visualize and args.steps_plot > 0:
        board = Dashboard(args.port)

    for epoch in range(start_epoch, args.num_epochs+1):
        print("----- TRAINING - EPOCH", epoch, "-----")

        scheduler.step(epoch)    ## scheduler 2

        epoch_loss = []
        time_train = []
     
        doIouTrain = args.iouTrain   
        doIouVal =  args.iouVal      

        if (doIouTrain):
            iouEvalTrain = iouEval(NUM_CLASSES, args.ignoreindex)

        usedLr = 0
        for param_group in optimizer.param_groups:
            print("LEARNING RATE: ", param_group['lr'])
            usedLr = float(param_group['lr'])

        model.train()
        for step, (images, labels, images_orig, labels_orig) in enumerate(loader):

            start_time = time.time()
            #print (labels.size())
            #print (np.unique(labels.numpy()))
            #print("labels: ", np.unique(labels[0].numpy()))
            #labels = torch.ones(4, 1, 512, 1024).long()
            if args.cuda:
                images = images.cuda()
                labels = labels.cuda()

            inputs = Variable(images)
            targets = Variable(labels)
            outputs = model(inputs, only_encode=enc)

            #print("targets", np.unique(targets[:, 0].cpu().data.numpy()))

            optimizer.zero_grad()
            loss = criterion(outputs, targets[:, 0])
            loss.backward()
            optimizer.step()

            epoch_loss.append(loss.data[0])
            time_train.append(time.time() - start_time)

            if (doIouTrain):
                #start_time_iou = time.time()
                upsampledOutputs = up(outputs)
                iouEvalTrain.addBatch(upsampledOutputs.max(1)[1].unsqueeze(1).data, labels_orig)
                #print ("Time to add confusion matrix: ", time.time() - start_time_iou)      

            #print(outputs.size())
            if args.visualize and args.steps_plot > 0 and step % args.steps_plot == 0:
                start_time_plot = time.time()
                image = inputs[0].cpu().data
                #image[0] = image[0] * .229 + .485
                #image[1] = image[1] * .224 + .456
                #image[2] = image[2] * .225 + .406
                #print("output", np.unique(outputs[0].cpu().max(0)[1].data.numpy()))
                board.image(image, f'input (epoch: {epoch}, step: {step})')
                if isinstance(outputs, list):   #merge gpu tensors
                    board.image(color_transform(outputs[0][0].cpu().max(0)[1].data.unsqueeze(0)),
                    f'output (epoch: {epoch}, step: {step})')
                else:
                    board.image(color_transform(outputs[0].cpu().max(0)[1].data.unsqueeze(0)),
                    f'output (epoch: {epoch}, step: {step})')
                board.image(color_transform(targets[0].cpu().data),
                    f'target (epoch: {epoch}, step: {step})')
                print ("Time to paint images: ", time.time() - start_time_plot)
            if args.steps_loss > 0 and step % args.steps_loss == 0:
                average = sum(epoch_loss) / len(epoch_loss)
                print(f'loss: {average:0.4} (epoch: {epoch}, step: {step})', 
                        "// Avg time/img: %.4f s" % (sum(time_train) / len(time_train) / args.batch_size))

            
        average_epoch_loss_train = sum(epoch_loss) / len(epoch_loss)
        
        iouTrain = 0
        if (doIouTrain):
            iouTrain, iou_classes = iouEvalTrain.getIoU()
            iouStr = getColorEntry(iouTrain)+'{:0.2f}'.format(iouTrain*100) + '\033[0m'
            print ("EPOCH IoU on TRAIN set: ", iouStr, "%", iou_classes)  

        #Validate on 500 val images after each epoch of training
        print("----- VALIDATING - EPOCH", epoch, "-----")
        model.eval()
        epoch_loss_val = []
        time_val = []

        if (doIouVal):
            iouEvalVal = iouEval(NUM_CLASSES, args.ignoreindex)

        for step, (images, labels, images_orig, labels_orig) in enumerate(loader_val):
            start_time = time.time()
            if args.cuda:
                images = images.cuda()
                labels = labels.cuda()

            inputs = Variable(images, volatile=True)    #volatile flag makes it free backward or outputs for eval
            targets = Variable(labels, volatile=True)
            outputs = model(inputs, only_encode=enc) 

            loss = criterion(outputs, targets[:, 0])
            epoch_loss_val.append(loss.data[0])
            time_val.append(time.time() - start_time)


            #Add batch to calculate TP, FP and FN for iou estimation
            if (doIouVal):
                #start_time_iou = time.time()
                upsampledOutputs = up(outputs)
                iouEvalVal.addBatch(upsampledOutputs.max(1)[1].unsqueeze(1).data, labels_orig)
                #print ("Time to add confusion matrix: ", time.time() - start_time_iou)

            if args.visualize and args.steps_plot > 0 and step % args.steps_plot == 0:
                start_time_plot = time.time()
                image = inputs[0].cpu().data
                board.image(image, f'VAL input (epoch: {epoch}, step: {step})')
                if isinstance(outputs, list):   #merge gpu tensors
                    board.image(color_transform(outputs[0][0].cpu().max(0)[1].data.unsqueeze(0)),
                    f'VAL output (epoch: {epoch}, step: {step})')
                else:
                    board.image(color_transform(outputs[0].cpu().max(0)[1].data.unsqueeze(0)),
                    f'VAL output (epoch: {epoch}, step: {step})')
                board.image(color_transform(targets[0].cpu().data),
                    f'VAL target (epoch: {epoch}, step: {step})')
                print ("Time to paint images: ", time.time() - start_time_plot)
            if args.steps_loss > 0 and step % args.steps_loss == 0:
                average = sum(epoch_loss_val) / len(epoch_loss_val)
                print(f'VAL loss: {average:0.4} (epoch: {epoch}, step: {step})', 
                        "// Avg time/img: %.4f s" % (sum(time_val) / len(time_val) / args.batch_size))
                       

        average_epoch_loss_val = sum(epoch_loss_val) / len(epoch_loss_val)
        #scheduler.step(average_epoch_loss_val, epoch)  ## scheduler 1   # update lr if needed

        iouVal = 0
        if (doIouVal):
            iouVal, iou_classes = iouEvalVal.getIoU()
            iouStr = getColorEntry(iouVal)+'{:0.2f}'.format(iouVal*100) + '\033[0m'
            print ("EPOCH IoU on VAL set: ", iouStr, "%", iou_classes) 
           

        # remember best valIoU and save checkpoint
        if iouVal == 0:
            current_acc = -average_epoch_loss_val
        else:
            current_acc = iouVal 
        is_best = current_acc > best_acc
        best_acc = max(current_acc, best_acc)
        if enc:
            filenameCheckpoint = savedir + '/checkpoint_enc.pth.tar'
            filenameBest = savedir + '/model_best_enc.pth.tar'    
        else:
            filenameCheckpoint = savedir + '/checkpoint.pth.tar'
            filenameBest = savedir + '/model_best.pth.tar'
        save_checkpoint({
            'epoch': epoch + 1,
            'arch': str(model),
            'state_dict': model.state_dict(),
            'best_acc': best_acc,
            'optimizer' : optimizer.state_dict(),
        }, is_best, filenameCheckpoint, filenameBest)

        #SAVE MODEL AFTER EPOCH
        if (enc):
            filename = f'{savedir}/model_encoder-{epoch:03}.pth'
            filenamebest = f'{savedir}/model_encoder_best.pth'
        else:
            filename = f'{savedir}/model-{epoch:03}.pth'
            filenamebest = f'{savedir}/model_best.pth'
        if args.epochs_save > 0 and step > 0 and step % args.epochs_save == 0:
            torch.save(model.state_dict(), filename)
            print(f'save: {filename} (epoch: {epoch})')
        if (is_best):
            torch.save(model.state_dict(), filenamebest)
            print(f'save: {filenamebest} (epoch: {epoch})')
            if (not enc):
                with open(savedir + "/best.txt", "w") as myfile:
                    myfile.write("Best epoch is %d, with Val-IoU= %.4f" % (epoch, iouVal))   
            else:
                with open(savedir + "/best_encoder.txt", "w") as myfile:
                    myfile.write("Best epoch is %d, with Val-IoU= %.4f" % (epoch, iouVal))           

        #SAVE TO FILE A ROW WITH THE EPOCH RESULT (train loss, val loss, train IoU, val IoU)
        #Epoch		Train-loss		Test-loss	Train-IoU	Test-IoU		learningRate
        with open(automated_log_path, "a") as myfile:
            myfile.write("\n%d\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.8f" % (epoch, average_epoch_loss_train, average_epoch_loss_val, iouTrain, iouVal, usedLr ))
    
    return(model)   #return model (convenience for encoder-decoder training)
Example #4
0
def main(args):

    modelpath = args.loadDir + args.loadModel
    weightspath = args.loadDir + args.loadWeights

    print("Loading model: " + modelpath)
    print("Loading weights: " + weightspath)

    model = ERFNet(NUM_CLASSES)

    model = torch.nn.DataParallel(model)
    if (not args.cpu):
        model = model.cuda()

    def load_my_state_dict(
            model, state_dict
    ):  #custom function to load model when not all dict elements
        own_state = model.state_dict()
        for name, param in state_dict.items():
            if name not in own_state:
                print(name, " not loaded")
                continue
            own_state[name].copy_(param)
        return model

    model = load_my_state_dict(model, torch.load(weightspath))
    print("Model and weights LOADED successfully")

    model.eval()

    if (not os.path.exists(args.datadir)):
        print("Error: datadir could not be loaded")

    loader = DataLoader(cityscapes(args.datadir,
                                   input_transform_cityscapes,
                                   target_transform_cityscapes,
                                   subset=args.subset),
                        num_workers=args.num_workers,
                        batch_size=args.batch_size,
                        shuffle=False)

    iouEvalVal = iouEval(NUM_CLASSES)

    start = time.time()

    for step, (images, labels, filename, filenameGt) in enumerate(loader):
        if (not args.cpu):
            images = images.cuda()
            labels = labels.cuda()

        inputs = Variable(images, volatile=True)
        outputs = model(inputs)

        iouEvalVal.addBatch(outputs.max(1)[1].unsqueeze(1).data, labels)

        filenameSave = filename[0].split("leftImg8bit/")[1]

        print(step, filenameSave)

    iouVal, iou_classes = iouEvalVal.getIoU()

    iou_classes_str = []
    for i in range(iou_classes.size(0)):
        iouStr = getColorEntry(iou_classes[i]) + '{:0.2f}'.format(
            iou_classes[i] * 100) + '\033[0m'
        iou_classes_str.append(iouStr)

    print("---------------------------------------")
    print("Took ", time.time() - start, "seconds")
    print("=======================================")
    #print("TOTAL IOU: ", iou * 100, "%")
    print("Per-Class IoU:")
    print(iou_classes_str[0], "Road")
    print(iou_classes_str[1], "sidewalk")
    print(iou_classes_str[2], "building")
    print(iou_classes_str[3], "wall")
    print(iou_classes_str[4], "fence")
    print(iou_classes_str[5], "pole")
    print(iou_classes_str[6], "traffic light")
    print(iou_classes_str[7], "traffic sign")
    print(iou_classes_str[8], "vegetation")
    print(iou_classes_str[9], "terrain")
    print(iou_classes_str[10], "sky")
    print(iou_classes_str[11], "person")
    print(iou_classes_str[12], "rider")
    print(iou_classes_str[13], "car")
    print(iou_classes_str[14], "truck")
    print(iou_classes_str[15], "bus")
    print(iou_classes_str[16], "train")
    print(iou_classes_str[17], "motorcycle")
    print(iou_classes_str[18], "bicycle")
    print("=======================================")
    iouStr = getColorEntry(iouVal) + '{:0.2f}'.format(iouVal * 100) + '\033[0m'
    print("MEAN IoU: ", iouStr, "%")
def train(args, model_student, model_teacher, enc=False):
    global best_acc

    weight = torch.ones(1)
    
    assert os.path.exists(args.datadir), "Error: datadir (dataset directory) could not be loaded"

    # Set data loading variables
    co_transform = MyCoTransform(enc, augment=True, height=480)#1024)
    co_transform_val = MyCoTransform(enc, augment=False, height=480)#1024)
    dataset_train = self_supervised_power(args.datadir, co_transform, 'train', file_format="csv", label_name="class", subsample=args.subsample)
    # dataset_train = self_supervised_power(args.datadir, None, 'train')
    dataset_val = self_supervised_power(args.datadir, None, 'val', file_format="csv", label_name="class", subsample=args.subsample)

    if args.force_n_classes > 0:
        color_transform_classes_prob = ColorizeClassesProb(args.force_n_classes)  # Automatic color based on max class probability
        color_transform_classes = ColorizeClasses(args.force_n_classes)  # Automatic color based on max class probability

    loader = DataLoader(dataset_train, num_workers=args.num_workers, batch_size=args.batch_size, shuffle=True)
    loader_val = DataLoader(dataset_val, num_workers=args.num_workers, batch_size=args.batch_size, shuffle=False)

    if args.cuda:
        weight = weight.cuda()

    # Set Loss functions
    if args.force_n_classes > 0:
        criterion = L1LossClassProbMasked() # L1 loss weighted with class prob with averaging over mini-batch
    else:
        criterion = L1LossMasked()     

    criterion = CrossEntropyLoss2d()

    criterion_trav = L1LossTraversability()
    criterion_consistency = MSELossWeighted()
    criterion_val = CrossEntropyLoss2d()    
    criterion_acc = ClassificationAccuracy()
    print(type(criterion))

    savedir = f'../save/{args.savedir}'

    if (enc):
        automated_log_path = savedir + "/automated_log_encoder.txt"
        modeltxtpath = savedir + "/model_encoder.txt"
    else:
        automated_log_path = savedir + "/automated_log.txt"
        modeltxtpath = savedir + "/model.txt"    

    if (not os.path.exists(automated_log_path)):    #dont add first line if it exists 
        with open(automated_log_path, "a") as myfile:
            myfile.write("Epoch\t\tTrain-loss\t\tTest-loss\t\tTrain-IoU\t\tTest-IoU\t\tlearningRate")

    with open(modeltxtpath, "w") as myfile:
        myfile.write(str(model_student))


    #TODO: reduce memory in first gpu: https://discuss.pytorch.org/t/multi-gpu-training-memory-usage-in-balance/4163/4        #https://github.com/pytorch/pytorch/issues/1893

    #optimizer = Adam(model.parameters(), 5e-4, (0.9, 0.999),  eps=1e-08, weight_decay=2e-4)     ## scheduler 1
    optimizer = Adam(model_student.parameters(), LEARNING_RATE, BETAS,  eps=OPT_EPS, weight_decay=WEIGHT_DECAY)
    if args.alternate_optimization:
        params_prob = [param for name, param in model.named_parameters() if name != "module.class_power"]
        params_power  = [param for name, param in model.named_parameters() if name == "module.class_power"]
        optimizer_prob = Adam(params_prob, LEARNING_RATE, BETAS,  eps=OPT_EPS, weight_decay=WEIGHT_DECAY)
        optimizer_power = Adam(params_power, LEARNING_RATE, BETAS,  eps=OPT_EPS, weight_decay=WEIGHT_DECAY)

    start_epoch = 1

    if args.resume:
        #Must load weights, optimizer, epoch and best value. 
        if enc:
            filenameCheckpoint = savedir + '/checkpoint_enc.pth.tar'
        else:
            filenameCheckpoint = savedir + '/checkpoint.pth.tar'

        assert os.path.exists(filenameCheckpoint), "Error: resume option was used but checkpoint was not found in folder"
        checkpoint = torch.load(filenameCheckpoint)
        start_epoch = checkpoint['epoch']
        model_student.load_state_dict(checkpoint['state_dict'])
        model_teacher.load_state_dict(checkpoint['state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        best_acc = checkpoint['best_acc']
        print("=> Loaded checkpoint at epoch {})".format(checkpoint['epoch']))

    # Initialize teacher with same weights as student. 
    copyWeightsToModelNoGrad(model_student, model_teacher)

    #scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.5) # set up scheduler     ## scheduler 1
    lambda1 = lambda epoch: pow((1-((epoch-1)/args.num_epochs)),0.9)  ## scheduler 2
    scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda1)                             ## scheduler 2
    if args.alternate_optimization:
        scheduler_prob = lr_scheduler.LambdaLR(optimizer_prob, lr_lambda=lambda1)                             ## scheduler 2
        scheduler_power = lr_scheduler.LambdaLR(optimizer_power, lr_lambda=lambda1)                             ## scheduler 2

    if args.visualize:
        board = Dashboard(args.port)
        writer = SummaryWriter()
        log_base_dir = writer.file_writer.get_logdir() + "/"
        print("Saving tensorboard log to: " + log_base_dir)
        total_steps_train = 0
        total_steps_val = 0
        # Figure out histogram plot indices.
        steps_hist = int(len(loader_val)/NUM_HISTOGRAMS)
        steps_img_train = int(len(loader)/(NUM_IMG_PER_EPOCH-1))
        if steps_img_train == 0:
            steps_img_train = 1
        steps_img_val = int(len(loader_val)/(NUM_IMG_PER_EPOCH-1))
        if steps_img_val == 0:
            steps_img_val = 1
        hist_bins = np.arange(-0.5, args.force_n_classes+0.5, 1.0)

    for epoch in range(start_epoch, args.num_epochs+1):
        print("----- TRAINING - EPOCH", epoch, "-----")

        if epoch < MAX_CONSISTENCY_EPOCH:
            cur_consistency_weight = epoch / MAX_CONSISTENCY_EPOCH
        else:
            cur_consistency_weight = 1.0

        if args.no_mean_teacher:
            cur_consistency_weight = 0.0

        if args.alternate_optimization:
            if epoch % 2 == 0:
                scheduler_power.step(epoch)
            else:
                scheduler_prob.step(epoch)
        else:
            scheduler.step(epoch)    ## scheduler 2

        average_loss_student_val = 0
        average_loss_teacher_val = 0

        epoch_loss_student = []
        epoch_loss_teacher = []
        epoch_acc_student = []
        epoch_acc_teacher = []
        epoch_loss_trav_student = []
        epoch_loss_trav_teacher = []
        epoch_loss_consistency = []
        time_train = []
        time_load = []
        time_iter = [0.0]
     
        doIouTrain = args.iouTrain   
        doIouVal =  args.iouVal      


        usedLr = 0
        for param_group in optimizer.param_groups:
            print("LEARNING RATE: ", param_group['lr'])
            usedLr = float(param_group['lr'])

        model_student.train()
        model_teacher.train()

        start_time = time.time()

        for step, (images1, images2, labels) in enumerate(loader):

            time_load.append(time.time() - start_time)

            start_time = time.time()
            #print (labels.size())
            #print (np.unique(labels.numpy()))
            #print("labels: ", np.unique(labels[0].numpy()))
            #labels = torch.ones(4, 1, 512, 1024).long()
            if args.cuda:
                images1 = images1.cuda()
                images2 = images2.cuda()
                labels = labels.cuda()

            inputs1 = Variable(images1)
            inputs2 = Variable(images2)
            targets = Variable(labels)
            if (args.force_n_classes) > 0:
                # Forced into discrete classes. 
                output_student_prob, output_student_trav, output_student_power = model_student(inputs1, only_encode=enc)
                output_teacher_prob, output_teacher_trav, output_teacher_power = model_teacher(inputs2, only_encode=enc)
                if args.alternate_optimization:
                    if epoch % 2 == 0:
                        optimizer_power.zero_grad()
                    else:
                        optimizer_prob.zero_grad()
                else:
                    optimizer.zero_grad()
                loss_student_pred = criterion(output_student_prob, targets)
                loss_teacher_pred = criterion(output_teacher_prob, targets)
                loss_consistency = criterion_consistency(output_student_prob, output_teacher_prob, cur_consistency_weight)
                acc_student = criterion_acc(output_student_prob, targets)
                acc_teacher = criterion_acc(output_teacher_prob, targets)
            else:
                # Straight regressoin
                output_student, output_student_trav = model_student(inputs1, only_encode=enc)
                output_teacher, output_teacher_trav = model_teacher(inputs2, only_encode=enc)
                optimizer.zero_grad()
                loss_student_pred = criterion(output_student, targets)
                loss_teacher_pred = criterion(output_teacher, targets)
                loss_consistency = criterion_consistency(output_student, output_teacher, cur_consistency_weight)

            # Loss independent of how scalar value is determined
            loss_student_trav = criterion_trav(output_student_trav, targets)
            loss_teacher_trav = criterion_trav(output_teacher_trav, targets)


            #print("targets", np.unique(targets[:, 0].cpu().data.numpy()))

            # Do backward pass.
            loss_student_pred.backward(retain_graph=True)
            if epoch>0 and not args.no_mean_teacher:
                loss_student_trav.backward(retain_graph=True)
                loss_consistency.backward()
            else: 
                loss_student_trav.backward()


            if args.alternate_optimization:
                if epoch % 2 == 0:
                    optimizer_power.step()
                else:
                    optimizer_prob.step()
            else:
                optimizer.step()

            # Average over first 50 epochs.
            if epoch < DISCOUNT_RATE_START_EPOCH:
                cur_discount_rate = DISCOUNT_RATE_START
            else:
                cur_discount_rate = DISCOUNT_RATE
            copyWeightsToModelWithDiscount(model_student, model_teacher, cur_discount_rate)

            # copyWeightsToModelWithDiscount(model_student, model_teacher, DISCOUNT_RATE)

            epoch_loss_student.append(loss_student_pred.data.item())
            epoch_loss_teacher.append(loss_teacher_pred.data.item())
            epoch_loss_trav_student.append(loss_student_trav.data.item())
            epoch_loss_trav_teacher.append(loss_teacher_trav.data.item())
            epoch_loss_consistency.append(loss_consistency.data.item())
            if (args.force_n_classes) > 0:
                epoch_acc_student.append(acc_student.data.item())
                epoch_acc_teacher.append(acc_teacher.data.item())
            time_train.append(time.time() - start_time)

            # if (doIouTrain):
            #     #start_time_iou = time.time()
            #     iouEvalTrain.addBatch(outputs.max(1)[1].unsqueeze(1).data, targets.data)
            #     #print ("Time to add confusion matrix: ", time.time() - start_time_iou)      

            #print(outputs.size())
            if args.visualize and step % steps_img_train == 0:
                step_vis_no = total_steps_train + len(epoch_loss_student)

                # Figure out and compute tensor to visualize. 
                if args.force_n_classes > 0:
                    # Compute weighted power consumption
                    sum_dim = output_student_prob.dim()-3
                    # weighted_sum_output = (output_student_prob * output_student_power).sum(dim=sum_dim, keepdim=True)
                    if (isinstance(output_student_prob, list)):
                        max_prob, vis_output = getMaxProbValue(output_student_prob[0][0].cpu().data, output_student_power[0][0].cpu().data)
                        max_prob_teacher, vis_output_teacher = getMaxProbValue(output_teacher_prob[0][0].cpu().data, output_teacher_power[0][0].cpu().data)
                        writer.add_image("train/2_classes", color_transform_classes_prob(output_student_prob[0][0].cpu().data), step_vis_no)
                        writer.add_image("train/3_max_class_probability", max_prob[0][0], step_vis_no)
                        # writer.add_image("train/4_weighted_output", color_transform_output(weighted_sum_output[0][0].cpu().data), step_vis_no)
                    else:
                        max_prob, vis_output = getMaxProbValue(output_student_prob[0].cpu().data, output_student_power[0].cpu().data)
                        max_prob_teacher, vis_output_teacher = getMaxProbValue(output_teacher_prob[0].cpu().data, output_teacher_power[0].cpu().data)
                        writer.add_image("train/2_classes", color_transform_classes_prob(output_student_prob[0].cpu().data), step_vis_no)
                        writer.add_image("train/3_max_class_probability", max_prob[0], step_vis_no)
                        # writer.add_image("train/4_weighted_output", color_transform_output(weighted_sum_output[0].cpu().data), step_vis_no)
                else:
                    if (isinstance(output_teacher, list)):
                        vis_output = output_student[0][0].cpu().data
                        vis_output_teacher = output_teacher[0][0].cpu().data
                    else:
                        vis_output = output_student[0].cpu().data
                        vis_output_teacher = output_teacher[0].cpu().data

                if (isinstance(output_teacher_trav, list)):
                    trav_output = output_student_trav[0][0].cpu().data
                    trav_output_teacher = output_teacher_trav[0][0].cpu().data
                else:
                    trav_output = output_student_trav[0].cpu().data
                    trav_output_teacher = output_teacher_trav[0].cpu().data

                start_time_plot = time.time()
                image1 = inputs1[0].cpu().data
                image2 = inputs2[0].cpu().data
                # board.image(image, f'input (epoch: {epoch}, step: {step})')
                writer.add_image("train/1_input_student", image1, step_vis_no)
                writer.add_image("train/1_input_teacher", image2, step_vis_no)
                # writer.add_image("train/5_output_student", color_transform_output(vis_output), step_vis_no)
                # writer.add_image("train/5_output_teacher", color_transform_output(vis_output_teacher), step_vis_no)
                writer.add_image("train/7_output_trav_student", trav_output, step_vis_no)
                writer.add_image("train/7_output_trav_teacher", trav_output_teacher, step_vis_no)
                # board.image(color_transform_target(targets[0].cpu().data),
                #     f'target (epoch: {epoch}, step: {step})')
                writer.add_image("train/6_target", color_transform_classes(targets.cpu().data), step_vis_no)

                # Visualize graph.
                writer.add_graph(model_teacher, inputs2)

                print ("Time for visualization: ", time.time() - start_time_plot)
                

        len_epoch_loss = len(epoch_loss_student)
        for ind, val in enumerate(epoch_loss_student):
            writer.add_scalar("train/instant_loss_student", val, total_steps_train + ind)
        for ind, val in enumerate(epoch_loss_teacher):
            writer.add_scalar("train/instant_loss_teacher", val, total_steps_train + ind)
        for ind, val in enumerate(epoch_loss_trav_student):
            writer.add_scalar("train/instant_loss_trav_student", val, total_steps_train + ind)
        for ind, val in enumerate(epoch_loss_trav_teacher):
            writer.add_scalar("train/instant_loss_trav_teacher", val, total_steps_train + ind)
        for ind, val in enumerate(epoch_loss_consistency):
            writer.add_scalar("train/instant_loss_consistency", val, total_steps_train + ind)
        if (args.force_n_classes) > 0:
            for ind, val in enumerate(epoch_acc_student):
                writer.add_scalar("train/instant_acc_student", val, total_steps_train + ind)
            for ind, val in enumerate(epoch_acc_teacher):
                writer.add_scalar("train/instant_acc_teacher", val, total_steps_train + ind)
        total_steps_train += len_epoch_loss
        avg_loss_teacher = sum(epoch_loss_teacher)/len(epoch_loss_teacher)
        writer.add_scalar("train/epoch_loss_student", sum(epoch_loss_student)/len(epoch_loss_student), total_steps_train)
        writer.add_scalar("train/epoch_loss_teacher", avg_loss_teacher, total_steps_train)
        writer.add_scalar("train/epoch_loss_trav_student", sum(epoch_loss_trav_student)/len(epoch_loss_trav_student), total_steps_train)
        writer.add_scalar("train/epoch_loss_trav_teacher", sum(epoch_loss_trav_teacher)/len(epoch_loss_trav_teacher), total_steps_train)
        writer.add_scalar("train/epoch_loss_consistency", sum(epoch_loss_consistency)/len(epoch_loss_consistency), total_steps_train)
        if (args.force_n_classes) > 0:
            writer.add_scalar("train/epoch_acc_student", sum(epoch_acc_student)/len(epoch_acc_student), total_steps_train)
            writer.add_scalar("train/epoch_acc_teacher", sum(epoch_acc_teacher)/len(epoch_acc_teacher), total_steps_train)
        # Clear loss for next loss print iteration.
        # Output class power costs
        power_dict = {}
        if args.force_n_classes > 0:
            for ind, val in enumerate(output_teacher_power.squeeze()):
                power_dict[str(ind)] = val
            writer.add_scalars("params/class_cost", power_dict, total_steps_train)
        epoch_loss_student = []
        epoch_loss_teacher = []
        epoch_loss_consistency = []
        epoch_loss_trav_student = []
        epoch_loss_trav_teacher = []
        epoch_acc_student = []
        epoch_acc_teacher = []
        # Print current loss. 
        print(f'loss: {avg_loss_teacher:0.4} (epoch: {epoch}, step: {step})', 
                "// Train: %.4f s" % (sum(time_train) / len(time_train) / args.batch_size), 
                "// Load: %.4f s" % (sum(time_load) / len(time_load) / args.batch_size),
                "// Iter: %.4f s" % (sum(time_iter) / len(time_iter) / args.batch_size))

        if step == 0:
            time_iter.clear()
        time_iter.append(time.time() - start_time)
        # Save time for image loading duration.
        start_time = time.time()

            
        average_epoch_loss_train = avg_loss_teacher   
        
        iouTrain = 0
        if (doIouTrain):
            iouTrain, iou_classes = iouEvalTrain.getIoU()
            iouStr = getColorEntry(iouTrain)+'{:0.2f}'.format(iouTrain*100) + '\033[0m'
            print ("EPOCH IoU on TRAIN set: ", iouStr, "%")  

        #Validate on 500 val images after each epoch of training
        print("----- VALIDATING - EPOCH", epoch, "-----")
        model_student.eval()
        model_teacher.eval()
        epoch_loss_student_val = []
        epoch_loss_teacher_val = []
        epoch_acc_student_val = []
        epoch_acc_teacher_val = []
        epoch_loss_trav_student_val = []
        epoch_loss_trav_teacher_val = []
        time_val = []


        for step, (images1, images2, labels) in enumerate(loader_val):
            start_time = time.time()
            if args.cuda:
                images1 = images1.cuda()
                images2 = images2.cuda()
                labels = labels.cuda()

            inputs1 = Variable(images1, volatile=True)    #volatile flag makes it free backward or outputs for eval
            inputs2 = Variable(images2, volatile=True)    #volatile flag makes it free backward or outputs for eval
            targets = Variable(labels, volatile=True)

            if args.force_n_classes:
                output_student_prob, output_student_trav, output_student_power = model_student(inputs1, only_encode=enc) 
                output_teacher_prob, output_teacher_trav, output_teacher_power = model_teacher(inputs2, only_encode=enc) 
                max_prob, output_student = getMaxProbValue(output_student_prob, output_student_power)
                max_prob, output_teacher = getMaxProbValue(output_teacher_prob, output_teacher_power)
                # Compute weighted power consumption
                sum_dim = output_student_prob.dim()-3
                # weighted_sum_output = (output_student_prob * output_student_power).sum(dim=sum_dim, keepdim=True)
            else:
                output_student, output_student_trav = model_student(inputs1, only_encode=enc)
                output_teacher, output_teacher_trav = model_teacher(inputs2, only_encode=enc)

            loss_student = criterion_val(output_student_prob, targets)
            loss_teacher = criterion_val(output_teacher_prob, targets)
            loss_student_trav = criterion_trav(output_student_trav, targets)
            loss_teacher_trav = criterion_trav(output_teacher_trav, targets)
            epoch_loss_student_val.append(loss_student.data.item())
            epoch_loss_teacher_val.append(loss_teacher.data.item())
            epoch_loss_trav_student_val.append(loss_student_trav.data.item())
            epoch_loss_trav_teacher_val.append(loss_teacher_trav.data.item())
            if args.force_n_classes:
                acc_student = criterion_acc(output_student_prob, targets)
                acc_teacher = criterion_acc(output_teacher_prob, targets)
                epoch_acc_student_val.append(acc_student.data.item())
                epoch_acc_teacher_val.append(acc_teacher.data.item())
            time_val.append(time.time() - start_time)


            #Add batch to calculate TP, FP and FN for iou estimation
            # if (doIouVal):
            #     #start_time_iou = time.time()
            #     iouEvalVal.addBatch(outputs.max(1)[1].unsqueeze(1).data, targets.data)
            #     #print ("Time to add confusion matrix: ", time.time() - start_time_iou)

            # Plot images
            if args.visualize and step % steps_img_val == 0:
                if (isinstance(output_teacher_trav, list)):
                    trav_output = output_student_trav[0][0].cpu().data
                    trav_output_teacher = output_teacher_trav[0][0].cpu().data
                else:
                    trav_output = output_student_trav[0].cpu().data
                    trav_output_teacher = output_teacher_trav[0].cpu().data

                step_vis_no = total_steps_val + len(epoch_loss_student_val)
                start_time_plot = time.time()
                image1 = inputs1[0].cpu().data
                image2 = inputs2[0].cpu().data
                # board.image(image, f'VAL input (epoch: {epoch}, step: {step})')
                writer.add_image("val/1_input_student", image1, step_vis_no)
                writer.add_image("val/1_input_teacher", image2, step_vis_no)
                if isinstance(output_teacher, list):   #merge gpu tensors
                    # board.image(color_transform_output(outputs[0][0].cpu().data),
                    # f'VAL output (epoch: {epoch}, step: {step})')
                    # writer.add_image("val/5_output_teacher", color_transform_output(output_teacher[0][0].cpu().data), step_vis_no)
                    # writer.add_image("val/5_output_student", color_transform_output(output_student[0][0].cpu().data), step_vis_no)
                    if args.force_n_classes > 0:
                        writer.add_image("val/2_classes", color_transform_classes_prob(output_teacher_prob[0][0].cpu().data), step_vis_no)
                        writer.add_image("val/3_max_class_probability", max_prob[0][0], step_vis_no)
                        # writer.add_image("val/4_weighted_output", color_transform_output(weighted_sum_output[0][0].cpu().data), step_vis_no)

                else:
                    # board.image(color_transform_output(outputs[0].cpu().data),
                    # f'VAL output (epoch: {epoch}, step: {step})')
                    # writer.add_image("val/5_output_teacher", color_transform_output(output_teacher[0].cpu().data), step_vis_no)
                    # writer.add_image("val/5_output_student", color_transform_output(output_student[0].cpu().data), step_vis_no)
                    if args.force_n_classes > 0:
                        writer.add_image("val/2_classes", color_transform_classes_prob(output_teacher_prob[0].cpu().data), step_vis_no)
                        writer.add_image("val/3_max_class_probability", max_prob[0], step_vis_no)
                        # writer.add_image("val/4_weighted_output", color_transform_output(weighted_sum_output[0].cpu().data), step_vis_no)
                # board.image(color_transform_target(targets[0].cpu().data),
                #     f'VAL target (epoch: {epoch}, step: {step})')
                writer.add_image("val/7_output_trav_student", trav_output, step_vis_no)
                writer.add_image("val/7_output_trav_teacher", trav_output_teacher, step_vis_no)
                writer.add_image("val/6_target", color_transform_classes(targets.cpu().data), step_vis_no)
                print ("Time to paint images: ", time.time() - start_time_plot)
            # Plot histograms
            if args.force_n_classes > 0 and args.visualize and steps_hist > 0 and step % steps_hist == 0:
                image1 = inputs1[0].cpu().data+0.5  # +0.5 to remove zero-mean normalization
                image2 = inputs2[0].cpu().data+0.5
                hist_ind = int(step / steps_hist)
                if (isinstance(output_teacher_prob, list)):
                    _, hist_array = output_teacher_prob[0][0].cpu().data.max(dim=0, keepdim=True)
                else:
                    _, hist_array = output_teacher_prob[0].cpu().data.max(dim=0, keepdim=True)

                writer.add_histogram("val/hist_"+str(hist_ind), hist_array.numpy().flatten(), total_steps_train, hist_bins)  # Use train steps so we can compare with class power plot
                if isinstance(output_teacher, list):
                    writer.add_image("val/classes_"+str(hist_ind), color_transform_classes_prob(output_teacher_prob[0][0].cpu().data), total_steps_train)
                else:
                    writer.add_image("val/classes_"+str(hist_ind), color_transform_classes_prob(output_teacher_prob[0].cpu().data), total_steps_train)

                if epoch == start_epoch:
                    writer.add_image("val/hist/input_"+str(hist_ind), image2, total_steps_train)  # Visualize image used to compute histogram
                       
        total_steps_val += len(epoch_loss_student_val)
        avg_loss_teacher_val = sum(epoch_loss_teacher_val) / len(epoch_loss_teacher_val)
        print(f'VAL loss_teacher: {avg_loss_teacher_val:0.4} (epoch: {epoch}, step: {total_steps_val})', 
                "// Avg time/img: %.4f s" % (sum(time_val) / len(time_val) / args.batch_size))
        writer.add_scalar("val/epoch_loss_student", sum(epoch_loss_student_val) / len(epoch_loss_student_val), total_steps_val)
        writer.add_scalar("val/epoch_loss_teacher", avg_loss_teacher_val, total_steps_val)
        writer.add_scalar("val/epoch_loss_trav_student", sum(epoch_loss_trav_student_val) / len(epoch_loss_trav_student_val), total_steps_val)
        writer.add_scalar("val/epoch_loss_trav_teacher", sum(epoch_loss_trav_teacher_val) / len(epoch_loss_trav_teacher_val), total_steps_val)
        if args.force_n_classes:
            writer.add_scalar("val/epoch_acc_student", sum(epoch_acc_student_val) / len(epoch_acc_student_val), total_steps_val)
            writer.add_scalar("val/epoch_acc_teacher", sum(epoch_acc_teacher_val) / len(epoch_acc_teacher_val), total_steps_val)

        epoch_loss_student_val = []
        epoch_loss_teacher_val = []
        epoch_acc_student_val = []
        epoch_acc_teacher_val = []
        epoch_loss_trav_student_val = []
        epoch_loss_trav_teacher_val = []

        average_epoch_loss_val = avg_loss_teacher_val
        #scheduler.step(average_epoch_loss_val, epoch)  ## scheduler 1   # update lr if needed

        iouVal = 0
        if (doIouVal):
            iouVal, iou_classes = iouEvalVal.getIoU()
            iouStr = getColorEntry(iouVal)+'{:0.2f}'.format(iouVal*100) + '\033[0m'
            print ("EPOCH IoU on VAL set: ", iouStr, "%") 
           

        # remember best valIoU and save checkpoint
        if iouVal == 0:
            current_acc = average_epoch_loss_val
        else:
            current_acc = iouVal 
        is_best = current_acc > best_acc
        best_acc = max(current_acc, best_acc)
        if enc:
            filenameCheckpoint = savedir + '/checkpoint_enc.pth.tar'
            filenameBest = savedir + '/model_best_enc.pth.tar'    
        else:
            filenameCheckpoint = savedir + '/checkpoint.pth.tar'
            filenameBest = savedir + '/model_best.pth.tar'
        save_checkpoint({
            'epoch': epoch + 1,
            'arch': str(model_teacher),
            'state_dict': model_teacher.state_dict(),
            'best_acc': best_acc,
            'optimizer' : optimizer.state_dict(),
        }, is_best, filenameCheckpoint, filenameBest)

        #SAVE MODEL AFTER EPOCH
        if (enc):
            filename = f'{savedir}/model_encoder-{epoch:03}.pth'
            filenamebest = f'{savedir}/model_encoder_best.pth'
        else:
            filename = f'{savedir}/model-{epoch:03}.pth'
            filenamebest = f'{savedir}/model_best.pth'
        if args.epochs_save > 0 and step > 0 and step % args.epochs_save == 0:
            torch.save(model_teacher.state_dict(), filename)
            print(f'save: {filename} (epoch: {epoch})')
        if (is_best):
            torch.save(model_teacher.state_dict(), filenamebest)
            print(f'save: {filenamebest} (epoch: {epoch})')
            if (not enc):
                with open(savedir + "/best.txt", "w") as myfile:
                    myfile.write("Best epoch is %d, with Val-IoU= %.4f" % (epoch, iouVal))   
            else:
                with open(savedir + "/best_encoder.txt", "w") as myfile:
                    myfile.write("Best epoch is %d, with Val-IoU= %.4f" % (epoch, iouVal))           

        #SAVE TO FILE A ROW WITH THE EPOCH RESULT (train loss, val loss, train IoU, val IoU)
        #Epoch		Train-loss		Test-loss	Train-IoU	Test-IoU		learningRate
        with open(automated_log_path, "a") as myfile:
            myfile.write("\n%d\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.8f" % (epoch, average_epoch_loss_train, average_epoch_loss_val, iouTrain, iouVal, usedLr ))
    
    return(model_student, model_teacher)   #return model (convenience for encoder-decoder training)
Example #6
0
def train(args, model, enc=False):
    global best_acc

    weight = torch.ones(NUM_CLASSES)
    weight[0] = 121.21
    weight[1] = 947.02
    weight[2] = 151.92
    weight[3] = 428.31
    weight[4] = 25.88
    weight[5] = 235.97
    weight[6] = 885.72
    weight[7] = 911.87
    weight[8] = 307.49
    weight[9] = 204.69
    weight[10] = 813.92
    weight[11] = 5.83
    weight[12] = 34.22
    weight[13] = 453.34
    weight[14] = 346.10
    weight[15] = 250.19
    weight[16] = 119.99
    weight[17] = 75.28
    weight[18] = 76.71
    weight[19] = 8.58
    weight[20] = 281.68
    weight[21] = 924.07
    weight[22] = 3.91
    weight[23] = 7.14
    weight[24] = 88.89
    weight[25] = 59.00
    weight[26] = 126.59
    weight[27] = 0

    assert os.path.exists(
        args.datadir), "Error: datadir (dataset directory) could not be loaded"

    co_transform = MyCoTransform(enc, augment=True, height=args.height)  #1024)
    co_transform_val = MyCoTransform(enc, augment=False,
                                     height=args.height)  #1024)
    dataset_train = cityscapes(args.datadir, co_transform, 'train')
    dataset_val = cityscapes(args.datadir, co_transform_val, 'val')

    loader = DataLoader(dataset_train,
                        num_workers=args.num_workers,
                        batch_size=args.batch_size,
                        shuffle=True)
    loader_val = DataLoader(dataset_val,
                            num_workers=args.num_workers,
                            batch_size=args.batch_size,
                            shuffle=False)

    if args.cuda:
        #criterion =LovaszLoss2d()
        #criterion = CrossEntropyLoss2d(weight.cuda())
        criterion = FocalLoss2d(weight.cuda())
    else:
        #criterion = LovaszLoss2d()
        #criterion = CrossEntropyLoss2d(weight)
        criterion = FocalLoss2d(weight.cuda())

    print(type(criterion))

    savedir = f'../save/{args.savedir}'

    if (enc):
        automated_log_path = savedir + "/automated_log_encoder.txt"
        modeltxtpath = savedir + "/model_encoder.txt"
    else:
        automated_log_path = savedir + "/automated_log.txt"
        modeltxtpath = savedir + "/model.txt"

    if (not os.path.exists(automated_log_path)
        ):  #dont add first line if it exists
        with open(automated_log_path, "a") as myfile:
            myfile.write(
                "Epoch\t\tTrain-loss\t\tTest-loss\t\tTrain-IoU\t\tTest-IoU\t\tlearningRate"
            )

    with open(modeltxtpath, "w") as myfile:
        myfile.write(str(model))

    #optimizer = Adam(model.parameters(), 5e-4, (0.9, 0.999),  eps=1e-08, weight_decay=2e-4)     ## scheduler 1
    optimizer = Adam(model.parameters(),
                     1e-4, (0.9, 0.999),
                     eps=1e-08,
                     weight_decay=1e-4)  ## scheduler 2

    start_epoch = 1

    #scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.5) # set up scheduler     ## scheduler 1
    lambda1 = lambda epoch: pow(
        (1 - ((epoch - 1) / args.num_epochs)), 0.9)  ## scheduler 2
    scheduler = lr_scheduler.LambdaLR(optimizer,
                                      lr_lambda=lambda1)  ## scheduler 2

    time_train_perepoch = []
    for epoch in range(start_epoch, args.num_epochs + 1):
        print("----- TRAINING - EPOCH", epoch, "-----")
        start_time_perepoch = time.time()

        scheduler.step(epoch)  ## scheduler 2

        epoch_loss = []
        time_train = []

        doIouTrain = args.iouTrain
        doIouVal = args.iouVal

        usedLr = 0
        for param_group in optimizer.param_groups:
            print("LEARNING RATE: ", param_group['lr'])
            usedLr = float(param_group['lr'])

        model.train()
        for step, (images, labels) in enumerate(loader):
            start_time = time.time()
            if args.cuda:
                images = images.cuda()
                labels = labels.cuda()

            #inputs = images
            #targets= labels
            inputs = Variable(images)
            targets = Variable(labels)
            outputs = model(inputs, only_encode=enc)
            optimizer.zero_grad()
            loss = criterion(outputs, targets[:, 0])
            #loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            epoch_loss.append(loss.data[0])
            time_train.append(time.time() - start_time)

            if args.steps_loss > 0 and step % args.steps_loss == 0:
                average = sum(epoch_loss) / len(epoch_loss)
                print(
                    f'loss: {average:0.4} (epoch: {epoch}, step: {step})',
                    "// Avg time/img: %.4f s" %
                    (sum(time_train) / len(time_train) / args.batch_size))

        average_epoch_loss_train = sum(epoch_loss) / len(epoch_loss)
        #evalIoU.printConfMatrix(confMatrix, evalIoU.args)

        time_train_perepoch.append(time.time() - start_time_perepoch)
        print("// Time per epoch: %.4f hours" %
              (sum(time_train_perepoch) / len(time_train_perepoch) / 3600.0))

        #Validate on 500 val images after each epoch of training
        print("----- VALIDATING - EPOCH", epoch, "-----")
        model.eval()
        epoch_loss_val = []
        time_val = []

        if (doIouVal):
            iouEvalVal = iouEval(NUM_CLASSES)

        with torch.no_grad():
            for step, (images, labels) in enumerate(loader_val):
                start_time = time.time()
                if args.cuda:
                    images = images.cuda()
                    labels = labels.cuda()

                #inputs =images
                #targets=labels
                inputs = Variable(
                    images, requires_grad=False
                )  #, volatile=True)    #volatile flag makes it free backward or outputs for eval
                targets = Variable(labels,
                                   requires_grad=False)  #, volatile=True)
                outputs = model(inputs, only_encode=enc)

                loss = criterion(outputs, targets[:, 0])
                epoch_loss_val.append(loss.data[0])
                time_val.append(time.time() - start_time)

                if (doIouVal):
                    iouEvalVal.addBatch(
                        outputs.max(1)[1].unsqueeze(1).data, targets.data)

                if args.steps_loss > 0 and step % args.steps_loss == 0:
                    average = sum(epoch_loss_val) / len(epoch_loss_val)
                    print(
                        f'VAL loss: {average:0.4} (epoch: {epoch}, step: {step})',
                        "// Avg time/img: %.4f s" %
                        (sum(time_val) / len(time_val) / args.batch_size))

        average_epoch_loss_val = sum(epoch_loss_val) / len(epoch_loss_val)
        #scheduler.step(average_epoch_loss_val, epoch)  ## scheduler 1   # update lr if needed

        # Calculate IOU scores on class level from matrix
        iouVal = 0
        iouTrain = 0
        if (doIouVal):
            iouVal, iou_classes, accVal, acc_classes = iouEvalVal.getIoU()

            print("pole    : %.6f" % (iou_classes[0] * 100.0), "%\t")
            print("slight  : %.6f" % (iou_classes[1] * 100.0), "%\t")
            print("bboard  : %.6f" % (iou_classes[2] * 100.0), "%\t")
            print("tlight  : %.6f" % (iou_classes[3] * 100.0), "%\t")
            print("car     : %.6f" % (iou_classes[4] * 100.0), "%\t")
            print("truck   : %.6f" % (iou_classes[5] * 100.0), "%\t")
            print("bicycle : %.6f" % (iou_classes[6] * 100.0), "%\t")
            print("motor   : %.6f" % (iou_classes[7] * 100.0), "%\t")
            print("bus     : %.6f" % (iou_classes[8] * 100.0), "%\t")
            print("tsignf  : %.6f" % (iou_classes[9] * 100.0), "%\t")
            print("tsignb  : %.6f" % (iou_classes[10] * 100.0), "%\t")
            print("road    : %.6f" % (iou_classes[11] * 100.0), "%\t")
            print("sidewalk: %.6f" % (iou_classes[12] * 100.0), "%\t")
            print("curbcut : %.6f" % (iou_classes[13] * 100.0), "%\t")
            print("crosspln: %.6f" % (iou_classes[14] * 100.0), "%\t")
            print("bikelane: %.6f" % (iou_classes[15] * 100.0), "%\t")
            print("curb    : %.6f" % (iou_classes[16] * 100.0), "%\t")
            print("fence   : %.6f" % (iou_classes[17] * 100.0), "%\t")
            print("wall    : %.6f" % (iou_classes[18] * 100.0), "%\t")
            print("building: %.6f" % (iou_classes[19] * 100.0), "%\t")
            print("person  : %.6f" % (iou_classes[20] * 100.0), "%\t")
            print("rider   : %.6f" % (iou_classes[21] * 100.0), "%\t")
            print("sky     : %.6f" % (iou_classes[22] * 100.0), "%\t")
            print("vege    : %.6f" % (iou_classes[23] * 100.0), "%\t")
            print("terrain : %.6f" % (iou_classes[24] * 100.0), "%\t")
            print("markings: %.6f" % (iou_classes[25] * 100.0), "%\t")
            print("crosszeb: %.6f" % (iou_classes[26] * 100.0), "%\t")

            iouStr = getColorEntry(iouVal) + '{:0.2f}'.format(
                iouVal * 100) + '\033[0m'
            print("EPOCH IoU on VAL set: ", iouStr, "%")

            print("pole    : %.6f" % (acc_classes[0] * 100.0), "%\t")
            print("slight  : %.6f" % (acc_classes[1] * 100.0), "%\t")
            print("bboard  : %.6f" % (acc_classes[2] * 100.0), "%\t")
            print("tlight  : %.6f" % (acc_classes[3] * 100.0), "%\t")
            print("car     : %.6f" % (acc_classes[4] * 100.0), "%\t")
            print("truck   : %.6f" % (acc_classes[5] * 100.0), "%\t")
            print("bicycle : %.6f" % (acc_classes[6] * 100.0), "%\t")
            print("motor   : %.6f" % (acc_classes[7] * 100.0), "%\t")
            print("bus     : %.6f" % (acc_classes[8] * 100.0), "%\t")
            print("tsignf  : %.6f" % (acc_classes[9] * 100.0), "%\t")
            print("tsignb  : %.6f" % (acc_classes[10] * 100.0), "%\t")
            print("road    : %.6f" % (acc_classes[11] * 100.0), "%\t")
            print("sidewalk: %.6f" % (acc_classes[12] * 100.0), "%\t")
            print("curbcut : %.6f" % (acc_classes[13] * 100.0), "%\t")
            print("crosspln: %.6f" % (acc_classes[14] * 100.0), "%\t")
            print("bikelane: %.6f" % (acc_classes[15] * 100.0), "%\t")
            print("curb    : %.6f" % (acc_classes[16] * 100.0), "%\t")
            print("fence   : %.6f" % (acc_classes[17] * 100.0), "%\t")
            print("wall    : %.6f" % (acc_classes[18] * 100.0), "%\t")
            print("building: %.6f" % (acc_classes[19] * 100.0), "%\t")
            print("person  : %.6f" % (acc_classes[20] * 100.0), "%\t")
            print("rider   : %.6f" % (acc_classes[21] * 100.0), "%\t")
            print("sky     : %.6f" % (acc_classes[22] * 100.0), "%\t")
            print("vege    : %.6f" % (acc_classes[23] * 100.0), "%\t")
            print("terrain : %.6f" % (acc_classes[24] * 100.0), "%\t")
            print("markings: %.6f" % (acc_classes[25] * 100.0), "%\t")
            print("crosszeb: %.6f" % (acc_classes[26] * 100.0), "%\t")

            accStr = getColorEntry(accVal) + '{:0.2f}'.format(
                accVal * 100) + '\033[0m'
            print("EPOCH ACC on VAL set: ", accStr, "%")

        # remember best valIoU and save checkpoint
        if iouVal == 0:
            current_acc = average_epoch_loss_val
        else:
            current_acc = iouVal
        is_best = current_acc > best_acc
        best_acc = max(current_acc, best_acc)
        if (enc and epoch == args.num_epochs):
            best_acc = 0

        if enc:
            filenameCheckpoint = savedir + '/checkpoint_enc.pth'
            filenameBest = savedir + '/model_best_enc.pth'
        else:
            filenameCheckpoint = savedir + '/checkpoint.pth'
            filenameBest = savedir + '/model_best.pth'
        save_checkpoint({
            'state_dict': model.state_dict(),
        }, is_best, filenameCheckpoint, filenameBest)

        #SAVE MODEL AFTER EPOCH
        if (enc):
            filename = f'{savedir}/model_encoder-{epoch:03}.pth'
            filenamebest = f'{savedir}/model_encoder_best_each.pth'
        else:
            filename = f'{savedir}/model-{epoch:03}.pth'
            filenamebest = f'{savedir}/model_best_each.pth'
        if args.epochs_save > 0 and step > 0 and step % args.epochs_save == 0:
            torch.save(model.state_dict(), filename)
            print(f'save: {filename} (epoch: {epoch})')
        #if (True) #(is_best):
        torch.save(model.state_dict(), filenamebest)
        print(f'save: {filenamebest} (epoch: {epoch})')
        filenameSuperBest = f'{savedir}/model_superbest.pth'
        if (is_best):
            torch.save(model.state_dict(), filenameSuperBest)
            print(f'saving superbest')
        if (not enc):
            with open(savedir + "/best.txt", "w") as myfile:
                myfile.write("Best epoch is %d, with Val-IoU= %.4f" %
                             (epoch, iouVal))
        else:
            with open(savedir + "/best_encoder.txt", "w") as myfile:
                myfile.write("Best epoch is %d, with Val-IoU= %.4f" %
                             (epoch, iouVal))

        #SAVE TO FILE A ROW WITH THE EPOCH RESULT (train loss, val loss, train IoU, val IoU)
        #Epoch		Train-loss		Test-loss	Train-IoU	Test-IoU		learningRate
        with open(automated_log_path, "a") as myfile:
            myfile.write("\n%d\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.8f" %
                         (epoch, average_epoch_loss_train,
                          average_epoch_loss_val, iouTrain, iouVal, usedLr))

    return (model)  #return model (convenience for encoder-decoder training)
Example #7
0
def train(savedir,
          model,
          dataloader_train,
          dataloader_eval,
          criterion,
          optimizer,
          args,
          enc=False):
    min_loss = float('inf')

    # use tensorboard
    writer = SummaryWriter(log_dir=savedir)
    if (enc):
        automated_log_path = savedir + "/automated_log_encoder.txt"
        modeltxtpath = savedir + "/model_encoder.txt"
    else:
        automated_log_path = savedir + "/automated_log.txt"
        modeltxtpath = savedir + "/model.txt"

    if (not os.path.exists(automated_log_path)
        ):  #dont add first line if it exists
        with open(automated_log_path, "a") as myfile:
            myfile.write(
                "Epoch\t\tTrain-loss\t\tTest-loss\t\tTrain-IoU\t\tTest-IoU\t\tlearningRate"
            )

    with open(modeltxtpath, "w") as myfile:
        myfile.write(str(model))

    start_epoch = 1
    if args.resume:
        #Must load weights, optimizer, epoch and best value.
        if enc:
            filenameCheckpoint = savedir + '/checkpoint_enc.pth.tar'
        else:
            filenameCheckpoint = savedir + '/checkpoint.pth.tar'

        assert os.path.exists(
            filenameCheckpoint
        ), "Error: resume option was used but checkpoint was not found in folder"
        checkpoint = torch.load(filenameCheckpoint)
        start_epoch = checkpoint['epoch']
        model.load_state_dict(checkpoint['state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        best_acc = checkpoint['best_acc']
        print("=> Loaded checkpoint at epoch {})".format(checkpoint['epoch']))

    #scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.5) # set up scheduler     ## scheduler 1
    lambda1 = lambda epoch: pow(
        (1 - ((epoch - 1) / args.num_epochs)), 0.9)  ## scheduler 2
    scheduler = lr_scheduler.LambdaLR(optimizer,
                                      lr_lambda=lambda1)  ## scheduler 2

    if args.visualize and args.steps_plot > 0:
        board = Dashboard(args.port)

    for epoch in range(start_epoch, args.num_epochs + 1):
        print("----- TRAINING - EPOCH", epoch, "-----")

        scheduler.step(epoch)

        epoch_loss = []
        time_train = []

        doIouTrain = args.iouTrain
        doIouVal = args.iouVal

        if (doIouTrain):
            iouEvalTrain = iouEval(mean_and_var)

        usedLr = 0
        for param_group in optimizer.param_groups:
            print("LEARNING RATE: ", param_group['lr'])
            usedLr = float(param_group['lr'])

        model.train()
        for step, (images, labels, _) in enumerate(dataloader_train):

            start_time = time.time()
            #print (labels.size())
            #print (np.unique(labels.numpy()))
            #print("labels: ", np.unique(labels[0].numpy()))
            #labels = torch.ones(4, 1, 512, 1024).long()
            if args.cuda:
                images = images.cuda()
                labels = labels.cuda()
            #print("image: ", images.size())
            #print("labels: ", labels.size())
            inputs = Variable(images)
            targets = Variable(labels)
            outputs = model(inputs, only_encode=enc)

            # print("output: ", outputs.size()) #TODO
            # print("targets", np.unique(targets[:, 0].cpu().data.numpy()))

            optimizer.zero_grad()
            loss = criterion(outputs, targets[:, 0])

            loss.backward()
            optimizer.step()

            epoch_loss.append(loss)
            time_train.append(time.time() - start_time)

            if (doIouTrain):
                #start_time_iou = time.time()
                iouEvalTrain.addBatch(
                    outputs.max(1)[1].unsqueeze(1).data, targets.data)
                #print ("Time to add confusion matrix: ", time.time() - start_time_iou)

            #print(outputs.size())
            if args.visualize and args.steps_plot > 0 and step % args.steps_plot == 0:
                start_time_plot = time.time()
                image = inputs[0].cpu().data
                #image[0] = image[0] * .229 + .485
                #image[1] = image[1] * .224 + .456
                #image[2] = image[2] * .225 + .406
                #print("output", np.unique(outputs[0].cpu().max(0)[1].data.numpy()))
                board.image(image, f'input (epoch: {epoch}, step: {step})')
                if isinstance(outputs, list):  #merge gpu tensors
                    board.image(
                        color_transform(
                            outputs[0][0].cpu().max(0)[1].data.unsqueeze(0)),
                        f'output (epoch: {epoch}, step: {step})')
                else:
                    board.image(
                        color_transform(
                            outputs[0].cpu().max(0)[1].data.unsqueeze(0)),
                        f'output (epoch: {epoch}, step: {step})')
                board.image(color_transform(targets[0].cpu().data),
                            f'target (epoch: {epoch}, step: {step})')
                print("Time to paint images: ", time.time() - start_time_plot)
            if args.steps_loss > 0 and step % args.steps_loss == 0:
                average = sum(epoch_loss) / len(epoch_loss)
                print(
                    f'loss: {average:0.4} (epoch: {epoch}, step: {step})',
                    "// Avg time/img: %.4f s" %
                    (sum(time_train) / len(time_train) / args.batch_size))

        average_epoch_loss_train = sum(epoch_loss) / len(epoch_loss)
        writer.add_scalar('train_loss', average_epoch_loss_train, epoch)

        iouTrain = 0
        if (doIouTrain):
            iouTrain, iou_classes = iouEvalTrain.getIoU()
            iouStr = getColorEntry(iouTrain) + '{:0.2f}'.format(
                iouTrain * 100) + '\033[0m'
            print("EPOCH IoU on TRAIN set: ", iouStr, "%")

        #Validate on 500 val images after each epoch of training
        print("----- VALIDATING - EPOCH", epoch, "-----")
        model.eval()
        epoch_loss_val = []
        time_val = []

        if (doIouVal):
            iouEvalVal = iouEval(mean_and_var)

        for step, (images, labels, _) in enumerate(dataloader_eval):
            start_time = time.time()
            if args.cuda:
                images = images.cuda()
                labels = labels.cuda()
            optimizer.zero_grad()
            inputs = Variable(images)
            targets = Variable(labels)
            with torch.no_grad():
                outputs = model(inputs, only_encode=enc)

                loss = criterion(outputs, targets[:, 0])
            epoch_loss_val.append(loss.data)
            time_val.append(time.time() - start_time)

            if args.steps_loss > 0 and step % args.steps_loss == 0:
                average = sum(epoch_loss_val) / len(epoch_loss_val)
                print(
                    f'VAL loss: {average:0.4} (epoch: {epoch}, step: {step})',
                    "// Avg time/img: %.4f s" %
                    (sum(time_val) / len(time_val) / args.batch_size))

        average_epoch_loss_val = sum(epoch_loss_val) / len(epoch_loss_val)
        #scheduler.step(average_epoch_loss_val, epoch)  ## scheduler 1   # update lr if needed
        writer.add_scalar('eval_loss', average_epoch_loss_val, epoch)

        iouVal = 0
        if (doIouVal):
            iouVal, iou_classes = iouEvalVal.getIoU()
            iouStr = getColorEntry(iouVal) + '{:0.2f}'.format(
                iouVal * 100) + '\033[0m'
            print("EPOCH IoU on VAL set: ", iouStr, "%")

        is_best = average_epoch_loss_val < min_loss
        min_loss = min(min_loss, average_epoch_loss_val)
        if enc:
            filenameCheckpoint = savedir + '/checkpoint_enc.pth.tar'
            filenameBest = savedir + '/model_best_enc.pth.tar'
        else:
            filenameCheckpoint = savedir + '/checkpoint.pth.tar'
            filenameBest = savedir + '/model_best.pth.tar'
        save_checkpoint(
            {
                'epoch': epoch + 1,
                'arch': str(model),
                'state_dict': model.state_dict(),
                'best_acc': min_loss,
                'optimizer': optimizer.state_dict(),
            }, is_best, filenameCheckpoint, filenameBest)

        #SAVE MODEL AFTER EPOCH
        if (enc):
            filename = f'{savedir}/model_encoder-{epoch:03}.pth'
            filenamebest = f'{savedir}/model_encoder_best.pth'
        else:
            filename = f'{savedir}/model-{epoch:03}.pth'
            filenamebest = f'{savedir}/model_best.pth'
        if args.epochs_save > 0 and step > 0 and step % args.epochs_save == 0:
            torch.save(model.state_dict(), filename)
            print(f'save: {filename} (epoch: {epoch})')
        if (is_best):
            torch.save(model.state_dict(), filenamebest)
            print(f'save: {filenamebest} (epoch: {epoch})')
            if (not enc):
                with open(savedir + "/best.txt", "w") as myfile:
                    myfile.write("Best epoch is %d, with Val-IoU= %.4f" %
                                 (epoch, iouVal))
            else:
                with open(savedir + "/best_encoder.txt", "w") as myfile:
                    myfile.write("Best epoch is %d, with Val-IoU= %.4f" %
                                 (epoch, iouVal))

        #SAVE TO FILE A ROW WITH THE EPOCH RESULT (train loss, val loss, train IoU, val IoU)
        #Epoch		Train-loss		Test-loss	Train-IoU	Test-IoU		learningRate
        with open(automated_log_path, "a") as myfile:
            myfile.write("\n%d\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.8f" %
                         (epoch, average_epoch_loss_train,
                          average_epoch_loss_val, iouTrain, iouVal, usedLr))
    writer.close()
    torch.save(model.state_dict(), f'{savedir}/weight_final.pth')
    return (model)  #return model (convenience for encoder-decoder training)
Example #8
0
def train(args, model, classNum, epochNum, encoderOnly=False):

    start_epoch = 1
    best_acc = 0

    # === Dataset Processing === #
    if args.dataset == 'cityscapes':
        co_transform = MyCoTransform(encoderOnly,
                                     dataAugment=True,
                                     height=args.height)
        co_transform_val = MyCoTransform(encoderOnly,
                                         dataAugment=False,
                                         height=args.height)
        dataDir = '/media/commlab/TenTB/swhung/SegNet/Cityscapes/'
        dataset_train = cityscapes(dataDir, co_transform, 'train')
        dataset_val = cityscapes(dataDir, co_transform_val, 'val')
        saveDir = f'../save/{args.saveDir}'  # #

    loader_train = DataLoader(dataset_train,
                              num_workers=args.num_workers,
                              batch_size=args.batchSize,
                              shuffle=True)
    loader_val = DataLoader(dataset_val,
                            num_workers=args.num_workers,
                            batch_size=args.batchSize,
                            shuffle=False)

    # === Optimization Setting === #

    # ** optimizer
    if args.optimizer == 'adam':
        optimizer = optim.Adam(model.parameters(),
                               lr=args.lr,
                               weight_decay=1e-4)
    elif args.optimizer == 'sgd':
        optimizer = optim.SGD(model.parameters(),
                              lr=args.lr,
                              momentum=0.9,
                              weight_decay=1e-4)

# ** learing rate scheduler
    my_lambda = lambda epoch: pow((1 - ((epoch - 1) / epochNum)), 0.9)  # poly
    scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=my_lambda)

    # ** apply loss function
    classWeight = getClassWeight(args.dataset, classNum)
    if args.cuda:
        classWeight = classWeight.cuda()

    criterion = CrossEntropyLoss2d(weight=classWeight, ignore_index=19)

    # === save information in .txt files === #
    if (encoderOnly):
        automated_log_path = saveDir + "/automated_log_encoder.txt"
        modeltxtpath = saveDir + "/model_txt_encoder.txt"
    else:
        automated_log_path = saveDir + "/automated_log.txt"
        modeltxtpath = saveDir + "/model_txt.txt"

    if (not os.path.exists(automated_log_path)
        ):  # do not add first line if it exists
        with open(automated_log_path, "a") as myfile:
            myfile.write(
                "Epoch\t\tTrain-loss\t\tTest-loss\t\tTrain-IoU\t\tTest-IoU\t\tlearningRate"
            )

    with open(modeltxtpath, "w") as myfile:
        myfile.write(str(model))

# === Training === #
    for epoch in range(start_epoch, epochNum + 1):

        print("----- TRAINING - EPOCH", epoch, "-----")

        model.train()

        scheduler.step(epoch - 1)

        epoch_loss = []
        time_train = []

        if (args.doEvalTrain):
            iouEvalTrain = iouEval(classNum)

        usedLr = 0
        for param_group in optimizer.param_groups:
            print("learning rate: ", param_group['lr'])
            usedLr = float(param_group['lr'])

# ** training iteration
        for iter, (images, labels) in enumerate(loader_train):
            start_time = time.time()

            slice = torch.split(images, 1, 1)
            rgb = torch.cat((slice[0], slice[1], slice[2]), 1)
            d = torch.cat((slice[3], slice[4]), 1)  #depth and luminance

            if args.cuda:
                rgb_inputs = rgb.cuda()
                d_input = d.cuda()
                targets = labels.cuda()

            img_size = list(targets.size())[2:4]

            # run the model
            if args.onlyWholeNet:
                outputs = model(inputs)
            else:
                outputs = model(rgb_inputs, d_input, only_encoder=encoderOnly)

# run the back-propagation
            loss = criterion(outputs, targets[:, 0])
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            epoch_loss.append(loss.item())
            time_train.append(time.time() - start_time)

            if (args.doEvalTrain):
                iouEvalTrain.addBatch(
                    outputs.max(1)[1].unsqueeze(1).data, targets.data)

# print the training loss information
            if args.iter_loss > 0 and iter % args.iter_loss == 0:
                average = sum(epoch_loss) / len(epoch_loss)
                print(
                    f'loss: {average:0.4} (epoch: {epoch}, iter: {iter})',
                    "// Avg time/img: %.4f s" %
                    (sum(time_train) / len(time_train) / args.batchSize))

        average_epoch_loss_train = sum(epoch_loss) / len(epoch_loss)

        iouTrain = 0
        if (args.doEvalTrain):
            iouTrain, iou_classes = iouEvalTrain.getIoU()
            iouStr = getColorEntry(iouTrain) + '{:0.2f}'.format(
                iouTrain * 100) + '\033[0m'
            print("EPOCH IoU on TRAIN set: ", iouStr, "%")

        if epoch <= 10 or epoch >= 70:
            with torch.no_grad():

                # Validate on 500 val images after each epoch of training
                print("----- VALIDATING - EPOCH", epoch, "-----")

                model.eval()

                epoch_loss_val = []
                time_val = []

                if (args.doEvalVal):
                    iouEvalVal = iouEval(classNum)

# ** valadation iteration
                for iter, (images, labels) in enumerate(loader_val):

                    start_time = time.time()

                    slice = torch.split(images, 1, 1)
                    rgb = torch.cat((slice[0], slice[1], slice[2]), 1)
                    d = torch.cat((slice[3], slice[4]),
                                  1)  #depth and luminance

                    if args.cuda:
                        rgb_inputs = rgb.cuda()
                        d_input = d.cuda()
                        targets = labels.cuda()

                    img_size = list(targets.size())[2:4]

                    # run the model
                    if args.onlyWholeNet:
                        outputs = model(inputs)
                    else:
                        outputs = model(rgb_inputs,
                                        d_input,
                                        only_encoder=encoderOnly)

                    loss = criterion(outputs, targets[:, 0])

                    epoch_loss_val.append(loss.item())
                    time_val.append(time.time() - start_time)

                    # Add batch to calculate TP, FP and FN for iou estimation
                    if (args.doEvalVal):
                        iouEvalVal.addBatch(
                            outputs.max(1)[1].unsqueeze(1).data, targets.data)

# print the valadation loss information
                    if args.iter_loss > 0 and iter % args.iter_loss == 0:
                        average = sum(epoch_loss_val) / len(epoch_loss_val)
                        print(
                            f'VAL loss: {average:0.4} (epoch: {epoch}, iter: {iter})',
                            "// Avg time/img: %.4f s" %
                            (sum(time_val) / len(time_val) / args.batchSize))

            average_epoch_loss_val = sum(epoch_loss_val) / len(epoch_loss_val)

            # print epoch val IoU accuracy
            iouVal = 0
            if (args.doEvalVal):
                iouVal, iou_classes = iouEvalVal.getIoU()
                iouStr = getColorEntry(iouVal) + '{:0.2f}'.format(
                    iouVal * 100) + '\033[0m'
                print("EPOCH IoU on VAL set: ", iouStr, "%")

            # remember best valIoU and save checkpoint
            if iouVal == 0:
                current_acc = average_epoch_loss_val
            else:
                current_acc = iouVal

            is_best = current_acc > best_acc
            best_acc = max(current_acc, best_acc)

            if encoderOnly:
                filenameCheckpoint = saveDir + '/checkpoint_enc.pth.tar'
                filenameBest = saveDir + '/model_best_encoder.pth.tar'
            else:
                filenameCheckpoint = saveDir + '/checkpoint.pth.tar'
                filenameBest = saveDir + '/model_best.pth.tar'

            save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'arch': str(model),
                    'state_dict': model.state_dict(),
                    'best_acc': best_acc,
                    'optimizer': optimizer.state_dict(),
                }, is_best, filenameCheckpoint, filenameBest)

            if (encoderOnly):
                filename = f'{saveDir}/model_encoder-{epoch:03}.pth'
                filenamebest = f'{saveDir}/model_best_encoder.pth'
            else:
                filename = f'{saveDir}/model-{epoch:03}.pth'
                filenamebest = f'{saveDir}/model_best.pth'

# save model after some epochs
            if args.epochs_save > 0 and iter > 0 and iter % args.epochs_save == 0:
                torch.save(model.state_dict(), filename)
                print(f'save: {filename} (epoch: {epoch})')

# save the best model
            if (is_best):
                torch.save(model.state_dict(), filenamebest)
                print(f'save: {filenamebest} (epoch: {epoch})')

                if (not encoderOnly):
                    with open(saveDir + "/best_IoU.txt", "w") as myfile:
                        myfile.write("Best epoch is %d, with Val-IoU= %.4f" %
                                     (epoch, iouVal))
                else:
                    with open(saveDir + "/best_IoU_encoder.txt",
                              "w") as myfile:
                        myfile.write("Best epoch is %d, with Val-IoU= %.4f" %
                                     (epoch, iouVal))

# save information in .txt files
#SAVE TO FILE A ROW WITH THE EPOCH RESULT (train loss, val loss, train IoU, val IoU)
#Epoch		Train-loss		Test-loss	Train-IoU	Test-IoU		learningRate
            with open(automated_log_path, "a") as myfile:
                myfile.write(
                    "\n%d\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.8f" %
                    (epoch, average_epoch_loss_train, average_epoch_loss_val,
                     iouTrain, iouVal, usedLr))

    return model  # return model (convenience for encoder-decoder training)
Example #9
0
def main(args):

    modelpath = args.loadDir + args.loadModel
    weightspath = args.loadDir + args.loadWeights

    print("Loading model: " + modelpath)
    print("Loading weights: " + weightspath)

    model = FSFNet(NUM_CLASSES)

    #model = torch.nn.DataParallel(model)
    if (not args.cpu):
        model = torch.nn.DataParallel(model).cuda()

    def load_my_state_dict(
            model, state_dict
    ):  #custom function to load model when not all dict elements
        own_state = model.state_dict()
        for name, param in state_dict.items():
            # print(name)
            # print(param)
            if name not in own_state:

                if name.startswith("module."):
                    own_state[name.split("module.")[-1]].copy_(param)
                else:
                    print(name, " not loaded")
                    continue
            else:
                own_state[name].copy_(param)
        return model

    model = load_my_state_dict(
        model,
        torch.load(weightspath, map_location=lambda storage, loc: storage))
    print("Model and weights LOADED successfully")

    model.eval()

    if (not os.path.exists(args.datadir)):
        print("Error: datadir could not be loaded")

    loader = DataLoader(camvid(args.datadir,
                               input_transform_camvid,
                               target_transform_camvid,
                               subset=args.subset),
                        num_workers=args.num_workers,
                        batch_size=args.batch_size,
                        shuffle=False)

    iouEvalVal = iouEval(NUM_CLASSES)

    start = time.time()

    for step, (images, labels, filename, filenameGt) in enumerate(loader):
        if (not args.cpu):
            images = images.cuda()
            labels = labels.cuda()

        inputs = Variable(images, volatile=True)
        outputs = model(inputs)

        iouEvalVal.addBatch(outputs.max(1)[1].unsqueeze(1).data, labels)

        filenameSave = filename[0].split("images/")[1]

        print(step, filenameSave)

    iouVal, iou_classes = iouEvalVal.getIoU()

    iou_classes_str = []
    for i in range(iou_classes.size(0)):
        iouStr = getColorEntry(iou_classes[i]) + '{:0.2f}'.format(
            iou_classes[i] * 100) + '\033[0m'
        iou_classes_str.append(iouStr)

    print("---------------------------------------")
    print("Took ", time.time() - start, "seconds")
    print("=======================================")
    #print("TOTAL IOU: ", iou * 100, "%")
    print("Per-Class IoU:")
    print(iou_classes_str[0], "Sky")
    print(iou_classes_str[1], "Building")
    print(iou_classes_str[2], "Pole")
    print(iou_classes_str[3], "Road")
    print(iou_classes_str[4], "Pavement")
    print(iou_classes_str[5], "Tree")
    print(iou_classes_str[6], "SignSymbol")
    print(iou_classes_str[7], "Fence")
    print(iou_classes_str[8], "Car")
    print(iou_classes_str[9], "Pedestrian")
    print(iou_classes_str[10], "Bicyclist")

    print("=======================================")
    iouStr = getColorEntry(iouVal) + '{:0.2f}'.format(iouVal * 100) + '\033[0m'
    print("MEAN IoU: ", iouStr, "%")
def train(args, rmodel, model, enc=False):
    best_acc = 0
    weight = classWeights(NUM_CLASSES)
    assert os.path.exists(
        args.datadir), "Error: datadir (dataset directory) could not be loaded"

    co_transform = MyCoTransform(augment=True, height=args.height)
    co_transform_val = MyCoTransform(augment=False, height=args.height)
    dataset_train = cityscapes(args.datadir, co_transform, 'train')
    dataset_val = cityscapes(args.datadir, co_transform_val, 'val')

    loader = DataLoader(dataset_train,
                        num_workers=args.num_workers,
                        batch_size=args.batch_size,
                        shuffle=True)
    loader_val = DataLoader(dataset_val,
                            num_workers=args.num_workers,
                            batch_size=args.batch_size,
                            shuffle=False)

    if args.cuda:
        weight = weight.cuda()
    rcriterion = torch.nn.L1Loss()

    savedir = '/home/shyam.nandan/NewExp/F_erfnet_pytorch_ours_w_gt_v2_multiply/save/' + args.savedir  #change path

    if (enc):
        automated_log_path = savedir + "/automated_log_encoder.txt"
        modeltxtpath = savedir + "/model_encoder.txt"
    else:
        automated_log_path = savedir + "/automated_log.txt"
        modeltxtpath = savedir + "/model.txt"

    if (not os.path.exists(automated_log_path)):
        with open(automated_log_path, "a") as myfile:
            myfile.write(
                "Epoch\t\tTrain-loss\t\tTest-loss\t\tTrain-IoU\t\tTest-IoU\t\tlearningRate"
            )

    with open(modeltxtpath, "w") as myfile:
        myfile.write(str(model))

    optimizer = Adam(model.parameters(),
                     5e-4, (0.9, 0.999),
                     eps=1e-08,
                     weight_decay=2e-4)  ##
    roptimizer = Adam(rmodel.parameters(), 2e-4,
                      (0.9, 0.999))  ## restoration scheduler

    start_epoch = 1
    scheduler = lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.99)
    rscheduler = lr_scheduler.StepLR(roptimizer, step_size=30,
                                     gamma=0.5)  ## Restoration schedular

    for epoch in range(start_epoch, args.num_epochs + 1):
        print("----- TRAINING - EPOCH", epoch, "-----")

        scheduler.step()  ## scheduler 2
        rscheduler.step()

        epoch_loss = []
        time_train = []

        doIouTrain = args.iouTrain
        doIouVal = args.iouVal

        if (doIouTrain):
            iouEvalTrain = iouEval(NUM_CLASSES)

        usedLr = 0
        rusedLr = 0
        for param_group in optimizer.param_groups:
            print("Segmentation LEARNING RATE: ", param_group['lr'])
            usedLr = float(param_group['lr'])
        for param_group in roptimizer.param_groups:
            print("Restoration LEARNING RATE: ", param_group['lr'])
            rusedLr = float(param_group['lr'])

        model.eval()
        epoch_loss_val = []
        time_val = []

        if (doIouVal):
            iouEvalVal = iouEval(NUM_CLASSES)

        for step, (timages, images, labels, filename) in enumerate(loader_val):
            start_time = time.time()
            if args.cuda:
                images = images.cuda()
                labels = labels.cuda()
                timages = timages.cuda()

            inputs = Variable(
                timages, volatile=True
            )  #volatile flag makes it free backward or outputs for eval
            itargets = Variable(images, volatile=True)
            targets = Variable(labels, volatile=True)

            ss_inputs = rmodel(inputs, flag=0, r_fb1=0, r_fb2=0)

            outs = model(ss_inputs, only_encode=enc)

            tminus_outs = outs.detach()
            tplus_outs = outs.detach()

            for num_feedback in range(3):

                optimizer.zero_grad()
                roptimizer.zero_grad()

                ss_inputs = rmodel(inputs,
                                   flag=1,
                                   r_fb1=(tplus_outs - tminus_outs),
                                   r_fb2=ss_inputs.detach())

                loss = rcriterion(ss_inputs, itargets)

                outs = model(ss_inputs.detach(), only_encode=enc)

                tminus_outs = tplus_outs
                tplus_outs = outs.detach()

            outputs = outs
            del outs, tminus_outs, tplus_outs
            gc.collect()
            Gamma = [0, 0, 0]
            Alpha = [1, 1, 1]
            loss = CB_iFl(outputs,
                          targets[:, 0],
                          weight,
                          gamma=Gamma[0],
                          alpha=Alpha[0])
            epoch_loss_val.append(loss.data[0])
            time_val.append(time.time() - start_time)

            if (doIouVal):
                #start_time_iou = time.time()
                iouEvalVal_img = iouEval(NUM_CLASSES)
                iouEvalVal_img.addBatch(
                    outputs.max(1)[1].unsqueeze(1).data, targets.data)

                iouEvalVal.addBatch(
                    outputs.max(1)[1].unsqueeze(1).data, targets.data)

                #print ("Time to add confusion matrix: ", time.time() - start_time_iou)
                label_color = Colorize()(
                    outputs[0].max(0)[1].byte().cpu().data.unsqueeze(0))
                label_save = ToPILImage()(label_color)

                filenameSave = '../save_color_restored_joint_afl_CBFL/' + filename[
                    0].split('/')[-2]

                im_iou, _ = iouEvalVal_img.getIoU()

                if not os.path.exists(filenameSave):
                    os.makedirs(filenameSave)
            #Uncomment to save output
            #label_save.save(filenameSave+ '/' + str(" %6.4f " %im_iou[0].data.numpy()) + '_' + filename[0].split('/')[-1])

            if args.steps_loss > 0 and step % args.steps_loss == 0:
                average = sum(epoch_loss_val) / len(epoch_loss_val)
                print('Val loss:  ', average, 'Epoch:  ', epoch, 'Step:  ',
                      step)

        average_epoch_loss_val = sum(epoch_loss_val) / len(epoch_loss_val)

        iouVal = 0
        if (doIouVal):
            iouVal, iou_classes = iouEvalVal.getIoU()
            iouStr = getColorEntry(iouVal) + '{:0.2f}'.format(
                iouVal * 100) + '\033[0m'
            print(iouVal, iou_classes, iouStr)

    return (model)
Example #11
0
def train(args, rmodel, model, enc=False):

    best_acc = 0
    weight = classWeights(NUM_CLASSES)
    assert os.path.exists(args.datadir), "Error: datadir (dataset directory) could not be loaded"

    co_transform = MyCoTransform(augment=True, height=args.height)
    co_transform_val = MyCoTransform(augment=False, height=args.height)

    dataset_train = cityscapes(args.datadir, co_transform, 'train')
    dataset_val = cityscapes(args.datadir, co_transform_val, 'val')

    loader = DataLoader(dataset_train, num_workers=args.num_workers, batch_size=args.batch_size, shuffle=True)
    loader_val = DataLoader(dataset_val, num_workers=args.num_workers, batch_size=args.batch_size, shuffle=False)

    if args.cuda:
        weight = weight.cuda()
    rcriterion = torch.nn.L1Loss()
    
    savedir = '/home/shyam.nandan/NewExp/final_code/save/' + args.savedir
    automated_log_path = savedir + "/automated_log.txt"
    modeltxtpath = savedir + "/model.txt"    

    if (not os.path.exists(automated_log_path)):    
        with open(automated_log_path, "a") as myfile:
            myfile.write("Epoch\t\tTrain-loss\t\tTest-loss\t\tTrain-IoU\t\tTest-IoU\t\tlearningRate")

    with open(modeltxtpath, "w") as myfile:
        myfile.write(str(model))

    optimizer = Adam(model.parameters(), 5e-4, (0.9, 0.999),eps=1e-08, weight_decay=2e-4)
    roptimizer = Adam(rmodel.parameters(), 2e-4, (0.9, 0.999))                                       

    start_epoch = 1
    scheduler = lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.99)
    rscheduler = lr_scheduler.StepLR(roptimizer, step_size=30, gamma=0.5)                        
    
    for epoch in range(start_epoch, args.num_epochs+1):
        print("----- TRAINING - EPOCH", epoch, "-----")

        scheduler.step()    
        rscheduler.step()
        
	epoch_loss = []
        time_train = []
     
        doIouTrain = args.iouTrain   
        doIouVal =  args.iouVal      

        if (doIouTrain):
            iouEvalTrain = iouEval(NUM_CLASSES)

        usedLr = 0
        rusedLr = 0

        for param_group in optimizer.param_groups:
            print("Segmentation LEARNING RATE: ", param_group['lr'])
            usedLr = float(param_group['lr'])
        for param_group in roptimizer.param_groups:
            print("Restoration LEARNING RATE: ", param_group['lr'])
            rusedLr = float(param_group['lr'])

        model.train()
        for step, (timages, images, labels) in enumerate(loader):
            start_time = time.time()
            if args.cuda:
                images = images.cuda()
                labels = labels.cuda()
                timages = timages.cuda()
           
            inputs = Variable(timages)
	    itargets = Variable(images)
            targets = Variable(labels)  
	    
	    ss_inputs = rmodel(inputs, flag = 0, r_fb1 = 0, r_fb2 = 0)
            
            outs = model(ss_inputs, only_encode=enc)

            tminus_outs = outs.detach()
            tplus_outs = outs.detach()
            
            outputs = []
            for num_feedback in range(3):
            	optimizer.zero_grad()
            	roptimizer.zero_grad()
                
                ss_inputs = rmodel(inputs, flag= 1, r_fb1 = (tplus_outs - tminus_outs) , r_fb2 = ss_inputs.detach())

                loss = rcriterion(ss_inputs, itargets)

                loss.backward()
                roptimizer.step()

            	optimizer.zero_grad()
            	roptimizer.zero_grad()
                   
            	outs = model(ss_inputs.detach(),only_encode=enc)
                
                outputs.append(outs)

                tminus_outs = tplus_outs
                tplus_outs = outs.detach()

            del outs, tminus_outs, tplus_outs
            gc.collect()
            
            loss = 0.0
            Gamma = [0, 0.1, 0.2]
            Alpha = [1, 1, 1]
            
            for i, o in enumerate(outputs):
                loss += CB_iFl(o, targets[:, 0], weight, gamma = Gamma[i], alpha = Alpha[i])
       
            loss.backward()
            optimizer.step()

            epoch_loss.append(loss.data[0])
            time_train.append(time.time() - start_time)

            if (doIouTrain):
         
                iouEvalTrain.addBatch(outputs.max(1)[1].unsqueeze(1).data, targets.data)
               
            if args.steps_loss > 0 and step % args.steps_loss == 0:
                average = sum(epoch_loss) / len(epoch_loss)
		print('loss:  ', average.data.cpu()[0], 'Epoch:  ', epoch, 'Step:  ', step)

        average_epoch_loss_train = sum(epoch_loss) / len(epoch_loss)        
        iouTrain = 0
        if (doIouTrain):
            iouTrain, iou_classes = iouEvalTrain.getIoU()
            iouStr = getColorEntry(iouTrain)+'{:0.2f}'.format(iouTrain*100) + '\033[0m'
            print ("EPOCH IoU on TRAIN set: ", iouStr, "%")  

        print("----- VALIDATING - EPOCH", epoch, "-----")
        model.eval()
        epoch_loss_val = []
        time_val = []

        if (doIouVal):
            iouEvalVal = iouEval(NUM_CLASSES)

        for step, (timages, images, labels) in enumerate(loader_val):
            start_time = time.time()
            if args.cuda:
                images = images.cuda()
                labels = labels.cuda()
		timages = timages.cuda()

            inputs = Variable(timages, volatile=True)   
            itargets = Variable(images, volatile=True)
	    targets = Variable(labels, volatile=True)	    
	    ss_inputs = rmodel(inputs, flag = 0, r_fb1 = 0, r_fb2 = 0)
            
            outs = model(ss_inputs, only_encode=enc)
            tminus_outs = outs.detach()
            tplus_outs = outs.detach()
                        
            for num_feedback in range(3):

            	optimizer.zero_grad()
            	roptimizer.zero_grad()

                ss_inputs = rmodel(inputs, flag= 1, r_fb1 = (tplus_outs - tminus_outs) , r_fb2 = ss_inputs.detach())

                loss = rcriterion(ss_inputs, itargets)

            	outs = model(ss_inputs.detach(),only_encode=enc)

                tminus_outs = tplus_outs

                tplus_outs = outs.detach()
  
            ##################################

            del ss_inputs, tplus_outs, tminus_outs
            outputs = outs
            loss = CB_iFl(outputs, targets[:, 0], weight, gamma = Gamma[0], alpha = Alpha[0])
            epoch_loss_val.append(loss.data[0])
            time_val.append(time.time() - start_time)

            if (doIouVal):
                iouEvalVal.addBatch(outputs.max(1)[1].unsqueeze(1).data, targets.data)

            if args.steps_loss > 0 and step % args.steps_loss == 0:
                average = sum(epoch_loss_val) / len(epoch_loss_val)
		print('Val loss:  ', average, 'Epoch:  ', epoch, 'Step:  ', step)

        average_epoch_loss_val = sum(epoch_loss_val) / len(epoch_loss_val)

        iouVal = 0
        if (doIouVal):
            iouVal, iou_classes = iouEvalVal.getIoU()
            iouStr = getColorEntry(iouVal)+'{:0.2f}'.format(iouVal*100) + '\033[0m'
            print ("EPOCH IoU on VAL set: ", iouStr, "%") 
           
        # remember best valIoU and save checkpoint
        if iouVal == 0:
            current_acc = -average_epoch_loss_val
        else:
            current_acc = iouVal 

        is_best = current_acc > best_acc
        best_acc = max(current_acc, best_acc)

        filenameCheckpoint = savedir + '/checkpoint.pth.tar'
        filenameBest = savedir + '/model_best.pth.tar'
        save_checkpoint({
            'epoch': epoch + 1,
            'arch': str(model),
            'state_dict': model.state_dict(),
            'best_acc': best_acc,
            'optimizer' : optimizer.state_dict(),
        }, is_best, filenameCheckpoint, filenameBest)

        #SAVE MODEL AFTER EPOCH
        filename = savedir + '/model-{epoch:03}.pth'
        filenamebest = savedir + '/model_best.pth'

        if args.epochs_save > 0 and step > 0 and step % args.epochs_save == 0:
            torch.save(model.state_dict(), filename)
            print(filename, epoch)
        if (is_best):
            torch.save(model.state_dict(), filenamebest)
            torch.save(rmodel.state_dict(), savedir + '/rmodel_best.pth')
            print(filenamebest,epoch)
            with open(savedir + "/best.txt", "w") as myfile:
                 myfile.write("Best epoch is %d, with Val-IoU= %.4f" % (epoch, iouVal))            

        #SAVE TO FILE A ROW WITH THE EPOCH RESULT (train loss, val loss, train IoU, val IoU)
        #Epoch		Train-loss		Test-loss	Train-IoU	Test-IoU		learningRate
        with open(automated_log_path, "a") as myfile:
            myfile.write("\n%d\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.4f\t\t%.8f" % (epoch, average_epoch_loss_train, average_epoch_loss_val, iouTrain, iouVal, usedLr ))
    
    return(model)