def train_model(args):
    """
    args:
       args: global arguments
    """
    h, w = map(int, args.input_size.split(','))
    input_size = (h, w)
    print("=====> input size:{}".format(input_size))

    print(args)

    if args.cuda:
        print("=====> use gpu id: '{}'".format(args.gpus))
        os.environ["CUDA_VISIBLE_DEVICES"] = args.gpus
        if not torch.cuda.is_available():
            raise Exception(
                "No GPU found or Wrong gpu id, please run without --cuda")

    # set the seed
    setup_seed(GLOBAL_SEED)
    print("=====> set Global Seed: ", GLOBAL_SEED)

    cudnn.enabled = True
    print("=====> building network")

    # build the model and initialization
    model = build_model(args.model, num_classes=args.classes)
    init_weight(model,
                nn.init.kaiming_normal_,
                nn.BatchNorm2d,
                1e-3,
                0.1,
                mode='fan_in')

    print("=====> computing network parameters and FLOPs")
    total_parameters = netParams(model)
    print("the number of parameters: %d ==> %.2f M" %
          (total_parameters, (total_parameters / 1e6)))

    # load data and data augmentation
    datas, trainLoader, valLoader = build_dataset_train(
        args.dataset, args.classes, input_size, args.batch_size,
        args.train_type, False, False, args.num_workers)

    args.per_iter = len(trainLoader)
    args.max_iter = args.max_epochs * args.per_iter

    print('=====> Dataset statistics')
    print("data['classWeights']: ", datas['classWeights'])
    print('mean and std: ', datas['mean'], datas['std'])
    # datas['classWeights'] = np.array([4.044603, 2.0614128, 4.2246304, 6.0238333,
    #                                   10.107266, 8.601249, 8.808282], dtype=np.float32)
    # datas['mean'] = [0.5, 0.5, 0.5]
    # datas['std'] = [0.2, 0.2, 0.2]

    # define loss function, respectively
    weight = torch.from_numpy(datas['classWeights'])
    if args.dataset == 'pollen':
        weight = torch.tensor([1., 1.])

    if args.dataset == 'camvid':
        criteria = CrossEntropyLoss2d(weight=weight,
                                      ignore_label=args.ignore_label)
    elif args.dataset == 'camvid' and args.use_label_smoothing:
        criteria = CrossEntropyLoss2dLabelSmooth(
            weight=weight, ignore_label=args.ignore_label)

    elif args.dataset == 'cityscapes' and args.use_ohem:
        min_kept = int(args.batch_size // len(args.gpus) * h * w // 16)
        criteria = ProbOhemCrossEntropy2d(use_weight=True,
                                          ignore_label=args.ignore_label,
                                          thresh=0.7,
                                          min_kept=min_kept)
    elif args.dataset == 'cityscapes' and args.use_label_smoothing:
        criteria = CrossEntropyLoss2dLabelSmooth(
            weight=weight, ignore_label=args.ignore_label)
    elif args.dataset == 'cityscape' and args.use_lovaszsoftmax:
        criteria = LovaszSoftmax(ignore_index=args.ignore_label)
    elif args.dataset == 'cityscapes' and args.use_focal:
        criteria = FocalLoss2d(weight=weight, ignore_index=args.ignore_label)
    elif args.dataset == 'seed':
        criteria = CrossEntropyLoss2d(weight=weight,
                                      ignore_label=args.ignore_label)

    elif args.dataset == 'remote':
        criteria = CrossEntropyLoss2d(weight=weight,
                                      ignore_label=args.ignore_label)
    elif args.dataset == 'remote' and args.use_ohem:
        min_kept = int(args.batch_size // len(args.gpus) * h * w // 16)
        criteria = ProbOhemCrossEntropy2d(use_weight=True,
                                          ignore_label=args.ignore_label,
                                          thresh=0.7,
                                          min_kept=min_kept)
    elif args.dataset == 'remote' and args.use_label_smoothing:
        criteria = CrossEntropyLoss2dLabelSmooth(
            weight=weight, ignore_label=args.ignore_label)
    elif args.dataset == 'remote' and args.use_lovaszsoftmax:
        criteria = LovaszSoftmax(ignore_index=args.ignore_label)
    elif args.dataset == 'remote' and args.use_focal:
        criteria = FocalLoss2d(weight=weight, ignore_index=args.ignore_label)
    else:
        criteria = CrossEntropyLoss2d(weight=weight,
                                      ignore_label=args.ignore_label)

    if args.cuda:
        criteria = criteria.cuda()
        if torch.cuda.device_count() > 1:
            print("torch.cuda.device_count()=", torch.cuda.device_count())
            args.gpu_nums = torch.cuda.device_count()
            model = nn.DataParallel(model).cuda()  # multi-card data parallel
        else:
            args.gpu_nums = 1
            print("single GPU for training")
            model = model.cuda()  # 1-card data parallel

    args.savedir = (args.savedir + args.dataset + '/' + args.model + 'bs' +
                    str(args.batch_size) + 'gpu' + str(args.gpu_nums) + "_" +
                    str(args.train_type) + '/')

    if not os.path.exists(args.savedir):
        os.makedirs(args.savedir)

    start_epoch = 0

    # continue training
    if args.resume:
        if os.path.isfile(args.resume):
            checkpoint = torch.load(args.resume)
            start_epoch = checkpoint['epoch']
            model.load_state_dict(checkpoint['model'])
            # model.load_state_dict(convert_state_dict(checkpoint['model']))
            print("=====> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("=====> no checkpoint found at '{}'".format(args.resume))

    model.train()
    cudnn.benchmark = True
    # cudnn.deterministic = True ## my add

    logFileLoc = args.savedir + args.logFile
    if os.path.isfile(logFileLoc):
        logger = open(logFileLoc, 'a')
    else:
        logger = open(logFileLoc, 'w')
        logger.write("Parameters: %s Seed: %s" %
                     (str(total_parameters), GLOBAL_SEED))
        logger.write("\n%s\t\t%s\t%s\t%s" %
                     ('Epoch', 'Loss(Tr)', 'mIOU (val)', 'lr'))
    logger.flush()

    # define optimization strategy
    if args.optim == 'sgd':
        optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad,
                                           model.parameters()),
                                    lr=args.lr,
                                    momentum=0.9,
                                    weight_decay=1e-4)
    elif args.optim == 'adam':
        optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad,
                                            model.parameters()),
                                     lr=args.lr,
                                     betas=(0.9, 0.999),
                                     eps=1e-08,
                                     weight_decay=1e-4)
    elif args.optim == 'radam':
        optimizer = RAdam(filter(lambda p: p.requires_grad,
                                 model.parameters()),
                          lr=args.lr,
                          betas=(0.90, 0.999),
                          eps=1e-08,
                          weight_decay=1e-4)
    elif args.optim == 'ranger':
        optimizer = Ranger(filter(lambda p: p.requires_grad,
                                  model.parameters()),
                           lr=args.lr,
                           betas=(0.95, 0.999),
                           eps=1e-08,
                           weight_decay=1e-4)
    elif args.optim == 'adamw':
        optimizer = AdamW(filter(lambda p: p.requires_grad,
                                 model.parameters()),
                          lr=args.lr,
                          betas=(0.9, 0.999),
                          eps=1e-08,
                          weight_decay=1e-4)

    lossTr_list = []
    epoches = []
    mIOU_val_list = []

    print('=====> beginning training')
    for epoch in range(start_epoch, args.max_epochs):
        # training

        lossTr, lr = train(args, trainLoader, model, criteria, optimizer,
                           epoch)
        lossTr_list.append(lossTr)

        # validation
        if epoch % 2 == 0 or epoch == (args.max_epochs - 1):
            epoches.append(epoch)
            mIOU_val, per_class_iu = val(args, valLoader, model)
            mIOU_val_list.append(mIOU_val)
            # record train information
            logger.write("\n%d\t\t%.4f\t\t%.4f\t\t%.7f" %
                         (epoch, lossTr, mIOU_val, lr))
            logger.flush()
            print("Epoch : " + str(epoch) + ' Details')
            print(
                "Epoch No.: %d\tTrain Loss = %.4f\t mIOU(val) = %.4f\t lr= %.6f\n"
                % (epoch, lossTr, mIOU_val, lr))
        else:
            # record train information
            logger.write("\n%d\t\t%.4f\t\t\t\t%.7f" % (epoch, lossTr, lr))
            logger.flush()
            print("Epoch : " + str(epoch) + ' Details')
            print("Epoch No.: %d\tTrain Loss = %.4f\t lr= %.6f\n" %
                  (epoch, lossTr, lr))

        # save the model
        model_file_name = args.savedir + '/model_' + str(epoch + 1) + '.pth'
        state = {"epoch": epoch + 1, "model": model.state_dict()}

        # Individual Setting for save model !!!
        if args.dataset == 'camvid':
            torch.save(state, model_file_name)
        elif args.dataset == 'cityscapes':
            if epoch >= args.max_epochs - 10:
                torch.save(state, model_file_name)
            elif not epoch % 50:
                torch.save(state, model_file_name)
        elif args.dataset == 'seed':
            torch.save(state, model_file_name)
        else:
            torch.save(state, model_file_name)

        # draw plots for visualization
        if epoch % 5 == 0 or epoch == (args.max_epochs - 1):
            # Plot the figures per 50 epochs
            fig1, ax1 = plt.subplots(figsize=(11, 8))

            ax1.plot(range(start_epoch, epoch + 1), lossTr_list)
            ax1.set_title("Average training loss vs epochs")
            ax1.set_xlabel("Epochs")
            ax1.set_ylabel("Current loss")

            plt.savefig(args.savedir + "loss_vs_epochs.png")

            plt.clf()

            fig2, ax2 = plt.subplots(figsize=(11, 8))

            ax2.plot(epoches, mIOU_val_list, label="Val IoU")
            ax2.set_title("Average IoU vs epochs")
            ax2.set_xlabel("Epochs")
            ax2.set_ylabel("Current IoU")
            plt.legend(loc='lower right')

            plt.savefig(args.savedir + "iou_vs_epochs.png")

            plt.close('all')

    logger.close()
Beispiel #2
0
def train_model(args):
    """
    args:
       args: global arguments
    """
    h, w = map(int, args.input_size.split(','))
    input_size = (h, w)
    print("=====> input size:{}".format(input_size))

    print(args)

    if args.cuda:
        print("=====> use gpu id: '{}'".format(args.gpus))
        os.environ["CUDA_VISIBLE_DEVICES"] = args.gpus
        if not torch.cuda.is_available():
            raise Exception(
                "No GPU found or Wrong gpu id, please run without --cuda")

    # set the seed
    setup_seed(GLOBAL_SEED)
    print("=====> set Global Seed: ", GLOBAL_SEED)

    cudnn.enabled = True
    print("=====> building network")

    # build the model and initialization
    model = build_model(args.model, num_classes=args.classes)
    init_weight(model,
                nn.init.kaiming_normal_,
                nn.BatchNorm2d,
                1e-3,
                0.1,
                mode='fan_in')

    print("=====> computing network parameters and FLOPs")
    total_paramters = netParams(model)
    print("the number of parameters: %d ==> %.2f M" %
          (total_paramters, (total_paramters / 1e6)))

    # load data and data augmentation
    datas, trainLoader, valLoader = build_dataset_train(
        args.dataset, input_size, args.batch_size, args.train_type,
        args.random_scale, args.random_mirror, args.num_workers)

    print('=====> Dataset statistics')
    print("data['classWeights']: ", datas['classWeights'])
    print('mean and std: ', datas['mean'], datas['std'])

    # define loss function, respectively
    weight = torch.from_numpy(datas['classWeights'])

    if args.dataset == 'camvid':
        criteria = CrossEntropyLoss2d(weight=weight, ignore_label=ignore_label)
    elif args.dataset == 'cityscapes':
        min_kept = int(args.batch_size // len(args.gpus) * h * w // 16)
        criteria = ProbOhemCrossEntropy2d(use_weight=True,
                                          ignore_label=ignore_label,
                                          thresh=0.7,
                                          min_kept=min_kept)
    else:
        raise NotImplementedError(
            "This repository now supports two datasets: cityscapes and camvid, %s is not included"
            % args.dataset)

    if args.cuda:
        criteria = criteria.cuda()
        if torch.cuda.device_count() > 1:
            print("torch.cuda.device_count()=", torch.cuda.device_count())
            args.gpu_nums = torch.cuda.device_count()
            model = nn.DataParallel(model).cuda()  # multi-card data parallel
        else:
            args.gpu_nums = 1
            print("single GPU for training")
            model = model.cuda()  # 1-card data parallel

    args.savedir = (args.dataset + '/' + args.savedir + args.model + 'bs' +
                    str(args.batch_size) + "_" + str(args.train_type) + '/')

    if not os.path.exists(args.savedir):
        os.makedirs(args.savedir)

    start_epoch = 0

    # continue training
    if args.resume:
        if os.path.isfile(args.resume):
            checkpoint = torch.load(args.resume)
            start_epoch = checkpoint['epoch']
            model.load_state_dict(checkpoint['model'])
            # model.load_state_dict(convert_state_dict(checkpoint['model']))
            print("=====> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("=====> no checkpoint found at '{}'".format(args.resume))

    model.train()
    cudnn.benchmark = True

    logFileLoc = args.savedir + args.logFile
    if os.path.isfile(logFileLoc):
        logger = open(logFileLoc, 'a')
    else:
        logger = open(logFileLoc, 'w')
        logger.write("Parameters: %s Seed: %s" %
                     (str(total_paramters), GLOBAL_SEED))
        logger.write("\n%s\t\t%s\t%s\t%s" %
                     ('Epoch', 'Loss(Tr)', 'mIOU (val)', 'lr'))
    logger.flush()

    # define optimization criteria
    if args.dataset == 'camvid':
        optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad,
                                            model.parameters()),
                                     args.lr, (0.9, 0.999),
                                     eps=1e-08,
                                     weight_decay=2e-4)

    elif args.dataset == 'cityscapes':
        #optimizer = torch.optim.SGD(
        #filter(lambda p: p.requires_grad, model.parameters()), args.lr, momentum=0.9, weight_decay=1e-4)
        optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad,
                                            model.parameters()),
                                     args.lr, (0.9, 0.999),
                                     eps=1e-08,
                                     weight_decay=1e-5)

    lossTr_list = []
    epoches = []
    mIOU_val_list = []

    print('=====> beginning training')
    for epoch in range(start_epoch, args.max_epochs):
        # training
        lossTr, lr = train(args, trainLoader, model, criteria, optimizer,
                           epoch)
        lossTr_list.append(lossTr)

        # validation
        if epoch % 30 == 0 or epoch == (args.max_epochs - 1):
            epoches.append(epoch)
            mIOU_val, per_class_iu = val(args, valLoader, model)
            mIOU_val_list.append(mIOU_val)
            # record train information
            logger.write("\n%d\t\t%.4f\t\t%.4f\t\t%.7f" %
                         (epoch, lossTr, mIOU_val, lr))
            logger.flush()
            print("Epoch : " + str(epoch) + ' Details')
            print(
                "Epoch No.: %d\tTrain Loss = %.4f\t mIOU(val) = %.4f\t lr= %.6f\n"
                % (epoch, lossTr, mIOU_val, lr))
        else:
            # record train information
            logger.write("\n%d\t\t%.4f\t\t\t\t%.7f" % (epoch, lossTr, lr))
            logger.flush()
            print("Epoch : " + str(epoch) + ' Details')
            print("Epoch No.: %d\tTrain Loss = %.4f\t lr= %.6f\n" %
                  (epoch, lossTr, lr))

        # save the model
        model_file_name = args.savedir + '/model_' + str(epoch + 1) + '.pth'
        state = {"epoch": epoch + 1, "model": model.state_dict()}

        if epoch >= args.max_epochs - 10:
            torch.save(state, model_file_name)
        elif not epoch % 100:
            torch.save(state, model_file_name)

        # draw plots for visualization
        if epoch % 30 == 0 or epoch == (args.max_epochs - 1):
            # Plot the figures per 50 epochs
            fig1, ax1 = plt.subplots(figsize=(11, 8))

            ax1.plot(range(start_epoch, epoch + 1), lossTr_list)
            ax1.set_title("Average training loss vs epochs")
            ax1.set_xlabel("Epochs")
            ax1.set_ylabel("Current loss")

            plt.savefig(args.savedir + "loss_vs_epochs.png")

            plt.clf()

            fig2, ax2 = plt.subplots(figsize=(11, 8))

            ax2.plot(epoches, mIOU_val_list, label="Val IoU")
            ax2.set_title("Average IoU vs epochs")
            ax2.set_xlabel("Epochs")
            ax2.set_ylabel("Current IoU")
            plt.legend(loc='lower right')

            plt.savefig(args.savedir + "iou_vs_epochs.png")

            plt.close('all')

    logger.close()
Beispiel #3
0
def main(args):
    """
    args:
       args: global arguments
    """
    # set the seed
    setup_seed(GLOBAL_SEED)
    # cudnn.enabled = True
    # cudnn.benchmark = True  # find the optimal configuration
    # cudnn.deterministic = True  # reduce volatility

    # learning scheduling, for 10 epoch lr*0.8
    # lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.85)

    # build the model and initialization weights
    model = build_model(args.model, args.classes, args.backbone, args.pretrained, args.out_stride, args.mult_grid)

    # define loss function, respectively
    criterion = build_loss(args, None, ignore_label)

    # load train set and data augmentation
    datas, traindataset = build_dataset_train(args.root, args.dataset, args.base_size, args.crop_size)
    # load the test set, if want set cityscapes test dataset change none_gt=False
    testdataset, class_dict_df = build_dataset_test(args.root, args.dataset, args.crop_size,
                                                    mode=args.predict_mode, gt=True)

    # move model and criterion on cuda
    if args.cuda:
        os.environ["CUDA_VISIBLE_DEVICES"] = args.gpus_id
        dist.init_process_group(backend="nccl", init_method='env://')
        args.local_rank = torch.distributed.get_rank()
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        gpus = len(list(os.environ["CUDA_VISIBLE_DEVICES"])) - (len(list(os.environ["CUDA_VISIBLE_DEVICES"])) // 2)

        trainLoader, model, criterion = Distribute(args, traindataset, model, criterion, device, gpus)
        # test with distributed
        # testLoader, _, _ = Distribute(args, testdataset, model, criterion, device, gpus)
        # test with single card
        testLoader = data.DataLoader(testdataset, batch_size=args.batch_size,
                                     shuffle=True, num_workers=args.batch_size, pin_memory=True, drop_last=False)

        if not torch.cuda.is_available():
            raise Exception("No GPU found or Wrong gpu id, please run without --cuda")

    # define optimization strategy
    # parameters = [{'params': model.get_1x_lr_params(), 'lr': args.lr},
    #             {'params': model.get_10x_lr_params(), 'lr': args.lr}]
    parameters = model.parameters()

    if args.optim == 'sgd':
        optimizer = torch.optim.SGD(parameters, lr=args.lr, momentum=0.9, weight_decay=5e-4, nesterov=False)
    elif args.optim == 'adam':
        optimizer = torch.optim.Adam(parameters, weight_decay=5e-4)
    elif args.optim == 'adamw':
        optimizer = torch.optim.AdamW(parameters, weight_decay=5e-4)

    # initial log file val output save
    args.savedir = (args.savedir + args.dataset + '/' + args.model + '/')
    if not os.path.exists(args.savedir) and args.local_rank == 0:
        os.makedirs(args.savedir)

    # save_seg_dir
    args.save_seg_dir = os.path.join(args.savedir, args.predict_mode)
    if not os.path.exists(args.save_seg_dir) and args.local_rank == 0:
        os.makedirs(args.save_seg_dir)

    recorder = record_log(args)
    if args.resume == None and args.local_rank == 0:
        recorder.record_args(datas, str(netParams(model) / 1e6) + ' M', GLOBAL_SEED)

    # initialize the early_stopping object
    early_stopping = EarlyStopping(patience=300)
    start_epoch = 1
    if args.local_rank == 0:
        print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n"
              ">>>>>>>>>>>  beginning training   >>>>>>>>>>>\n"
              ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")

    epoch_list = []
    lossTr_list = []
    Miou_list = []
    lossVal_list = []
    Miou = 0
    Best_Miou = 0
    # continue training
    if args.resume:
        logger, lines = recorder.resume_logfile()
        for index, line in enumerate(lines):
            lossTr_list.append(float(line.strip().split()[2]))
            if len(line.strip().split()) != 3:
                epoch_list.append(int(line.strip().split()[0]))
                lossVal_list.append(float(line.strip().split()[3]))
                Miou_list.append(float(line.strip().split()[5]))

        if os.path.isfile(args.resume):
            checkpoint = torch.load(args.resume)
            start_epoch = checkpoint['epoch'] + 1
            optimizer.load_state_dict(checkpoint['optimizer'])
            check_list = [i for i in checkpoint['model'].items()]
            # Read weights with multiple cards, and continue training with a single card this time
            if 'module.' in check_list[0][0]:
                new_stat_dict = {}
                for k, v in checkpoint['model'].items():
                    new_stat_dict[k[:]] = v
                model.load_state_dict(new_stat_dict, strict=True)
            # Read the training weight of a single card, and continue training with a single card this time
            else:
                model.load_state_dict(checkpoint['model'])
            if args.local_rank == 0:
                print("loaded checkpoint '{}' (epoch {})".format(args.resume, checkpoint['epoch']))
        else:
            if args.local_rank == 0:
                print("no checkpoint found at '{}'".format(args.resume))
    else:
        logger = recorder.initial_logfile()
        logger.flush()

    for epoch in range(start_epoch, args.max_epochs + 1):
        start_time = time.time()
        # training
        train_start = time.time()

        lossTr, lr = train(args, trainLoader, model, criterion, optimizer, epoch, device)
        if args.local_rank == 0:
            lossTr_list.append(lossTr)

        train_end = time.time()
        train_per_epoch_seconds = train_end - train_start
        validation_per_epoch_seconds = 60  # init validation time
        # validation if mode==validation, predict with label; elif mode==predict, predict without label.

        if epoch % args.val_epochs == 0 or epoch == 1 or args.max_epochs - 10 < epoch <= args.max_epochs:
            validation_start = time.time()

            loss, FWIoU, Miou, MIoU, PerCiou_set, Pa, PerCpa_set, Mpa, MF, F_set, F1_avg = \
                predict_multiscale_sliding(args=args, model=model,
                                           testLoader=testLoader,
                                           class_dict_df=class_dict_df,
                                           # scales=[1.25, 1.5, 1.75, 2.0],
                                           scales=[1.0],
                                           overlap=0.3,
                                           criterion=criterion,
                                           mode=args.predict_type,
                                           save_result=True)
            torch.cuda.empty_cache()

            if args.local_rank == 0:
                epoch_list.append(epoch)
                Miou_list.append(Miou)
                lossVal_list.append(loss.item())
                # record trainVal information
                recorder.record_trainVal_log(logger, epoch, lr, lossTr, loss,
                                             FWIoU, Miou, MIoU, PerCiou_set, Pa, Mpa,
                                             PerCpa_set, MF, F_set, F1_avg,
                                             class_dict_df)

                torch.cuda.empty_cache()
                validation_end = time.time()
                validation_per_epoch_seconds = validation_end - validation_start
        else:
            if args.local_rank == 0:
                # record train information
                recorder.record_train_log(logger, epoch, lr, lossTr)

            # # Update lr_scheduler. In pytorch 1.1.0 and later, should call 'optimizer.step()' before 'lr_scheduler.step()'
            # lr_scheduler.step()
        if args.local_rank == 0:
            # draw log fig
            draw_log(args, epoch, epoch_list, lossTr_list, Miou_list, lossVal_list)

            # save the model
            model_file_name = args.savedir + '/best_model.pth'
            last_model_file_name = args.savedir + '/last_model.pth'
            state = {
                "epoch": epoch,
                "model": model.state_dict(),
                'optimizer': optimizer.state_dict()
            }
            if Miou > Best_Miou:
                Best_Miou = Miou
                torch.save(state, model_file_name)
                recorder.record_best_epoch(epoch, Best_Miou, Pa)

            # early_stopping monitor
            early_stopping.monitor(monitor=Miou)
            if early_stopping.early_stop:
                print("Early stopping and Save checkpoint")
                if not os.path.exists(last_model_file_name):
                    torch.save(state, last_model_file_name)
                    torch.cuda.empty_cache()  # empty_cache

                    loss, FWIoU, Miou, Miou_Noback, PerCiou_set, Pa, PerCpa_set, Mpa, MF, F_set, F1_Noback = \
                        predict_multiscale_sliding(args=args, model=model,
                                                   testLoader=testLoader,
                                                   scales=[1.0],
                                                   overlap=0.3,
                                                   criterion=criterion,
                                                   mode=args.predict_type,
                                                   save_result=False)
                    print("Epoch {}  lr= {:.6f}  Train Loss={:.4f}  Val Loss={:.4f}  Miou={:.4f}  PerCiou_set={}\n"
                          .format(epoch, lr, lossTr, loss, Miou, str(PerCiou_set)))
                break

            total_second = start_time + (args.max_epochs - epoch) * train_per_epoch_seconds + \
                           ((args.max_epochs - epoch) / args.val_epochs + 10) * validation_per_epoch_seconds + 43200
            print('Best Validation MIoU:{}'.format(Best_Miou))
            print('Training deadline is: {}\n'.format(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(total_second))))
Beispiel #4
0
def train_model(args):
    """
    args:
       args: global arguments
    """
    h, w = map(int, args.input_size.split(','))
    input_size = (h, w)
    print("input size:{}".format(input_size))

    print(args)

    if args.cuda:
        print("use gpu id: '{}'".format(args.gpus))
        os.environ["CUDA_VISIBLE_DEVICES"] = args.gpus
        if not torch.cuda.is_available():
            raise Exception(
                "No GPU found or Wrong gpu id, please run without --cuda")

    # set the seed
    setup_seed(GLOBAL_SEED)
    print("set Global Seed: ", GLOBAL_SEED)
    cudnn.enabled = True
    print("building network")

    # build the model and initialization
    model = build_model(args.model, num_classes=args.classes)
    init_weight(model,
                nn.init.kaiming_normal_,
                nn.BatchNorm2d,
                1e-3,
                0.1,
                mode='fan_in')

    print("computing network parameters and FLOPs")
    total_paramters = netParams(model)
    print("the number of parameters: %d ==> %.2f M" %
          (total_paramters, (total_paramters / 1e6)))

    # load data and data augmentation
    datas, trainLoader, valLoader = build_dataset_train(
        args.dataset, input_size, args.batch_size, args.train_type,
        args.random_scale, args.random_mirror, args.num_workers)

    args.per_iter = len(trainLoader)
    args.max_iter = args.max_epochs * args.per_iter

    print('Dataset statistics')
    print("data['classWeights']: ", datas['classWeights'])
    print('mean and std: ', datas['mean'], datas['std'])

    # define loss function, respectively
    weight = torch.from_numpy(datas['classWeights'])

    if args.dataset == 'camvid':
        criteria = CrossEntropyLoss2d(weight=weight, ignore_label=ignore_label)
    elif args.dataset == 'camvid' and args.use_label_smoothing:
        criteria = CrossEntropyLoss2dLabelSmooth(weight=weight,
                                                 ignore_label=ignore_label)

    elif args.dataset == 'cityscapes' and args.use_ohem:
        min_kept = int(args.batch_size // len(args.gpus) * h * w // 16)
        criteria = ProbOhemCrossEntropy2d(use_weight=True,
                                          ignore_label=ignore_label,
                                          thresh=0.7,
                                          min_kept=min_kept)
    elif args.dataset == 'cityscapes' and args.use_label_smoothing:
        criteria = CrossEntropyLoss2dLabelSmooth(weight=weight,
                                                 ignore_label=ignore_label)
    elif args.dataset == 'cityscapes' and args.use_lovaszsoftmax:
        criteria = LovaszSoftmax(ignore_index=ignore_label)
    elif args.dataset == 'cityscapes' and args.use_focal:
        criteria = FocalLoss2d(weight=weight, ignore_index=ignore_label)

    elif args.dataset == 'paris':
        criteria = CrossEntropyLoss2d(weight=weight, ignore_label=ignore_label)

    else:
        raise NotImplementedError(
            "This repository now supports two datasets: cityscapes and camvid, %s is not included"
            % args.dataset)

    if args.cuda:
        criteria = criteria.cuda()
        if torch.cuda.device_count() > 1:
            print("torch.cuda.device_count()=", torch.cuda.device_count())
            args.gpu_nums = torch.cuda.device_count()
            model = nn.DataParallel(model).cuda()  # multi-card data parallel
        else:
            args.gpu_nums = 1
            print("single GPU for training")
            model = model.cuda()  # 1-card data parallel

    args.savedir = (args.savedir + args.dataset + '/' + args.model + 'bs' +
                    str(args.batch_size) + 'gpu' + str(args.gpu_nums) + "_" +
                    str(args.train_type) + '/')

    if not os.path.exists(args.savedir):
        os.makedirs(args.savedir)

    with open(args.savedir + 'args.txt', 'w') as f:
        f.write('mean:{}\nstd:{}\n'.format(datas['mean'], datas['std']))
        f.write("Parameters: {} Seed: {}\n".format(str(total_paramters),
                                                   GLOBAL_SEED))
        f.write(str(args))

    start_epoch = 0
    # continue training
    if args.resume:
        if os.path.isfile(args.resume):
            checkpoint = torch.load(args.resume)
            start_epoch = checkpoint['epoch']
            model.load_state_dict(checkpoint['model'])
            # model.load_state_dict(convert_state_dict(checkpoint['model']))
            print("loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("no checkpoint found at '{}'".format(args.resume))

    model.train()
    cudnn.benchmark = True
    # cudnn.deterministic = True ## my add

    # initialize the early_stopping object
    early_stopping = EarlyStopping(patience=50)

    logFileLoc = args.savedir + args.logFile
    if os.path.isfile(logFileLoc):
        logger = open(logFileLoc, 'a')
    else:
        logger = open(logFileLoc, 'w')
        logger.write("%s\t%s\t\t%s\t%s\t%s" %
                     ('Epoch', '   lr', 'Loss(Tr)', 'Loss(Val)', 'mIOU(Val)'))
    logger.flush()

    # define optimization strategy
    if args.optim == 'sgd':
        optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad,
                                           model.parameters()),
                                    lr=args.lr,
                                    momentum=0.9,
                                    weight_decay=1e-4)
    elif args.optim == 'adam':
        optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad,
                                            model.parameters()),
                                     lr=args.lr,
                                     betas=(0.9, 0.999),
                                     eps=1e-08,
                                     weight_decay=1e-4)
    elif args.optim == 'radam':
        optimizer = RAdam(filter(lambda p: p.requires_grad,
                                 model.parameters()),
                          lr=args.lr,
                          betas=(0.90, 0.999),
                          eps=1e-08,
                          weight_decay=1e-4)
    elif args.optim == 'ranger':
        optimizer = Ranger(filter(lambda p: p.requires_grad,
                                  model.parameters()),
                           lr=args.lr,
                           betas=(0.95, 0.999),
                           eps=1e-08,
                           weight_decay=1e-4)
    elif args.optim == 'adamw':
        optimizer = AdamW(filter(lambda p: p.requires_grad,
                                 model.parameters()),
                          lr=args.lr,
                          betas=(0.9, 0.999),
                          eps=1e-08,
                          weight_decay=1e-4)

    lossTr_list = []
    epoches = []
    mIOU_val_list = []
    lossVal_list = []
    print('>>>>>>>>>>>beginning training>>>>>>>>>>>')
    for epoch in range(start_epoch, args.max_epochs):
        # training
        lossTr, lr = train(args, trainLoader, model, criteria, optimizer,
                           epoch)
        lossTr_list.append(lossTr)

        # validation
        if epoch % args.val_miou_epochs == 0:
            epoches.append(epoch)
            val_loss, mIOU_val, per_class_iu = val(args, valLoader, criteria,
                                                   model, epoch)
            mIOU_val_list.append(mIOU_val)
            lossVal_list.append(val_loss.item())
            # record train information
            logger.write(
                "\n%d\t%.6f\t%.4f\t\t%.4f\t%0.4f\t %s" %
                (epoch, lr, lossTr, val_loss, mIOU_val, str(per_class_iu)))
            logger.flush()
            print(
                "Epoch  %d\tlr= %.6f\tTrain Loss = %.4f\tVal Loss = %.4f\tmIOU(val) = %.4f\tper_class_iu= %s\n"
                % (epoch, lr, lossTr, val_loss, mIOU_val, str(per_class_iu)))
        else:
            # record train information
            val_loss = val(args, valLoader, criteria, model, epoch)
            lossVal_list.append(val_loss.item())
            logger.write("\n%d\t%.6f\t%.4f\t\t%.4f" %
                         (epoch, lr, lossTr, val_loss))
            logger.flush()
            print("Epoch  %d\tlr= %.6f\tTrain Loss = %.4f\tVal Loss = %.4f\n" %
                  (epoch, lr, lossTr, val_loss))

        # save the model
        model_file_name = args.savedir + '/model_' + str(epoch) + '.pth'
        state = {"epoch": epoch, "model": model.state_dict()}

        # Individual Setting for save model
        if epoch >= args.max_epochs - 10:
            torch.save(state, model_file_name)
        elif epoch % 10 == 0:
            torch.save(state, model_file_name)

        # draw plots for visualization
        if os.path.isfile(args.savedir + "loss.png"):
            f = open(args.savedir + 'log.txt', 'r')
            next(f)
            epoch_list = []
            lossTr_list = []
            lossVal_list = []
            for line in f.readlines():
                epoch_list.append(line.strip().split()[0])
                lossTr_list.append(line.strip().split()[2])
                lossVal_list.append(line.strip().split()[3])
            assert len(epoch_list) == len(lossTr_list) == len(lossVal_list)

            fig1, ax1 = plt.subplots(figsize=(11, 8))

            ax1.plot(range(0, epoch + 1), lossTr_list, label='Train_loss')
            ax1.plot(range(0, epoch + 1), lossVal_list, label='Val_loss')
            ax1.set_title("Average training loss vs epochs")
            ax1.set_xlabel("Epochs")
            ax1.set_ylabel("Current loss")
            ax1.legend()

            plt.savefig(args.savedir + "loss.png")
            plt.clf()
        else:
            fig1, ax1 = plt.subplots(figsize=(11, 8))

            ax1.plot(range(0, epoch + 1), lossTr_list, label='Train_loss')
            ax1.plot(range(0, epoch + 1), lossVal_list, label='Val_loss')
            ax1.set_title("Average training loss vs epochs")
            ax1.set_xlabel("Epochs")
            ax1.set_ylabel("Current loss")
            ax1.legend()

            plt.savefig(args.savedir + "loss.png")
            plt.clf()

            fig2, ax2 = plt.subplots(figsize=(11, 8))

            ax2.plot(epoches, mIOU_val_list, label="Val IoU")
            ax2.set_title("Average IoU vs epochs")
            ax2.set_xlabel("Epochs")
            ax2.set_ylabel("Current IoU")
            ax2.legend()

            plt.savefig(args.savedir + "mIou.png")
            plt.close('all')

        early_stopping.monitor(monitor=val_loss)
        if early_stopping.early_stop:
            print("Early stopping and Save checkpoint")
            if not os.path.exists(model_file_name):
                torch.save(state, model_file_name)
            break

    logger.close()