Example #1
0
def main(cfgs):
    Logger.init(**cfgs['logger'])

    local_rank = cfgs['local_rank']
    world_size = int(os.environ['WORLD_SIZE'])
    Log.info('rank: {}, world_size: {}'.format(local_rank, world_size))

    log_dir = cfgs['log_dir']
    pth_dir = cfgs['pth_dir']
    if local_rank == 0:
        assure_dir(log_dir)
        assure_dir(pth_dir)

    aux_config = cfgs.get('auxiliary', None)
    network = ModuleBuilder(cfgs['network'], aux_config).cuda()
    criterion = build_criterion(cfgs['criterion'], aux_config).cuda()
    optimizer = optim.SGD(network.parameters(), **cfgs['optimizer'])
    scheduler = PolyLRScheduler(optimizer, **cfgs['scheduler'])

    dataset = build_dataset(**cfgs['dataset'], **cfgs['transforms'])
    sampler = DistributedSampler4Iter(dataset,
                                      world_size=world_size,
                                      rank=local_rank,
                                      **cfgs['sampler'])
    train_loader = DataLoader(dataset, sampler=sampler, **cfgs['loader'])

    cudnn.benchmark = True
    torch.manual_seed(666)
    torch.cuda.manual_seed(666)
    torch.cuda.set_device(local_rank)
    dist.init_process_group(backend='nccl', init_method='env://')

    model = DistributedDataParallel(network)
    model = apex.parallel.convert_syncbn_model(model)

    torch.cuda.empty_cache()
    train(local_rank, world_size, pth_dir, cfgs['frequency'], criterion,
          train_loader, model, optimizer, scheduler)
Example #2
0
def main():

    # make save dir
    if args.local_rank == 0:
        if not os.path.exists(args.save_dir):
            os.makedirs(args.save_dir)
    # launch the logger
    Log.init(
        log_level=args.log_level,
        log_file=osp.join(args.save_dir, args.log_file),
        log_format=args.log_format,
        rewrite=args.rewrite,
        stdout_level=args.stdout_level
    )
    # RGB or BGR input(RGB input for ImageNet pretrained models while BGR input for caffe pretrained models)
    if args.rgb:
        IMG_MEAN = np.array((0.485, 0.456, 0.406), dtype=np.float32)
        IMG_VARS = np.array((0.229, 0.224, 0.225), dtype=np.float32)
    else:
        IMG_MEAN = np.array((104.00698793, 116.66876762, 122.67891434), dtype=np.float32)
        IMG_VARS = np.array((1, 1, 1), dtype=np.float32)

    # set models
    import libs.models as models
    deeplab = models.__dict__[args.arch](num_classes=args.num_classes, data_set=args.data_set)
    if args.restore_from is not None:
        saved_state_dict = torch.load(args.restore_from, map_location=torch.device('cpu'))
        new_params = deeplab.state_dict().copy()
        for i in saved_state_dict:
            i_parts = i.split('.')
            if not i_parts[0] == 'fc':
                new_params['.'.join(i_parts[0:])] = saved_state_dict[i]
        Log.info("load pretrined models")
        if deeplab.backbone is not None:
            deeplab.backbone.load_state_dict(new_params, strict=False)
        else:
            deeplab.load_state_dict(new_params, strict=False)
    else:
        Log.info("train from stracth")


    args.world_size = 1

    if 'WORLD_SIZE' in os.environ and args.apex:
        args.apex = int(os.environ['WORLD_SIZE']) > 1
        args.world_size = int(os.environ['WORLD_SIZE'])
        print("Total world size: ", int(os.environ['WORLD_SIZE']))

    if not args.gpu == None:
        os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
    h, w = args.input_size, args.input_size
    input_size = (h, w)


     # Set the device according to local_rank.
    torch.cuda.set_device(args.local_rank)
    Log.info("Local Rank: {}".format(args.local_rank))
    torch.distributed.init_process_group(backend='nccl',
                                         init_method='env://')
    # set optimizer
    optimizer = optim.SGD(
        [{'params': filter(lambda p: p.requires_grad, deeplab.parameters()), 'lr': args.learning_rate}],
        lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay)
    optimizer.zero_grad()

    # set on cuda
    deeplab.cuda()

    # models transformation
    model = DistributedDataParallel(deeplab)
    model = apex.parallel.convert_syncbn_model(model)
    model.train()
    model.float()
    model.cuda()

    # set loss function
    if args.ohem:
        criterion = CriterionOhemDSN(thresh=args.ohem_thres, min_kept=args.ohem_keep)  # OHEM CrossEntrop
        if "ic" in args.arch:
            criterion = CriterionICNet(thresh=args.ohem_thres, min_kept=args.ohem_keep)
        if "dfa" in args.arch:
            criterion = CriterionDFANet(thresh=args.ohem_thres, min_kept=args.ohem_keep)
    else:
        criterion = CriterionDSN()  # CrossEntropy
    criterion.cuda()

    cudnn.benchmark = True

    if args.world_size == 1:
        print(model)

    # this is a little different from mul-gpu traning setting in distributed training
    # because each trainloader is a process that sample from the dataset class.
    batch_size = args.gpu_num * args.batch_size_per_gpu
    max_iters = args.num_steps * batch_size / args.gpu_num
    # set data loader
    data_set = Cityscapes(args.data_dir, args.data_list, max_iters=max_iters, crop_size=input_size,
                  scale=args.random_scale, mirror=args.random_mirror, mean=IMG_MEAN,vars=IMG_VARS, RGB= args.rgb)

    trainloader = data.DataLoader(
        data_set,
        batch_size=args.batch_size_per_gpu, shuffle=True, num_workers=args.num_workers, pin_memory=True)

    print("trainloader", len(trainloader))

    torch.cuda.empty_cache()

    # start training:
    for i_iter, batch in enumerate(trainloader):
        images, labels = batch
        images = images.cuda()
        labels = labels.long().cuda()
        optimizer.zero_grad()
        lr = adjust_learning_rate(optimizer, args, i_iter, len(trainloader))
        preds = model(images)

        loss = criterion(preds, labels)
        loss.backward()
        optimizer.step()
        reduce_loss = all_reduce_tensor(loss,
                                        world_size=args.gpu_num)
        if args.local_rank == 0:
            Log.info('iter = {} of {} completed, lr={}, loss = {}'.format(i_iter,
                                                                      len(trainloader), lr, reduce_loss.data.cpu().numpy()))
            if i_iter % args.save_pred_every == 0 and i_iter > args.save_start:
                print('save models ...')
                torch.save(deeplab.state_dict(), osp.join(args.save_dir, str(args.arch) + str(i_iter) + '.pth'))

    end = timeit.default_timer()

    if args.local_rank == 0:
        Log.info("Training cost: "+ str(end - start) + 'seconds')
        Log.info("Save final models")
        torch.save(deeplab.state_dict(), osp.join(args.save_dir, str(args.arch) + '_final' + '.pth'))
Example #3
0
def main():

    # make save dir
    if args.local_rank == 0:
        if not os.path.exists(args.save_dir):
            os.makedirs(args.save_dir)

    # for tensorboard logs
    tb_path = osp.join(args.save_dir, "runs")
    writer = SummaryWriter(tb_path)

    # launch the logger
    Log.init(log_level=args.log_level,
             log_file=osp.join(args.save_dir, args.log_file),
             log_format=args.log_format,
             rewrite=args.rewrite,
             stdout_level=args.stdout_level)
    # RGB or BGR input(RGB input for ImageNet pretrained models while BGR input for caffe pretrained models)
    if args.rgb:
        IMG_MEAN = np.array((0.485, 0.456, 0.406), dtype=np.float32)
        IMG_VARS = np.array((0.229, 0.224, 0.225), dtype=np.float32)
    else:
        IMG_MEAN = np.array((104.00698793, 116.66876762, 122.67891434),
                            dtype=np.float32)
        IMG_VARS = np.array((1, 1, 1), dtype=np.float32)

    # set models
    import libs.models as models
    deeplab = models.__dict__[args.arch](num_classes=args.num_classes)
    # print(deeplab)
    if args.restore_from is not None:
        print("LOADING FROM PRETRAINED MODEL")
        saved_state_dict = torch.load(args.restore_from,
                                      map_location=torch.device('cpu'))
        new_params = deeplab.state_dict().copy()
        for i in saved_state_dict:
            i_parts = i.split('.')
            if not i_parts[0] == 'fc':
                new_params['.'.join(i_parts[0:])] = saved_state_dict[i]
        Log.info("load pretrained models")
        deeplab.load_state_dict(new_params, strict=False)
    else:
        Log.info("train from scratch")

    args.world_size = 1

    if 'WORLD_SIZE' in os.environ and args.apex:
        args.apex = int(os.environ['WORLD_SIZE']) > 1
        args.world_size = int(os.environ['WORLD_SIZE'])
        print("Total world size: ", int(os.environ['WORLD_SIZE']))

    if not args.gpu == None:
        os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
    h, w = args.input_size, args.input_size
    input_size = (h, w)

    # Set the device according to local_rank.
    #    torch.cuda.set_device(args.local_rank)
    #    Log.info("Local Rank: {}".format(args.local_rank))
    #    torch.distributed.init_process_group(backend='nccl',
    #                                         init_method='env://')
    # set optimizer
    optimizer = optim.SGD(
        [{
            'params': filter(lambda p: p.requires_grad, deeplab.parameters()),
            'lr': args.learning_rate
        }],
        lr=args.learning_rate,
        momentum=args.momentum,
        weight_decay=args.weight_decay)
    optimizer.zero_grad()
    deeplab.cuda()
    # models transformation
    #    model = DistributedDataParallel(deeplab)
    #    model = apex.parallel.convert_syncbn_model(model)
    model = deeplab
    model.train()
    model.float()
    model.cuda()

    # set loss function
    if args.ohem:
        criterion = CriterionOhemDSN(
            thresh=args.ohem_thres,
            min_kept=args.ohem_keep)  # OHEM CrossEntrop
    else:
        criterion = CriterionDSN()  # CrossEntropy
    criterion.cuda()

    cudnn.benchmark = True

    # if args.world_size == 1:
    #     print(model)

    # this is a little different from mul-gpu traning setting in distributed training
    # because each trainloader is a process that sample from the dataset class.
    batch_size = args.batch_size_per_gpu
    max_iters = args.num_steps * batch_size
    # set data loader

    #PASCAL - VOC -----------------

    from torchvision import transforms
    augs = transforms.Compose([
        transforms.RandomResizedCrop(300),
        transforms.RandomRotation(20),
        transforms.ToTensor(),
        transforms.Normalize([0.4589, 0.4355, 0.4032],
                             [0.2239, 0.2186, 0.2206])
    ])
    if args.data_set == 'pascalvoc':
        data_set = VOCSegmentation(args.data_dir,
                                   image_set='val',
                                   scale=args.random_scale,
                                   mean=IMG_MEAN,
                                   vars=IMG_VARS,
                                   transforms=augs)

    elif args.data_set == 'cityscapes':
        data_set = Cityscapes(args.data_dir,
                              args.data_list,
                              crop_size=input_size,
                              scale=args.random_scale,
                              mirror=args.random_mirror,
                              mean=IMG_MEAN,
                              vars=IMG_VARS,
                              RGB=args.rgb)

    # instance_count = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    # for _, label in data_set:
    #     for pixel in label.flatten():
    #         if(int(pixel) == 255):
    #             pixel = 21
    #         instance_count[int(pixel)] += 1
    # print(instance_count)
    # sys.exit()

    trainloader = data.DataLoader(data_set,
                                  batch_size=args.batch_size_per_gpu,
                                  shuffle=True,
                                  num_workers=args.num_workers,
                                  pin_memory=True)

    print("trainloader", len(trainloader))

    torch.cuda.empty_cache()

    # start training:
    iter_no = 0
    for epoch in range(args.num_steps):
        print("epoch " + str(epoch + 1))
        total_loss = 0
        total_correct = 0

        for i_iter, batch in enumerate(trainloader):
            if i_iter % 100 == 0:
                print("iteration " + str(i_iter + 1))
            images, labels = batch
            images = images.cuda()
            labels = labels.long().cuda()

            optimizer.zero_grad()
            lr = adjust_learning_rate(optimizer, args, i_iter,
                                      len(trainloader))
            preds = model(images)

            loss = criterion(preds, labels)
            total_loss += loss.item()
            writer.add_scalar("Loss_vs_Iteration", loss.item(), iter_no)
            iter_no += 1
            loss.backward()
            optimizer.step()

        writer.add_scalar("Loss_vs_Epoch", total_loss / len(trainloader),
                          epoch)
        # writer.add_scaler("Correct", total_correct, epoch)
        # writer.add_scaler("Accuracy",total_correct / len(dataset), epoch)
        # reduce_loss = all_reduce_tensor(loss,world_size=args.gpu_num)
        # if args.local_rank == 0:
        #     # Log.info('iter = {} of {} completed, lr={}, loss = {}'.format(i_iter,
        #     #                                                          len(trainloader), lr, reduce_loss.data.cpu().numpy()))
        #     if i_iter % args.save_pred_every == 0 and i_iter > args.save_start:
        #         print('save models ...')

        #         torch.save(deeplab.state_dict(), osp.join(args.save_dir, str(args.arch) + str(i_iter) + '.pth'))

        if args.local_rank == 0:
            if epoch % 9 == 0:
                print('save models ...')
                torch.save(
                    deeplab.state_dict(),
                    osp.join(args.save_dir,
                             str(args.arch) + str(i_iter) + '.pth'))

    writer.close()

    end = timeit.default_timer()

    if args.local_rank == 0:
        Log.info("Training cost: " + str(end - start) + 'seconds')
        Log.info("Save final models")
        torch.save(
            deeplab.state_dict(),
            osp.join(
                args.save_dir,
                str(args.arch) + '_' + str(args.num_steps) + 'epoch_' +
                str(args.batch_size_per_gpu) + '.pth'))