Beispiel #1
0
def main():
    global best_top1, best_top5

    args.world_size = 1

    start_epoch = args.start_epoch  # start from epoch 0 or last checkpoint epoch

    if not os.path.isdir(args.checkpoint):
        mkdir_p(args.checkpoint)

    # Data loading code
    traindir = os.path.join(args.data, 'train')
    valdir = os.path.join(args.data, 'val')
    crop_size = 224
    val_size = 256

    pipe = HybridTrainPipe(batch_size=args.train_batch,
                           num_threads=args.workers,
                           device_id=args.local_rank,
                           data_dir=traindir,
                           crop=crop_size,
                           dali_cpu=args.dali_cpu)
    pipe.build()
    train_loader = DALIClassificationIterator(
        pipe, size=int(pipe.epoch_size("Reader") / args.world_size))

    pipe = HybridValPipe(batch_size=args.test_batch,
                         num_threads=args.workers,
                         device_id=args.local_rank,
                         data_dir=valdir,
                         crop=crop_size,
                         size=val_size)
    pipe.build()
    val_loader = DALIClassificationIterator(
        pipe, size=int(pipe.epoch_size("Reader") / args.world_size))

    # create model
    if args.pretrained:
        print("=> using pre-trained model '{}'".format(args.arch))
        model = models.__dict__[args.arch](pretrained=True)
    elif args.arch.startswith('resnext'):
        model = models.__dict__[args.arch](
            baseWidth=args.base_width,
            cardinality=args.cardinality,
        )
    else:
        model = models.__dict__[args.arch]()

    if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):
        model.features = torch.nn.DataParallel(model.features)
        model.cuda()
    else:
        model = torch.nn.DataParallel(model).cuda()

    cudnn.benchmark = True

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda()
    if args.optimizer.lower() == 'sgd':
        optimizer = optim.SGD(model.parameters(),
                              lr=args.lr,
                              momentum=args.momentum,
                              weight_decay=args.weight_decay)
    elif args.optimizer.lower() == 'adamw':
        optimizer = AdamW(model.parameters(),
                          lr=args.lr,
                          betas=(args.beta1, args.beta2),
                          weight_decay=args.weight_decay,
                          warmup=0)
    elif args.optimizer.lower() == 'radam':
        optimizer = RAdam(model.parameters(),
                          lr=args.lr,
                          betas=(args.beta1, args.beta2),
                          weight_decay=args.weight_decay)
    elif args.optimizer.lower() == 'lsadam':
        optimizer = LSAdamW(model.parameters(),
                            lr=args.lr * ((1. + 4. * args.sigma)**(0.25)),
                            betas=(args.beta1, args.beta2),
                            weight_decay=args.weight_decay,
                            sigma=args.sigma)
    elif args.optimizer.lower() == 'lsradam':
        sigma = 0.1
        optimizer = LSRAdam(model.parameters(),
                            lr=args.lr * ((1. + 4. * args.sigma)**(0.25)),
                            betas=(args.beta1, args.beta2),
                            weight_decay=args.weight_decay,
                            sigma=args.sigma)
    elif args.optimizer.lower() == 'srsgd':
        iter_count = 1
        optimizer = SGD_Adaptive(model.parameters(),
                                 lr=args.lr,
                                 weight_decay=args.weight_decay,
                                 iter_count=iter_count,
                                 restarting_iter=args.restart_schedule[0])
    elif args.optimizer.lower() == 'sradam':
        iter_count = 1
        optimizer = SRNAdam(model.parameters(),
                            lr=args.lr,
                            betas=(args.beta1, args.beta2),
                            iter_count=iter_count,
                            weight_decay=args.weight_decay,
                            restarting_iter=args.restart_schedule[0])
    elif args.optimizer.lower() == 'sradamw':
        iter_count = 1
        optimizer = SRAdamW(model.parameters(),
                            lr=args.lr,
                            betas=(args.beta1, args.beta2),
                            iter_count=iter_count,
                            weight_decay=args.weight_decay,
                            warmup=0,
                            restarting_iter=args.restart_schedule[0])
    elif args.optimizer.lower() == 'srradam':
        #NOTE: need to double-check this
        iter_count = 1
        optimizer = SRRAdam(model.parameters(),
                            lr=args.lr,
                            betas=(args.beta1, args.beta2),
                            iter_count=iter_count,
                            weight_decay=args.weight_decay,
                            warmup=0,
                            restarting_iter=args.restart_schedule[0])

    schedule_index = 1
    # Resume
    title = 'ImageNet-' + args.arch
    if args.resume:
        # Load checkpoint.
        print('==> Resuming from checkpoint..')
        assert os.path.isfile(
            args.resume), 'Error: no checkpoint directory found!'
        # args.checkpoint = os.path.dirname(args.resume)
        checkpoint = torch.load(args.resume)
        best_top1 = checkpoint['best_top1']
        best_top5 = checkpoint['best_top5']
        start_epoch = checkpoint['epoch']
        model.load_state_dict(checkpoint['state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        if args.optimizer.lower() == 'srsgd' or args.optimizer.lower(
        ) == 'sradam' or args.optimizer.lower(
        ) == 'sradamw' or args.optimizer.lower() == 'srradam':
            iter_count = optimizer.param_groups[0]['iter_count']
        schedule_index = checkpoint['schedule_index']
        state['lr'] = optimizer.param_groups[0]['lr']
        if args.checkpoint == args.resume:
            logger = Logger(os.path.join(args.checkpoint, 'log.txt'),
                            title=title,
                            resume=True)
        else:
            logger = Logger(os.path.join(args.checkpoint, 'log.txt'),
                            title=title)
            logger.set_names([
                'Learning Rate', 'Train Loss', 'Valid Loss', 'Train Top1',
                'Valid Top1', 'Train Top5', 'Valid Top5'
            ])
    else:
        logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title)
        logger.set_names([
            'Learning Rate', 'Train Loss', 'Valid Loss', 'Train Top1',
            'Valid Top1', 'Train Top5', 'Valid Top5'
        ])

    logger.file.write('    Total params: %.2fM' %
                      (sum(p.numel() for p in model.parameters()) / 1000000.0))

    if args.evaluate:
        logger.file.write('\nEvaluation only')
        test_loss, test_top1, test_top5 = test(val_loader, model, criterion,
                                               start_epoch, use_cuda, logger)
        logger.file.write(
            ' Test Loss:  %.8f, Test Top1:  %.2f, Test Top5: %.2f' %
            (test_loss, test_top1, test_top5))
        return

    # Train and val
    for epoch in range(start_epoch, args.epochs):
        if args.optimizer.lower() == 'srsgd':
            if epoch in args.schedule:
                optimizer = SGD_Adaptive(
                    model.parameters(),
                    lr=args.lr * (args.gamma**schedule_index),
                    weight_decay=args.weight_decay,
                    iter_count=iter_count,
                    restarting_iter=args.restart_schedule[schedule_index])
                schedule_index += 1

        elif args.optimizer.lower() == 'sradam':
            if epoch in args.schedule:
                optimizer = SRNAdam(
                    model.parameters(),
                    lr=args.lr * (args.gamma**schedule_index),
                    betas=(args.beta1, args.beta2),
                    iter_count=iter_count,
                    weight_decay=args.weight_decay,
                    restarting_iter=args.restart_schedule[schedule_index])
                schedule_index += 1

        elif args.optimizer.lower() == 'sradamw':
            if epoch in args.schedule:
                optimizer = SRAdamW(
                    model.parameters(),
                    lr=args.lr * (args.gamma**schedule_index),
                    betas=(args.beta1, args.beta2),
                    iter_count=iter_count,
                    weight_decay=args.weight_decay,
                    warmup=0,
                    restarting_iter=args.restart_schedule[schedule_index])
                schedule_index += 1

        elif args.optimizer.lower() == 'srradam':
            if epoch in args.schedule:
                optimizer = SRRAdam(
                    model.parameters(),
                    lr=args.lr * (args.gamma**schedule_index),
                    betas=(args.beta1, args.beta2),
                    iter_count=iter_count,
                    weight_decay=args.weight_decay,
                    warmup=0,
                    restarting_iter=args.restart_schedule[schedule_index])
                schedule_index += 1

        else:
            adjust_learning_rate(optimizer, epoch)

        logger.file.write('\nEpoch: [%d | %d] LR: %f' %
                          (epoch + 1, args.epochs, state['lr']))

        if args.optimizer.lower() == 'srsgd' or args.optimizer.lower(
        ) == 'sradam' or args.optimizer.lower(
        ) == 'sradamw' or args.optimizer.lower() == 'srradam':
            train_loss, train_top1, train_top5, iter_count = train(
                train_loader, model, criterion, optimizer, epoch, use_cuda,
                logger)
        else:
            train_loss, train_top1, train_top5 = train(train_loader, model,
                                                       criterion, optimizer,
                                                       epoch, use_cuda, logger)

        test_loss, test_top1, test_top5 = test(val_loader, model, criterion,
                                               epoch, use_cuda, logger)

        # append logger file
        logger.append([
            state['lr'], train_loss, test_loss, train_top1, test_top1,
            train_top5, test_top5
        ])

        writer.add_scalars('train_loss', {args.model_name: train_loss}, epoch)
        writer.add_scalars('test_loss', {args.model_name: test_loss}, epoch)
        writer.add_scalars('train_top1', {args.model_name: train_top1}, epoch)
        writer.add_scalars('test_top1', {args.model_name: test_top1}, epoch)
        writer.add_scalars('train_top5', {args.model_name: train_top5}, epoch)
        writer.add_scalars('test_top5', {args.model_name: test_top5}, epoch)

        # save model
        is_best = test_top1 > best_top1
        best_top1 = max(test_top1, best_top1)
        best_top5 = max(test_top5, best_top5)
        save_checkpoint(
            {
                'epoch': epoch + 1,
                'schedule_index': schedule_index,
                'state_dict': model.state_dict(),
                'top1': test_top1,
                'top5': test_top5,
                'best_top1': best_top1,
                'best_top5': best_top5,
                'optimizer': optimizer.state_dict(),
            },
            is_best,
            epoch,
            checkpoint=args.checkpoint)

        # reset DALI iterators
        train_loader.reset()
        val_loader.reset()

    logger.file.write('Best top1: %f' % best_top1)
    logger.file.write('Best top5: %f' % best_top5)

    logger.close()
    logger.plot()
    savefig(os.path.join(args.checkpoint, 'log.eps'))

    print('Best top1: %f' % best_top1)
    print('Best top5: %f' % best_top5)

    with open("./all_results_imagenet.txt", "a") as f:
        fcntl.flock(f, fcntl.LOCK_EX)
        f.write("%s\n" % args.checkpoint)
        f.write("best_top1 %f, best_top5 %f\n\n" % (best_top1, best_top5))
        fcntl.flock(f, fcntl.LOCK_UN)
def main():
    global best_ppl
    
    print('loading dataset')
    dataset = pickle.load(open(args.dataset_folder + 'test.pk', 'rb'))
    w_map, test_data, range_idx = dataset['w_map'], dataset['test_data'], dataset['range']

    cut_off = args.cut_off + [len(w_map) + 1]

    train_loader = LargeDataset(args.dataset_folder, range_idx, args.batch_size, args.sequence_length)
    test_loader = EvalDataset(test_data, args.batch_size)

    print('building model')

    rnn_map = {'Basic': BasicRNN, 'DDNet': DDRNN, 'DenseNet': DenseRNN, 'LDNet': functools.partial(LDRNN, layer_drop = args.layer_drop)}
    rnn_layer = rnn_map[args.rnn_layer](args.layer_num, args.rnn_unit, args.word_dim, args.hid_dim, args.droprate)

    if args.label_dim > 0:
        soft_max = AdaptiveSoftmax(args.label_dim, cut_off)
    else:
        soft_max = AdaptiveSoftmax(rnn_layer.output_dim, cut_off)

    lm_model = LM(rnn_layer, soft_max, len(w_map), args.word_dim, args.droprate, label_dim = args.label_dim, add_relu=args.add_relu)
    lm_model.rand_ini()
    # lm_model.cuda()
    
    # set up optimizers
    optim_map = {'Adam' : optim.Adam, 'Adagrad': optim.Adagrad, 'Adadelta': optim.Adadelta, 'SGD': functools.partial(optim.SGD, momentum=0.9), 'LSRAdam':LSRAdam, 'LSAdam': LSAdam, 'AdamW': AdamW, 'RAdam': RAdam, 'SRAdamW': SRAdamW, 'SRRAdam': SRRAdam}
    if args.update.lower() == 'lsradam' or args.update.lower == 'lsadam':
            optimizer = optim_map[args.update](lm_model.parameters(), lr=args.lr*((1.+4.*args.sigma)**(0.25)), 
                           betas=(args.beta1, args.beta2),
                           weight_decay=args.weight_decay, 
                           sigma=args.sigma) 
    elif args.update.lower() == 'radam':
        optimizer = optim_map[args.update](lm_model.parameters(), lr=args.lr, betas=(args.beta1, args.beta2), weight_decay=args.weight_decay)
    elif args.update.lower() == 'adamw':
        optimizer = optim_map[args.update](lm_model.parameters(), lr=args.lr, betas=(args.beta1, args.beta2), weight_decay=args.weight_decay, warmup=args.warmup)
    elif args.update.lower() == 'sradamw':
        iter_count = 1
        optimizer = optim_map[args.update](lm_model.parameters(), lr=args.lr, betas=(args.beta1, args.beta2), iter_count=iter_count, weight_decay=args.weight_decay, warmup = args.warmup, restarting_iter=args.restart_schedule[0]) 
    elif args.update.lower() == 'srradam':
        #NOTE: need to double-check this
        iter_count = 1
        optimizer = optim_map[args.update](lm_model.parameters(), lr=args.lr, betas=(args.beta1, args.beta2), iter_count=iter_count, weight_decay=args.weight_decay, warmup = args.warmup, restarting_iter=args.restart_schedule[0]) 
    else:
        if args.lr > 0:
            optimizer=optim_map[args.update](lm_model.parameters(), lr=args.lr)
        else:
            optimizer=optim_map[args.update](lm_model.parameters())
            
    # Resume
    title = 'onebillionword-' + args.rnn_layer
    logger = Logger(os.path.join(args.checkpath, 'log.txt'), title=title)
    logger.set_names(['Learning Rate', 'Train Loss', 'Train PPL', 'Valid PPL'])
    
    if args.load_checkpoint:
        if os.path.isfile(args.load_checkpoint):
            print("loading checkpoint: '{}'".format(args.load_checkpoint))
            checkpoint_file = torch.load(args.load_checkpoint, map_location=lambda storage, loc: storage)
            lm_model.load_state_dict(checkpoint_file['lm_model'], False)
            optimizer.load_state_dict(checkpoint_file['opt'], False)
        else:
            print("no checkpoint found at: '{}'".format(args.load_checkpoint))

    test_lm = nn.NLLLoss()
    
    test_lm.cuda()
    lm_model.cuda()
    
    batch_index = 0
    epoch_loss = 0
    full_epoch_loss = 0
    best_train_ppl = float('inf')
    cur_lr = args.lr
    
    schedule_index = 1

    try:
        for indexs in range(args.epoch):

            print('#' * 89)
            print('Start: {}'.format(indexs))

            if args.optimizer.lower() == 'sradamw':
                if indexs in args.schedule:
                    optimizer = SRAdamW(lm_model.parameters(), lr=args.lr * (args.gamma**schedule_index), betas=(args.beta1, args.beta2), iter_count=iter_count, weight_decay=args.weight_decay, warmup = 0, restarting_iter=args.restart_schedule[schedule_index])
                    schedule_index += 1

            elif args.optimizer.lower() == 'srradam':
                if indexs in args.schedule:
                    optimizer = SRRAdam(lm_model.parameters(), lr=args.lr * (args.gamma**schedule_index), betas=(args.beta1, args.beta2), iter_count=iter_count, weight_decay=args.weight_decay, warmup = 0, restarting_iter=args.restart_schedule[schedule_index])
                    schedule_index += 1
            
            else:
                adjust_learning_rate(optimizer, indexs)
                
            logger.file.write('\nEpoch: [%d | %d] LR: %f' % (indexs + 1, args.epoch, state['lr']))
            
            iterator = train_loader.get_tqdm()
            full_epoch_loss = 0

            lm_model.train()

            for word_t, label_t in iterator:

                if 1 == train_loader.cur_idx:
                    lm_model.init_hidden()

                label_t = label_t.view(-1)

                lm_model.zero_grad()
                loss = lm_model(word_t, label_t)
                
                loss.backward()
                torch.nn.utils.clip_grad_norm(lm_model.parameters(), args.clip)
                optimizer.step()
                
                if args.optimizer.lower() == 'sradamw' or args.optimizer.lower() == 'srradam'
                    iter_count, iter_total = optimizer.update_iter()

                batch_index += 1 

                if 0 == batch_index % args.interval:
                    s_loss = utils.to_scalar(loss)
                    writer.add_scalars('loss_tracking/train_loss', {args.model_name:s_loss}, batch_index)
                
                epoch_loss += utils.to_scalar(loss)
                full_epoch_loss += utils.to_scalar(loss)
                if 0 == batch_index % args.check_interval:
                    epoch_ppl = math.exp(epoch_loss / args.check_interval)
                    writer.add_scalars('loss_tracking/train_ppl', {args.model_name: epoch_ppl}, batch_index)
                    print('epoch_ppl: {} lr: {} @ batch_index: {}'.format(epoch_ppl, cur_lr, batch_index))
                    logger.file.write('epoch_ppl: {} lr: {} @ batch_index: {}'.format(epoch_ppl, cur_lr, batch_index))
                    epoch_loss = 0
    
            test_ppl = evaluate(test_loader, lm_model, test_lm, -1)
        
            is_best = test_ppl < best_ppl
            best_ppl = min(test_ppl, best_ppl)

            writer.add_scalars('loss_tracking/test_ppl', {args.model_name: test_ppl}, indexs)
            print('test_ppl: {} @ index: {}'.format(test_ppl, indexs))
            logger.file.write('test_ppl: {} @ index: {}'.format(test_ppl, indexs))
            
            save_checkpoint({
                'epoch': epoch + 1,
                'schedule_index': schedule_index,
                'lm_model': lm_model.state_dict(),
                'ppl': test_ppl,
                'best_ppl': best_ppl,
                'opt':optimizer.state_dict(),
            }, is_best, indexs, checkpoint=args.checkpath)

    except KeyboardInterrupt:

        print('Exiting from training early')
        logger.file.write('Exiting from training early')
        test_ppl = evaluate(test_loader, lm_model, test_lm, -1)
        writer.add_scalars('loss_tracking/test_ppl', {args.model_name: test_ppl}, args.epoch)
        
        is_best=False
        save_checkpoint({
                'epoch': epoch + 1,
                'schedule_index': schedule_index,
                'lm_model': lm_model.state_dict(),
                'ppl': test_ppl,
                'best_ppl': best_ppl,
                'opt':optimizer.state_dict(),
            }, is_best, indexs, checkpoint=args.checkpath)
    
    print('Best PPL:%f'%best_ppl)
    
    logger.file.write('Best PPL:%f'%best_ppl)  
    logger.close()
    
    with open("./all_results.txt", "a") as f:
        fcntl.flock(f, fcntl.LOCK_EX)
        f.write("%s\n"%args.checkpath)
        f.write("best_ppl %f\n\n"%best_ppl)
        fcntl.flock(f, fcntl.LOCK_UN)
def main():
    global best_acc
    start_epoch = args.start_epoch  # start from epoch 0 or last checkpoint epoch

    if not os.path.isdir(args.checkpoint):
        mkdir_p(args.checkpoint)

    # Data
    print('==> Preparing dataset %s' % args.dataset)
    transform_train = transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465),
                             (0.2023, 0.1994, 0.2010)),
    ])

    transform_test = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465),
                             (0.2023, 0.1994, 0.2010)),
    ])
    if args.dataset == 'cifar10':
        dataloader = datasets.CIFAR10
        num_classes = 10
    else:
        dataloader = datasets.CIFAR100
        num_classes = 100

    trainset = dataloader(root='./data',
                          train=True,
                          download=True,
                          transform=transform_train)
    trainloader = data.DataLoader(trainset,
                                  batch_size=args.train_batch,
                                  shuffle=True,
                                  num_workers=args.workers)

    testset = dataloader(root='./data',
                         train=False,
                         download=False,
                         transform=transform_test)
    testloader = data.DataLoader(testset,
                                 batch_size=args.test_batch,
                                 shuffle=False,
                                 num_workers=args.workers)

    # Model
    print("==> creating model '{}'".format(args.arch))
    if args.arch.startswith('resnext'):
        model = models.__dict__[args.arch](
            cardinality=args.cardinality,
            num_classes=num_classes,
            depth=args.depth,
            widen_factor=args.widen_factor,
            dropRate=args.drop,
        )
    elif args.arch.startswith('densenet'):
        model = models.__dict__[args.arch](
            num_classes=num_classes,
            depth=args.depth,
            growthRate=args.growthRate,
            compressionRate=args.compressionRate,
            dropRate=args.drop,
        )
    elif args.arch.startswith('wrn'):
        model = models.__dict__[args.arch](
            num_classes=num_classes,
            depth=args.depth,
            widen_factor=args.widen_factor,
            dropRate=args.drop,
        )
    elif args.arch.startswith('resnet'):
        model = models.__dict__[args.arch](
            num_classes=num_classes,
            depth=args.depth,
            block_name=args.block_name,
        )
    elif args.arch.startswith('preresnet'):
        model = models.__dict__[args.arch](
            num_classes=num_classes,
            depth=args.depth,
            block_name=args.block_name,
        )
    elif args.arch.startswith('horesnet'):
        model = models.__dict__[args.arch](num_classes=num_classes,
                                           depth=args.depth,
                                           eta=args.eta,
                                           block_name=args.block_name,
                                           feature_vec=args.feature_vec)
    elif args.arch.startswith('hopreresnet'):
        model = models.__dict__[args.arch](num_classes=num_classes,
                                           depth=args.depth,
                                           eta=args.eta,
                                           block_name=args.block_name,
                                           feature_vec=args.feature_vec)
    elif args.arch.startswith('nagpreresnet'):
        model = models.__dict__[args.arch](num_classes=num_classes,
                                           depth=args.depth,
                                           eta=args.eta,
                                           block_name=args.block_name,
                                           feature_vec=args.feature_vec)
    elif args.arch.startswith('mompreresnet'):
        model = models.__dict__[args.arch](num_classes=num_classes,
                                           depth=args.depth,
                                           eta=args.eta,
                                           block_name=args.block_name,
                                           feature_vec=args.feature_vec)
    elif args.arch.startswith('v2_preresnet'):
        if args.depth == 18:
            block_name = 'basicblock'
            num_blocks = [2, 2, 2, 2]
        elif args.depth == 34:
            block_name = 'basicblock'
            num_blocks = [3, 4, 6, 3]
        elif args.depth == 50:
            block_name = 'bottleneck'
            num_blocks = [3, 4, 6, 3]
        elif args.depth == 101:
            block_name = 'bottleneck'
            num_blocks = [3, 4, 23, 3]
        elif args.depth == 152:
            block_name = 'bottleneck'
            num_blocks = [3, 8, 36, 3]

        model = models.__dict__[args.arch](block_name=block_name,
                                           num_blocks=num_blocks,
                                           num_classes=num_classes)
    else:
        print('Model is specified wrongly - Use standard model')
        model = models.__dict__[args.arch](num_classes=num_classes)

    model = torch.nn.DataParallel(model).cuda()
    cudnn.benchmark = True
    print('    Total params: %.2fM' %
          (sum(p.numel() for p in model.parameters()) / 1000000.0))
    criterion = nn.CrossEntropyLoss()
    if args.optimizer.lower() == 'sgd':
        optimizer = optim.SGD(model.parameters(),
                              lr=args.lr,
                              momentum=args.momentum,
                              weight_decay=args.weight_decay)
    # elif args.optimizer.lower() == 'adam':
    #     optimizer = optim.Adam(model.parameters(), lr=args.lr, betas=(args.beta1, args.beta2), weight_decay=args.weight_decay)
    elif args.optimizer.lower() == 'radam':
        optimizer = RAdam(model.parameters(),
                          lr=args.lr,
                          betas=(args.beta1, args.beta2),
                          weight_decay=args.weight_decay)
    elif args.optimizer.lower() == 'adamw':
        optimizer = AdamW(model.parameters(),
                          lr=args.lr,
                          betas=(args.beta1, args.beta2),
                          weight_decay=args.weight_decay,
                          warmup=args.warmup)
    elif args.optimizer.lower() == 'adam':
        optimizer = optim.Adam(model.parameters(),
                               lr=args.lr,
                               betas=(args.beta1, args.beta2),
                               weight_decay=args.weight_decay)
    elif args.optimizer.lower() == 'srsgd':
        iter_count = 1
        optimizer = SGD_Adaptive(model.parameters(),
                                 lr=args.lr,
                                 weight_decay=args.weight_decay,
                                 iter_count=iter_count,
                                 restarting_iter=args.restart_schedule[0])
    elif args.optimizer.lower() == 'sradam':
        iter_count = 1
        optimizer = SRNAdam(model.parameters(),
                            lr=args.lr,
                            betas=(args.beta1, args.beta2),
                            iter_count=iter_count,
                            weight_decay=args.weight_decay,
                            restarting_iter=args.restart_schedule[0])
    elif args.optimizer.lower() == 'sradamw':
        iter_count = 1
        optimizer = SRAdamW(model.parameters(),
                            lr=args.lr,
                            betas=(args.beta1, args.beta2),
                            iter_count=iter_count,
                            weight_decay=args.weight_decay,
                            warmup=args.warmup,
                            restarting_iter=args.restart_schedule[0])
    elif args.optimizer.lower() == 'srradam':
        #NOTE: need to double-check this
        iter_count = 1
        optimizer = SRRAdam(model.parameters(),
                            lr=args.lr,
                            betas=(args.beta1, args.beta2),
                            iter_count=iter_count,
                            weight_decay=args.weight_decay,
                            warmup=args.warmup,
                            restarting_iter=args.restart_schedule[0])

    # Resume
    title = 'cifar-10-' + args.arch
    logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title)
    logger.set_names([
        'Learning Rate', 'Train Loss', 'Valid Loss', 'Train Acc.', 'Valid Acc.'
    ])

    schedule_index = 1
    # Resume
    title = '%s-' % args.dataset + args.arch
    if args.resume:
        # Load checkpoint.
        print('==> Resuming from checkpoint..')
        assert os.path.isfile(
            args.resume), 'Error: no checkpoint directory found!'
        # args.checkpoint = os.path.dirname(args.resume)
        checkpoint = torch.load(args.resume)
        best_acc = checkpoint['best_acc']
        start_epoch = checkpoint['epoch']
        model.load_state_dict(checkpoint['state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        if args.optimizer.lower() == 'srsgd' or args.optimizer.lower(
        ) == 'sradam' or args.optimizer.lower(
        ) == 'sradamw' or args.optimizer.lower() == 'srradam':
            iter_count = optimizer.param_groups[0]['iter_count']
        # schedule_index = checkpoint['schedule_index']
        schedule_index = 3
        state['lr'] = optimizer.param_groups[0]['lr']
        logger = Logger(os.path.join(args.checkpoint, 'log.txt'),
                        title=title,
                        resume=True)
    else:
        logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title)
        logger.set_names([
            'Learning Rate', 'Train Loss', 'Valid Loss', 'Train Acc.',
            'Valid Acc.'
        ])

    if args.evaluate:
        print('\nEvaluation only')
        test_loss, test_acc = test(testloader, model, criterion, start_epoch,
                                   use_cuda)
        print(' Test Loss:  %.8f, Test Acc:  %.2f' % (test_loss, test_acc))
        return

    # Train and val

    for epoch in range(start_epoch, args.epochs):
        if args.optimizer.lower() == 'srsgd':
            if epoch == 161:
                start_decay_restarting_iter = args.restart_schedule[
                    schedule_index] - 1
                current_lr = args.lr * (args.gamma**schedule_index)

            if epoch in args.schedule:
                current_lr = args.lr * (args.gamma**schedule_index)
                current_restarting_iter = args.restart_schedule[schedule_index]
                optimizer = SGD_Adaptive(
                    model.parameters(),
                    lr=current_lr,
                    weight_decay=args.weight_decay,
                    iter_count=iter_count,
                    restarting_iter=current_restarting_iter)
                schedule_index += 1

            if epoch >= 161:
                current_restarting_iter = start_decay_restarting_iter * (
                    args.epochs - epoch - 1) / (args.epochs - 162) + 1
                optimizer = SGD_Adaptive(
                    model.parameters(),
                    lr=current_lr,
                    weight_decay=args.weight_decay,
                    iter_count=iter_count,
                    restarting_iter=current_restarting_iter)

        else:
            adjust_learning_rate(optimizer, epoch)

        logger.file.write('\nEpoch: [%d | %d] LR: %f' %
                          (epoch + 1, args.epochs, state['lr']))

        if args.optimizer.lower() == 'srsgd' or args.optimizer.lower(
        ) == 'sradam' or args.optimizer.lower(
        ) == 'sradamw' or args.optimizer.lower() == 'srradam':
            train_loss, train_acc, iter_count = train(trainloader, model,
                                                      criterion, optimizer,
                                                      epoch, use_cuda, logger)
        else:
            train_loss, train_acc = train(trainloader, model, criterion,
                                          optimizer, epoch, use_cuda, logger)

        test_loss, test_acc = test(testloader, model, criterion, epoch,
                                   use_cuda, logger)

        # append logger file
        logger.append(
            [state['lr'], train_loss, test_loss, train_acc, test_acc])

        writer.add_scalars('train_loss', {args.model_name: train_loss}, epoch)
        writer.add_scalars('test_loss', {args.model_name: test_loss}, epoch)
        writer.add_scalars('train_acc', {args.model_name: train_acc}, epoch)
        writer.add_scalars('test_acc', {args.model_name: test_acc}, epoch)

        # save model
        is_best = test_acc > best_acc
        best_acc = max(test_acc, best_acc)
        save_checkpoint(
            {
                'epoch': epoch + 1,
                'schedule_index': schedule_index,
                'state_dict': model.state_dict(),
                'acc': test_acc,
                'best_acc': best_acc,
                'optimizer': optimizer.state_dict(),
            },
            is_best,
            epoch,
            checkpoint=args.checkpoint)

    logger.file.write('Best acc:%f' % best_acc)

    logger.close()
    logger.plot()
    savefig(os.path.join(args.checkpoint, 'log.eps'))

    print('Best acc:')
    print(best_acc)

    with open("./all_results.txt", "a") as f:
        fcntl.flock(f, fcntl.LOCK_EX)
        f.write("%s\n" % args.checkpoint)
        f.write("best_acc %f\n\n" % best_acc)
        fcntl.flock(f, fcntl.LOCK_UN)
Beispiel #4
0
def main():
    global best_top1, best_top5

    start_epoch = args.start_epoch  # start from epoch 0 or last checkpoint epoch

    if not os.path.isdir(args.checkpoint):
        mkdir_p(args.checkpoint)

    # Data loading code
    traindir = os.path.join(args.data, 'train')
    validdir = os.path.join(args.data, 'val')
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])

    train_transform = transforms.Compose([
        transforms.RandomResizedCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        normalize,
    ])

    val_transform = transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        normalize,
    ])

    train_data = imagenet_lmdb_dataset(traindir, transform=train_transform)
    valid_data = imagenet_lmdb_dataset(validdir, transform=val_transform)

    train_sampler = torch.utils.data.distributed.DistributedSampler(train_data)

    train_loader = torch.utils.data.DataLoader(train_data,
                                               batch_size=args.train_batch,
                                               shuffle=(train_sampler is None),
                                               pin_memory=True,
                                               num_workers=8,
                                               sampler=train_sampler)

    val_loader = torch.utils.data.DataLoader(valid_data,
                                             batch_size=args.test_batch,
                                             shuffle=False,
                                             pin_memory=True,
                                             num_workers=8)

    # create model
    if args.pretrained:
        print("=> using pre-trained model '{}'".format(args.arch))
        model = models.__dict__[args.arch](pretrained=True)
    elif args.arch.startswith('resnext'):
        model = models.__dict__[args.arch](
            baseWidth=args.base_width,
            cardinality=args.cardinality,
        )
    elif args.arch == 'densenet264':
        model = DenseNet(growth_rate=32,
                         block_config=(6, 12, 64, 48),
                         num_init_features=64,
                         bn_size=4,
                         drop_rate=0,
                         num_classes=1000,
                         memory_efficient=False)
    elif args.arch == 'resnet200':
        model = ResNet(block=Bottleneck,
                       layers=[3, 24, 36, 3],
                       num_classes=1000,
                       zero_init_residual=False,
                       groups=1,
                       width_per_group=64,
                       replace_stride_with_dilation=None,
                       norm_layer=None)
    else:
        print("=> creating model '{}'".format(args.arch))
        model = models.__dict__[args.arch]()

    if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):
        model.features = DDP(model.features)
        model.cuda()
    else:
        model = model.cuda()
        model = DDP(model, delay_allreduce=True)

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda()
    if args.optimizer.lower() == 'sgd':
        optimizer = optim.SGD(model.parameters(),
                              lr=args.lr,
                              momentum=args.momentum,
                              weight_decay=args.weight_decay)
    elif args.optimizer.lower() == 'adamw':
        optimizer = AdamW(model.parameters(),
                          lr=args.lr,
                          betas=(args.beta1, args.beta2),
                          weight_decay=args.weight_decay,
                          warmup=0)
    elif args.optimizer.lower() == 'radam':
        optimizer = RAdam(model.parameters(),
                          lr=args.lr,
                          betas=(args.beta1, args.beta2),
                          weight_decay=args.weight_decay)
    elif args.optimizer.lower() == 'lsadam':
        optimizer = LSAdamW(model.parameters(),
                            lr=args.lr * ((1. + 4. * args.sigma)**(0.25)),
                            betas=(args.beta1, args.beta2),
                            weight_decay=args.weight_decay,
                            sigma=args.sigma)
    elif args.optimizer.lower() == 'lsradam':
        sigma = 0.1
        optimizer = LSRAdam(model.parameters(),
                            lr=args.lr * ((1. + 4. * args.sigma)**(0.25)),
                            betas=(args.beta1, args.beta2),
                            weight_decay=args.weight_decay,
                            sigma=args.sigma)
    elif args.optimizer.lower() == 'srsgd':
        iter_count = 1
        optimizer = SGD_Adaptive(model.parameters(),
                                 lr=args.lr,
                                 weight_decay=args.weight_decay,
                                 iter_count=iter_count,
                                 restarting_iter=args.restart_schedule[0])
    elif args.optimizer.lower() == 'sradam':
        iter_count = 1
        optimizer = SRNAdam(model.parameters(),
                            lr=args.lr,
                            betas=(args.beta1, args.beta2),
                            iter_count=iter_count,
                            weight_decay=args.weight_decay,
                            restarting_iter=args.restart_schedule[0])
    elif args.optimizer.lower() == 'sradamw':
        iter_count = 1
        optimizer = SRAdamW(model.parameters(),
                            lr=args.lr,
                            betas=(args.beta1, args.beta2),
                            iter_count=iter_count,
                            weight_decay=args.weight_decay,
                            warmup=0,
                            restarting_iter=args.restart_schedule[0])
    elif args.optimizer.lower() == 'srradam':
        #NOTE: need to double-check this
        iter_count = 1
        optimizer = SRRAdam(model.parameters(),
                            lr=args.lr,
                            betas=(args.beta1, args.beta2),
                            iter_count=iter_count,
                            weight_decay=args.weight_decay,
                            warmup=0,
                            restarting_iter=args.restart_schedule[0])

    schedule_index = 1
    # Resume
    title = 'ImageNet-' + args.arch
    if args.resume:
        # Load checkpoint.
        print('==> Resuming from checkpoint..')
        assert os.path.isfile(
            args.resume), 'Error: no checkpoint directory found!'
        # args.checkpoint = os.path.dirname(args.resume)
        # checkpoint = torch.load(args.resume, map_location = lambda storage, loc: storage.cuda(args.local_rank))
        checkpoint = torch.load(args.resume, map_location=torch.device('cpu'))
        best_top1 = checkpoint['best_top1']
        best_top5 = checkpoint['best_top5']
        start_epoch = checkpoint['epoch']
        model.load_state_dict(checkpoint['state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        if args.optimizer.lower() == 'srsgd' or args.optimizer.lower(
        ) == 'sradam' or args.optimizer.lower(
        ) == 'sradamw' or args.optimizer.lower() == 'srradam':
            iter_count = optimizer.param_groups[0]['iter_count']
        schedule_index = checkpoint['schedule_index']
        state['lr'] = optimizer.param_groups[0]['lr']
        if args.checkpoint == args.resume:
            logger = LoggerDistributed(os.path.join(args.checkpoint,
                                                    'log.txt'),
                                       rank=args.local_rank,
                                       title=title,
                                       resume=True)
        else:
            logger = LoggerDistributed(os.path.join(args.checkpoint,
                                                    'log.txt'),
                                       rank=args.local_rank,
                                       title=title)
            if args.local_rank == 0:
                logger.set_names([
                    'Learning Rate', 'Train Loss', 'Valid Loss', 'Train Top1',
                    'Valid Top1', 'Train Top5', 'Valid Top5'
                ])
    else:
        logger = LoggerDistributed(os.path.join(args.checkpoint, 'log.txt'),
                                   rank=args.local_rank,
                                   title=title)
        if args.local_rank == 0:
            logger.set_names([
                'Learning Rate', 'Train Loss', 'Valid Loss', 'Train Top1',
                'Valid Top1', 'Train Top5', 'Valid Top5'
            ])

    if args.local_rank == 0:
        logger.file.write('    Total params: %.2fM' %
                          (sum(p.numel()
                               for p in model.parameters()) / 1000000.0))

    if args.evaluate:
        if args.local_rank == 0:
            logger.file.write('\nEvaluation only')
        test_loss, test_top1, test_top5 = test(val_loader, model, criterion,
                                               start_epoch, use_cuda, logger)
        if args.local_rank == 0:
            logger.file.write(
                ' Test Loss:  %.8f, Test Top1:  %.2f, Test Top5: %.2f' %
                (test_loss, test_top1, test_top5))
        return

    # Train and val
    for epoch in range(start_epoch, args.epochs):
        # Shuffle the sampler.
        train_loader.sampler.set_epoch(epoch + args.manualSeed)

        if args.optimizer.lower() == 'srsgd':
            if epoch in args.schedule:
                optimizer = SGD_Adaptive(
                    model.parameters(),
                    lr=args.lr * (args.gamma**schedule_index),
                    weight_decay=args.weight_decay,
                    iter_count=iter_count,
                    restarting_iter=args.restart_schedule[schedule_index])
                schedule_index += 1

        elif args.optimizer.lower() == 'sradam':
            if epoch in args.schedule:
                optimizer = SRNAdam(
                    model.parameters(),
                    lr=args.lr * (args.gamma**schedule_index),
                    betas=(args.beta1, args.beta2),
                    iter_count=iter_count,
                    weight_decay=args.weight_decay,
                    restarting_iter=args.restart_schedule[schedule_index])
                schedule_index += 1

        elif args.optimizer.lower() == 'sradamw':
            if epoch in args.schedule:
                optimizer = SRAdamW(
                    model.parameters(),
                    lr=args.lr * (args.gamma**schedule_index),
                    betas=(args.beta1, args.beta2),
                    iter_count=iter_count,
                    weight_decay=args.weight_decay,
                    warmup=0,
                    restarting_iter=args.restart_schedule[schedule_index])
                schedule_index += 1

        elif args.optimizer.lower() == 'srradam':
            if epoch in args.schedule:
                optimizer = SRRAdam(
                    model.parameters(),
                    lr=args.lr * (args.gamma**schedule_index),
                    betas=(args.beta1, args.beta2),
                    iter_count=iter_count,
                    weight_decay=args.weight_decay,
                    warmup=0,
                    restarting_iter=args.restart_schedule[schedule_index])
                schedule_index += 1

        else:
            adjust_learning_rate(optimizer, epoch)

        if args.local_rank == 0:
            logger.file.write('\nEpoch: [%d | %d] LR: %f' %
                              (epoch + 1, args.epochs, state['lr']))

        if args.optimizer.lower() == 'srsgd' or args.optimizer.lower(
        ) == 'sradam' or args.optimizer.lower(
        ) == 'sradamw' or args.optimizer.lower() == 'srradam':
            train_loss, train_top1, train_top5, iter_count = train(
                train_loader, model, criterion, optimizer, epoch, use_cuda,
                logger)
        else:
            train_loss, train_top1, train_top5 = train(train_loader, model,
                                                       criterion, optimizer,
                                                       epoch, use_cuda, logger)

        test_loss, test_top1, test_top5 = test(val_loader, model, criterion,
                                               epoch, use_cuda, logger)

        # append logger file
        if args.local_rank == 0:
            logger.append([
                state['lr'], train_loss, test_loss, train_top1, test_top1,
                train_top5, test_top5
            ])
            writer.add_scalars('train_loss', {args.model_name: train_loss},
                               epoch)
            writer.add_scalars('test_loss', {args.model_name: test_loss},
                               epoch)
            writer.add_scalars('train_top1', {args.model_name: train_top1},
                               epoch)
            writer.add_scalars('test_top1', {args.model_name: test_top1},
                               epoch)
            writer.add_scalars('train_top5', {args.model_name: train_top5},
                               epoch)
            writer.add_scalars('test_top5', {args.model_name: test_top5},
                               epoch)

        # save model
        is_best = test_top1 > best_top1
        best_top1 = max(test_top1, best_top1)
        best_top5 = max(test_top5, best_top5)
        if args.local_rank == 0:
            save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'schedule_index': schedule_index,
                    'state_dict': model.state_dict(),
                    'top1': test_top1,
                    'top5': test_top5,
                    'best_top1': best_top1,
                    'best_top5': best_top5,
                    'optimizer': optimizer.state_dict(),
                },
                is_best,
                epoch,
                checkpoint=args.checkpoint)

            if epoch == args.schedule[-1]:
                logger.file.write('Best top1: %f at epoch %i' %
                                  (best_top1, epoch))
                logger.file.write('Best top5: %f at epoch %i' %
                                  (best_top5, epoch))
                print('Best top1: %f at epoch %i' % (best_top1, epoch))
                print('Best top5: %f at epoch %i' % (best_top5, epoch))
                with open("./all_results_imagenet.txt", "a") as f:
                    fcntl.flock(f, fcntl.LOCK_EX)
                    f.write("%s\n" % args.checkpoint)
                    f.write("best_top1 %f, best_top5 %f at epoch %i\n\n" %
                            (best_top1, best_top5, epoch))
                    fcntl.flock(f, fcntl.LOCK_UN)

    if args.local_rank == 0:
        logger.file.write('Best top1: %f' % best_top1)
        logger.file.write('Best top5: %f' % best_top5)
        logger.close()
        logger.plot()
        savefig(os.path.join(args.checkpoint, 'log.eps'))
        print('Best top1: %f' % best_top1)
        print('Best top5: %f' % best_top5)
        with open("./all_results_imagenet.txt", "a") as f:
            fcntl.flock(f, fcntl.LOCK_EX)
            f.write("%s\n" % args.checkpoint)
            f.write("best_top1 %f, best_top5 %f\n\n" % (best_top1, best_top5))
            fcntl.flock(f, fcntl.LOCK_UN)