def main(): global best_top1, best_top5 args.world_size = 1 start_epoch = args.start_epoch # start from epoch 0 or last checkpoint epoch if not os.path.isdir(args.checkpoint): mkdir_p(args.checkpoint) # Data loading code traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'val') crop_size = 224 val_size = 256 pipe = HybridTrainPipe(batch_size=args.train_batch, num_threads=args.workers, device_id=args.local_rank, data_dir=traindir, crop=crop_size, dali_cpu=args.dali_cpu) pipe.build() train_loader = DALIClassificationIterator( pipe, size=int(pipe.epoch_size("Reader") / args.world_size)) pipe = HybridValPipe(batch_size=args.test_batch, num_threads=args.workers, device_id=args.local_rank, data_dir=valdir, crop=crop_size, size=val_size) pipe.build() val_loader = DALIClassificationIterator( pipe, size=int(pipe.epoch_size("Reader") / args.world_size)) # create model if args.pretrained: print("=> using pre-trained model '{}'".format(args.arch)) model = models.__dict__[args.arch](pretrained=True) elif args.arch.startswith('resnext'): model = models.__dict__[args.arch]( baseWidth=args.base_width, cardinality=args.cardinality, ) else: model = models.__dict__[args.arch]() if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): model.features = torch.nn.DataParallel(model.features) model.cuda() else: model = torch.nn.DataParallel(model).cuda() cudnn.benchmark = True # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() if args.optimizer.lower() == 'sgd': optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'adamw': optimizer = AdamW(model.parameters(), lr=args.lr, betas=(args.beta1, args.beta2), weight_decay=args.weight_decay, warmup=0) elif args.optimizer.lower() == 'radam': optimizer = RAdam(model.parameters(), lr=args.lr, betas=(args.beta1, args.beta2), weight_decay=args.weight_decay) elif args.optimizer.lower() == 'lsadam': optimizer = LSAdamW(model.parameters(), lr=args.lr * ((1. + 4. * args.sigma)**(0.25)), betas=(args.beta1, args.beta2), weight_decay=args.weight_decay, sigma=args.sigma) elif args.optimizer.lower() == 'lsradam': sigma = 0.1 optimizer = LSRAdam(model.parameters(), lr=args.lr * ((1. + 4. * args.sigma)**(0.25)), betas=(args.beta1, args.beta2), weight_decay=args.weight_decay, sigma=args.sigma) elif args.optimizer.lower() == 'srsgd': iter_count = 1 optimizer = SGD_Adaptive(model.parameters(), lr=args.lr, weight_decay=args.weight_decay, iter_count=iter_count, restarting_iter=args.restart_schedule[0]) elif args.optimizer.lower() == 'sradam': iter_count = 1 optimizer = SRNAdam(model.parameters(), lr=args.lr, betas=(args.beta1, args.beta2), iter_count=iter_count, weight_decay=args.weight_decay, restarting_iter=args.restart_schedule[0]) elif args.optimizer.lower() == 'sradamw': iter_count = 1 optimizer = SRAdamW(model.parameters(), lr=args.lr, betas=(args.beta1, args.beta2), iter_count=iter_count, weight_decay=args.weight_decay, warmup=0, restarting_iter=args.restart_schedule[0]) elif args.optimizer.lower() == 'srradam': #NOTE: need to double-check this iter_count = 1 optimizer = SRRAdam(model.parameters(), lr=args.lr, betas=(args.beta1, args.beta2), iter_count=iter_count, weight_decay=args.weight_decay, warmup=0, restarting_iter=args.restart_schedule[0]) schedule_index = 1 # Resume title = 'ImageNet-' + args.arch if args.resume: # Load checkpoint. print('==> Resuming from checkpoint..') assert os.path.isfile( args.resume), 'Error: no checkpoint directory found!' # args.checkpoint = os.path.dirname(args.resume) checkpoint = torch.load(args.resume) best_top1 = checkpoint['best_top1'] best_top5 = checkpoint['best_top5'] start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) if args.optimizer.lower() == 'srsgd' or args.optimizer.lower( ) == 'sradam' or args.optimizer.lower( ) == 'sradamw' or args.optimizer.lower() == 'srradam': iter_count = optimizer.param_groups[0]['iter_count'] schedule_index = checkpoint['schedule_index'] state['lr'] = optimizer.param_groups[0]['lr'] if args.checkpoint == args.resume: logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title, resume=True) else: logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title) logger.set_names([ 'Learning Rate', 'Train Loss', 'Valid Loss', 'Train Top1', 'Valid Top1', 'Train Top5', 'Valid Top5' ]) else: logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title) logger.set_names([ 'Learning Rate', 'Train Loss', 'Valid Loss', 'Train Top1', 'Valid Top1', 'Train Top5', 'Valid Top5' ]) logger.file.write(' Total params: %.2fM' % (sum(p.numel() for p in model.parameters()) / 1000000.0)) if args.evaluate: logger.file.write('\nEvaluation only') test_loss, test_top1, test_top5 = test(val_loader, model, criterion, start_epoch, use_cuda, logger) logger.file.write( ' Test Loss: %.8f, Test Top1: %.2f, Test Top5: %.2f' % (test_loss, test_top1, test_top5)) return # Train and val for epoch in range(start_epoch, args.epochs): if args.optimizer.lower() == 'srsgd': if epoch in args.schedule: optimizer = SGD_Adaptive( model.parameters(), lr=args.lr * (args.gamma**schedule_index), weight_decay=args.weight_decay, iter_count=iter_count, restarting_iter=args.restart_schedule[schedule_index]) schedule_index += 1 elif args.optimizer.lower() == 'sradam': if epoch in args.schedule: optimizer = SRNAdam( model.parameters(), lr=args.lr * (args.gamma**schedule_index), betas=(args.beta1, args.beta2), iter_count=iter_count, weight_decay=args.weight_decay, restarting_iter=args.restart_schedule[schedule_index]) schedule_index += 1 elif args.optimizer.lower() == 'sradamw': if epoch in args.schedule: optimizer = SRAdamW( model.parameters(), lr=args.lr * (args.gamma**schedule_index), betas=(args.beta1, args.beta2), iter_count=iter_count, weight_decay=args.weight_decay, warmup=0, restarting_iter=args.restart_schedule[schedule_index]) schedule_index += 1 elif args.optimizer.lower() == 'srradam': if epoch in args.schedule: optimizer = SRRAdam( model.parameters(), lr=args.lr * (args.gamma**schedule_index), betas=(args.beta1, args.beta2), iter_count=iter_count, weight_decay=args.weight_decay, warmup=0, restarting_iter=args.restart_schedule[schedule_index]) schedule_index += 1 else: adjust_learning_rate(optimizer, epoch) logger.file.write('\nEpoch: [%d | %d] LR: %f' % (epoch + 1, args.epochs, state['lr'])) if args.optimizer.lower() == 'srsgd' or args.optimizer.lower( ) == 'sradam' or args.optimizer.lower( ) == 'sradamw' or args.optimizer.lower() == 'srradam': train_loss, train_top1, train_top5, iter_count = train( train_loader, model, criterion, optimizer, epoch, use_cuda, logger) else: train_loss, train_top1, train_top5 = train(train_loader, model, criterion, optimizer, epoch, use_cuda, logger) test_loss, test_top1, test_top5 = test(val_loader, model, criterion, epoch, use_cuda, logger) # append logger file logger.append([ state['lr'], train_loss, test_loss, train_top1, test_top1, train_top5, test_top5 ]) writer.add_scalars('train_loss', {args.model_name: train_loss}, epoch) writer.add_scalars('test_loss', {args.model_name: test_loss}, epoch) writer.add_scalars('train_top1', {args.model_name: train_top1}, epoch) writer.add_scalars('test_top1', {args.model_name: test_top1}, epoch) writer.add_scalars('train_top5', {args.model_name: train_top5}, epoch) writer.add_scalars('test_top5', {args.model_name: test_top5}, epoch) # save model is_best = test_top1 > best_top1 best_top1 = max(test_top1, best_top1) best_top5 = max(test_top5, best_top5) save_checkpoint( { 'epoch': epoch + 1, 'schedule_index': schedule_index, 'state_dict': model.state_dict(), 'top1': test_top1, 'top5': test_top5, 'best_top1': best_top1, 'best_top5': best_top5, 'optimizer': optimizer.state_dict(), }, is_best, epoch, checkpoint=args.checkpoint) # reset DALI iterators train_loader.reset() val_loader.reset() logger.file.write('Best top1: %f' % best_top1) logger.file.write('Best top5: %f' % best_top5) logger.close() logger.plot() savefig(os.path.join(args.checkpoint, 'log.eps')) print('Best top1: %f' % best_top1) print('Best top5: %f' % best_top5) with open("./all_results_imagenet.txt", "a") as f: fcntl.flock(f, fcntl.LOCK_EX) f.write("%s\n" % args.checkpoint) f.write("best_top1 %f, best_top5 %f\n\n" % (best_top1, best_top5)) fcntl.flock(f, fcntl.LOCK_UN)
def main(): global best_ppl print('loading dataset') dataset = pickle.load(open(args.dataset_folder + 'test.pk', 'rb')) w_map, test_data, range_idx = dataset['w_map'], dataset['test_data'], dataset['range'] cut_off = args.cut_off + [len(w_map) + 1] train_loader = LargeDataset(args.dataset_folder, range_idx, args.batch_size, args.sequence_length) test_loader = EvalDataset(test_data, args.batch_size) print('building model') rnn_map = {'Basic': BasicRNN, 'DDNet': DDRNN, 'DenseNet': DenseRNN, 'LDNet': functools.partial(LDRNN, layer_drop = args.layer_drop)} rnn_layer = rnn_map[args.rnn_layer](args.layer_num, args.rnn_unit, args.word_dim, args.hid_dim, args.droprate) if args.label_dim > 0: soft_max = AdaptiveSoftmax(args.label_dim, cut_off) else: soft_max = AdaptiveSoftmax(rnn_layer.output_dim, cut_off) lm_model = LM(rnn_layer, soft_max, len(w_map), args.word_dim, args.droprate, label_dim = args.label_dim, add_relu=args.add_relu) lm_model.rand_ini() # lm_model.cuda() # set up optimizers optim_map = {'Adam' : optim.Adam, 'Adagrad': optim.Adagrad, 'Adadelta': optim.Adadelta, 'SGD': functools.partial(optim.SGD, momentum=0.9), 'LSRAdam':LSRAdam, 'LSAdam': LSAdam, 'AdamW': AdamW, 'RAdam': RAdam, 'SRAdamW': SRAdamW, 'SRRAdam': SRRAdam} if args.update.lower() == 'lsradam' or args.update.lower == 'lsadam': optimizer = optim_map[args.update](lm_model.parameters(), lr=args.lr*((1.+4.*args.sigma)**(0.25)), betas=(args.beta1, args.beta2), weight_decay=args.weight_decay, sigma=args.sigma) elif args.update.lower() == 'radam': optimizer = optim_map[args.update](lm_model.parameters(), lr=args.lr, betas=(args.beta1, args.beta2), weight_decay=args.weight_decay) elif args.update.lower() == 'adamw': optimizer = optim_map[args.update](lm_model.parameters(), lr=args.lr, betas=(args.beta1, args.beta2), weight_decay=args.weight_decay, warmup=args.warmup) elif args.update.lower() == 'sradamw': iter_count = 1 optimizer = optim_map[args.update](lm_model.parameters(), lr=args.lr, betas=(args.beta1, args.beta2), iter_count=iter_count, weight_decay=args.weight_decay, warmup = args.warmup, restarting_iter=args.restart_schedule[0]) elif args.update.lower() == 'srradam': #NOTE: need to double-check this iter_count = 1 optimizer = optim_map[args.update](lm_model.parameters(), lr=args.lr, betas=(args.beta1, args.beta2), iter_count=iter_count, weight_decay=args.weight_decay, warmup = args.warmup, restarting_iter=args.restart_schedule[0]) else: if args.lr > 0: optimizer=optim_map[args.update](lm_model.parameters(), lr=args.lr) else: optimizer=optim_map[args.update](lm_model.parameters()) # Resume title = 'onebillionword-' + args.rnn_layer logger = Logger(os.path.join(args.checkpath, 'log.txt'), title=title) logger.set_names(['Learning Rate', 'Train Loss', 'Train PPL', 'Valid PPL']) if args.load_checkpoint: if os.path.isfile(args.load_checkpoint): print("loading checkpoint: '{}'".format(args.load_checkpoint)) checkpoint_file = torch.load(args.load_checkpoint, map_location=lambda storage, loc: storage) lm_model.load_state_dict(checkpoint_file['lm_model'], False) optimizer.load_state_dict(checkpoint_file['opt'], False) else: print("no checkpoint found at: '{}'".format(args.load_checkpoint)) test_lm = nn.NLLLoss() test_lm.cuda() lm_model.cuda() batch_index = 0 epoch_loss = 0 full_epoch_loss = 0 best_train_ppl = float('inf') cur_lr = args.lr schedule_index = 1 try: for indexs in range(args.epoch): print('#' * 89) print('Start: {}'.format(indexs)) if args.optimizer.lower() == 'sradamw': if indexs in args.schedule: optimizer = SRAdamW(lm_model.parameters(), lr=args.lr * (args.gamma**schedule_index), betas=(args.beta1, args.beta2), iter_count=iter_count, weight_decay=args.weight_decay, warmup = 0, restarting_iter=args.restart_schedule[schedule_index]) schedule_index += 1 elif args.optimizer.lower() == 'srradam': if indexs in args.schedule: optimizer = SRRAdam(lm_model.parameters(), lr=args.lr * (args.gamma**schedule_index), betas=(args.beta1, args.beta2), iter_count=iter_count, weight_decay=args.weight_decay, warmup = 0, restarting_iter=args.restart_schedule[schedule_index]) schedule_index += 1 else: adjust_learning_rate(optimizer, indexs) logger.file.write('\nEpoch: [%d | %d] LR: %f' % (indexs + 1, args.epoch, state['lr'])) iterator = train_loader.get_tqdm() full_epoch_loss = 0 lm_model.train() for word_t, label_t in iterator: if 1 == train_loader.cur_idx: lm_model.init_hidden() label_t = label_t.view(-1) lm_model.zero_grad() loss = lm_model(word_t, label_t) loss.backward() torch.nn.utils.clip_grad_norm(lm_model.parameters(), args.clip) optimizer.step() if args.optimizer.lower() == 'sradamw' or args.optimizer.lower() == 'srradam' iter_count, iter_total = optimizer.update_iter() batch_index += 1 if 0 == batch_index % args.interval: s_loss = utils.to_scalar(loss) writer.add_scalars('loss_tracking/train_loss', {args.model_name:s_loss}, batch_index) epoch_loss += utils.to_scalar(loss) full_epoch_loss += utils.to_scalar(loss) if 0 == batch_index % args.check_interval: epoch_ppl = math.exp(epoch_loss / args.check_interval) writer.add_scalars('loss_tracking/train_ppl', {args.model_name: epoch_ppl}, batch_index) print('epoch_ppl: {} lr: {} @ batch_index: {}'.format(epoch_ppl, cur_lr, batch_index)) logger.file.write('epoch_ppl: {} lr: {} @ batch_index: {}'.format(epoch_ppl, cur_lr, batch_index)) epoch_loss = 0 test_ppl = evaluate(test_loader, lm_model, test_lm, -1) is_best = test_ppl < best_ppl best_ppl = min(test_ppl, best_ppl) writer.add_scalars('loss_tracking/test_ppl', {args.model_name: test_ppl}, indexs) print('test_ppl: {} @ index: {}'.format(test_ppl, indexs)) logger.file.write('test_ppl: {} @ index: {}'.format(test_ppl, indexs)) save_checkpoint({ 'epoch': epoch + 1, 'schedule_index': schedule_index, 'lm_model': lm_model.state_dict(), 'ppl': test_ppl, 'best_ppl': best_ppl, 'opt':optimizer.state_dict(), }, is_best, indexs, checkpoint=args.checkpath) except KeyboardInterrupt: print('Exiting from training early') logger.file.write('Exiting from training early') test_ppl = evaluate(test_loader, lm_model, test_lm, -1) writer.add_scalars('loss_tracking/test_ppl', {args.model_name: test_ppl}, args.epoch) is_best=False save_checkpoint({ 'epoch': epoch + 1, 'schedule_index': schedule_index, 'lm_model': lm_model.state_dict(), 'ppl': test_ppl, 'best_ppl': best_ppl, 'opt':optimizer.state_dict(), }, is_best, indexs, checkpoint=args.checkpath) print('Best PPL:%f'%best_ppl) logger.file.write('Best PPL:%f'%best_ppl) logger.close() with open("./all_results.txt", "a") as f: fcntl.flock(f, fcntl.LOCK_EX) f.write("%s\n"%args.checkpath) f.write("best_ppl %f\n\n"%best_ppl) fcntl.flock(f, fcntl.LOCK_UN)
def main(): global best_acc start_epoch = args.start_epoch # start from epoch 0 or last checkpoint epoch if not os.path.isdir(args.checkpoint): mkdir_p(args.checkpoint) # Data print('==> Preparing dataset %s' % args.dataset) transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) transform_test = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) if args.dataset == 'cifar10': dataloader = datasets.CIFAR10 num_classes = 10 else: dataloader = datasets.CIFAR100 num_classes = 100 trainset = dataloader(root='./data', train=True, download=True, transform=transform_train) trainloader = data.DataLoader(trainset, batch_size=args.train_batch, shuffle=True, num_workers=args.workers) testset = dataloader(root='./data', train=False, download=False, transform=transform_test) testloader = data.DataLoader(testset, batch_size=args.test_batch, shuffle=False, num_workers=args.workers) # Model print("==> creating model '{}'".format(args.arch)) if args.arch.startswith('resnext'): model = models.__dict__[args.arch]( cardinality=args.cardinality, num_classes=num_classes, depth=args.depth, widen_factor=args.widen_factor, dropRate=args.drop, ) elif args.arch.startswith('densenet'): model = models.__dict__[args.arch]( num_classes=num_classes, depth=args.depth, growthRate=args.growthRate, compressionRate=args.compressionRate, dropRate=args.drop, ) elif args.arch.startswith('wrn'): model = models.__dict__[args.arch]( num_classes=num_classes, depth=args.depth, widen_factor=args.widen_factor, dropRate=args.drop, ) elif args.arch.startswith('resnet'): model = models.__dict__[args.arch]( num_classes=num_classes, depth=args.depth, block_name=args.block_name, ) elif args.arch.startswith('preresnet'): model = models.__dict__[args.arch]( num_classes=num_classes, depth=args.depth, block_name=args.block_name, ) elif args.arch.startswith('horesnet'): model = models.__dict__[args.arch](num_classes=num_classes, depth=args.depth, eta=args.eta, block_name=args.block_name, feature_vec=args.feature_vec) elif args.arch.startswith('hopreresnet'): model = models.__dict__[args.arch](num_classes=num_classes, depth=args.depth, eta=args.eta, block_name=args.block_name, feature_vec=args.feature_vec) elif args.arch.startswith('nagpreresnet'): model = models.__dict__[args.arch](num_classes=num_classes, depth=args.depth, eta=args.eta, block_name=args.block_name, feature_vec=args.feature_vec) elif args.arch.startswith('mompreresnet'): model = models.__dict__[args.arch](num_classes=num_classes, depth=args.depth, eta=args.eta, block_name=args.block_name, feature_vec=args.feature_vec) elif args.arch.startswith('v2_preresnet'): if args.depth == 18: block_name = 'basicblock' num_blocks = [2, 2, 2, 2] elif args.depth == 34: block_name = 'basicblock' num_blocks = [3, 4, 6, 3] elif args.depth == 50: block_name = 'bottleneck' num_blocks = [3, 4, 6, 3] elif args.depth == 101: block_name = 'bottleneck' num_blocks = [3, 4, 23, 3] elif args.depth == 152: block_name = 'bottleneck' num_blocks = [3, 8, 36, 3] model = models.__dict__[args.arch](block_name=block_name, num_blocks=num_blocks, num_classes=num_classes) else: print('Model is specified wrongly - Use standard model') model = models.__dict__[args.arch](num_classes=num_classes) model = torch.nn.DataParallel(model).cuda() cudnn.benchmark = True print(' Total params: %.2fM' % (sum(p.numel() for p in model.parameters()) / 1000000.0)) criterion = nn.CrossEntropyLoss() if args.optimizer.lower() == 'sgd': optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # elif args.optimizer.lower() == 'adam': # optimizer = optim.Adam(model.parameters(), lr=args.lr, betas=(args.beta1, args.beta2), weight_decay=args.weight_decay) elif args.optimizer.lower() == 'radam': optimizer = RAdam(model.parameters(), lr=args.lr, betas=(args.beta1, args.beta2), weight_decay=args.weight_decay) elif args.optimizer.lower() == 'adamw': optimizer = AdamW(model.parameters(), lr=args.lr, betas=(args.beta1, args.beta2), weight_decay=args.weight_decay, warmup=args.warmup) elif args.optimizer.lower() == 'adam': optimizer = optim.Adam(model.parameters(), lr=args.lr, betas=(args.beta1, args.beta2), weight_decay=args.weight_decay) elif args.optimizer.lower() == 'srsgd': iter_count = 1 optimizer = SGD_Adaptive(model.parameters(), lr=args.lr, weight_decay=args.weight_decay, iter_count=iter_count, restarting_iter=args.restart_schedule[0]) elif args.optimizer.lower() == 'sradam': iter_count = 1 optimizer = SRNAdam(model.parameters(), lr=args.lr, betas=(args.beta1, args.beta2), iter_count=iter_count, weight_decay=args.weight_decay, restarting_iter=args.restart_schedule[0]) elif args.optimizer.lower() == 'sradamw': iter_count = 1 optimizer = SRAdamW(model.parameters(), lr=args.lr, betas=(args.beta1, args.beta2), iter_count=iter_count, weight_decay=args.weight_decay, warmup=args.warmup, restarting_iter=args.restart_schedule[0]) elif args.optimizer.lower() == 'srradam': #NOTE: need to double-check this iter_count = 1 optimizer = SRRAdam(model.parameters(), lr=args.lr, betas=(args.beta1, args.beta2), iter_count=iter_count, weight_decay=args.weight_decay, warmup=args.warmup, restarting_iter=args.restart_schedule[0]) # Resume title = 'cifar-10-' + args.arch logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title) logger.set_names([ 'Learning Rate', 'Train Loss', 'Valid Loss', 'Train Acc.', 'Valid Acc.' ]) schedule_index = 1 # Resume title = '%s-' % args.dataset + args.arch if args.resume: # Load checkpoint. print('==> Resuming from checkpoint..') assert os.path.isfile( args.resume), 'Error: no checkpoint directory found!' # args.checkpoint = os.path.dirname(args.resume) checkpoint = torch.load(args.resume) best_acc = checkpoint['best_acc'] start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) if args.optimizer.lower() == 'srsgd' or args.optimizer.lower( ) == 'sradam' or args.optimizer.lower( ) == 'sradamw' or args.optimizer.lower() == 'srradam': iter_count = optimizer.param_groups[0]['iter_count'] # schedule_index = checkpoint['schedule_index'] schedule_index = 3 state['lr'] = optimizer.param_groups[0]['lr'] logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title, resume=True) else: logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title) logger.set_names([ 'Learning Rate', 'Train Loss', 'Valid Loss', 'Train Acc.', 'Valid Acc.' ]) if args.evaluate: print('\nEvaluation only') test_loss, test_acc = test(testloader, model, criterion, start_epoch, use_cuda) print(' Test Loss: %.8f, Test Acc: %.2f' % (test_loss, test_acc)) return # Train and val for epoch in range(start_epoch, args.epochs): if args.optimizer.lower() == 'srsgd': if epoch == 161: start_decay_restarting_iter = args.restart_schedule[ schedule_index] - 1 current_lr = args.lr * (args.gamma**schedule_index) if epoch in args.schedule: current_lr = args.lr * (args.gamma**schedule_index) current_restarting_iter = args.restart_schedule[schedule_index] optimizer = SGD_Adaptive( model.parameters(), lr=current_lr, weight_decay=args.weight_decay, iter_count=iter_count, restarting_iter=current_restarting_iter) schedule_index += 1 if epoch >= 161: current_restarting_iter = start_decay_restarting_iter * ( args.epochs - epoch - 1) / (args.epochs - 162) + 1 optimizer = SGD_Adaptive( model.parameters(), lr=current_lr, weight_decay=args.weight_decay, iter_count=iter_count, restarting_iter=current_restarting_iter) else: adjust_learning_rate(optimizer, epoch) logger.file.write('\nEpoch: [%d | %d] LR: %f' % (epoch + 1, args.epochs, state['lr'])) if args.optimizer.lower() == 'srsgd' or args.optimizer.lower( ) == 'sradam' or args.optimizer.lower( ) == 'sradamw' or args.optimizer.lower() == 'srradam': train_loss, train_acc, iter_count = train(trainloader, model, criterion, optimizer, epoch, use_cuda, logger) else: train_loss, train_acc = train(trainloader, model, criterion, optimizer, epoch, use_cuda, logger) test_loss, test_acc = test(testloader, model, criterion, epoch, use_cuda, logger) # append logger file logger.append( [state['lr'], train_loss, test_loss, train_acc, test_acc]) writer.add_scalars('train_loss', {args.model_name: train_loss}, epoch) writer.add_scalars('test_loss', {args.model_name: test_loss}, epoch) writer.add_scalars('train_acc', {args.model_name: train_acc}, epoch) writer.add_scalars('test_acc', {args.model_name: test_acc}, epoch) # save model is_best = test_acc > best_acc best_acc = max(test_acc, best_acc) save_checkpoint( { 'epoch': epoch + 1, 'schedule_index': schedule_index, 'state_dict': model.state_dict(), 'acc': test_acc, 'best_acc': best_acc, 'optimizer': optimizer.state_dict(), }, is_best, epoch, checkpoint=args.checkpoint) logger.file.write('Best acc:%f' % best_acc) logger.close() logger.plot() savefig(os.path.join(args.checkpoint, 'log.eps')) print('Best acc:') print(best_acc) with open("./all_results.txt", "a") as f: fcntl.flock(f, fcntl.LOCK_EX) f.write("%s\n" % args.checkpoint) f.write("best_acc %f\n\n" % best_acc) fcntl.flock(f, fcntl.LOCK_UN)
def main(): global best_top1, best_top5 start_epoch = args.start_epoch # start from epoch 0 or last checkpoint epoch if not os.path.isdir(args.checkpoint): mkdir_p(args.checkpoint) # Data loading code traindir = os.path.join(args.data, 'train') validdir = os.path.join(args.data, 'val') normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_transform = transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ]) val_transform = transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ]) train_data = imagenet_lmdb_dataset(traindir, transform=train_transform) valid_data = imagenet_lmdb_dataset(validdir, transform=val_transform) train_sampler = torch.utils.data.distributed.DistributedSampler(train_data) train_loader = torch.utils.data.DataLoader(train_data, batch_size=args.train_batch, shuffle=(train_sampler is None), pin_memory=True, num_workers=8, sampler=train_sampler) val_loader = torch.utils.data.DataLoader(valid_data, batch_size=args.test_batch, shuffle=False, pin_memory=True, num_workers=8) # create model if args.pretrained: print("=> using pre-trained model '{}'".format(args.arch)) model = models.__dict__[args.arch](pretrained=True) elif args.arch.startswith('resnext'): model = models.__dict__[args.arch]( baseWidth=args.base_width, cardinality=args.cardinality, ) elif args.arch == 'densenet264': model = DenseNet(growth_rate=32, block_config=(6, 12, 64, 48), num_init_features=64, bn_size=4, drop_rate=0, num_classes=1000, memory_efficient=False) elif args.arch == 'resnet200': model = ResNet(block=Bottleneck, layers=[3, 24, 36, 3], num_classes=1000, zero_init_residual=False, groups=1, width_per_group=64, replace_stride_with_dilation=None, norm_layer=None) else: print("=> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch]() if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): model.features = DDP(model.features) model.cuda() else: model = model.cuda() model = DDP(model, delay_allreduce=True) # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() if args.optimizer.lower() == 'sgd': optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) elif args.optimizer.lower() == 'adamw': optimizer = AdamW(model.parameters(), lr=args.lr, betas=(args.beta1, args.beta2), weight_decay=args.weight_decay, warmup=0) elif args.optimizer.lower() == 'radam': optimizer = RAdam(model.parameters(), lr=args.lr, betas=(args.beta1, args.beta2), weight_decay=args.weight_decay) elif args.optimizer.lower() == 'lsadam': optimizer = LSAdamW(model.parameters(), lr=args.lr * ((1. + 4. * args.sigma)**(0.25)), betas=(args.beta1, args.beta2), weight_decay=args.weight_decay, sigma=args.sigma) elif args.optimizer.lower() == 'lsradam': sigma = 0.1 optimizer = LSRAdam(model.parameters(), lr=args.lr * ((1. + 4. * args.sigma)**(0.25)), betas=(args.beta1, args.beta2), weight_decay=args.weight_decay, sigma=args.sigma) elif args.optimizer.lower() == 'srsgd': iter_count = 1 optimizer = SGD_Adaptive(model.parameters(), lr=args.lr, weight_decay=args.weight_decay, iter_count=iter_count, restarting_iter=args.restart_schedule[0]) elif args.optimizer.lower() == 'sradam': iter_count = 1 optimizer = SRNAdam(model.parameters(), lr=args.lr, betas=(args.beta1, args.beta2), iter_count=iter_count, weight_decay=args.weight_decay, restarting_iter=args.restart_schedule[0]) elif args.optimizer.lower() == 'sradamw': iter_count = 1 optimizer = SRAdamW(model.parameters(), lr=args.lr, betas=(args.beta1, args.beta2), iter_count=iter_count, weight_decay=args.weight_decay, warmup=0, restarting_iter=args.restart_schedule[0]) elif args.optimizer.lower() == 'srradam': #NOTE: need to double-check this iter_count = 1 optimizer = SRRAdam(model.parameters(), lr=args.lr, betas=(args.beta1, args.beta2), iter_count=iter_count, weight_decay=args.weight_decay, warmup=0, restarting_iter=args.restart_schedule[0]) schedule_index = 1 # Resume title = 'ImageNet-' + args.arch if args.resume: # Load checkpoint. print('==> Resuming from checkpoint..') assert os.path.isfile( args.resume), 'Error: no checkpoint directory found!' # args.checkpoint = os.path.dirname(args.resume) # checkpoint = torch.load(args.resume, map_location = lambda storage, loc: storage.cuda(args.local_rank)) checkpoint = torch.load(args.resume, map_location=torch.device('cpu')) best_top1 = checkpoint['best_top1'] best_top5 = checkpoint['best_top5'] start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) if args.optimizer.lower() == 'srsgd' or args.optimizer.lower( ) == 'sradam' or args.optimizer.lower( ) == 'sradamw' or args.optimizer.lower() == 'srradam': iter_count = optimizer.param_groups[0]['iter_count'] schedule_index = checkpoint['schedule_index'] state['lr'] = optimizer.param_groups[0]['lr'] if args.checkpoint == args.resume: logger = LoggerDistributed(os.path.join(args.checkpoint, 'log.txt'), rank=args.local_rank, title=title, resume=True) else: logger = LoggerDistributed(os.path.join(args.checkpoint, 'log.txt'), rank=args.local_rank, title=title) if args.local_rank == 0: logger.set_names([ 'Learning Rate', 'Train Loss', 'Valid Loss', 'Train Top1', 'Valid Top1', 'Train Top5', 'Valid Top5' ]) else: logger = LoggerDistributed(os.path.join(args.checkpoint, 'log.txt'), rank=args.local_rank, title=title) if args.local_rank == 0: logger.set_names([ 'Learning Rate', 'Train Loss', 'Valid Loss', 'Train Top1', 'Valid Top1', 'Train Top5', 'Valid Top5' ]) if args.local_rank == 0: logger.file.write(' Total params: %.2fM' % (sum(p.numel() for p in model.parameters()) / 1000000.0)) if args.evaluate: if args.local_rank == 0: logger.file.write('\nEvaluation only') test_loss, test_top1, test_top5 = test(val_loader, model, criterion, start_epoch, use_cuda, logger) if args.local_rank == 0: logger.file.write( ' Test Loss: %.8f, Test Top1: %.2f, Test Top5: %.2f' % (test_loss, test_top1, test_top5)) return # Train and val for epoch in range(start_epoch, args.epochs): # Shuffle the sampler. train_loader.sampler.set_epoch(epoch + args.manualSeed) if args.optimizer.lower() == 'srsgd': if epoch in args.schedule: optimizer = SGD_Adaptive( model.parameters(), lr=args.lr * (args.gamma**schedule_index), weight_decay=args.weight_decay, iter_count=iter_count, restarting_iter=args.restart_schedule[schedule_index]) schedule_index += 1 elif args.optimizer.lower() == 'sradam': if epoch in args.schedule: optimizer = SRNAdam( model.parameters(), lr=args.lr * (args.gamma**schedule_index), betas=(args.beta1, args.beta2), iter_count=iter_count, weight_decay=args.weight_decay, restarting_iter=args.restart_schedule[schedule_index]) schedule_index += 1 elif args.optimizer.lower() == 'sradamw': if epoch in args.schedule: optimizer = SRAdamW( model.parameters(), lr=args.lr * (args.gamma**schedule_index), betas=(args.beta1, args.beta2), iter_count=iter_count, weight_decay=args.weight_decay, warmup=0, restarting_iter=args.restart_schedule[schedule_index]) schedule_index += 1 elif args.optimizer.lower() == 'srradam': if epoch in args.schedule: optimizer = SRRAdam( model.parameters(), lr=args.lr * (args.gamma**schedule_index), betas=(args.beta1, args.beta2), iter_count=iter_count, weight_decay=args.weight_decay, warmup=0, restarting_iter=args.restart_schedule[schedule_index]) schedule_index += 1 else: adjust_learning_rate(optimizer, epoch) if args.local_rank == 0: logger.file.write('\nEpoch: [%d | %d] LR: %f' % (epoch + 1, args.epochs, state['lr'])) if args.optimizer.lower() == 'srsgd' or args.optimizer.lower( ) == 'sradam' or args.optimizer.lower( ) == 'sradamw' or args.optimizer.lower() == 'srradam': train_loss, train_top1, train_top5, iter_count = train( train_loader, model, criterion, optimizer, epoch, use_cuda, logger) else: train_loss, train_top1, train_top5 = train(train_loader, model, criterion, optimizer, epoch, use_cuda, logger) test_loss, test_top1, test_top5 = test(val_loader, model, criterion, epoch, use_cuda, logger) # append logger file if args.local_rank == 0: logger.append([ state['lr'], train_loss, test_loss, train_top1, test_top1, train_top5, test_top5 ]) writer.add_scalars('train_loss', {args.model_name: train_loss}, epoch) writer.add_scalars('test_loss', {args.model_name: test_loss}, epoch) writer.add_scalars('train_top1', {args.model_name: train_top1}, epoch) writer.add_scalars('test_top1', {args.model_name: test_top1}, epoch) writer.add_scalars('train_top5', {args.model_name: train_top5}, epoch) writer.add_scalars('test_top5', {args.model_name: test_top5}, epoch) # save model is_best = test_top1 > best_top1 best_top1 = max(test_top1, best_top1) best_top5 = max(test_top5, best_top5) if args.local_rank == 0: save_checkpoint( { 'epoch': epoch + 1, 'schedule_index': schedule_index, 'state_dict': model.state_dict(), 'top1': test_top1, 'top5': test_top5, 'best_top1': best_top1, 'best_top5': best_top5, 'optimizer': optimizer.state_dict(), }, is_best, epoch, checkpoint=args.checkpoint) if epoch == args.schedule[-1]: logger.file.write('Best top1: %f at epoch %i' % (best_top1, epoch)) logger.file.write('Best top5: %f at epoch %i' % (best_top5, epoch)) print('Best top1: %f at epoch %i' % (best_top1, epoch)) print('Best top5: %f at epoch %i' % (best_top5, epoch)) with open("./all_results_imagenet.txt", "a") as f: fcntl.flock(f, fcntl.LOCK_EX) f.write("%s\n" % args.checkpoint) f.write("best_top1 %f, best_top5 %f at epoch %i\n\n" % (best_top1, best_top5, epoch)) fcntl.flock(f, fcntl.LOCK_UN) if args.local_rank == 0: logger.file.write('Best top1: %f' % best_top1) logger.file.write('Best top5: %f' % best_top5) logger.close() logger.plot() savefig(os.path.join(args.checkpoint, 'log.eps')) print('Best top1: %f' % best_top1) print('Best top5: %f' % best_top5) with open("./all_results_imagenet.txt", "a") as f: fcntl.flock(f, fcntl.LOCK_EX) f.write("%s\n" % args.checkpoint) f.write("best_top1 %f, best_top5 %f\n\n" % (best_top1, best_top5)) fcntl.flock(f, fcntl.LOCK_UN)