return factor * base_lr # return base_lr criterion = F.cross_entropy regularizer = None if args.curve is None else curves.l2_regularizer(args.wd) optimizer = torch.optim.SGD( filter(lambda param: param.requires_grad, model.parameters()), lr=args.lr, momentum=args.momentum, weight_decay=args.wd if args.curve is None else 0.0) if args.grda: optimizer = gRDA(filter(lambda param: param.requires_grad, model.parameters()), lr=args.lr, c=args.c, mu=args.mu) start_epoch = 1 if args.resume is not None: print('Resume training from %s' % args.resume) checkpoint = torch.load(args.resume) start_epoch = checkpoint['epoch'] + 1 model.load_state_dict(checkpoint['model_state']) if start_epoch != 1: optimizer.load_state_dict(checkpoint['optimizer_state']) if start_epoch == 1: if args.grda: optimizer = gRDA(filter(lambda param: param.requires_grad, model.parameters()),
def main_worker(gpu, ngpus_per_node, args): global best_acc1 args.gpu = gpu if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) if args.save is not None: args.save = args.save + "/" if args.distributed: if args.dist_url == "env://" and args.rank == -1: args.rank = int(os.environ["RANK"]) if args.multiprocessing_distributed: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes args.rank = args.rank * ngpus_per_node + gpu dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) # create model if args.pretrained: print("=> using pre-trained model '{}'".format(args.arch)) model = models.__dict__[args.arch](pretrained=True) else: print("=> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch]() if args.distributed: # For multiprocessing distributed, DistributedDataParallel constructor # should always set the single device scope, otherwise, # DistributedDataParallel will use all available devices. if args.gpu is not None: torch.cuda.set_device(args.gpu) model.cuda(args.gpu) # When using a single GPU per process and per # DistributedDataParallel, we need to divide the batch size # ourselves based on the total number of GPUs we have args.batch_size = int(args.batch_size / ngpus_per_node) args.workers = int( (args.workers + ngpus_per_node - 1) / ngpus_per_node) print("args.workers", args.workers) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) else: model.cuda() # DistributedDataParallel will divide and allocate batch_size to all # available GPUs if device_ids are not set model = torch.nn.parallel.DistributedDataParallel(model) elif args.gpu is not None: torch.cuda.set_device(args.gpu) model = model.cuda(args.gpu) else: # DataParallel will divide and allocate batch_size to all available GPUs if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): model.features = torch.nn.DataParallel(model.features) model.cuda() else: model = torch.nn.DataParallel(model).cuda() # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda(args.gpu) # optimizer = torch.optim.SGD(model.parameters(), args.lr, # momentum=args.momentum, # weight_decay=args.weight_decay) # model_setting = "SGD_lr" + args.lr + "_c" + args.c + "_mu" + args.mu optimizer = gRDA(model.parameters(), lr=args.lr, c=args.c, mu=args.mu) model_setting = "gRDA_lr" + str(args.lr) + "_c" + str( args.c) + "_mu" + str(args.mu) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_acc1 = checkpoint['best_acc1'] if args.gpu is not None: # best_acc1 may be from a checkpoint from a different GPU best_acc1 = best_acc1.to(args.gpu) model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) #### set new lr and l1_accumulation # set_l1_accumulation = False # for param_group in optimizer.param_groups: # for p in param_group['params']: # param_state = optimizer.state[p] # if 'l1_accumulation' not in param_state or torch.equal(param_state['l1_accumulation'], torch.zeros(1)): # set_l1_accumulation = True # param_state['l1_accumulation'] = param_group['c'] * torch.pow(param_group['lr'], torch.tensor(param_group['mu'] + 0.5)) * torch.pow(param_state['iter_num'], param_group['mu']) # if set_l1_accumulation: # print("l1_accumulation needs to be set") # print("set l1_accumulation '{}'".format(param_state['l1_accumulation'])) for param_group in optimizer.param_groups: print("=> loaded old_base_lr '{}'".format(param_group['lr'])) param_group['lr'] = args.lr print("=> set new_base_lr '{}'".format(param_group['lr'])) #### -----set new lr and l1_accumulation if args.start_epoch == 0: # load initialization from other optimizer models optimizer = gRDA(model.parameters(), lr=args.lr, c=args.c, mu=args.mu) model_setting = "gRDA_lr" + str(args.lr) + "_c" + str( args.c) + "_mu" + str(args.mu) print("*** loading initialization from '{}'".format( args.resume)) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) else: print("=> saving initialized model '{}'".format(args.arch)) if not args.multiprocessing_distributed or ( args.multiprocessing_distributed and args.rank % ngpus_per_node == 0): save_checkpoint( { 'epoch': 0, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_acc1': best_acc1, 'optimizer': optimizer.state_dict(), }, True, model_setting, "I", save_path=args.save) cudnn.benchmark = True # Data loading code traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'val') normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_dataset = datasets.ImageFolder( traindir, transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ])) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset) else: train_sampler = None train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler) val_loader = torch.utils.data.DataLoader(datasets.ImageFolder( valdir, transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ])), batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) if args.evaluate: validate(val_loader, model, criterion, args) return for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) current_lr = adjust_learning_rate(optimizer, epoch, args) print("=> current learning rate:", current_lr) # train for one epoch train(train_loader, model, criterion, optimizer, epoch, args) # evaluate on validation set acc1 = validate(val_loader, model, criterion, args) # remember best acc@1 and save checkpoint is_best = acc1 > best_acc1 best_acc1 = max(acc1, best_acc1) if not args.multiprocessing_distributed or ( args.multiprocessing_distributed and args.rank % ngpus_per_node == 0): save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_acc1': best_acc1, 'optimizer': optimizer.state_dict(), }, is_best, model_setting, epoch, save_path=args.save)
def main(): # Training settings parser = argparse.ArgumentParser(description='PyTorch MNIST Example') parser.add_argument('--batch-size', type=int, default=64, metavar='N', help='input batch size for training (default: 64)') parser.add_argument('--test-batch-size', type=int, default=64, metavar='N', help='input batch size for testing (default: 64)') parser.add_argument('--epochs', type=int, default=10, metavar='N', help='number of epochs to train (default: 10)') parser.add_argument('--lr', type=float, default=0.01, metavar='LR', help='learning rate (default: 0.01)') parser.add_argument('--c', type=float, default=0.1, metavar='M', help='gRDA c (default: 0.1)') parser.add_argument('--mu', type=float, default=0.5, metavar='M', help='gRDA mu (default: 0.5)') # parser.add_argument('--momentum', type=float, default=0.5, metavar='M', # help='SGD momentum (default: 0.5)') parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') parser.add_argument( '--log-interval', type=int, default=10, metavar='N', help='how many batches to wait before logging training status') parser.add_argument('--save-model', action='store_true', default=False, help='For Saving the current Model') args = parser.parse_args() use_cuda = not args.no_cuda and torch.cuda.is_available() torch.manual_seed(args.seed) device = torch.device("cuda" if use_cuda else "cpu") kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} train_loader = torch.utils.data.DataLoader(datasets.MNIST( '../data', train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])), batch_size=args.batch_size, shuffle=True, **kwargs) test_loader = torch.utils.data.DataLoader(datasets.MNIST( '../data', train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])), batch_size=args.test_batch_size, shuffle=True, **kwargs) model = Net().to(device) optimizer = gRDA(model.parameters(), lr=args.lr, c=args.c, mu=args.mu) # optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) for epoch in range(1, args.epochs + 1): current_lr = adjust_learning_rate(optimizer, epoch, args) print("=> current learning rate:", current_lr) train(args, model, device, train_loader, optimizer, epoch) test(args, model, device, test_loader) print("non-zero weight number in each layer:") for name, param in model.named_parameters(): if param.requires_grad: print(name, torch.norm(param.data, p=0), "/", param.shape) if (args.save_model): torch.save(model.state_dict(), "mnist_cnn.pt")