def infer(valid_queue, model, alpha, criterion): """Run model in eval only mode.""" objs = utils.AverageMeter() top1 = utils.AverageMeter() top5 = utils.AverageMeter() # model.eval() with torch.no_grad(): for step, (data, target) in enumerate(valid_queue): n = data.size(0) data = data.cuda() target = target.cuda() weights = alpha(data.size(0)) logits = model(data, weights) loss = criterion(logits, target) # Calculate the accuracy. prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5)) objs.update(loss.data, n) top1.update(prec1.data, n) top5.update(prec5.data, n) if step % args.report_freq == 0 or step == len(valid_queue) - 1: objs_avg = utils.reduce_tensor(objs.avg, args.world_size) top1_avg = utils.reduce_tensor(top1.avg, args.world_size) top5_avg = utils.reduce_tensor(top5.avg, args.world_size) logging.info('valid %03d %e %f %f', step, objs_avg, top1_avg, top5_avg) return top1_avg, objs_avg
def train(train_queue, model, criterion, optimizer, epoch, init_lr, warmup_epochs, global_step): objs = utils.AvgrageMeter() top1 = utils.AvgrageMeter() top5 = utils.AvgrageMeter() model.train() for step, (data, target) in enumerate(train_queue): n = data.size(0) data = data.cuda() target = target.cuda() # Change lr. if epoch < warmup_epochs: len_epoch = len(train_queue) scale = float(1 + step + epoch * len_epoch) / \ (warmup_epochs * len_epoch) lr = init_lr * scale for param_group in optimizer.param_groups: param_group['lr'] = lr # Forward. optimizer.zero_grad() logits, logits_aux = model(data) loss = criterion(logits, target) if args.auxiliary: loss_aux = criterion(logits_aux, target) loss += args.auxiliary_weight * loss_aux # Backward and step. loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip) optimizer.step() ############# APEX ############# # Calculate the accuracy. prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5)) reduced_loss = utils.reduce_tensor(loss.data, args.world_size) prec1 = utils.reduce_tensor(prec1, args.world_size) prec5 = utils.reduce_tensor(prec5, args.world_size) objs.update(to_python_float(reduced_loss), n) top1.update(to_python_float(prec1), n) top5.update(to_python_float(prec5), n) ################################ if step % args.report_freq == 0: current_lr = list(optimizer.param_groups)[0]['lr'] logging.info('train %03d %e %f %f lr: %e', step, objs.avg, top1.avg, top5.avg, current_lr) writer.add_scalar('train/loss', objs.avg, global_step) writer.add_scalar('train/acc_top1', top1.avg, global_step) writer.add_scalar('train/acc_top5', top5.avg, global_step) writer.add_scalar('train/lr', optimizer.state_dict()['param_groups'][0]['lr'], global_step) global_step += 1 return top1.avg, objs.avg, global_step
def train_init(train_queue, model, alpha, criterion, optimizer, weight_params): """Update network weights on train set and architecture on val set.""" objs = utils.AverageMeter() top1 = utils.AverageMeter() top5 = utils.AverageMeter() for step, (data, target) in enumerate(train_queue): model.train() n = data.size(0) # Update network weights using the train set. data = data.cuda() target = target.cuda() weights = alpha(data.size(0)) weights_no_grad = alpha.module.clone_weights(weights) optimizer.zero_grad() logits = model(data, weights_no_grad) loss = criterion(logits, target) dummy = sum([torch.sum(param) for param in model.parameters()]) loss += dummy * 0. loss.backward() nn.utils.clip_grad_norm_(weight_params, args.grad_clip) optimizer.step() # Calculate the accuracy. prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5)) objs.update(loss.data, n) top1.update(prec1.data, n) top5.update(prec5.data, n) if (step + 1) % args.report_freq == 0 or step == len(train_queue) - 1: objs_avg = utils.reduce_tensor(objs.avg, args.world_size) top1_avg = utils.reduce_tensor(top1.avg, args.world_size) top5_avg = utils.reduce_tensor(top5.avg, args.world_size) logging.info('train_init %03d %e %f %f', step, objs_avg, top1_avg, top5_avg) return top1_avg, objs_avg
def train(train_queue, model, criterion, optimizer, global_step): objs = utils.AverageMeter() top1 = utils.AverageMeter() top5 = utils.AverageMeter() model.train() for step, (data, target) in enumerate(train_queue): n = data.size(0) data = data.cuda() target = target.cuda() # Forward. optimizer.zero_grad() logits = model(data) loss = criterion(logits, target) # Backward and step. loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip) optimizer.step() # Calculate the accuracy. prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5)) reduced_loss = utils.reduce_tensor(loss.data, args.world_size) prec1 = utils.reduce_tensor(prec1, args.world_size) prec5 = utils.reduce_tensor(prec5, args.world_size) objs.update(to_python_float(reduced_loss), n) top1.update(to_python_float(prec1), n) top5.update(to_python_float(prec5), n) if (step + 1) % args.report_freq == 0: current_lr = list(optimizer.param_groups)[0]['lr'] logging.info('train %03d %e %f %f lr: %e', step, objs.avg, top1.avg, top5.avg, current_lr) global_step += 1 return top1.avg, top5.avg, objs.avg, global_step
def main(): # Scale learning rate based on global batch size. if not args.no_scale_lr: scale = float(args.batch_size * args.world_size) / 128.0 args.learning_rate = scale * args.learning_rate if not torch.cuda.is_available(): logging.info('no gpu device available') sys.exit(1) np.random.seed(args.seed) cudnn.benchmark = True torch.manual_seed(args.seed) cudnn.enabled = True torch.cuda.manual_seed(args.seed) logging.info('gpu device = %d' % args.gpu) logging.info('args = %s', args) # Get data loaders. traindir = os.path.join(args.data, 'train') validdir = os.path.join(args.data, 'val') normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_transform = transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4, hue=0.2), transforms.ToTensor(), normalize, ]) val_transform = transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ]) if 'lmdb' in args.data: train_data = imagenet_lmdb_dataset(traindir, transform=train_transform) valid_data = imagenet_lmdb_dataset(validdir, transform=val_transform) else: train_data = dset.ImageFolder(traindir, transform=train_transform) valid_data = dset.ImageFolder(validdir, transform=val_transform) train_sampler = torch.utils.data.distributed.DistributedSampler(train_data) train_queue = torch.utils.data.DataLoader(train_data, batch_size=args.batch_size, shuffle=False, pin_memory=True, num_workers=8, sampler=train_sampler) valid_queue = torch.utils.data.DataLoader(valid_data, batch_size=args.batch_size, shuffle=False, pin_memory=True, num_workers=8) # Set up the network. if os.path.isfile(args.genotype): logging.info('Loading genotype from: %s' % args.genotype) genotype = torch.load(args.genotype, map_location='cpu') else: logging.info('Loading genotype: %s' % args.genotype) genotype = eval('genotypes.%s' % args.genotype) if not isinstance(genotype, list): genotype = [genotype] # If num channels not provided, find the max under 600M MAdds. if args.init_channels < 0: if args.local_rank == 0: flops, num_params, init_channels = find_max_channels( genotype, args.layers, args.max_M_flops * 1e6) logging.info('Num flops = %.2fM', flops / 1e6) logging.info('Num params = %.2fM', num_params / 1e6) else: init_channels = 0 # All reduce with world_size 1 is sum. init_channels = torch.Tensor([init_channels]).cuda() init_channels = utils.reduce_tensor(init_channels, 1) args.init_channels = int(init_channels.item()) logging.info('Num channels = %d', args.init_channels) # Create model and loss. model = Network(args.init_channels, CLASSES, args.layers, args.auxiliary, genotype) model = model.cuda() model = DDP(model, delay_allreduce=True) criterion = nn.CrossEntropyLoss() criterion = criterion.cuda() criterion_smooth = CrossEntropyLabelSmooth(CLASSES, args.label_smooth) criterion_smooth = criterion_smooth.cuda() logging.info('param size = %fM', utils.count_parameters_in_M(model)) # Set up network weights optimizer. optimizer = torch.optim.SGD(model.parameters(), args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, float(args.epochs), eta_min=args.min_learning_rate) # Train. global_step = 0 best_acc_top1 = 0 for epoch in range(args.epochs): # Shuffle the sampler, update lrs. train_queue.sampler.set_epoch(epoch + args.seed) # Change lr. if epoch >= args.warmup_epochs: scheduler.step() model.module.drop_path_prob = args.drop_path_prob * epoch / args.epochs # Training. train_acc, train_obj, global_step = train(train_queue, model, criterion_smooth, optimizer, epoch, args.learning_rate, args.warmup_epochs, global_step) logging.info('train_acc %f', train_acc) writer.add_scalar('train/acc', train_acc, global_step) # Validation. valid_acc_top1, valid_acc_top5, valid_obj = infer( valid_queue, model, criterion) logging.info('valid_acc_top1 %f', valid_acc_top1) logging.info('valid_acc_top5 %f', valid_acc_top5) writer.add_scalar('val/acc_top1', valid_acc_top1, global_step) writer.add_scalar('val/acc_top5', valid_acc_top5, global_step) writer.add_scalar('val/loss', valid_obj, global_step) is_best = False if valid_acc_top1 > best_acc_top1: best_acc_top1 = valid_acc_top1 is_best = True if args.local_rank == 0: utils.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'best_acc_top1': best_acc_top1, 'optimizer': optimizer.state_dict(), }, is_best, args.save)
def train(train_queue, valid_queue, model, alpha, nas, criterion, optimizer, global_step, weight_params, seed): """Update network weights on train set and architecture on val set.""" objs = utils.AverageMeter() top1 = utils.AverageMeter() top5 = utils.AverageMeter() nas.reset_counter() # Init meta queue iterator. valid_queue.sampler.set_epoch(global_step + seed) valid_queue_iterator = iter(valid_queue) for step, (data, target) in enumerate(train_queue): model.train() n = data.size(0) # Use the same minibatch for updating weights as well as arch params. data = data.cuda() target = target.cuda() # architecture update weights = alpha(data.size(0)) weights_no_grad = alpha.module.clone_weights(weights) # Get a random minibatch from the valid queue with replacement. # Use this to update the architecture. if args.val_arch_update or args.gen_error_alpha: try: input_valid, target_valid = next(valid_queue_iterator) except: valid_queue.sampler.set_epoch(global_step + seed) valid_queue_iterator = iter(valid_queue) input_valid, target_valid = next(valid_queue_iterator) input_valid = input_valid.cuda() target_valid = target_valid.cuda() if args.gen_error_alpha: nas.step(data, target, global_step, weights, input_valid, target_valid, optimizer) else: nas.step(input_valid, target_valid, global_step, weights) else: nas.step(data, target, global_step, weights) # network update optimizer.zero_grad() logits = model(data, weights_no_grad) loss = criterion(logits, target) dummy = sum([torch.sum(param) for param in model.parameters()]) loss += dummy * 0. loss.backward() nn.utils.clip_grad_norm_(weight_params, args.grad_clip) optimizer.step() # Calculate the accuracy. prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5)) objs.update(loss.data, n) top1.update(prec1.data, n) top5.update(prec5.data, n) if (step + 1) % args.report_freq == 0 or step == len(train_queue) - 1: objs_avg = utils.reduce_tensor(objs.avg, args.world_size) top1_avg = utils.reduce_tensor(top1.avg, args.world_size) top5_avg = utils.reduce_tensor(top5.avg, args.world_size) logging.info('train %03d %e %f %f', step, objs_avg, top1_avg, top5_avg) writer.add_scalar('train/loss', objs_avg, global_step) writer.add_scalar('train/acc1', top1_avg, global_step) writer.add_scalar('train/lr', optimizer.state_dict()['param_groups'][0]['lr'], global_step) global_step += 1 return top1_avg, objs_avg, global_step