Exemple #1
0
def infer(valid_queue, model, alpha, criterion):
    """Run model in eval only mode."""
    objs = utils.AverageMeter()
    top1 = utils.AverageMeter()
    top5 = utils.AverageMeter()
    # model.eval()

    with torch.no_grad():
        for step, (data, target) in enumerate(valid_queue):
            n = data.size(0)
            data = data.cuda()
            target = target.cuda()

            weights = alpha(data.size(0))
            logits = model(data, weights)
            loss = criterion(logits, target)

            # Calculate the accuracy.
            prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5))

            objs.update(loss.data, n)
            top1.update(prec1.data, n)
            top5.update(prec5.data, n)

            if step % args.report_freq == 0 or step == len(valid_queue) - 1:
                objs_avg = utils.reduce_tensor(objs.avg, args.world_size)
                top1_avg = utils.reduce_tensor(top1.avg, args.world_size)
                top5_avg = utils.reduce_tensor(top5.avg, args.world_size)
                logging.info('valid %03d %e %f %f', step, objs_avg, top1_avg,
                             top5_avg)

    return top1_avg, objs_avg
Exemple #2
0
def train(train_queue, model, criterion, optimizer, epoch, init_lr,
          warmup_epochs, global_step):
    objs = utils.AvgrageMeter()
    top1 = utils.AvgrageMeter()
    top5 = utils.AvgrageMeter()

    model.train()
    for step, (data, target) in enumerate(train_queue):
        n = data.size(0)
        data = data.cuda()
        target = target.cuda()

        # Change lr.
        if epoch < warmup_epochs:
            len_epoch = len(train_queue)
            scale = float(1 + step + epoch * len_epoch) / \
                (warmup_epochs * len_epoch)
            lr = init_lr * scale
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr

        # Forward.
        optimizer.zero_grad()
        logits, logits_aux = model(data)
        loss = criterion(logits, target)
        if args.auxiliary:
            loss_aux = criterion(logits_aux, target)
            loss += args.auxiliary_weight * loss_aux

        # Backward and step.
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip)
        optimizer.step()

        ############# APEX #############
        # Calculate the accuracy.
        prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5))
        reduced_loss = utils.reduce_tensor(loss.data, args.world_size)
        prec1 = utils.reduce_tensor(prec1, args.world_size)
        prec5 = utils.reduce_tensor(prec5, args.world_size)

        objs.update(to_python_float(reduced_loss), n)
        top1.update(to_python_float(prec1), n)
        top5.update(to_python_float(prec5), n)
        ################################

        if step % args.report_freq == 0:
            current_lr = list(optimizer.param_groups)[0]['lr']
            logging.info('train %03d %e %f %f lr: %e', step, objs.avg,
                         top1.avg, top5.avg, current_lr)
            writer.add_scalar('train/loss', objs.avg, global_step)
            writer.add_scalar('train/acc_top1', top1.avg, global_step)
            writer.add_scalar('train/acc_top5', top5.avg, global_step)
            writer.add_scalar('train/lr',
                              optimizer.state_dict()['param_groups'][0]['lr'],
                              global_step)
        global_step += 1

    return top1.avg, objs.avg, global_step
Exemple #3
0
def train_init(train_queue, model, alpha, criterion, optimizer, weight_params):
    """Update network weights on train set and architecture on val set."""
    objs = utils.AverageMeter()
    top1 = utils.AverageMeter()
    top5 = utils.AverageMeter()

    for step, (data, target) in enumerate(train_queue):
        model.train()
        n = data.size(0)

        # Update network weights using the train set.
        data = data.cuda()
        target = target.cuda()

        weights = alpha(data.size(0))
        weights_no_grad = alpha.module.clone_weights(weights)

        optimizer.zero_grad()
        logits = model(data, weights_no_grad)
        loss = criterion(logits, target)
        dummy = sum([torch.sum(param) for param in model.parameters()])
        loss += dummy * 0.
        loss.backward()
        nn.utils.clip_grad_norm_(weight_params, args.grad_clip)
        optimizer.step()

        # Calculate the accuracy.
        prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5))

        objs.update(loss.data, n)
        top1.update(prec1.data, n)
        top5.update(prec5.data, n)

        if (step + 1) % args.report_freq == 0 or step == len(train_queue) - 1:
            objs_avg = utils.reduce_tensor(objs.avg, args.world_size)
            top1_avg = utils.reduce_tensor(top1.avg, args.world_size)
            top5_avg = utils.reduce_tensor(top5.avg, args.world_size)
            logging.info('train_init %03d %e %f %f', step, objs_avg, top1_avg,
                         top5_avg)

    return top1_avg, objs_avg
Exemple #4
0
def train(train_queue, model, criterion, optimizer, global_step):
    objs = utils.AverageMeter()
    top1 = utils.AverageMeter()
    top5 = utils.AverageMeter()

    model.train()
    for step, (data, target) in enumerate(train_queue):
        n = data.size(0)
        data = data.cuda()
        target = target.cuda()

        # Forward.
        optimizer.zero_grad()
        logits = model(data)
        loss = criterion(logits, target)

        # Backward and step.
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip)
        optimizer.step()

        # Calculate the accuracy.
        prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5))
        reduced_loss = utils.reduce_tensor(loss.data, args.world_size)
        prec1 = utils.reduce_tensor(prec1, args.world_size)
        prec5 = utils.reduce_tensor(prec5, args.world_size)

        objs.update(to_python_float(reduced_loss), n)
        top1.update(to_python_float(prec1), n)
        top5.update(to_python_float(prec5), n)

        if (step + 1) % args.report_freq == 0:
            current_lr = list(optimizer.param_groups)[0]['lr']
            logging.info('train %03d %e %f %f lr: %e', step, objs.avg,
                         top1.avg, top5.avg, current_lr)
        global_step += 1

    return top1.avg, top5.avg, objs.avg, global_step
Exemple #5
0
def main():
    # Scale learning rate based on global batch size.
    if not args.no_scale_lr:
        scale = float(args.batch_size * args.world_size) / 128.0
        args.learning_rate = scale * args.learning_rate

    if not torch.cuda.is_available():
        logging.info('no gpu device available')
        sys.exit(1)
    np.random.seed(args.seed)
    cudnn.benchmark = True
    torch.manual_seed(args.seed)
    cudnn.enabled = True
    torch.cuda.manual_seed(args.seed)
    logging.info('gpu device = %d' % args.gpu)
    logging.info('args = %s', args)

    # Get data loaders.
    traindir = os.path.join(args.data, 'train')
    validdir = os.path.join(args.data, 'val')
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
    train_transform = transforms.Compose([
        transforms.RandomResizedCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.ColorJitter(brightness=0.4,
                               contrast=0.4,
                               saturation=0.4,
                               hue=0.2),
        transforms.ToTensor(),
        normalize,
    ])
    val_transform = transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        normalize,
    ])

    if 'lmdb' in args.data:
        train_data = imagenet_lmdb_dataset(traindir, transform=train_transform)
        valid_data = imagenet_lmdb_dataset(validdir, transform=val_transform)
    else:
        train_data = dset.ImageFolder(traindir, transform=train_transform)
        valid_data = dset.ImageFolder(validdir, transform=val_transform)

    train_sampler = torch.utils.data.distributed.DistributedSampler(train_data)
    train_queue = torch.utils.data.DataLoader(train_data,
                                              batch_size=args.batch_size,
                                              shuffle=False,
                                              pin_memory=True,
                                              num_workers=8,
                                              sampler=train_sampler)

    valid_queue = torch.utils.data.DataLoader(valid_data,
                                              batch_size=args.batch_size,
                                              shuffle=False,
                                              pin_memory=True,
                                              num_workers=8)

    # Set up the network.
    if os.path.isfile(args.genotype):
        logging.info('Loading genotype from: %s' % args.genotype)
        genotype = torch.load(args.genotype, map_location='cpu')
    else:
        logging.info('Loading genotype: %s' % args.genotype)
        genotype = eval('genotypes.%s' % args.genotype)
    if not isinstance(genotype, list):
        genotype = [genotype]

    # If num channels not provided, find the max under 600M MAdds.
    if args.init_channels < 0:
        if args.local_rank == 0:
            flops, num_params, init_channels = find_max_channels(
                genotype, args.layers, args.max_M_flops * 1e6)
            logging.info('Num flops = %.2fM', flops / 1e6)
            logging.info('Num params = %.2fM', num_params / 1e6)
        else:
            init_channels = 0
        # All reduce with world_size 1 is sum.
        init_channels = torch.Tensor([init_channels]).cuda()
        init_channels = utils.reduce_tensor(init_channels, 1)
        args.init_channels = int(init_channels.item())
    logging.info('Num channels = %d', args.init_channels)

    # Create model and loss.
    model = Network(args.init_channels, CLASSES, args.layers, args.auxiliary,
                    genotype)
    model = model.cuda()
    model = DDP(model, delay_allreduce=True)

    criterion = nn.CrossEntropyLoss()
    criterion = criterion.cuda()
    criterion_smooth = CrossEntropyLabelSmooth(CLASSES, args.label_smooth)
    criterion_smooth = criterion_smooth.cuda()
    logging.info('param size = %fM', utils.count_parameters_in_M(model))

    # Set up network weights optimizer.
    optimizer = torch.optim.SGD(model.parameters(),
                                args.learning_rate,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
        optimizer, float(args.epochs), eta_min=args.min_learning_rate)

    # Train.
    global_step = 0
    best_acc_top1 = 0
    for epoch in range(args.epochs):
        # Shuffle the sampler, update lrs.
        train_queue.sampler.set_epoch(epoch + args.seed)
        # Change lr.
        if epoch >= args.warmup_epochs:
            scheduler.step()
        model.module.drop_path_prob = args.drop_path_prob * epoch / args.epochs

        # Training.
        train_acc, train_obj, global_step = train(train_queue, model,
                                                  criterion_smooth, optimizer,
                                                  epoch, args.learning_rate,
                                                  args.warmup_epochs,
                                                  global_step)
        logging.info('train_acc %f', train_acc)
        writer.add_scalar('train/acc', train_acc, global_step)

        # Validation.
        valid_acc_top1, valid_acc_top5, valid_obj = infer(
            valid_queue, model, criterion)
        logging.info('valid_acc_top1 %f', valid_acc_top1)
        logging.info('valid_acc_top5 %f', valid_acc_top5)
        writer.add_scalar('val/acc_top1', valid_acc_top1, global_step)
        writer.add_scalar('val/acc_top5', valid_acc_top5, global_step)
        writer.add_scalar('val/loss', valid_obj, global_step)

        is_best = False
        if valid_acc_top1 > best_acc_top1:
            best_acc_top1 = valid_acc_top1
            is_best = True

        if args.local_rank == 0:
            utils.save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'state_dict': model.state_dict(),
                    'best_acc_top1': best_acc_top1,
                    'optimizer': optimizer.state_dict(),
                }, is_best, args.save)
Exemple #6
0
def train(train_queue, valid_queue, model, alpha, nas, criterion, optimizer,
          global_step, weight_params, seed):
    """Update network weights on train set and architecture on val set."""
    objs = utils.AverageMeter()
    top1 = utils.AverageMeter()
    top5 = utils.AverageMeter()
    nas.reset_counter()

    # Init meta queue iterator.
    valid_queue.sampler.set_epoch(global_step + seed)
    valid_queue_iterator = iter(valid_queue)

    for step, (data, target) in enumerate(train_queue):
        model.train()
        n = data.size(0)

        # Use the same minibatch for updating weights as well as arch params.
        data = data.cuda()
        target = target.cuda()

        # architecture update
        weights = alpha(data.size(0))
        weights_no_grad = alpha.module.clone_weights(weights)
        # Get a random minibatch from the valid queue with replacement.
        # Use this to update the architecture.
        if args.val_arch_update or args.gen_error_alpha:
            try:
                input_valid, target_valid = next(valid_queue_iterator)
            except:
                valid_queue.sampler.set_epoch(global_step + seed)
                valid_queue_iterator = iter(valid_queue)
                input_valid, target_valid = next(valid_queue_iterator)

            input_valid = input_valid.cuda()
            target_valid = target_valid.cuda()
            if args.gen_error_alpha:
                nas.step(data, target, global_step, weights, input_valid,
                         target_valid, optimizer)
            else:
                nas.step(input_valid, target_valid, global_step, weights)
        else:
            nas.step(data, target, global_step, weights)

        # network update
        optimizer.zero_grad()
        logits = model(data, weights_no_grad)
        loss = criterion(logits, target)
        dummy = sum([torch.sum(param) for param in model.parameters()])
        loss += dummy * 0.
        loss.backward()
        nn.utils.clip_grad_norm_(weight_params, args.grad_clip)
        optimizer.step()

        # Calculate the accuracy.
        prec1, prec5 = utils.accuracy(logits, target, topk=(1, 5))
        objs.update(loss.data, n)
        top1.update(prec1.data, n)
        top5.update(prec5.data, n)

        if (step + 1) % args.report_freq == 0 or step == len(train_queue) - 1:
            objs_avg = utils.reduce_tensor(objs.avg, args.world_size)
            top1_avg = utils.reduce_tensor(top1.avg, args.world_size)
            top5_avg = utils.reduce_tensor(top5.avg, args.world_size)

            logging.info('train %03d %e %f %f', step, objs_avg, top1_avg,
                         top5_avg)
            writer.add_scalar('train/loss', objs_avg, global_step)
            writer.add_scalar('train/acc1', top1_avg, global_step)
            writer.add_scalar('train/lr',
                              optimizer.state_dict()['param_groups'][0]['lr'],
                              global_step)
        global_step += 1

    return top1_avg, objs_avg, global_step