Beispiel #1
0
def main():
    print("evaluate start")

    # set default gpu device id
    # torch.cuda.set_device(config.gpus[0])

    # set seed
    np.random.seed(config.seed)
    torch.manual_seed(config.seed)
    torch.cuda.manual_seed_all(config.seed)

    if config.deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.enabled = True
    else:
        torch.backends.cudnn.benchmark = True

    # get data with meta info
    if config.data_loader_type == 'torch':
        input_size, input_channels, n_classes, train_data, valid_data = get_data.get_data(
            config.dataset,
            config.data_path,
            config.cutout_length,
            auto_augmentation=config.auto_augmentation)
        # train_loader = torch.utils.data.DataLoader(train_data,
        #                                            batch_size=config.batch_size,
        #                                            shuffle=True,
        #                                            num_workers=config.workers,
        #                                            pin_memory=True)
        valid_loader = torch.utils.data.DataLoader(
            valid_data,
            batch_size=config.batch_size,
            shuffle=False,
            num_workers=config.workers,
            pin_memory=False)
    elif config.data_loader_type == 'dali':
        input_size, input_channels, n_classes, train_data, valid_data = get_data.get_data_dali(
            config.dataset,
            config.data_path,
            batch_size=config.batch_size,
            num_threads=config.workers)
        # train_loader = train_data
        valid_loader = valid_data
    else:
        raise NotImplementedError

    use_aux = config.aux_weight > 0.
    if config.model_method == 'darts_NAS':
        if config.genotype is None:
            config.genotype = get_model.get_model(config.model_method,
                                                  config.model_name)
        if 'imagenet' in config.dataset.lower():
            model = AugmentCNN_ImageNet(input_size, input_channels,
                                        config.init_channels, n_classes,
                                        config.layers, use_aux,
                                        config.genotype)
        else:
            model = AugmentCNN(input_size, input_channels,
                               config.init_channels, n_classes, config.layers,
                               use_aux, config.genotype)
    elif config.model_method == 'my_model_collection':
        from models.my_searched_model import my_specialized
        if config.structure_path is None:
            _ = config.model_name.split(':')
            net_config_path = os.path.join(project_path, 'models',
                                           'my_model_collection', _[0],
                                           _[1] + '.json')
        else:
            net_config_path = config.structure_path
        # model = my_specialized(num_classes=n_classes, net_config=net_config_path,
        #                        dropout_rate=config.dropout_rate)
        model = my_specialized(num_classes=n_classes,
                               net_config=net_config_path,
                               dropout_rate=0)
    else:
        model_fun = get_model.get_model(config.model_method, config.model_name)
        # model = model_fun(num_classes=n_classes, dropout_rate=config.dropout_rate)
        model = model_fun(num_classes=n_classes, dropout_rate=0)
    # load model
    ckpt = torch.load(config.pretrained)
    print(ckpt.keys())
    # for k in model:
    #     print(k)
    # return
    # set bn
    # model.set_bn_param(config.bn_momentum, config.bn_eps)
    for _key in list(ckpt['state_dict_ema'].keys()):
        if 'total_ops' in _key or 'total_params' in _key:
            del ckpt['state_dict_ema'][_key]
    model.load_state_dict(ckpt['state_dict_ema'])
    # model init
    # model.init_model(model_init=config.model_init)
    model.cuda()
    # model size
    total_ops, total_params = flops_counter.profile(
        model, [1, input_channels, input_size, input_size])
    print("Model size = {:.3f} MB".format(total_params))
    print("Model FLOPS with input {} = {:.3f} M".format(
        str([1, input_channels, input_size, input_size]), total_ops))
    total_ops, total_params = flops_counter.profile(model, [1, 3, 224, 224])
    print("Model FLOPS with input [1,3,224,224] {:.3f} M".format(total_ops))

    model = nn.DataParallel(model).to(device)
    # CRITERION
    if config.label_smoothing > 0:
        from utils import LabelSmoothLoss
        criterion = LabelSmoothLoss(
            smoothing=config.label_smoothing).to(device)
    else:
        criterion = nn.CrossEntropyLoss().to(device)

    best_top1 = validate(valid_loader, model, criterion, 0, 0)

    print("Final best Prec@1 = {:.4%}".format(best_top1))
def main():
    logger.info("Logger is set - training start")

    # set default gpu device id
    # torch.cuda.set_device(config.gpus[0])

    # set seed
    np.random.seed(config.seed)
    torch.manual_seed(config.seed)
    torch.cuda.manual_seed_all(config.seed)

    if config.deterministic:
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.enabled = True
    else:
        torch.backends.cudnn.benchmark = True

    # get data with meta info
    if config.data_loader_type == 'torch':
        input_size, input_channels, n_classes, train_data, valid_data = get_data.get_data(
            config.dataset, config.data_path, config.cutout_length,
            auto_augmentation=config.auto_augmentation)
        train_loader = torch.utils.data.DataLoader(train_data,
                                                   batch_size=config.batch_size,
                                                   shuffle=True,
                                                   num_workers=config.workers,
                                                   pin_memory=True)
        valid_loader = torch.utils.data.DataLoader(valid_data,
                                                   batch_size=config.batch_size,
                                                   shuffle=False,
                                                   num_workers=config.workers,
                                                   pin_memory=True)
    elif config.data_loader_type == 'dali':
        input_size, input_channels, n_classes, train_data, valid_data = get_data.get_data_dali(
            config.dataset, config.data_path, batch_size=config.batch_size, num_threads=config.workers)
        train_loader = train_data
        valid_loader = valid_data
    else:
        raise NotImplementedError

    if config.label_smoothing > 0:
        from utils import LabelSmoothLoss
        criterion = LabelSmoothLoss(smoothing=config.label_smoothing).to(device)
    else:
        criterion = nn.CrossEntropyLoss().to(device)

    use_aux = config.aux_weight > 0.
    if config.model_method == 'darts_NAS':
        if config.genotype is None:
            config.genotype = get_model.get_model(config.model_method, config.model_name)
        if 'imagenet' in config.dataset.lower():
            model = AugmentCNN_ImageNet(input_size, input_channels, config.init_channels, n_classes, config.layers,
                           use_aux, config.genotype)
        else:
            model = AugmentCNN(input_size, input_channels, config.init_channels, n_classes, config.layers,
                               use_aux, config.genotype)
    elif config.model_method == 'my_model_collection':
        from models.my_searched_model import my_specialized
        if config.structure_path is None:
            _ = config.model_name.split(':')
            net_config_path = os.path.join(project_path, 'models', 'my_model_collection',
                                           _[0], _[1] + '.json')
        else:
            net_config_path = config.structure_path
        model = my_specialized(num_classes=n_classes, net_config=net_config_path,
                               dropout_rate=config.dropout_rate)
    else:
        model_fun = get_model.get_model(config.model_method, config.model_name)
        model = model_fun(num_classes=n_classes, dropout_rate=config.dropout_rate)
    # set bn
    model.set_bn_param(config.bn_momentum, config.bn_eps)
    # model init
    model.init_model(model_init=config.model_init)
    model.cuda()
    # model size
    total_ops, total_params = flops_counter.profile(model, [1, input_channels, input_size, input_size])
    logger.info("Model size = {:.3f} MB".format(total_params))
    logger.info("Model FLOPS with input {} = {:.3f} M".format(str([1, input_channels, input_size, input_size]),
                                                              total_ops))
    total_ops, total_params = flops_counter.profile(model, [1, 3, 224, 224])
    logger.info("Model FLOPS with input [1,3,224,224] {:.3f} M".format(total_ops))

    model = nn.DataParallel(model).to(device)
    # weights optimizer
    if not config.no_decay_keys == 'None':
        keys = config.no_decay_keys.split('#')
        optimizer = torch.optim.SGD([
            {'params': model.module.get_parameters(keys, mode='exclude'), 'weight_decay': config.weight_decay},
            {'params': model.module.get_parameters(keys, mode='include'), 'weight_decay': 0},
        ], lr=config.lr, momentum=config.momentum)
    else:
        optimizer = torch.optim.SGD(model.parameters(), config.lr, momentum=config.momentum,
                                    weight_decay=config.weight_decay)

    lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, config.epochs)

    best_top1 = 0.
    # training loop
    _size = get_iterator_length(train_loader)
    for epoch in range(config.epochs):
        lr_scheduler.step()
        if config.drop_path_prob > 0:
            drop_prob = config.drop_path_prob * epoch / config.epochs
            model.module.drop_path_prob(drop_prob)

        # training
        train(train_loader, model, optimizer, criterion, epoch)

        # validation
        cur_step = (epoch+1) * _size
        top1 = validate(valid_loader, model, criterion, epoch, cur_step)

        # save
        if best_top1 < top1:
            best_top1 = top1
            is_best = True
            logger.info("Current best Prec@1 = {:.4%}".format(best_top1))
        else:
            is_best = False
        utils.save_checkpoint(model, config.path, is_best)

        print("")

    logger.info("Final best Prec@1 = {:.4%}".format(best_top1))
def main():
    args.prefetcher = not args.no_prefetcher
    args.distributed = False
    if 'WORLD_SIZE' in os.environ:
        args.distributed = int(os.environ['WORLD_SIZE']) > 1
        if args.distributed and args.num_gpu > 1:
            logger.warning(
                'Using more than one GPU per process in distributed mode is not allowed. Setting num_gpu to 1.'
            )
            args.num_gpu = 1

    args.device = 'cuda:0'
    args.world_size = 1
    args.rank = 0  # global rank
    if args.distributed:
        args.num_gpu = 1
        args.device = 'cuda:%d' % args.local_rank
        torch.cuda.set_device(args.local_rank)
        torch.distributed.init_process_group(backend='nccl',
                                             init_method='env://')
        args.world_size = torch.distributed.get_world_size()
        args.rank = torch.distributed.get_rank()
    assert args.rank >= 0

    if args.distributed:
        logger.info(
            'Training in distributed mode with multiple processes, 1 GPU per process. Process %d, total %d.'
            % (args.rank, args.world_size))
    else:
        logger.info('Training with a single process on %d GPUs.' %
                    args.num_gpu)

    torch.manual_seed(args.seed + args.rank)

    # my model
    use_aux = args.aux_weight > 0.
    if args.model_method == 'darts_NAS':
        if args.genotype is None:
            args.genotype = get_model.get_model(args.model_method,
                                                args.model_name)
        model = AugmentCNN_ImageNet(224, 3, args.init_channels,
                                    args.num_classes, args.layers, use_aux,
                                    args.genotype)
    elif args.model_method == 'my_model_collection':
        from models.my_searched_model import my_specialized
        if args.structure_path is None:
            _ = args.model_name.split(':')
            net_config_path = os.path.join(project_path, 'models',
                                           'my_model_collection', _[0],
                                           _[1] + '.json')
        else:
            net_config_path = args.structure_path
        model = my_specialized(num_classes=args.num_classes,
                               net_config=net_config_path,
                               dropout_rate=args.drop)
    else:
        model_fun = get_model.get_model(args.model_method, args.model_name)
        model = model_fun(num_classes=args.num_classes, dropout_rate=args.drop)
    # set bn
    model.set_bn_param(args.bn_momentum, args.bn_eps)
    # model init
    model.init_model(model_init=args.model_init)
    total_ops, total_params = flops_counter.profile(model, [1, 3, 224, 224])
    logger.info("Model size = {:.3f} MB".format(total_params))
    logger.info(
        "Model FLOPS with input [1,3,224,224] = {:.3f} M".format(total_ops))
    # pdb.set_trace()
    # model = create_model(
    #     args.model,
    #     pretrained=args.pretrained,
    #     num_classes=args.num_classes,
    #     drop_rate=args.drop,
    #     drop_connect_rate=args.drop_connect,
    #     global_pool=args.gp,
    #     bn_tf=args.bn_tf,
    #     bn_momentum=args.bn_momentum,
    #     bn_eps=args.bn_eps,
    #     checkpoint_path=args.initial_checkpoint)

    if args.local_rank == 0:
        logger.info('Model %s created, param count: %d' %
                    (args.model, sum([m.numel() for m in model.parameters()])))

    data_config = resolve_data_config(vars(args),
                                      model=model,
                                      verbose=args.local_rank == 0)

    num_aug_splits = 0
    if args.aug_splits > 0:
        assert args.aug_splits > 1, 'A split of 1 makes no sense'
        num_aug_splits = args.aug_splits

    if args.split_bn:
        assert num_aug_splits > 1 or args.resplit
        model = convert_splitbn_model(model, max(num_aug_splits, 2))

    if args.num_gpu > 1:
        if args.amp:
            logger.warning(
                'AMP does not work well with nn.DataParallel, disabling. Use distributed mode for multi-GPU AMP.'
            )
            args.amp = False
        model = nn.DataParallel(model,
                                device_ids=list(range(args.num_gpu))).cuda()
    else:
        model.cuda()

    optimizer = create_optimizer(args, model)

    use_amp = False
    if has_apex and args.amp:
        model, optimizer = amp.initialize(model, optimizer, opt_level='O1')
        use_amp = True
    if args.local_rank == 0:
        logger.info('NVIDIA APEX {}. AMP {}.'.format(
            'installed' if has_apex else 'not installed',
            'on' if use_amp else 'off'))

    # optionally resume from a checkpoint
    resume_state = {}
    resume_epoch = None
    if args.resume:
        resume_state, resume_epoch = resume_checkpoint(model, args.resume)
    if resume_state and not args.no_resume_opt:
        if 'optimizer' in resume_state:
            if args.local_rank == 0:
                logger.info('Restoring Optimizer state from checkpoint')
            optimizer.load_state_dict(resume_state['optimizer'])
        if use_amp and 'amp' in resume_state and 'load_state_dict' in amp.__dict__:
            if args.local_rank == 0:
                logger.info('Restoring NVIDIA AMP state from checkpoint')
            amp.load_state_dict(resume_state['amp'])
    del resume_state

    model_ema = None
    if args.model_ema:
        # Important to create EMA model after cuda(), DP wrapper, and AMP but before SyncBN and DDP wrapper
        model_ema = ModelEma(model,
                             decay=args.model_ema_decay,
                             device='cpu' if args.model_ema_force_cpu else '',
                             resume=args.resume)

    if args.distributed:
        if args.sync_bn:
            assert not args.split_bn
            try:
                if has_apex:
                    model = convert_syncbn_model(model)
                else:
                    model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(
                        model)
                if args.local_rank == 0:
                    logger.info(
                        'Converted model to use Synchronized BatchNorm. WARNING: You may have issues if using '
                        'zero initialized BN layers (enabled by default for ResNets) while sync-bn enabled.'
                    )
            except Exception as e:
                logger.error(
                    'Failed to enable Synchronized BatchNorm. Install Apex or Torch >= 1.1'
                )
        if has_apex:
            model = DDP(model, delay_allreduce=True)
        else:
            if args.local_rank == 0:
                logger.info(
                    "Using torch DistributedDataParallel. Install NVIDIA Apex for Apex DDP."
                )
            model = DDP(model,
                        device_ids=[args.local_rank
                                    ])  # can use device str in Torch >= 1.1
        # NOTE: EMA model does not need to be wrapped by DDP

    lr_scheduler, num_epochs = create_scheduler(args, optimizer)
    start_epoch = 0
    if args.start_epoch is not None:
        # a specified start_epoch will always override the resume epoch
        start_epoch = args.start_epoch
    elif resume_epoch is not None:
        start_epoch = resume_epoch
    if lr_scheduler is not None and start_epoch > 0:
        lr_scheduler.step(start_epoch)

    if args.local_rank == 0:
        logger.info('Scheduled epochs: {}'.format(num_epochs))

    train_dir = os.path.join(args.data, 'train')
    if not os.path.exists(train_dir):
        logger.error('Training folder does not exist at: {}'.format(train_dir))
        exit(1)
    dataset_train = Dataset(train_dir)

    collate_fn = None
    if args.prefetcher and args.mixup > 0:
        assert not num_aug_splits  # collate conflict (need to support deinterleaving in collate mixup)
        collate_fn = FastCollateMixup(args.mixup, args.smoothing,
                                      args.num_classes)

    if num_aug_splits > 1:
        dataset_train = AugMixDataset(dataset_train, num_splits=num_aug_splits)

    loader_train = create_loader(
        dataset_train,
        input_size=data_config['input_size'],
        batch_size=args.batch_size,
        is_training=True,
        use_prefetcher=args.prefetcher,
        re_prob=args.reprob,
        re_mode=args.remode,
        re_count=args.recount,
        re_split=args.resplit,
        color_jitter=args.color_jitter,
        auto_augment=args.aa,
        num_aug_splits=num_aug_splits,
        interpolation=args.train_interpolation,
        mean=data_config['mean'],
        std=data_config['std'],
        num_workers=args.workers,
        distributed=args.distributed,
        collate_fn=collate_fn,
        pin_memory=args.pin_mem,
    )

    eval_dir = os.path.join(args.data, 'val')
    if not os.path.isdir(eval_dir):
        eval_dir = os.path.join(args.data, 'validation')
        if not os.path.isdir(eval_dir):
            logger.error(
                'Validation folder does not exist at: {}'.format(eval_dir))
            exit(1)
    dataset_eval = Dataset(eval_dir)

    loader_eval = create_loader(
        dataset_eval,
        input_size=data_config['input_size'],
        batch_size=args.validation_batch_size_multiplier * args.batch_size,
        is_training=False,
        use_prefetcher=args.prefetcher,
        interpolation=data_config['interpolation'],
        mean=data_config['mean'],
        std=data_config['std'],
        num_workers=args.workers,
        distributed=args.distributed,
        crop_pct=data_config['crop_pct'],
        pin_memory=args.pin_mem,
    )

    if args.jsd:
        assert num_aug_splits > 1  # JSD only valid with aug splits set
        train_loss_fn = JsdCrossEntropy(num_splits=num_aug_splits,
                                        smoothing=args.smoothing).cuda()
        validate_loss_fn = nn.CrossEntropyLoss().cuda()
    elif args.mixup > 0.:
        # smoothing is handled with mixup label transform
        train_loss_fn = SoftTargetCrossEntropy().cuda()
        validate_loss_fn = nn.CrossEntropyLoss().cuda()
    elif args.smoothing:
        train_loss_fn = LabelSmoothingCrossEntropy(
            smoothing=args.smoothing).cuda()
        validate_loss_fn = nn.CrossEntropyLoss().cuda()
    else:
        train_loss_fn = nn.CrossEntropyLoss().cuda()
        validate_loss_fn = train_loss_fn

    eval_metric = args.eval_metric
    best_metric = None
    best_epoch = None
    saver = None
    # output_dir = ''
    # if args.local_rank == 0:
    #     output_base = args.output if args.output else './output'
    #     exp_name = '-'.join([
    #         datetime.now().strftime("%Y%m%d-%H%M%S"),
    #         args.model_method,
    #         args.model_name,
    #         str(data_config['input_size'][-1])
    #     ])
    #     output_dir = get_outdir(output_base, 'train', exp_name)
    #     decreasing = True if eval_metric == 'loss' else False
    #     saver = CheckpointSaver(checkpoint_dir=output_dir, decreasing=decreasing)
    #     with open(os.path.join(output_dir, 'args.yaml'), 'w') as f:
    #         f.write(args_text)

    try:
        for epoch in range(start_epoch, num_epochs):
            if args.distributed:
                loader_train.sampler.set_epoch(epoch)

            train_metrics = train_epoch(epoch,
                                        model,
                                        loader_train,
                                        optimizer,
                                        train_loss_fn,
                                        args,
                                        lr_scheduler=lr_scheduler,
                                        saver=saver,
                                        output_dir=output_dir,
                                        use_amp=use_amp,
                                        model_ema=model_ema)

            if args.distributed and args.dist_bn in ('broadcast', 'reduce'):
                if args.local_rank == 0:
                    logger.info(
                        "Distributing BatchNorm running means and vars")
                distribute_bn(model, args.world_size, args.dist_bn == 'reduce')

            eval_metrics = validate(model, loader_eval, validate_loss_fn, args)

            if model_ema is not None and not args.model_ema_force_cpu:
                if args.distributed and args.dist_bn in ('broadcast',
                                                         'reduce'):
                    distribute_bn(model_ema, args.world_size,
                                  args.dist_bn == 'reduce')

                ema_eval_metrics = validate(model_ema.ema,
                                            loader_eval,
                                            validate_loss_fn,
                                            args,
                                            log_suffix=' (EMA)')
                eval_metrics = ema_eval_metrics

            if lr_scheduler is not None:
                # step LR for next epoch
                lr_scheduler.step(epoch + 1, eval_metrics[eval_metric])

            update_summary(epoch,
                           train_metrics,
                           eval_metrics,
                           os.path.join(output_dir, 'summary.csv'),
                           write_header=best_metric is None)

            if saver is not None:
                # save proper checkpoint with eval metric
                save_metric = eval_metrics[eval_metric]
                best_metric, best_epoch = saver.save_checkpoint(
                    model,
                    optimizer,
                    args,
                    epoch=epoch,
                    model_ema=model_ema,
                    metric=save_metric,
                    use_amp=use_amp)

    except KeyboardInterrupt:
        pass
    if best_metric is not None:
        logger.info('*** Best metric: {0} (epoch {1})'.format(
            best_metric, best_epoch))