Esempio n. 1
0
    def train(self):
        layers = self.layers
        switches = None
        for epoch in range(self.pdarts_epoch):

            layers = self.layers+self.pdarts_num_layers[epoch]
            model, criterion, optim, lr_scheduler = self.model_creator(layers)
            self.mutator = PdartsMutator(model, epoch, self.pdarts_num_to_drop, switches)

            for callback in self.callbacks:
                callback.build(model, self.mutator, self)
                callback.on_epoch_begin(epoch)

            darts_callbacks = []
            if lr_scheduler is not None:
                darts_callbacks.append(LRSchedulerCallback(lr_scheduler))

            self.trainer = DartsTrainer(model, mutator=self.mutator, loss=criterion, optimizer=optim,
                                        callbacks=darts_callbacks, **self.darts_parameters)
            logger.info("start pdarts training epoch %s...", epoch)

            self.trainer.train()

            switches = self.mutator.drop_paths()

            for callback in self.callbacks:
                callback.on_epoch_end(epoch)
Esempio n. 2
0
def main(args):
    reset_seed(args.seed)
    prepare_logger(args)

    logger.info("These are the hyper-parameters you want to tune:\n%s",
                pprint.pformat(vars(args)))

    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    train_loader, test_loader = data_preprocess(args)
    # model = models.__dict__[args.model](num_classes=10)
    model = CNN(32, 3, args.channels, 10, args.layers)
    model.to(device)

    criterion = nn.CrossEntropyLoss()
    if args.optimizer == 'adam':
        optimizer = optim.Adam(model.parameters(),
                               lr=args.initial_lr,
                               weight_decay=args.weight_decay)
    else:
        if args.optimizer == 'sgd':
            optimizer_cls = optim.SGD
        elif args.optimizer == 'rmsprop':
            optimizer_cls = optim.RMSprop
        optimizer = optimizer_cls(model.parameters(),
                                  lr=args.initial_lr,
                                  momentum=args.momentum,
                                  weight_decay=args.weight_decay)

    if args.lr_scheduler == 'cosin':
        scheduler = optim.lr_scheduler.CosineAnnealingLR(
            optimizer, args.epochs, eta_min=args.ending_lr)
    elif args.lr_scheduler == 'linear':
        scheduler = optim.lr_scheduler.StepLR(optimizer,
                                              step_size=15,
                                              gamma=0.1)

    trainer = DartsTrainer(
        model,
        loss=criterion,
        metrics=lambda output, target: accuracy(output, target),
        optimizer=optimizer,
        num_epochs=args.epochs,
        dataset_train=train_loader,
        dataset_valid=test_loader,
        batch_size=args.batch_size,
        log_frequency=args.log_frequency,
        unrolled=args.unrolled,
        callbacks=[
            LRSchedulerCallback(scheduler),
            ArchitectureCheckpoint("./checkpoints_layer5")
        ])

    if args.visualization:
        trainer.enable_visualization()
    trainer.train()
Esempio n. 3
0
                                   tanh_constant=1.1,
                                   cell_exit_extra_step=True)
    else:
        raise AssertionError

    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(),
                                0.05,
                                momentum=0.9,
                                weight_decay=1.0E-4)
    lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                              T_max=num_epochs,
                                                              eta_min=0.001)

    trainer = enas.EnasTrainer(model,
                               loss=criterion,
                               metrics=accuracy,
                               reward_function=reward_accuracy,
                               optimizer=optimizer,
                               callbacks=[
                                   LRSchedulerCallback(lr_scheduler),
                                   ArchitectureCheckpoint("./checkpoints")
                               ],
                               batch_size=args.batch_size,
                               num_epochs=num_epochs,
                               dataset_train=dataset_train,
                               dataset_valid=dataset_valid,
                               log_frequency=args.log_frequency,
                               mutator=mutator)
    trainer.train()
Esempio n. 4
0
    criterion = nn.CrossEntropyLoss()

    if args.arch is not None:
        logger.info('model retraining...')
        with open(args.arch, 'r') as f:
            arch = json.load(f)
        for trial in query_nb201_trial_stats(arch, 200, 'cifar100'):
            pprint.pprint(trial)
        apply_fixed_architecture(model, args.arch)
        dataloader_train = DataLoader(dataset_train, batch_size=args.batch_size, shuffle=True, num_workers=0)
        dataloader_valid = DataLoader(dataset_valid, batch_size=args.batch_size, shuffle=True, num_workers=0)
        train(args, model, dataloader_train, dataloader_valid, criterion, optim,
              torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
        exit(0)

    trainer = enas.EnasTrainer(model,
                               loss=criterion,
                               metrics=lambda output, target: accuracy(output, target, topk=(1,)),
                               reward_function=reward_accuracy,
                               optimizer=optim,
                               callbacks=[LRSchedulerCallback(lr_scheduler), ArchitectureCheckpoint("./checkpoints")],
                               batch_size=args.batch_size,
                               num_epochs=args.epochs,
                               dataset_train=dataset_train,
                               dataset_valid=dataset_valid,
                               log_frequency=args.log_frequency)

    if args.visualization:
        trainer.enable_visualization()
    trainer.train()
Esempio n. 5
0
        lambda step: (1.0 - step / args.epochs) if step <= args.epochs else 0,
        last_epoch=-1)
    train_loader = get_imagenet_iter_dali(
        "train",
        args.imagenet_dir,
        args.batch_size,
        args.workers,
        spos_preprocessing=args.spos_preprocessing)
    valid_loader = get_imagenet_iter_dali(
        "val",
        args.imagenet_dir,
        args.batch_size,
        args.workers,
        spos_preprocessing=args.spos_preprocessing)
    trainer = SPOSSupernetTrainer(model,
                                  criterion,
                                  accuracy,
                                  optimizer,
                                  args.epochs,
                                  train_loader,
                                  valid_loader,
                                  mutator=mutator,
                                  batch_size=args.batch_size,
                                  log_frequency=args.log_frequency,
                                  workers=args.workers,
                                  callbacks=[
                                      LRSchedulerCallback(scheduler),
                                      ModelCheckpoint("./checkpoints")
                                  ])
    trainer.train()
Esempio n. 6
0
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=args.lr,
                                 eps=1e-3,
                                 weight_decay=2e-6)
    lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
        optimizer, T_max=args.epochs, eta_min=1e-5)

    trainer = TextNASTrainer(
        model,
        loss=criterion,
        metrics=lambda output, target: {"acc": accuracy(output, target)},
        reward_function=accuracy,
        optimizer=optimizer,
        callbacks=[LRSchedulerCallback(lr_scheduler)],
        batch_size=args.batch_size,
        num_epochs=args.epochs,
        dataset_train=None,
        dataset_valid=None,
        train_loader=train_loader,
        valid_loader=valid_loader,
        test_loader=test_loader,
        log_frequency=args.log_frequency,
        mutator=mutator,
        mutator_lr=2e-3,
        mutator_steps=500,
        mutator_steps_aggregate=1,
        child_steps=3000,
        baseline_decay=0.99,
        test_arc_per_epoch=10)
Esempio n. 7
0
def main(args):
    reset_seed(args.seed)
    prepare_logger(args)

    logger.info("These are the hyper-parameters you want to tune:\n%s",
                pprint.pformat(vars(args)))

    if args.model == 'nas':
        logger.info("Using NAS.\n")
        if args.fix_arch:
            if not os.path.exists(args.arc_checkpoint):
                print(args.arc_checkpoint,
                      'does not exist, don not fix archetect')
                args.fix_arch = False

    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    if args.model == 'nas':
        if not args.fix_arch:
            model = CNN(32, 3, args.channels, 10, args.layers)
            trainset, testset = data_preprocess(args)
        else:
            model = CNN(32, 3, args.channels, 10, args.layers)
            apply_fixed_architecture(model, args.arc_checkpoint)
            model.to(device)
            train_loader, test_loader = data_preprocess(args)
    else:
        train_loader, test_loader = data_preprocess(args)
        model = models.__dict__[args.model]()
        model.to(device)

    criterion = nn.CrossEntropyLoss()
    if args.optimizer == 'adam':
        optimizer = optim.Adam(model.parameters(),
                               lr=args.initial_lr,
                               weight_decay=args.weight_decay)
    else:
        if args.optimizer == 'sgd':
            optimizer_cls = optim.SGD
        elif args.optimizer == 'rmsprop':
            optimizer_cls = optim.RMSprop
        optimizer = optimizer_cls(model.parameters(),
                                  lr=args.initial_lr,
                                  momentum=args.momentum,
                                  weight_decay=args.weight_decay)
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                     args.epochs,
                                                     eta_min=args.ending_lr)

    if args.model == 'nas' and not args.fix_arch:
        trainer = DartsTrainer(model,
                               loss=criterion,
                               metrics=lambda output, target: accuracyTopk(
                                   output, target, topk=(1, )),
                               optimizer=optimizer,
                               num_epochs=args.epochs,
                               dataset_train=trainset,
                               dataset_valid=testset,
                               batch_size=args.batch_size,
                               log_frequency=args.log_frequency,
                               unrolled=args.unrolled,
                               callbacks=[
                                   LRSchedulerCallback(scheduler),
                                   ArchitectureCheckpoint("./checkpoints")
                               ])
        if args.visualization:
            trainer.enable_visualization()
        trainer.train()
        trainer.export("final_arch.json")
    else:
        for epoch in range(1, args.epochs + 1):
            train(model, train_loader, criterion, optimizer, scheduler, args,
                  epoch, device)
            top1, _ = test(model, test_loader, criterion, args, epoch, device)
            nni.report_intermediate_result(top1)
        logger.info("Final accuracy is: %.6f", top1)
        nni.report_final_result(top1)
Esempio n. 8
0
def main():
    args, cfg = parse_config_args('nni.cream.supernet')

    # resolve logging
    output_dir = os.path.join(
        cfg.SAVE_PATH, "{}-{}".format(datetime.date.today().strftime('%m%d'),
                                      cfg.MODEL))
    if not os.path.exists(output_dir):
        os.mkdir(output_dir)

    if args.local_rank == 0:
        logger = get_logger(os.path.join(output_dir, "train.log"))
    else:
        logger = None

    # initialize distributed parameters
    torch.cuda.set_device(args.local_rank)
    torch.distributed.init_process_group(backend='nccl', init_method='env://')
    if args.local_rank == 0:
        logger.info('Training on Process %d with %d GPUs.', args.local_rank,
                    cfg.NUM_GPU)

    # fix random seeds
    torch.manual_seed(cfg.SEED)
    torch.cuda.manual_seed_all(cfg.SEED)
    np.random.seed(cfg.SEED)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    # generate supernet
    model, sta_num, resolution = gen_supernet(
        flops_minimum=cfg.SUPERNET.FLOPS_MINIMUM,
        flops_maximum=cfg.SUPERNET.FLOPS_MAXIMUM,
        num_classes=cfg.DATASET.NUM_CLASSES,
        drop_rate=cfg.NET.DROPOUT_RATE,
        global_pool=cfg.NET.GP,
        resunit=cfg.SUPERNET.RESUNIT,
        dil_conv=cfg.SUPERNET.DIL_CONV,
        slice=cfg.SUPERNET.SLICE,
        verbose=cfg.VERBOSE,
        logger=logger)

    # number of choice blocks in supernet
    choice_num = len(model.blocks[7])
    if args.local_rank == 0:
        logger.info('Supernet created, param count: %d',
                    (sum([m.numel() for m in model.parameters()])))
        logger.info('resolution: %d', (resolution))
        logger.info('choice number: %d', (choice_num))

    # initialize flops look-up table
    model_est = FlopsEst(model)
    flops_dict, flops_fixed = model_est.flops_dict, model_est.flops_fixed

    # optionally resume from a checkpoint
    optimizer_state = None
    resume_epoch = None
    if cfg.AUTO_RESUME:
        optimizer_state, resume_epoch = resume_checkpoint(
            model, cfg.RESUME_PATH)

    # create optimizer and resume from checkpoint
    optimizer = create_optimizer_supernet(cfg, model, USE_APEX)
    if optimizer_state is not None:
        optimizer.load_state_dict(optimizer_state['optimizer'])
    model = model.cuda()

    # convert model to distributed mode
    if cfg.BATCHNORM.SYNC_BN:
        try:
            if USE_APEX:
                model = convert_syncbn_model(model)
            else:
                model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
            if args.local_rank == 0:
                logger.info('Converted model to use Synchronized BatchNorm.')
        except Exception as exception:
            logger.info(
                'Failed to enable Synchronized BatchNorm. '
                'Install Apex or Torch >= 1.1 with Exception %s', exception)
    if USE_APEX:
        model = DDP(model, delay_allreduce=True)
    else:
        if args.local_rank == 0:
            logger.info(
                "Using torch DistributedDataParallel. Install NVIDIA Apex for Apex DDP."
            )
        # can use device str in Torch >= 1.1
        model = DDP(model, device_ids=[args.local_rank])

    # create learning rate scheduler
    lr_scheduler, num_epochs = create_supernet_scheduler(cfg, optimizer)

    start_epoch = resume_epoch if resume_epoch is not None else 0
    if start_epoch > 0:
        lr_scheduler.step(start_epoch)

    if args.local_rank == 0:
        logger.info('Scheduled epochs: %d', num_epochs)

    # imagenet train dataset
    train_dir = os.path.join(cfg.DATA_DIR, 'train')
    if not os.path.exists(train_dir):
        logger.info('Training folder does not exist at: %s', train_dir)
        sys.exit()

    dataset_train = Dataset(train_dir)
    loader_train = create_loader(dataset_train,
                                 input_size=(3, cfg.DATASET.IMAGE_SIZE,
                                             cfg.DATASET.IMAGE_SIZE),
                                 batch_size=cfg.DATASET.BATCH_SIZE,
                                 is_training=True,
                                 use_prefetcher=True,
                                 re_prob=cfg.AUGMENTATION.RE_PROB,
                                 re_mode=cfg.AUGMENTATION.RE_MODE,
                                 color_jitter=cfg.AUGMENTATION.COLOR_JITTER,
                                 interpolation='random',
                                 num_workers=cfg.WORKERS,
                                 distributed=True,
                                 collate_fn=None,
                                 crop_pct=DEFAULT_CROP_PCT,
                                 mean=IMAGENET_DEFAULT_MEAN,
                                 std=IMAGENET_DEFAULT_STD)

    # imagenet validation dataset
    eval_dir = os.path.join(cfg.DATA_DIR, 'val')
    if not os.path.isdir(eval_dir):
        logger.info('Validation folder does not exist at: %s', eval_dir)
        sys.exit()
    dataset_eval = Dataset(eval_dir)
    loader_eval = create_loader(dataset_eval,
                                input_size=(3, cfg.DATASET.IMAGE_SIZE,
                                            cfg.DATASET.IMAGE_SIZE),
                                batch_size=4 * cfg.DATASET.BATCH_SIZE,
                                is_training=False,
                                use_prefetcher=True,
                                num_workers=cfg.WORKERS,
                                distributed=True,
                                crop_pct=DEFAULT_CROP_PCT,
                                mean=IMAGENET_DEFAULT_MEAN,
                                std=IMAGENET_DEFAULT_STD,
                                interpolation=cfg.DATASET.INTERPOLATION)

    # whether to use label smoothing
    if cfg.AUGMENTATION.SMOOTHING > 0.:
        train_loss_fn = LabelSmoothingCrossEntropy(
            smoothing=cfg.AUGMENTATION.SMOOTHING).cuda()
        validate_loss_fn = nn.CrossEntropyLoss().cuda()
    else:
        train_loss_fn = nn.CrossEntropyLoss().cuda()
        validate_loss_fn = train_loss_fn

    mutator = RandomMutator(model)

    trainer = CreamSupernetTrainer(model,
                                   train_loss_fn,
                                   validate_loss_fn,
                                   optimizer,
                                   num_epochs,
                                   loader_train,
                                   loader_eval,
                                   mutator=mutator,
                                   batch_size=cfg.DATASET.BATCH_SIZE,
                                   log_frequency=cfg.LOG_INTERVAL,
                                   meta_sta_epoch=cfg.SUPERNET.META_STA_EPOCH,
                                   update_iter=cfg.SUPERNET.UPDATE_ITER,
                                   slices=cfg.SUPERNET.SLICE,
                                   pool_size=cfg.SUPERNET.POOL_SIZE,
                                   pick_method=cfg.SUPERNET.PICK_METHOD,
                                   choice_num=choice_num,
                                   sta_num=sta_num,
                                   acc_gap=cfg.ACC_GAP,
                                   flops_dict=flops_dict,
                                   flops_fixed=flops_fixed,
                                   local_rank=args.local_rank,
                                   callbacks=[
                                       LRSchedulerCallback(lr_scheduler),
                                       ModelCheckpoint(output_dir)
                                   ])

    trainer.train()