Ejemplo n.º 1
0
    def test_checkpoint_resume(self):
        model = _SimpleModel()
        dataloader = self._data_loader("cpu")
        opt = torch.optim.SGD(model.parameters(), 0.1)
        scheduler = torch.optim.lr_scheduler.StepLR(opt, 3)

        with tempfile.TemporaryDirectory(prefix="detectron2_test") as d:
            trainer = SimpleTrainer(model, dataloader, opt)
            checkpointer = Checkpointer(model, d, opt=opt, trainer=trainer)

            trainer.register_hooks([
                hooks.LRScheduler(scheduler=scheduler),
                # checkpoint after scheduler to properly save the state of scheduler
                hooks.PeriodicCheckpointer(checkpointer, 10),
            ])

            trainer.train(0, 12)
            self.assertAlmostEqual(opt.param_groups[0]["lr"], 1e-5)
            self.assertEqual(scheduler.last_epoch, 12)
            del trainer

            opt = torch.optim.SGD(model.parameters(), 999)  # lr will be loaded
            trainer = SimpleTrainer(model, dataloader, opt)
            scheduler = torch.optim.lr_scheduler.StepLR(opt, 3)
            trainer.register_hooks([
                hooks.LRScheduler(scheduler=scheduler),
            ])
            checkpointer = Checkpointer(model, d, opt=opt, trainer=trainer)
            checkpointer.resume_or_load("non_exist.pth")
            self.assertEqual(
                trainer.iter,
                11)  # last finished iter number (0-based in Trainer)
            # number of times `scheduler.step()` was called (1-based)
            self.assertEqual(scheduler.last_epoch, 12)
            self.assertAlmostEqual(opt.param_groups[0]["lr"], 1e-5)
Ejemplo n.º 2
0
    def test_checkpoint_resume(self):
        model = _SimpleModel()
        dataloader = self._data_loader("cpu")
        opt = torch.optim.SGD(model.parameters(), 0.1)
        scheduler = torch.optim.lr_scheduler.StepLR(opt, 3)

        with tempfile.TemporaryDirectory(prefix="detectron2_test") as d:
            trainer = SimpleTrainer(model, dataloader, opt)
            checkpointer = Checkpointer(model, d, opt=opt, trainer=trainer)

            trainer.register_hooks(
                [
                    hooks.PeriodicCheckpointer(checkpointer, 10),
                    hooks.LRScheduler(scheduler=scheduler),
                ]
            )

            trainer.train(0, 12)
            del trainer

            trainer = SimpleTrainer(model, dataloader, opt)
            scheduler = torch.optim.lr_scheduler.StepLR(opt, 3)
            trainer.register_hooks(
                [
                    hooks.LRScheduler(scheduler=scheduler),
                ]
            )
            checkpointer = Checkpointer(model, d, opt=opt, trainer=trainer)
            checkpointer.resume_or_load("non_exist.pth")
            self.assertEqual(trainer.iter, 11)  # last finished iter
            self.assertEqual(scheduler.last_epoch, 11)
Ejemplo n.º 3
0
}

# search_space = SimpleCellSearchSpace()
search_space = NasBench201SeachSpace()
# search_space = HierarchicalSearchSpace()
# search_space = DartsSearchSpace()

assert search_space.QUERYABLE

optimizer = supported_optimizers[config.optimizer]

optimizer.adapt_search_space(search_space)

checkpoint_dir = '/home/moa/dev/python_projects/NASLib/naslib/benchmarks/nasbench201/run/cifar10/{}/4/search/'.format(
    config.optimizer)
checkpointables = optimizer.get_checkpointables()

checkpointer = Checkpointer(model=checkpointables.pop('model'),
                            save_dir="/tmp/",
                            **checkpointables)

for checkpoint in sorted(
        glob.glob(os.path.join(checkpoint_dir, 'model_0*.pth'))):

    checkpoint = checkpointer.resume_or_load(checkpoint, resume=False)
    epoch = checkpoint.get("iteration", -1)

    print(optimizer.test_statistics())

trainer.evaluate(resume_from=checkpoint)
Ejemplo n.º 4
0
def main():
    global global_step

    config = load_config()

    set_seed(config)
    setup_cudnn(config)

    epoch_seeds = np.random.randint(np.iinfo(np.int32).max // 2,
                                    size=config.scheduler.epochs)

    if config.train.distributed:
        dist.init_process_group(backend=config.train.dist.backend,
                                init_method=config.train.dist.init_method,
                                rank=config.train.dist.node_rank,
                                world_size=config.train.dist.world_size)
        torch.cuda.set_device(config.train.dist.local_rank)

    output_dir = pathlib.Path(config.train.output_dir)
    if get_rank() == 0:
        if not config.train.resume and output_dir.exists():
            raise RuntimeError(
                f'Output directory `{output_dir.as_posix()}` already exists')
        output_dir.mkdir(exist_ok=True, parents=True)
        if not config.train.resume:
            save_config(config, output_dir / 'config.yaml')
            save_config(get_env_info(config), output_dir / 'env.yaml')
            diff = find_config_diff(config)
            if diff is not None:
                save_config(diff, output_dir / 'config_min.yaml')

    logger = create_logger(name=__name__,
                           distributed_rank=get_rank(),
                           output_dir=output_dir,
                           filename='log.txt')
    logger.info(config)
    logger.info(get_env_info(config))

    train_loader, val_loader = create_dataloader(config, is_train=True)

    model = create_model(config)
    macs, n_params = count_op(config, model)
    logger.info(f'MACs  : {macs}')
    logger.info(f'#params: {n_params}')

    optimizer = create_optimizer(config, model)
    model, optimizer = apex.amp.initialize(model,
                                           optimizer,
                                           opt_level=config.train.precision)
    model = apply_data_parallel_wrapper(config, model)

    scheduler = create_scheduler(config,
                                 optimizer,
                                 steps_per_epoch=len(train_loader))
    checkpointer = Checkpointer(model,
                                optimizer=optimizer,
                                scheduler=scheduler,
                                save_dir=output_dir,
                                save_to_disk=get_rank() == 0)

    start_epoch = config.train.start_epoch
    scheduler.last_epoch = start_epoch
    if config.train.resume:
        checkpoint_config = checkpointer.resume_or_load('', resume=True)
        global_step = checkpoint_config['global_step']
        start_epoch = checkpoint_config['epoch']
        config.defrost()
        config.merge_from_other_cfg(ConfigNode(checkpoint_config['config']))
        config.freeze()
    elif config.train.checkpoint != '':
        checkpoint = torch.load(config.train.checkpoint, map_location='cpu')
        if isinstance(model,
                      (nn.DataParallel, nn.parallel.DistributedDataParallel)):
            model.module.load_state_dict(checkpoint['model'])
        else:
            model.load_state_dict(checkpoint['model'])

    if get_rank() == 0 and config.train.use_tensorboard:
        tensorboard_writer = create_tensorboard_writer(
            config, output_dir, purge_step=config.train.start_epoch + 1)
        tensorboard_writer2 = create_tensorboard_writer(
            config, output_dir / 'running', purge_step=global_step + 1)
    else:
        tensorboard_writer = DummyWriter()
        tensorboard_writer2 = DummyWriter()

    train_loss, val_loss = create_loss(config)

    if (config.train.val_period > 0 and start_epoch == 0
            and config.train.val_first):
        validate(0, config, model, val_loss, val_loader, logger,
                 tensorboard_writer)

    for epoch, seed in enumerate(epoch_seeds[start_epoch:], start_epoch):
        epoch += 1

        np.random.seed(seed)
        train(epoch, config, model, optimizer, scheduler, train_loss,
              train_loader, logger, tensorboard_writer, tensorboard_writer2)

        if config.train.val_period > 0 and (epoch %
                                            config.train.val_period == 0):
            validate(epoch, config, model, val_loss, val_loader, logger,
                     tensorboard_writer)

        tensorboard_writer.flush()
        tensorboard_writer2.flush()

        if (epoch % config.train.checkpoint_period == 0) or (
                epoch == config.scheduler.epochs):
            checkpoint_config = {
                'epoch': epoch,
                'global_step': global_step,
                'config': config.as_dict(),
            }
            checkpointer.save(f'checkpoint_{epoch:05d}', **checkpoint_config)

    tensorboard_writer.close()
    tensorboard_writer2.close()
Ejemplo n.º 5
0
def main(cfg: DictConfig) -> None:

    if "experiments" in cfg.keys():
        cfg = OmegaConf.merge(cfg, cfg.experiments)

    if "debug" in cfg.keys():
        logger.info(f"Run script in debug")
        cfg = OmegaConf.merge(cfg, cfg.debug)

    # A logger for this file
    logger = logging.getLogger(__name__)

    # NOTE: hydra causes the python file to run in hydra.run.dir by default
    logger.info(f"Run script in {HydraConfig.get().run.dir}")

    writer = SummaryWriter(log_dir=cfg.train.tensorboard_dir)

    checkpoints_dir = Path(cfg.train.checkpoints_dir)
    if not checkpoints_dir.exists():
        checkpoints_dir.mkdir(parents=True)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    image_shape = (cfg.train.channels, cfg.train.image_height,
                   cfg.train.image_width)

    # NOTE: With hydra, the python file runs in hydra.run.dir by default, so set the dataset path to a full path or an appropriate relative path
    dataset_path = Path(cfg.dataset.root) / cfg.dataset.frames
    split_path = Path(cfg.dataset.root) / cfg.dataset.split_file
    assert dataset_path.exists(), "Video image folder not found"
    assert (split_path.exists()
            ), "The file that describes the split of train/test not found."

    # Define training set
    train_dataset = Dataset(
        dataset_path=dataset_path,
        split_path=split_path,
        split_number=cfg.dataset.split_number,
        input_shape=image_shape,
        sequence_length=cfg.train.sequence_length,
        training=True,
    )

    # Define train dataloader
    train_dataloader = DataLoader(
        train_dataset,
        batch_size=cfg.train.batch_size,
        shuffle=True,
        num_workers=cfg.train.num_workers,
    )

    # Define test set
    test_dataset = Dataset(
        dataset_path=dataset_path,
        split_path=split_path,
        split_number=cfg.dataset.split_number,
        input_shape=image_shape,
        sequence_length=cfg.train.sequence_length,
        training=False,
    )

    # Define test dataloader
    test_dataloader = DataLoader(
        test_dataset,
        batch_size=cfg.train.batch_size,
        shuffle=False,
        num_workers=cfg.train.num_workers,
    )

    # Classification criterion
    criterion = nn.CrossEntropyLoss().to(device)

    # Define network
    model = CNNLSTM(
        num_classes=train_dataset.num_classes,
        latent_dim=cfg.train.latent_dim,
        lstm_layers=cfg.train.lstm_layers,
        hidden_dim=cfg.train.hidden_dim,
        bidirectional=cfg.train.bidirectional,
        attention=cfg.train.attention,
    )
    model = model.to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

    checkpointer = Checkpointer(
        model,
        optimizer=optimizer,
        # scheduler=scheduler,
        save_dir=cfg.train.checkpoints_dir,
        save_to_disk=True,
    )

    if cfg.train.resume:
        if not checkpointer.has_checkpoint():
            start_epoch = 0
        else:
            ckpt = checkpointer.resume_or_load("", resume=True)
            start_epoch = ckpt["epoch"]
            model.to(device)
            for state in optimizer.state.values():
                for k, v in state.items():
                    if isinstance(v, torch.Tensor):
                        state[k] = v.to(device)
    elif cfg.train.checkpoint_model != "":
        ckpt = torch.load(cfg.train.checkpoint_model, map_location="cpu")
        model.load_state_dict(ckpt["model"])
        model.to(device)
        start_epoch = 0
    else:
        start_epoch = 0

    for epoch in range(start_epoch, cfg.train.num_epochs):
        epoch += 1
        epoch_metrics = {"loss": [], "acc": []}
        timer = Timer()
        for batch_i, (X, y) in enumerate(train_dataloader):
            batch_i += 1
            if X.size(0) == 1:
                continue

            image_sequences = Variable(X.to(device), requires_grad=True)
            labels = Variable(y.to(device), requires_grad=False)

            optimizer.zero_grad()

            # Reset LSTM hidden state
            model.lstm.reset_hidden_state()

            # Get sequence predictions
            predictions = model(image_sequences)

            # Compute metrics
            loss = criterion(predictions, labels)
            acc = (
                predictions.detach().argmax(1) == labels).cpu().numpy().mean()

            loss.backward()
            optimizer.step()

            # Keep track of epoch metrics
            epoch_metrics["loss"].append(loss.item())
            epoch_metrics["acc"].append(acc)

            # Determine approximate time left
            batches_done = (epoch - 1) * len(train_dataloader) + (batch_i - 1)
            batches_left = cfg.train.num_epochs * len(
                train_dataloader) - batches_done
            time_left = datetime.timedelta(seconds=batches_left *
                                           timer.seconds())
            time_iter = round(timer.seconds(), 3)
            timer.reset()

            logger.info(
                f'Training - [Epoch: {epoch}/{cfg.train.num_epochs}] [Batch: {batch_i}/{len(train_dataloader)}] [Loss: {np.mean(epoch_metrics["loss"]):.3f}] [Acc: {np.mean(epoch_metrics["acc"]):.3f}] [ETA: {time_left}] [Iter time: {time_iter}s/it]'
            )

            # Empty cache
            if torch.cuda.is_available():
                torch.cuda.empty_cache()

        writer.add_scalar("train/loss", np.mean(epoch_metrics["loss"]), epoch)
        writer.add_scalar("train/acc", np.mean(epoch_metrics["acc"]), epoch)

        def test_model(epoch):
            """ Evaluate the model on the test set """
            model.eval()
            test_metrics = {"loss": [], "acc": []}
            timer = Timer()
            for batch_i, (X, y) in enumerate(test_dataloader):
                batch_i += 1
                image_sequences = Variable(X.to(device), requires_grad=False)
                labels = Variable(y, requires_grad=False).to(device)

                with torch.no_grad():
                    # Reset LSTM hidden state
                    model.lstm.reset_hidden_state()
                    # Get sequence predictions
                    predictions = model(image_sequences)

                # Compute metrics
                loss = criterion(predictions, labels)
                acc = (predictions.detach().argmax(1) == labels
                       ).cpu().numpy().mean()

                # Keep track of loss and accuracy
                test_metrics["loss"].append(loss.item())
                test_metrics["acc"].append(acc)

                # Determine approximate time left
                batches_done = batch_i - 1
                batches_left = len(test_dataloader) - batches_done
                time_left = datetime.timedelta(seconds=batches_left *
                                               timer.seconds())
                time_iter = round(timer.seconds(), 3)
                timer.reset()

                # Log test performance
                logger.info(
                    f'Testing - [Epoch: {epoch}/{cfg.train.num_epochs}] [Batch: {batch_i}/{len(test_dataloader)}] [Loss: {np.mean(test_metrics["loss"]):.3f}] [Acc: {np.mean(test_metrics["acc"]):.3f}] [ETA: {time_left}] [Iter time: {time_iter}s/it]'
                )

            writer.add_scalar("test/loss", np.mean(test_metrics["loss"]),
                              epoch)
            writer.add_scalar("test/acc", np.mean(test_metrics["acc"]), epoch)

            model.train()

        # Evaluate the model on the test set
        test_model(epoch)

        # Save model checkpoint
        if epoch % cfg.train.checkpoint_interval == 0:
            checkpointer.save(f"checkpoint_{epoch:04}", epoch=epoch)

    writer.close()
Ejemplo n.º 6
0
def main():
    global global_step

    config = load_config()

    set_seed(config)
    setup_cudnn(config)

    # np.iinfo(np_type).max: machine limit (upper bound) of the this type
    # every epoch will have a specific epoch seed
    epoch_seeds = np.random.randint(np.iinfo(np.int32).max // 2,
                                    size=config.scheduler.epochs)

    if config.train.distributed:
        dist.init_process_group(backend=config.train.dist.backend,
                                init_method=config.train.dist.init_method,
                                rank=config.train.dist.node_rank,
                                world_size=config.train.dist.world_size)
        torch.cuda.set_device(config.train.dist.local_rank)

    output_dir = pathlib.Path(config.train.output_dir)
    if get_rank() == 0:
        if not config.train.resume and output_dir.exists():
            raise RuntimeError(
                f'Output directory `{output_dir.as_posix()}` already exists')
        output_dir.mkdir(exist_ok=True, parents=True)
        if not config.train.resume:
            # if we need to resume training, current config, environment info and the difference between
            # the current and default config will be saved.
            save_config(config, output_dir / 'config.yaml')
            save_config(get_env_info(config), output_dir / 'env.yaml')
            diff = find_config_diff(config)
            if diff is not None:
                save_config(diff, output_dir / 'config_min.yaml')

    logger = create_logger(name=__name__,
                           distributed_rank=get_rank(),
                           output_dir=output_dir,
                           filename='log.txt')
    logger.info(config)
    logger.info(get_env_info(config))

    train_loader, val_loader = create_dataloader(config, is_train=True)

    model = create_model(config)
    # Multiply-and-ACcumulate(MAC): ops
    macs, n_params = count_op(config, model)
    logger.info(f'MACs   : {macs}')
    logger.info(f'#params: {n_params}')
    # creating optimizer: SGD with nesterov momentum, adam, amsgrad, adabound, adaboundw or lars.
    optimizer = create_optimizer(config, model)
    # some AMP(Automatic mixed precision) settings
    if config.device != 'cpu':
        model, optimizer = apex.amp.initialize(
            model, optimizer, opt_level=config.train.precision)
    # create data parallel model or distributed data
    model = apply_data_parallel_wrapper(config, model)

    # set up scheduler and warm up scheduler
    # steps per epoch: how many batches in an epoch
    scheduler = create_scheduler(config,
                                 optimizer,
                                 steps_per_epoch=len(train_loader))
    # create checkponit, do ot use torch's default checkpoint saver because it can't save scheduler
    checkpointer = Checkpointer(model,
                                optimizer=optimizer,
                                scheduler=scheduler,
                                save_dir=output_dir,
                                save_to_disk=get_rank() == 0)

    start_epoch = config.train.start_epoch
    # last_epoch is used to resume training, here normally we should start from config.train.start_epoch
    scheduler.last_epoch = start_epoch
    # The resume training supports multiple modes:
    # 1. resume = True, loading model from the last training checkpoint and following the global step and config
    # 2. resume = False, training checkpoint is specified, load checkpoint to cpu
    if config.train.resume:
        checkpoint_config = checkpointer.resume_or_load('', resume=True)
        global_step = checkpoint_config['global_step']
        start_epoch = checkpoint_config['epoch']
        config.defrost()
        config.merge_from_other_cfg(ConfigNode(checkpoint_config['config']))
        config.freeze()
    elif config.train.checkpoint != '':
        checkpoint = torch.load(config.train.checkpoint, map_location='cpu')
        if isinstance(model,
                      (nn.DataParallel, nn.parallel.DistributedDataParallel)):
            model.module.load_state_dict(checkpoint['model'])
        else:
            model.load_state_dict(checkpoint['model'])
    # Two TensorBoard writer:
    # First writer for this run of training(maybe it's resuming training)
    # Second writer follows the global steps and records the global run.
    if get_rank() == 0 and config.train.use_tensorboard:
        tensorboard_writer = create_tensorboard_writer(
            config, output_dir, purge_step=config.train.start_epoch + 1)
        tensorboard_writer2 = create_tensorboard_writer(
            config, output_dir / 'running', purge_step=global_step + 1)
    else:
        tensorboard_writer = DummyWriter()
        tensorboard_writer2 = DummyWriter()

    train_loss, val_loss = create_loss(config)

    if (config.train.val_period > 0 and start_epoch == 0
            and config.train.val_first):
        # validate the model from epoch 0
        validate(0, config, model, val_loss, val_loader, logger,
                 tensorboard_writer)

    for epoch, seed in enumerate(epoch_seeds[start_epoch:], start_epoch):
        epoch += 1

        np.random.seed(seed)
        train(epoch, config, model, optimizer, scheduler, train_loss,
              train_loader, logger, tensorboard_writer, tensorboard_writer2)

        if config.train.val_period > 0 and (epoch % config.train.val_period
                                            == 0):
            validate(epoch, config, model, val_loss, val_loader, logger,
                     tensorboard_writer)

        tensorboard_writer.flush()
        tensorboard_writer2.flush()

        if (epoch % config.train.checkpoint_period
                == 0) or (epoch == config.scheduler.epochs):
            checkpoint_config = {
                'epoch': epoch,
                'global_step': global_step,
                'config': config.as_dict(),
            }
            checkpointer.save(f'checkpoint_{epoch:05d}', **checkpoint_config)

    tensorboard_writer.close()
    tensorboard_writer2.close()