def test_checkpoint_resume(self): model = _SimpleModel() dataloader = self._data_loader("cpu") opt = torch.optim.SGD(model.parameters(), 0.1) scheduler = torch.optim.lr_scheduler.StepLR(opt, 3) with tempfile.TemporaryDirectory(prefix="detectron2_test") as d: trainer = SimpleTrainer(model, dataloader, opt) checkpointer = Checkpointer(model, d, opt=opt, trainer=trainer) trainer.register_hooks([ hooks.LRScheduler(scheduler=scheduler), # checkpoint after scheduler to properly save the state of scheduler hooks.PeriodicCheckpointer(checkpointer, 10), ]) trainer.train(0, 12) self.assertAlmostEqual(opt.param_groups[0]["lr"], 1e-5) self.assertEqual(scheduler.last_epoch, 12) del trainer opt = torch.optim.SGD(model.parameters(), 999) # lr will be loaded trainer = SimpleTrainer(model, dataloader, opt) scheduler = torch.optim.lr_scheduler.StepLR(opt, 3) trainer.register_hooks([ hooks.LRScheduler(scheduler=scheduler), ]) checkpointer = Checkpointer(model, d, opt=opt, trainer=trainer) checkpointer.resume_or_load("non_exist.pth") self.assertEqual( trainer.iter, 11) # last finished iter number (0-based in Trainer) # number of times `scheduler.step()` was called (1-based) self.assertEqual(scheduler.last_epoch, 12) self.assertAlmostEqual(opt.param_groups[0]["lr"], 1e-5)
def test_checkpoint_resume(self): model = _SimpleModel() dataloader = self._data_loader("cpu") opt = torch.optim.SGD(model.parameters(), 0.1) scheduler = torch.optim.lr_scheduler.StepLR(opt, 3) with tempfile.TemporaryDirectory(prefix="detectron2_test") as d: trainer = SimpleTrainer(model, dataloader, opt) checkpointer = Checkpointer(model, d, opt=opt, trainer=trainer) trainer.register_hooks( [ hooks.PeriodicCheckpointer(checkpointer, 10), hooks.LRScheduler(scheduler=scheduler), ] ) trainer.train(0, 12) del trainer trainer = SimpleTrainer(model, dataloader, opt) scheduler = torch.optim.lr_scheduler.StepLR(opt, 3) trainer.register_hooks( [ hooks.LRScheduler(scheduler=scheduler), ] ) checkpointer = Checkpointer(model, d, opt=opt, trainer=trainer) checkpointer.resume_or_load("non_exist.pth") self.assertEqual(trainer.iter, 11) # last finished iter self.assertEqual(scheduler.last_epoch, 11)
} # search_space = SimpleCellSearchSpace() search_space = NasBench201SeachSpace() # search_space = HierarchicalSearchSpace() # search_space = DartsSearchSpace() assert search_space.QUERYABLE optimizer = supported_optimizers[config.optimizer] optimizer.adapt_search_space(search_space) checkpoint_dir = '/home/moa/dev/python_projects/NASLib/naslib/benchmarks/nasbench201/run/cifar10/{}/4/search/'.format( config.optimizer) checkpointables = optimizer.get_checkpointables() checkpointer = Checkpointer(model=checkpointables.pop('model'), save_dir="/tmp/", **checkpointables) for checkpoint in sorted( glob.glob(os.path.join(checkpoint_dir, 'model_0*.pth'))): checkpoint = checkpointer.resume_or_load(checkpoint, resume=False) epoch = checkpoint.get("iteration", -1) print(optimizer.test_statistics()) trainer.evaluate(resume_from=checkpoint)
def main(): global global_step config = load_config() set_seed(config) setup_cudnn(config) epoch_seeds = np.random.randint(np.iinfo(np.int32).max // 2, size=config.scheduler.epochs) if config.train.distributed: dist.init_process_group(backend=config.train.dist.backend, init_method=config.train.dist.init_method, rank=config.train.dist.node_rank, world_size=config.train.dist.world_size) torch.cuda.set_device(config.train.dist.local_rank) output_dir = pathlib.Path(config.train.output_dir) if get_rank() == 0: if not config.train.resume and output_dir.exists(): raise RuntimeError( f'Output directory `{output_dir.as_posix()}` already exists') output_dir.mkdir(exist_ok=True, parents=True) if not config.train.resume: save_config(config, output_dir / 'config.yaml') save_config(get_env_info(config), output_dir / 'env.yaml') diff = find_config_diff(config) if diff is not None: save_config(diff, output_dir / 'config_min.yaml') logger = create_logger(name=__name__, distributed_rank=get_rank(), output_dir=output_dir, filename='log.txt') logger.info(config) logger.info(get_env_info(config)) train_loader, val_loader = create_dataloader(config, is_train=True) model = create_model(config) macs, n_params = count_op(config, model) logger.info(f'MACs : {macs}') logger.info(f'#params: {n_params}') optimizer = create_optimizer(config, model) model, optimizer = apex.amp.initialize(model, optimizer, opt_level=config.train.precision) model = apply_data_parallel_wrapper(config, model) scheduler = create_scheduler(config, optimizer, steps_per_epoch=len(train_loader)) checkpointer = Checkpointer(model, optimizer=optimizer, scheduler=scheduler, save_dir=output_dir, save_to_disk=get_rank() == 0) start_epoch = config.train.start_epoch scheduler.last_epoch = start_epoch if config.train.resume: checkpoint_config = checkpointer.resume_or_load('', resume=True) global_step = checkpoint_config['global_step'] start_epoch = checkpoint_config['epoch'] config.defrost() config.merge_from_other_cfg(ConfigNode(checkpoint_config['config'])) config.freeze() elif config.train.checkpoint != '': checkpoint = torch.load(config.train.checkpoint, map_location='cpu') if isinstance(model, (nn.DataParallel, nn.parallel.DistributedDataParallel)): model.module.load_state_dict(checkpoint['model']) else: model.load_state_dict(checkpoint['model']) if get_rank() == 0 and config.train.use_tensorboard: tensorboard_writer = create_tensorboard_writer( config, output_dir, purge_step=config.train.start_epoch + 1) tensorboard_writer2 = create_tensorboard_writer( config, output_dir / 'running', purge_step=global_step + 1) else: tensorboard_writer = DummyWriter() tensorboard_writer2 = DummyWriter() train_loss, val_loss = create_loss(config) if (config.train.val_period > 0 and start_epoch == 0 and config.train.val_first): validate(0, config, model, val_loss, val_loader, logger, tensorboard_writer) for epoch, seed in enumerate(epoch_seeds[start_epoch:], start_epoch): epoch += 1 np.random.seed(seed) train(epoch, config, model, optimizer, scheduler, train_loss, train_loader, logger, tensorboard_writer, tensorboard_writer2) if config.train.val_period > 0 and (epoch % config.train.val_period == 0): validate(epoch, config, model, val_loss, val_loader, logger, tensorboard_writer) tensorboard_writer.flush() tensorboard_writer2.flush() if (epoch % config.train.checkpoint_period == 0) or ( epoch == config.scheduler.epochs): checkpoint_config = { 'epoch': epoch, 'global_step': global_step, 'config': config.as_dict(), } checkpointer.save(f'checkpoint_{epoch:05d}', **checkpoint_config) tensorboard_writer.close() tensorboard_writer2.close()
def main(cfg: DictConfig) -> None: if "experiments" in cfg.keys(): cfg = OmegaConf.merge(cfg, cfg.experiments) if "debug" in cfg.keys(): logger.info(f"Run script in debug") cfg = OmegaConf.merge(cfg, cfg.debug) # A logger for this file logger = logging.getLogger(__name__) # NOTE: hydra causes the python file to run in hydra.run.dir by default logger.info(f"Run script in {HydraConfig.get().run.dir}") writer = SummaryWriter(log_dir=cfg.train.tensorboard_dir) checkpoints_dir = Path(cfg.train.checkpoints_dir) if not checkpoints_dir.exists(): checkpoints_dir.mkdir(parents=True) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") image_shape = (cfg.train.channels, cfg.train.image_height, cfg.train.image_width) # NOTE: With hydra, the python file runs in hydra.run.dir by default, so set the dataset path to a full path or an appropriate relative path dataset_path = Path(cfg.dataset.root) / cfg.dataset.frames split_path = Path(cfg.dataset.root) / cfg.dataset.split_file assert dataset_path.exists(), "Video image folder not found" assert (split_path.exists() ), "The file that describes the split of train/test not found." # Define training set train_dataset = Dataset( dataset_path=dataset_path, split_path=split_path, split_number=cfg.dataset.split_number, input_shape=image_shape, sequence_length=cfg.train.sequence_length, training=True, ) # Define train dataloader train_dataloader = DataLoader( train_dataset, batch_size=cfg.train.batch_size, shuffle=True, num_workers=cfg.train.num_workers, ) # Define test set test_dataset = Dataset( dataset_path=dataset_path, split_path=split_path, split_number=cfg.dataset.split_number, input_shape=image_shape, sequence_length=cfg.train.sequence_length, training=False, ) # Define test dataloader test_dataloader = DataLoader( test_dataset, batch_size=cfg.train.batch_size, shuffle=False, num_workers=cfg.train.num_workers, ) # Classification criterion criterion = nn.CrossEntropyLoss().to(device) # Define network model = CNNLSTM( num_classes=train_dataset.num_classes, latent_dim=cfg.train.latent_dim, lstm_layers=cfg.train.lstm_layers, hidden_dim=cfg.train.hidden_dim, bidirectional=cfg.train.bidirectional, attention=cfg.train.attention, ) model = model.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=1e-5) checkpointer = Checkpointer( model, optimizer=optimizer, # scheduler=scheduler, save_dir=cfg.train.checkpoints_dir, save_to_disk=True, ) if cfg.train.resume: if not checkpointer.has_checkpoint(): start_epoch = 0 else: ckpt = checkpointer.resume_or_load("", resume=True) start_epoch = ckpt["epoch"] model.to(device) for state in optimizer.state.values(): for k, v in state.items(): if isinstance(v, torch.Tensor): state[k] = v.to(device) elif cfg.train.checkpoint_model != "": ckpt = torch.load(cfg.train.checkpoint_model, map_location="cpu") model.load_state_dict(ckpt["model"]) model.to(device) start_epoch = 0 else: start_epoch = 0 for epoch in range(start_epoch, cfg.train.num_epochs): epoch += 1 epoch_metrics = {"loss": [], "acc": []} timer = Timer() for batch_i, (X, y) in enumerate(train_dataloader): batch_i += 1 if X.size(0) == 1: continue image_sequences = Variable(X.to(device), requires_grad=True) labels = Variable(y.to(device), requires_grad=False) optimizer.zero_grad() # Reset LSTM hidden state model.lstm.reset_hidden_state() # Get sequence predictions predictions = model(image_sequences) # Compute metrics loss = criterion(predictions, labels) acc = ( predictions.detach().argmax(1) == labels).cpu().numpy().mean() loss.backward() optimizer.step() # Keep track of epoch metrics epoch_metrics["loss"].append(loss.item()) epoch_metrics["acc"].append(acc) # Determine approximate time left batches_done = (epoch - 1) * len(train_dataloader) + (batch_i - 1) batches_left = cfg.train.num_epochs * len( train_dataloader) - batches_done time_left = datetime.timedelta(seconds=batches_left * timer.seconds()) time_iter = round(timer.seconds(), 3) timer.reset() logger.info( f'Training - [Epoch: {epoch}/{cfg.train.num_epochs}] [Batch: {batch_i}/{len(train_dataloader)}] [Loss: {np.mean(epoch_metrics["loss"]):.3f}] [Acc: {np.mean(epoch_metrics["acc"]):.3f}] [ETA: {time_left}] [Iter time: {time_iter}s/it]' ) # Empty cache if torch.cuda.is_available(): torch.cuda.empty_cache() writer.add_scalar("train/loss", np.mean(epoch_metrics["loss"]), epoch) writer.add_scalar("train/acc", np.mean(epoch_metrics["acc"]), epoch) def test_model(epoch): """ Evaluate the model on the test set """ model.eval() test_metrics = {"loss": [], "acc": []} timer = Timer() for batch_i, (X, y) in enumerate(test_dataloader): batch_i += 1 image_sequences = Variable(X.to(device), requires_grad=False) labels = Variable(y, requires_grad=False).to(device) with torch.no_grad(): # Reset LSTM hidden state model.lstm.reset_hidden_state() # Get sequence predictions predictions = model(image_sequences) # Compute metrics loss = criterion(predictions, labels) acc = (predictions.detach().argmax(1) == labels ).cpu().numpy().mean() # Keep track of loss and accuracy test_metrics["loss"].append(loss.item()) test_metrics["acc"].append(acc) # Determine approximate time left batches_done = batch_i - 1 batches_left = len(test_dataloader) - batches_done time_left = datetime.timedelta(seconds=batches_left * timer.seconds()) time_iter = round(timer.seconds(), 3) timer.reset() # Log test performance logger.info( f'Testing - [Epoch: {epoch}/{cfg.train.num_epochs}] [Batch: {batch_i}/{len(test_dataloader)}] [Loss: {np.mean(test_metrics["loss"]):.3f}] [Acc: {np.mean(test_metrics["acc"]):.3f}] [ETA: {time_left}] [Iter time: {time_iter}s/it]' ) writer.add_scalar("test/loss", np.mean(test_metrics["loss"]), epoch) writer.add_scalar("test/acc", np.mean(test_metrics["acc"]), epoch) model.train() # Evaluate the model on the test set test_model(epoch) # Save model checkpoint if epoch % cfg.train.checkpoint_interval == 0: checkpointer.save(f"checkpoint_{epoch:04}", epoch=epoch) writer.close()
def main(): global global_step config = load_config() set_seed(config) setup_cudnn(config) # np.iinfo(np_type).max: machine limit (upper bound) of the this type # every epoch will have a specific epoch seed epoch_seeds = np.random.randint(np.iinfo(np.int32).max // 2, size=config.scheduler.epochs) if config.train.distributed: dist.init_process_group(backend=config.train.dist.backend, init_method=config.train.dist.init_method, rank=config.train.dist.node_rank, world_size=config.train.dist.world_size) torch.cuda.set_device(config.train.dist.local_rank) output_dir = pathlib.Path(config.train.output_dir) if get_rank() == 0: if not config.train.resume and output_dir.exists(): raise RuntimeError( f'Output directory `{output_dir.as_posix()}` already exists') output_dir.mkdir(exist_ok=True, parents=True) if not config.train.resume: # if we need to resume training, current config, environment info and the difference between # the current and default config will be saved. save_config(config, output_dir / 'config.yaml') save_config(get_env_info(config), output_dir / 'env.yaml') diff = find_config_diff(config) if diff is not None: save_config(diff, output_dir / 'config_min.yaml') logger = create_logger(name=__name__, distributed_rank=get_rank(), output_dir=output_dir, filename='log.txt') logger.info(config) logger.info(get_env_info(config)) train_loader, val_loader = create_dataloader(config, is_train=True) model = create_model(config) # Multiply-and-ACcumulate(MAC): ops macs, n_params = count_op(config, model) logger.info(f'MACs : {macs}') logger.info(f'#params: {n_params}') # creating optimizer: SGD with nesterov momentum, adam, amsgrad, adabound, adaboundw or lars. optimizer = create_optimizer(config, model) # some AMP(Automatic mixed precision) settings if config.device != 'cpu': model, optimizer = apex.amp.initialize( model, optimizer, opt_level=config.train.precision) # create data parallel model or distributed data model = apply_data_parallel_wrapper(config, model) # set up scheduler and warm up scheduler # steps per epoch: how many batches in an epoch scheduler = create_scheduler(config, optimizer, steps_per_epoch=len(train_loader)) # create checkponit, do ot use torch's default checkpoint saver because it can't save scheduler checkpointer = Checkpointer(model, optimizer=optimizer, scheduler=scheduler, save_dir=output_dir, save_to_disk=get_rank() == 0) start_epoch = config.train.start_epoch # last_epoch is used to resume training, here normally we should start from config.train.start_epoch scheduler.last_epoch = start_epoch # The resume training supports multiple modes: # 1. resume = True, loading model from the last training checkpoint and following the global step and config # 2. resume = False, training checkpoint is specified, load checkpoint to cpu if config.train.resume: checkpoint_config = checkpointer.resume_or_load('', resume=True) global_step = checkpoint_config['global_step'] start_epoch = checkpoint_config['epoch'] config.defrost() config.merge_from_other_cfg(ConfigNode(checkpoint_config['config'])) config.freeze() elif config.train.checkpoint != '': checkpoint = torch.load(config.train.checkpoint, map_location='cpu') if isinstance(model, (nn.DataParallel, nn.parallel.DistributedDataParallel)): model.module.load_state_dict(checkpoint['model']) else: model.load_state_dict(checkpoint['model']) # Two TensorBoard writer: # First writer for this run of training(maybe it's resuming training) # Second writer follows the global steps and records the global run. if get_rank() == 0 and config.train.use_tensorboard: tensorboard_writer = create_tensorboard_writer( config, output_dir, purge_step=config.train.start_epoch + 1) tensorboard_writer2 = create_tensorboard_writer( config, output_dir / 'running', purge_step=global_step + 1) else: tensorboard_writer = DummyWriter() tensorboard_writer2 = DummyWriter() train_loss, val_loss = create_loss(config) if (config.train.val_period > 0 and start_epoch == 0 and config.train.val_first): # validate the model from epoch 0 validate(0, config, model, val_loss, val_loader, logger, tensorboard_writer) for epoch, seed in enumerate(epoch_seeds[start_epoch:], start_epoch): epoch += 1 np.random.seed(seed) train(epoch, config, model, optimizer, scheduler, train_loss, train_loader, logger, tensorboard_writer, tensorboard_writer2) if config.train.val_period > 0 and (epoch % config.train.val_period == 0): validate(epoch, config, model, val_loss, val_loader, logger, tensorboard_writer) tensorboard_writer.flush() tensorboard_writer2.flush() if (epoch % config.train.checkpoint_period == 0) or (epoch == config.scheduler.epochs): checkpoint_config = { 'epoch': epoch, 'global_step': global_step, 'config': config.as_dict(), } checkpointer.save(f'checkpoint_{epoch:05d}', **checkpoint_config) tensorboard_writer.close() tensorboard_writer2.close()