def test_checkpointing(self): """ Tests checkpointing by running train_steps to make sure the train_steps run the same way after loading from a checkpoint. """ config = get_fast_test_task_config() task = build_task(config).set_hooks([LossLrMeterLoggingHook()]) task_2 = build_task(config).set_hooks([LossLrMeterLoggingHook()]) task.set_use_gpu(torch.cuda.is_available()) # only train 1 phase at a time trainer = LimitedPhaseTrainer(num_phases=1) while not task.done_training(): # set task's state as task_2's checkpoint task_2._set_checkpoint_dict( get_checkpoint_dict(task, {}, deep_copy=True)) # task 2 should have the same state before training self._compare_states(task.get_classy_state(), task_2.get_classy_state()) # train for one phase trainer.train(task) trainer.train(task_2) # task 2 should have the same state after training self._compare_states(task.get_classy_state(), task_2.get_classy_state())
def test_checkpointing(self): """ Tests checkpointing by running train_steps to make sure the train_steps run the same way after loading from a checkpoint. """ config = get_fast_test_task_config() task = build_task(config).set_hooks([LossLrMeterLoggingHook()]) task_2 = build_task(config).set_hooks([LossLrMeterLoggingHook()]) task.set_use_gpu(torch.cuda.is_available()) # prepare the tasks for the right device task.prepare() # test in both train and test mode for _ in range(2): task.advance_phase() # set task's state as task_2's checkpoint task_2.set_checkpoint(get_checkpoint_dict(task, {}, deep_copy=True)) task_2.prepare() # task 2 should have the same state self._compare_states(task.get_classy_state(), task_2.get_classy_state()) # this tests that both states' iterators return the same samples sample = next(task.get_data_iterator()) sample_2 = next(task_2.get_data_iterator()) self._compare_samples(sample, sample_2) # test that the train step runs the same way on both states # and the loss remains the same task.train_step() task_2.train_step() self._compare_states(task.get_classy_state(), task_2.get_classy_state())
def test_logging(self, mock_get_rank: mock.MagicMock) -> None: """ Test that the logging happens as expected and the loss and lr values are correct. """ rank = 5 mock_get_rank.return_value = rank # set up the task and state config = get_test_task_config() config["dataset"]["train"]["batchsize_per_replica"] = 2 config["dataset"]["test"]["batchsize_per_replica"] = 5 task = build_task(config) task.prepare() losses = [1.2, 2.3, 3.4, 4.5] local_variables = {} task.phase_idx = 0 for log_freq in [5, None]: # create a loss lr meter hook loss_lr_meter_hook = LossLrMeterLoggingHook(log_freq=log_freq) # check that _log_loss_meters() is called after on_step() every # log_freq batches and after on_phase_end() # and _log_lr() is called after on_step() every log_freq batches # and after on_phase_end() with mock.patch.object(loss_lr_meter_hook, "_log_loss_meters") as mock_fn: with mock.patch.object(loss_lr_meter_hook, "_log_lr") as mock_lr_fn: num_batches = 20 for i in range(num_batches): task.losses = list(range(i)) loss_lr_meter_hook.on_step(task, local_variables) if log_freq is not None and i and i % log_freq == 0: mock_fn.assert_called_with(task, local_variables) mock_fn.reset_mock() mock_lr_fn.assert_called_with( task, local_variables) mock_lr_fn.reset_mock() continue mock_fn.assert_not_called() mock_lr_fn.assert_not_called() loss_lr_meter_hook.on_phase_end(task, local_variables) mock_fn.assert_called_with(task, local_variables) if task.train: mock_lr_fn.assert_called_with(task, local_variables) # test _log_loss_lr_meters() task.losses = losses with self.assertLogs(): loss_lr_meter_hook._log_loss_meters(task, local_variables) loss_lr_meter_hook._log_lr(task, local_variables) task.phase_idx += 1
def test_logged_lr(self): # Mock LR scheduler def scheduler_mock(where): return where mock_lr_scheduler = mock.Mock(side_effect=scheduler_mock) mock_lr_scheduler.update_interval = UpdateInterval.STEP config = get_test_mlp_task_config() config["num_epochs"] = 3 config["dataset"]["train"]["batchsize_per_replica"] = 5 config["dataset"]["test"]["batchsize_per_replica"] = 5 task = build_task(config) task.optimizer.lr_scheduler = mock_lr_scheduler trainer = LocalTrainer() # 2 LR updates per epoch # At end of each epoch for train, LR is logged an additional time lr_order = [ 0.0, 1 / 6, 1 / 6, 2 / 6, 3 / 6, 3 / 6, 4 / 6, 5 / 6, 5 / 6 ] lr_list = [] def mock_log_lr(task: ClassyTask, local_variables) -> None: lr_list.append(task.optimizer.lr) with mock.patch.object(LossLrMeterLoggingHook, "_log_lr", side_effect=mock_log_lr): hook = LossLrMeterLoggingHook(1) task.set_hooks([hook]) trainer.train(task) self.assertEqual(lr_list, lr_order)
def test_test_only_task(self): """ Tests the task in test mode by running train_steps to make sure the train_steps run as expected on a test_only task """ test_config = get_fast_test_task_config() test_config["test_only"] = True # delete train dataset del test_config["dataset"]["train"] test_only_task = build_task(test_config).set_hooks( [LossLrMeterLoggingHook()]) test_only_task.prepare() test_state = test_only_task.get_classy_state() # We expect that test only state is test, no matter what train state is self.assertFalse(test_state["train"]) # Num updates should be 0 self.assertEqual(test_state["num_updates"], 0) # Verify task will run trainer = LocalTrainer() trainer.train(test_only_task)
def train(datasets, model, loss, optimizer, meters, args): task = (ClassificationTask() .set_num_epochs(args.num_epochs) .set_loss(loss) .set_model(model) .set_optimizer(optimizer) .set_meters(meters)) for phase in ["train", "test"]: task.set_dataset(datasets[phase], phase) hooks = [LossLrMeterLoggingHook(log_freq=args.print_freq)] # show progress hooks.append(ProgressBarHook()) if not args.skip_tensorboard: try: from tensorboardX import SummaryWriter tb_writer = SummaryWriter(log_dir=args.video_dir + "/tensorboard") hooks.append(TensorboardPlotHook(tb_writer)) except ImportError: print("tensorboardX not installed, skipping tensorboard hooks") checkpoint_dir = f"{args.video_dir}/checkpoint/classy_checkpoint_{time.time()}" os.mkdir(checkpoint_dir) hooks.append(CheckpointHook(checkpoint_dir, input_args={})) task = task.set_hooks(hooks) trainer = LocalTrainer(use_gpu=args.cuda, num_dataloader_workers=args.num_workers) trainer.train(task)
def main(local_rank, c10d_backend, rdzv_init_url, max_world_size, classy_args): torch.manual_seed(0) set_video_backend(classy_args.video_backend) # Loads config, sets up task config = load_json(classy_args.config_file) task = build_task(config) # Load checkpoint, if available checkpoint = load_checkpoint(classy_args.checkpoint_folder) task.set_checkpoint(checkpoint) pretrained_checkpoint = load_checkpoint(classy_args.pretrained_checkpoint_folder) if pretrained_checkpoint is not None: assert isinstance( task, FineTuningTask ), "Can only use a pretrained checkpoint for fine tuning tasks" task.set_pretrained_checkpoint(pretrained_checkpoint) hooks = [ LossLrMeterLoggingHook(classy_args.log_freq), ModelComplexityHook(), TimeMetricsHook(), ] if classy_args.checkpoint_folder != "": args_dict = vars(classy_args) args_dict["config"] = config hooks.append( CheckpointHook( classy_args.checkpoint_folder, args_dict, checkpoint_period=classy_args.checkpoint_period, ) ) if classy_args.profiler: hooks.append(ProfilerHook()) task.set_hooks(hooks) assert c10d_backend == Backend.NCCL or c10d_backend == Backend.GLOO if c10d_backend == torch.distributed.Backend.NCCL: # needed to enable NCCL error handling os.environ["NCCL_BLOCKING_WAIT"] = "1" coordinator = CoordinatorP2P( c10d_backend=c10d_backend, init_method=rdzv_init_url, max_num_trainers=max_world_size, process_group_timeout=60000, ) trainer = ElasticTrainer( use_gpu=classy_args.device == "gpu", num_dataloader_workers=classy_args.num_workers, local_rank=local_rank, elastic_coordinator=coordinator, input_args={}, ) trainer.train(task)
def test_test_only_checkpointing(self): """ Tests checkpointing by running train_steps to make sure the train_steps run the same way after loading from a training task checkpoint on a test_only task. """ train_config = get_fast_test_task_config() train_config["num_epochs"] = 10 test_config = get_fast_test_task_config() test_config["test_only"] = True train_task = build_task(train_config).set_hooks( [LossLrMeterLoggingHook()]) test_only_task = build_task(test_config).set_hooks( [LossLrMeterLoggingHook()]) use_gpu = torch.cuda.is_available() # prepare the tasks for the right device train_task.prepare(use_gpu=use_gpu) # test in both train and test mode trainer = LocalTrainer(use_gpu=use_gpu) trainer.train(train_task) # set task's state as task_2's checkpoint test_only_task.set_checkpoint( get_checkpoint_dict(train_task, {}, deep_copy=True)) test_only_task.prepare(use_gpu=use_gpu) test_state = test_only_task.get_classy_state() # We expect the phase idx to be different for a test only task self.assertEqual(test_state["phase_idx"], -1) # We expect that test only state is test, no matter what train state is self.assertFalse(test_state["train"]) # Num updates should be 0 self.assertEqual(test_state["num_updates"], 0) # train_phase_idx should -1 self.assertEqual(test_state["train_phase_idx"], -1) # Verify task will run trainer = LocalTrainer(use_gpu=use_gpu) trainer.train(test_only_task)
def test_train_only_task(self): """ Tests that the task runs when only a train dataset is specified. """ test_config = get_fast_test_task_config() # delete the test dataset from the config del test_config["dataset"]["test"] task = build_task(test_config).set_hooks([LossLrMeterLoggingHook()]) task.prepare() # verify the the task can still be trained trainer = LocalTrainer() trainer.train(task)
def test_training(self): """Checks we can train a small MLP model.""" config = get_test_mlp_task_config() task = (ClassificationTask().set_num_epochs(10).set_loss( build_loss(config["loss"])).set_model(build_model( config["model"])).set_optimizer( build_optimizer(config["optimizer"])).set_meters([ AccuracyMeter(topk=[1]) ]).set_hooks([LossLrMeterLoggingHook()])) for split in ["train", "test"]: dataset = build_dataset(config["dataset"][split]) task.set_dataset(dataset, split) self.assertTrue(task is not None) trainer = LocalTrainer() trainer.train(task) accuracy = task.meters[0].value["top_1"] self.assertAlmostEqual(accuracy, 1.0)
def configure_hooks(args, config): hooks = [LossLrMeterLoggingHook(args.log_freq), ModelComplexityHook()] # Make a folder to store checkpoints and tensorboard logging outputs suffix = datetime.now().isoformat() base_folder = f"{Path(__file__).parent}/output_{suffix}" if args.checkpoint_folder == "": args.checkpoint_folder = base_folder + "/checkpoints" os.makedirs(args.checkpoint_folder, exist_ok=True) logging.info(f"Logging outputs to {base_folder}") logging.info(f"Logging checkpoints to {args.checkpoint_folder}") if not args.skip_tensorboard: try: from torch.utils.tensorboard import SummaryWriter os.makedirs(Path(base_folder) / "tensorboard", exist_ok=True) tb_writer = SummaryWriter(log_dir=Path(base_folder) / "tensorboard") hooks.append(TensorboardPlotHook(tb_writer)) except ImportError: logging.warning( "tensorboard not installed, skipping tensorboard hooks") args_dict = vars(args) args_dict["config"] = config hooks.append( CheckpointHook(args.checkpoint_folder, args_dict, checkpoint_period=args.checkpoint_period)) if args.profiler: hooks.append(ProfilerHook()) if args.show_progress: hooks.append(ProgressBarHook()) if args.visdom_server != "": hooks.append(VisdomHook(args.visdom_server, args.visdom_port)) return hooks