def test_checkpointing(self): """ Tests checkpointing by running train_steps to make sure the train_steps run the same way after loading from a checkpoint. """ config = get_fast_test_task_config() task = build_task(config).set_hooks([LossLrMeterLoggingHook()]) task_2 = build_task(config).set_hooks([LossLrMeterLoggingHook()]) task.set_use_gpu(torch.cuda.is_available()) # prepare the tasks for the right device task.prepare() # test in both train and test mode for _ in range(2): task.advance_phase() # set task's state as task_2's checkpoint task_2.set_checkpoint(get_checkpoint_dict(task, {}, deep_copy=True)) task_2.prepare() # task 2 should have the same state self._compare_states(task.get_classy_state(), task_2.get_classy_state()) # this tests that both states' iterators return the same samples sample = next(task.get_data_iterator()) sample_2 = next(task_2.get_data_iterator()) self._compare_samples(sample, sample_2) # test that the train step runs the same way on both states # and the loss remains the same task.train_step() task_2.train_step() self._compare_states(task.get_classy_state(), task_2.get_classy_state())
def test_prepare(self): pre_train_config = self._get_pre_train_config() pre_train_task = build_task(pre_train_config) pre_train_task.prepare() checkpoint = get_checkpoint_dict(pre_train_task, {}) fine_tuning_config = self._get_fine_tuning_config() fine_tuning_task = build_task(fine_tuning_config) # cannot prepare a fine tuning task without a pre training checkpoint with self.assertRaises(Exception): fine_tuning_task.prepare() fine_tuning_task.set_pretrained_checkpoint(checkpoint) fine_tuning_task.prepare() # test a fine tuning task with incompatible heads fine_tuning_config = self._get_fine_tuning_config(head_num_classes=10) fine_tuning_task = build_task(fine_tuning_config) fine_tuning_task.set_pretrained_checkpoint(checkpoint) # cannot prepare a fine tuning task with a pre training checkpoint which # has incompatible heads with self.assertRaises(Exception): fine_tuning_task.prepare() fine_tuning_task.set_pretrained_checkpoint(checkpoint).set_reset_heads( True) fine_tuning_task.prepare()
def test_checkpointing(self): # make checkpoint directory checkpoint_folder = self.base_dir + "/checkpoint/" os.mkdir(checkpoint_folder) config = get_fast_test_task_config() cuda_available = torch.cuda.is_available() task = build_task(config) task.prepare(use_gpu=cuda_available) # create a checkpoint hook checkpoint_hook = CheckpointHook(checkpoint_folder, {}, phase_types=["train"]) # call the on end phase function checkpoint_hook.on_phase_end(task) # we should be able to train a task using the checkpoint on all available # devices for use_gpu in {False, cuda_available}: # load the checkpoint checkpoint = load_checkpoint(checkpoint_folder) # create a new task task = build_task(config) # set the checkpoint task.set_checkpoint(checkpoint) task.prepare(use_gpu=use_gpu) # we should be able to run the trainer using the checkpoint trainer = LocalTrainer(use_gpu=use_gpu) trainer.train(task)
def test_train_parametric_loss(self): heads_num_classes = 100 pre_train_config = self._get_pre_train_config( head_num_classes=heads_num_classes) pre_train_config["loss"] = { "name": "batchnorm_cross_entropy_loss", "num_classes": heads_num_classes, } pre_train_task = build_task(pre_train_config) trainer = LocalTrainer() trainer.train(pre_train_task) checkpoint = get_checkpoint_dict(pre_train_task, {}) fine_tuning_config = self._get_fine_tuning_config( head_num_classes=heads_num_classes) fine_tuning_config["loss"] = { "name": "batchnorm_cross_entropy_loss", "num_classes": heads_num_classes, } fine_tuning_task = build_task(fine_tuning_config) fine_tuning_task._set_pretrained_checkpoint_dict( copy.deepcopy(checkpoint)) # run in test mode to compare the loss state. Since we have a BatchNorm module in # the loss, its moving mean/std should be unchanged when we run in test-only mode fine_tuning_task.set_test_only(True) loss_state = copy.deepcopy(fine_tuning_task.loss.get_classy_state()) trainer.train(fine_tuning_task) self._compare_state_dict(loss_state, fine_tuning_task.loss.get_classy_state())
def test_checkpointing(self): """ Tests checkpointing by running train_steps to make sure the train_steps run the same way after loading from a checkpoint. """ config = get_fast_test_task_config() task = build_task(config).set_hooks([LossLrMeterLoggingHook()]) task_2 = build_task(config).set_hooks([LossLrMeterLoggingHook()]) task.set_use_gpu(torch.cuda.is_available()) # only train 1 phase at a time trainer = LimitedPhaseTrainer(num_phases=1) while not task.done_training(): # set task's state as task_2's checkpoint task_2._set_checkpoint_dict( get_checkpoint_dict(task, {}, deep_copy=True)) # task 2 should have the same state before training self._compare_states(task.get_classy_state(), task_2.get_classy_state()) # train for one phase trainer.train(task) trainer.train(task_2) # task 2 should have the same state after training self._compare_states(task.get_classy_state(), task_2.get_classy_state())
def test_update_classy_model(self): """ Tests that the update_classy_model successfully updates from a checkpoint """ config = get_fast_test_task_config() task = build_task(config) trainer = LocalTrainer() trainer.train(task) for reset_heads in [False, True]: task_2 = build_task(config) # prepare task_2 for the right device task_2.prepare() update_classy_model(task_2.model, task.model.get_classy_state(deep_copy=True), reset_heads) self._compare_model_state( task.model.get_classy_state(), task_2.model.get_classy_state(), check_heads=not reset_heads, ) if reset_heads: # the model head states should be different with self.assertRaises(Exception): self._compare_model_state( task.model.get_classy_state(), task_2.model.get_classy_state(), check_heads=True, )
def test_final_train_checkpoint(self): """Test that a train phase checkpoint with a where of 1.0 can be loaded""" config = get_fast_test_task_config() task = build_task(config).set_hooks( [CheckpointHook(self.base_dir, {}, phase_types=["train"])]) task_2 = build_task(config) use_gpu = torch.cuda.is_available() trainer = LocalTrainer(use_gpu=use_gpu) trainer.train(task) # load the final train checkpoint checkpoint = load_checkpoint(self.base_dir) # make sure fetching the where raises an exception, which means that # where is >= 1.0 with self.assertRaises(Exception): task.where # set task_2's state as task's final train checkpoint task_2.set_checkpoint(checkpoint) task_2.prepare(use_gpu=use_gpu) # we should be able to train the task trainer.train(task_2)
def test_fp16_grad_compression(self): # there is no API defined to check that a DDP hook has been enabled, so we just # test that we set the right variables config = copy.deepcopy(get_fast_test_task_config()) task = build_task(config) self.assertFalse(task.fp16_grad_compress) config.setdefault("distributed", {}) config["distributed"]["fp16_grad_compress"] = True task = build_task(config) self.assertTrue(task.fp16_grad_compress)
def test_build_task(self): config = get_test_task_config() task = build_task(config) self.assertTrue(isinstance(task, ClassificationTask)) # check that AMP is disabled by default self.assertIsNone(task.amp_args) # test a valid AMP opt level config = copy.deepcopy(config) config["amp_args"] = {"opt_level": "O1"} task = build_task(config) self.assertTrue(isinstance(task, ClassificationTask))
def test_build_task(self): config = self._get_fine_tuning_config() task = build_task(config) self.assertIsInstance(task, FineTuningTask) config = self._get_fine_tuning_config(pretrained_checkpoint=True) with mock.patch( "classy_vision.tasks.FineTuningTask.set_pretrained_checkpoint" ): task = build_task(config) self.assertIsInstance(task, FineTuningTask)
def test_update_classy_state(self): """ Tests that the update_classy_state successfully updates from a checkpoint """ config = get_fast_test_task_config() task = build_task(config) task_2 = build_task(config) task_2.prepare() trainer = LocalTrainer() trainer.train(task) update_classy_state(task_2, task.get_classy_state(deep_copy=True)) self._compare_states(task.get_classy_state(), task_2.get_classy_state())
def test_checkpointing_different_device(self): config = get_fast_test_task_config() task = build_task(config) task_2 = build_task(config) for use_gpu in [True, False]: task.prepare(use_gpu=use_gpu) # set task's state as task_2's checkpoint task_2.set_checkpoint(get_checkpoint_dict(task, {}, deep_copy=True)) # we should be able to run the trainer using state from a different device trainer = LocalTrainer(use_gpu=not use_gpu) trainer.train(task_2)
def test_training(self): # Test an Apex AMP training config = get_fast_test_task_config() config["amp_args"] = {"opt_level": "O2"} task = build_task(config) task.set_use_gpu(True) trainer = LocalTrainer() trainer.train(task) # Test a Pytorch AMP training config["amp_args"] = {"amp_type": "pytorch"} task = build_task(config) task.set_use_gpu(True) trainer = LocalTrainer() trainer.train(task)
def test_from_checkpoint(self): config = get_test_task_config() for use_head in [True, False]: config["model"] = self.get_model_config(use_head) task = build_task(config) task.prepare() checkpoint_folder = f"{self.base_dir}/{use_head}/" input_args = {"config": config} # Simulate training by setting the model parameters to zero for param in task.model.parameters(): param.data.zero_() checkpoint_hook = CheckpointHook( checkpoint_folder, input_args, phase_types=["train"] ) # Create checkpoint dir, save checkpoint os.mkdir(checkpoint_folder) checkpoint_hook.on_start(task) task.train = True checkpoint_hook.on_phase_end(task) # Model should be checkpointed. load and compare checkpoint = load_checkpoint(checkpoint_folder) model = ClassyModel.from_checkpoint(checkpoint) self.assertTrue(isinstance(model, MyTestModel)) # All parameters must be zero for param in model.parameters(): self.assertTrue(torch.all(param.data == 0))
def execute_hook(self, config, torchscript_folder, torchscript_hook) -> None: task = build_task(config) task.prepare() # create checkpoint dir, verify on_start hook runs os.mkdir(torchscript_folder) torchscript_hook.on_start(task) task.train = True # call the on end function torchscript_hook.on_end(task) # load torchscript file torchscript_file_name = ( f"{torchscript_hook.torchscript_folder}/{TORCHSCRIPT_FILE}") torchscript = torch.jit.load(torchscript_file_name) # compare model load from checkpoint vs torchscript with torch.no_grad(): batchsize = 1 model = task.model input_data = torch.randn((batchsize, ) + model.input_shape, dtype=torch.float) if torch.cuda.is_available(): input_data = input_data.cuda() model = model.cuda() torchscript = torchscript.cuda() checkpoint_out = model(input_data) torchscript_out = torchscript(input_data) self.assertTrue( torch.allclose(checkpoint_out, torchscript_out, atol=1e-5))
def test_training(self): config = get_fast_test_task_config() config["amp_args"] = {"opt_level": "O2"} task = build_task(config) task.set_use_gpu(True) trainer = LocalTrainer() trainer.train(task)
def test_logging(self, mock_get_rank: mock.MagicMock) -> None: """ Test that the logging happens as expected and the loss and lr values are correct. """ rank = 5 mock_get_rank.return_value = rank # set up the task and state config = get_test_task_config() config["dataset"]["train"]["batchsize_per_replica"] = 2 config["dataset"]["test"]["batchsize_per_replica"] = 5 task = build_task(config) task.prepare() losses = [1.2, 2.3, 3.4, 4.5] local_variables = {} task.phase_idx = 0 for log_freq in [5, None]: # create a loss lr meter hook loss_lr_meter_hook = LossLrMeterLoggingHook(log_freq=log_freq) # check that _log_loss_meters() is called after on_step() every # log_freq batches and after on_phase_end() # and _log_lr() is called after on_step() every log_freq batches # and after on_phase_end() with mock.patch.object(loss_lr_meter_hook, "_log_loss_meters") as mock_fn: with mock.patch.object(loss_lr_meter_hook, "_log_lr") as mock_lr_fn: num_batches = 20 for i in range(num_batches): task.losses = list(range(i)) loss_lr_meter_hook.on_step(task, local_variables) if log_freq is not None and i and i % log_freq == 0: mock_fn.assert_called_with(task, local_variables) mock_fn.reset_mock() mock_lr_fn.assert_called_with( task, local_variables) mock_lr_fn.reset_mock() continue mock_fn.assert_not_called() mock_lr_fn.assert_not_called() loss_lr_meter_hook.on_phase_end(task, local_variables) mock_fn.assert_called_with(task, local_variables) if task.train: mock_lr_fn.assert_called_with(task, local_variables) # test _log_loss_lr_meters() task.losses = losses with self.assertLogs(): loss_lr_meter_hook._log_loss_meters(task, local_variables) loss_lr_meter_hook._log_lr(task, local_variables) task.phase_idx += 1
def test_logged_lr(self): # Mock LR scheduler def scheduler_mock(where): return where mock_lr_scheduler = mock.Mock(side_effect=scheduler_mock) mock_lr_scheduler.update_interval = UpdateInterval.STEP config = get_test_mlp_task_config() config["num_epochs"] = 3 config["dataset"]["train"]["batchsize_per_replica"] = 10 config["dataset"]["test"]["batchsize_per_replica"] = 5 task = build_task(config) task.optimizer.param_schedulers["lr"] = mock_lr_scheduler trainer = LocalTrainer() # 2 LR updates per epoch = 6 lr_order = [0.0, 1 / 6, 2 / 6, 3 / 6, 4 / 6, 5 / 6] lr_list = [] class LRLoggingHook(ClassyHook): on_end = ClassyHook._noop on_phase_end = ClassyHook._noop on_phase_start = ClassyHook._noop on_start = ClassyHook._noop def on_step(self, task): if task.train: lr_list.append(task.optimizer.parameters.lr) hook = LRLoggingHook() task.set_hooks([hook]) trainer.train(task) self.assertEqual(lr_list, lr_order)
def main(local_rank, c10d_backend, rdzv_init_url, max_world_size, classy_args): torch.manual_seed(0) set_video_backend(classy_args.video_backend) # Loads config, sets up task config = load_json(classy_args.config_file) task = build_task(config) # Load checkpoint, if available checkpoint = load_checkpoint(classy_args.checkpoint_folder) task.set_checkpoint(checkpoint) pretrained_checkpoint = load_checkpoint(classy_args.pretrained_checkpoint_folder) if pretrained_checkpoint is not None: assert isinstance( task, FineTuningTask ), "Can only use a pretrained checkpoint for fine tuning tasks" task.set_pretrained_checkpoint(pretrained_checkpoint) hooks = [ LossLrMeterLoggingHook(classy_args.log_freq), ModelComplexityHook(), TimeMetricsHook(), ] if classy_args.checkpoint_folder != "": args_dict = vars(classy_args) args_dict["config"] = config hooks.append( CheckpointHook( classy_args.checkpoint_folder, args_dict, checkpoint_period=classy_args.checkpoint_period, ) ) if classy_args.profiler: hooks.append(ProfilerHook()) task.set_hooks(hooks) assert c10d_backend == Backend.NCCL or c10d_backend == Backend.GLOO if c10d_backend == torch.distributed.Backend.NCCL: # needed to enable NCCL error handling os.environ["NCCL_BLOCKING_WAIT"] = "1" coordinator = CoordinatorP2P( c10d_backend=c10d_backend, init_method=rdzv_init_url, max_num_trainers=max_world_size, process_group_timeout=60000, ) trainer = ElasticTrainer( use_gpu=classy_args.device == "gpu", num_dataloader_workers=classy_args.num_workers, local_rank=local_rank, elastic_coordinator=coordinator, input_args={}, ) trainer.train(task)
def test_logged_lr(self): class SchedulerMock(ClassyParamScheduler): def __call__(self, where): return where mock_lr_scheduler = SchedulerMock(UpdateInterval.STEP) config = get_test_mlp_task_config() config["num_epochs"] = 3 config["dataset"]["train"]["batchsize_per_replica"] = 10 config["dataset"]["test"]["batchsize_per_replica"] = 5 task = build_task(config) task.set_optimizer_schedulers({"lr": mock_lr_scheduler}) trainer = LocalTrainer() # 2 LR updates per epoch = 6 lr_order = [0.0, 1 / 6, 2 / 6, 3 / 6, 4 / 6, 5 / 6] lr_list = [] class LRLoggingHook(ClassyHook): on_end = ClassyHook._noop on_phase_end = ClassyHook._noop on_phase_start = ClassyHook._noop on_start = ClassyHook._noop def on_step(self, task): if task.train: lr_list.append(task.optimizer.options_view.lr) hook = LRLoggingHook() task.set_hooks([hook]) trainer.train(task) self.assertEqual(lr_list, lr_order)
def test_streaming_dataset(self): """ Test that streaming datasets return the correct number of batches, and that the length is also calculated correctly. """ config = get_test_task_config() dataset_config = { "name": "synthetic_image_streaming", "split": "train", "crop_size": 224, "class_ratio": 0.5, "num_samples": 2000, "length": 4000, "seed": 0, "batchsize_per_replica": 32, "use_shuffle": True, } expected_batches = 62 config["dataset"]["train"] = dataset_config task = build_task(config) task.prepare() task.advance_phase() # test that the number of batches expected is correct self.assertEqual(task.num_batches_per_phase, expected_batches) # test that the data iterator returns the expected number of batches data_iterator = task.data_iterator self._test_number_of_batches(data_iterator, expected_batches) # test that the dataloader can be rebuilt task.build_dataloaders_for_current_phase() task.create_data_iterators() data_iterator = task.data_iterator self._test_number_of_batches(data_iterator, expected_batches)
def test_test_only_task(self): """ Tests the task in test mode by running train_steps to make sure the train_steps run as expected on a test_only task """ test_config = get_fast_test_task_config() test_config["test_only"] = True # delete train dataset del test_config["dataset"]["train"] test_only_task = build_task(test_config).set_hooks( [LossLrMeterLoggingHook()]) test_only_task.prepare() test_state = test_only_task.get_classy_state() # We expect that test only state is test, no matter what train state is self.assertFalse(test_state["train"]) # Num updates should be 0 self.assertEqual(test_state["num_updates"], 0) # Verify task will run trainer = LocalTrainer() trainer.train(test_only_task)
def test_logged_lr(self): # Mock LR scheduler def scheduler_mock(where): return where mock_lr_scheduler = mock.Mock(side_effect=scheduler_mock) mock_lr_scheduler.update_interval = UpdateInterval.STEP config = get_test_mlp_task_config() config["num_epochs"] = 3 config["dataset"]["train"]["batchsize_per_replica"] = 5 config["dataset"]["test"]["batchsize_per_replica"] = 5 task = build_task(config) task.optimizer.lr_scheduler = mock_lr_scheduler trainer = LocalTrainer() # 2 LR updates per epoch # At end of each epoch for train, LR is logged an additional time lr_order = [ 0.0, 1 / 6, 1 / 6, 2 / 6, 3 / 6, 3 / 6, 4 / 6, 5 / 6, 5 / 6 ] lr_list = [] def mock_log_lr(task: ClassyTask, local_variables) -> None: lr_list.append(task.optimizer.lr) with mock.patch.object(LossLrMeterLoggingHook, "_log_lr", side_effect=mock_log_lr): hook = LossLrMeterLoggingHook(1) task.set_hooks([hook]) trainer.train(task) self.assertEqual(lr_list, lr_order)
def test_test_only_checkpointing(self): """ Tests checkpointing by running train_steps to make sure the train_steps run the same way after loading from a training task checkpoint on a test_only task. """ train_config = get_fast_test_task_config() train_config["num_epochs"] = 10 test_config = get_fast_test_task_config() test_config["test_only"] = True train_task = build_task(train_config).set_hooks( [LossLrMeterLoggingHook()]) test_only_task = build_task(test_config).set_hooks( [LossLrMeterLoggingHook()]) use_gpu = torch.cuda.is_available() # prepare the tasks for the right device train_task.prepare(use_gpu=use_gpu) # test in both train and test mode trainer = LocalTrainer(use_gpu=use_gpu) trainer.train(train_task) # set task's state as task_2's checkpoint test_only_task.set_checkpoint( get_checkpoint_dict(train_task, {}, deep_copy=True)) test_only_task.prepare(use_gpu=use_gpu) test_state = test_only_task.get_classy_state() # We expect the phase idx to be different for a test only task self.assertEqual(test_state["phase_idx"], -1) # We expect that test only state is test, no matter what train state is self.assertFalse(test_state["train"]) # Num updates should be 0 self.assertEqual(test_state["num_updates"], 0) # train_phase_idx should -1 self.assertEqual(test_state["train_phase_idx"], -1) # Verify task will run trainer = LocalTrainer(use_gpu=use_gpu) trainer.train(test_only_task)
def test_train(self): pre_train_config = self._get_pre_train_config(head_num_classes=100) pre_train_task = build_task(pre_train_config) trainer = LocalTrainer() trainer.train(pre_train_task) checkpoint = get_checkpoint_dict(pre_train_task, {}) for reset_heads, heads_num_classes in [(False, 100), (True, 20)]: for freeze_trunk in [True, False]: fine_tuning_config = self._get_fine_tuning_config( head_num_classes=heads_num_classes) fine_tuning_task = build_task(fine_tuning_config) fine_tuning_task = ( fine_tuning_task._set_pretrained_checkpoint_dict( copy.deepcopy(checkpoint)).set_reset_heads( reset_heads).set_freeze_trunk(freeze_trunk)) # run in test mode to compare the model state fine_tuning_task.set_test_only(True) trainer.train(fine_tuning_task) self._compare_model_state( pre_train_task.model.get_classy_state(), fine_tuning_task.model.get_classy_state(), check_heads=not reset_heads, ) # run in train mode to check accuracy fine_tuning_task.set_test_only(False) trainer.train(fine_tuning_task) if freeze_trunk: # if trunk is frozen the states should be the same self._compare_model_state( pre_train_task.model.get_classy_state(), fine_tuning_task.model.get_classy_state(), check_heads=False, ) else: # trunk isn't frozen, the states should be different with self.assertRaises(Exception): self._compare_model_state( pre_train_task.model.get_classy_state(), fine_tuning_task.model.get_classy_state(), check_heads=False, ) accuracy = fine_tuning_task.meters[0].value["top_1"] self.assertAlmostEqual(accuracy, 1.0)
def test_train(self): config = get_test_mlp_task_config() task = build_task(config) num_samples = 10 precise_batch_norm_hook = PreciseBatchNormHook(num_samples) task.set_hooks([precise_batch_norm_hook]) task.prepare() trainer = ClassyTrainer() trainer.train(task)
def test_build_task(self): config = get_test_task_config() task = build_task(config) self.assertTrue(isinstance(task, ClassificationTask)) # check that AMP is disabled by default self.assertIsNone(task.amp_opt_level) # test a valid AMP opt level config = copy.deepcopy(config) config["amp_opt_level"] = "O1" task = build_task(config) self.assertTrue(isinstance(task, ClassificationTask)) # test an invalid AMP opt level config = copy.deepcopy(config) config["amp_opt_level"] = "O5" with self.assertRaises(Exception): task = build_task(config)
def test_train_step(self): # test that the model can be run in a train step model = models.resnet34(pretrained=False) classy_model = ClassyModelWrapper(model) config = get_fast_test_task_config() task = build_task(config) task.set_model(classy_model) trainer = LocalTrainer() trainer.train(task)
def test_synchronize_losses_when_losses_empty(self): config = get_fast_test_task_config() task = build_task(config) task.prepare() task.set_use_gpu(torch.cuda.is_available()) # Losses should be empty when creating task self.assertEqual(len(task.losses), 0) task.synchronize_losses()
def test_synchronize_losses_non_distributed(self): """ Tests that synchronize losses has no side effects in a non-distributed setting. """ test_config = get_fast_test_task_config() task = build_task(test_config) task.prepare() old_losses = copy.deepcopy(task.losses) task.synchronize_losses() self.assertEqual(old_losses, task.losses)