def test_checkpointing(self): """ Tests checkpointing by running train_steps to make sure the train_steps run the same way after loading from a checkpoint. """ config = get_fast_test_task_config() task = build_task(config).set_hooks([LossLrMeterLoggingHook()]) task_2 = build_task(config).set_hooks([LossLrMeterLoggingHook()]) task.set_use_gpu(torch.cuda.is_available()) # prepare the tasks for the right device task.prepare() # test in both train and test mode for _ in range(2): task.advance_phase() # set task's state as task_2's checkpoint task_2.set_checkpoint(get_checkpoint_dict(task, {}, deep_copy=True)) task_2.prepare() # task 2 should have the same state self._compare_states(task.get_classy_state(), task_2.get_classy_state()) # this tests that both states' iterators return the same samples sample = next(task.get_data_iterator()) sample_2 = next(task_2.get_data_iterator()) self._compare_samples(sample, sample_2) # test that the train step runs the same way on both states # and the loss remains the same task.train_step() task_2.train_step() self._compare_states(task.get_classy_state(), task_2.get_classy_state())
def save(self, stream): checkpoint_state = get_checkpoint_dict(self.task, self.input_args) checkpoint_state[ "advance_to_next_phase"] = self.advance_to_next_phase checkpoint_state["skip_current_phase"] = self.skip_current_phase checkpoint_state["run_start_hooks"] = self.run_start_hooks torch.save(checkpoint_state, stream)
def test_train_parametric_loss(self): heads_num_classes = 100 pre_train_config = self._get_pre_train_config( head_num_classes=heads_num_classes) pre_train_config["loss"] = { "name": "batchnorm_cross_entropy_loss", "num_classes": heads_num_classes, } pre_train_task = build_task(pre_train_config) trainer = LocalTrainer() trainer.train(pre_train_task) checkpoint = get_checkpoint_dict(pre_train_task, {}) fine_tuning_config = self._get_fine_tuning_config( head_num_classes=heads_num_classes) fine_tuning_config["loss"] = { "name": "batchnorm_cross_entropy_loss", "num_classes": heads_num_classes, } fine_tuning_task = build_task(fine_tuning_config) fine_tuning_task._set_pretrained_checkpoint_dict( copy.deepcopy(checkpoint)) # run in test mode to compare the loss state. Since we have a BatchNorm module in # the loss, its moving mean/std should be unchanged when we run in test-only mode fine_tuning_task.set_test_only(True) loss_state = copy.deepcopy(fine_tuning_task.loss.get_classy_state()) trainer.train(fine_tuning_task) self._compare_state_dict(loss_state, fine_tuning_task.loss.get_classy_state())
def test_prepare(self): pre_train_config = self._get_pre_train_config() pre_train_task = build_task(pre_train_config) pre_train_task.prepare() checkpoint = get_checkpoint_dict(pre_train_task, {}) fine_tuning_config = self._get_fine_tuning_config() fine_tuning_task = build_task(fine_tuning_config) # cannot prepare a fine tuning task without a pre training checkpoint with self.assertRaises(Exception): fine_tuning_task.prepare() fine_tuning_task.set_pretrained_checkpoint(checkpoint) fine_tuning_task.prepare() # test a fine tuning task with incompatible heads fine_tuning_config = self._get_fine_tuning_config(head_num_classes=10) fine_tuning_task = build_task(fine_tuning_config) fine_tuning_task.set_pretrained_checkpoint(checkpoint) # cannot prepare a fine tuning task with a pre training checkpoint which # has incompatible heads with self.assertRaises(Exception): fine_tuning_task.prepare() fine_tuning_task.set_pretrained_checkpoint(checkpoint).set_reset_heads( True) fine_tuning_task.prepare()
def test_checkpointing(self): """ Tests checkpointing by running train_steps to make sure the train_steps run the same way after loading from a checkpoint. """ config = get_fast_test_task_config() task = build_task(config).set_hooks([LossLrMeterLoggingHook()]) task_2 = build_task(config).set_hooks([LossLrMeterLoggingHook()]) task.set_use_gpu(torch.cuda.is_available()) # only train 1 phase at a time trainer = LimitedPhaseTrainer(num_phases=1) while not task.done_training(): # set task's state as task_2's checkpoint task_2._set_checkpoint_dict( get_checkpoint_dict(task, {}, deep_copy=True)) # task 2 should have the same state before training self._compare_states(task.get_classy_state(), task_2.get_classy_state()) # train for one phase trainer.train(task) trainer.train(task_2) # task 2 should have the same state after training self._compare_states(task.get_classy_state(), task_2.get_classy_state())
def test_checkpointing_different_device(self): config = get_fast_test_task_config() task = build_task(config) task_2 = build_task(config) for use_gpu in [True, False]: task.prepare(use_gpu=use_gpu) # set task's state as task_2's checkpoint task_2.set_checkpoint(get_checkpoint_dict(task, {}, deep_copy=True)) # we should be able to run the trainer using state from a different device trainer = LocalTrainer(use_gpu=not use_gpu) trainer.train(task_2)
def _save_checkpoint(self, task, filename): if getattr(task, "test_only", False): return assert PathManager.exists( self.checkpoint_folder ), "Checkpoint folder '{}' deleted unexpectedly".format(self.checkpoint_folder) # save checkpoint: logging.info("Saving checkpoint to '{}'...".format(self.checkpoint_folder)) checkpoint_file = save_checkpoint( self.checkpoint_folder, get_checkpoint_dict(task, self.input_args) ) # make copy of checkpoint that won't be overwritten: PathManager.copy(checkpoint_file, f"{self.checkpoint_folder}/{filename}")
def test_test_only_checkpointing(self): """ Tests checkpointing by running train_steps to make sure the train_steps run the same way after loading from a training task checkpoint on a test_only task. """ train_config = get_fast_test_task_config() train_config["num_epochs"] = 10 test_config = get_fast_test_task_config() test_config["test_only"] = True train_task = build_task(train_config).set_hooks( [LossLrMeterLoggingHook()]) test_only_task = build_task(test_config).set_hooks( [LossLrMeterLoggingHook()]) use_gpu = torch.cuda.is_available() # prepare the tasks for the right device train_task.prepare(use_gpu=use_gpu) # test in both train and test mode trainer = LocalTrainer(use_gpu=use_gpu) trainer.train(train_task) # set task's state as task_2's checkpoint test_only_task.set_checkpoint( get_checkpoint_dict(train_task, {}, deep_copy=True)) test_only_task.prepare(use_gpu=use_gpu) test_state = test_only_task.get_classy_state() # We expect the phase idx to be different for a test only task self.assertEqual(test_state["phase_idx"], -1) # We expect that test only state is test, no matter what train state is self.assertFalse(test_state["train"]) # Num updates should be 0 self.assertEqual(test_state["num_updates"], 0) # train_phase_idx should -1 self.assertEqual(test_state["train_phase_idx"], -1) # Verify task will run trainer = LocalTrainer(use_gpu=use_gpu) trainer.train(test_only_task)
def test_train(self): pre_train_config = self._get_pre_train_config(head_num_classes=100) pre_train_task = build_task(pre_train_config) trainer = LocalTrainer() trainer.train(pre_train_task) checkpoint = get_checkpoint_dict(pre_train_task, {}) for reset_heads, heads_num_classes in [(False, 100), (True, 20)]: for freeze_trunk in [True, False]: fine_tuning_config = self._get_fine_tuning_config( head_num_classes=heads_num_classes) fine_tuning_task = build_task(fine_tuning_config) fine_tuning_task = ( fine_tuning_task._set_pretrained_checkpoint_dict( copy.deepcopy(checkpoint)).set_reset_heads( reset_heads).set_freeze_trunk(freeze_trunk)) # run in test mode to compare the model state fine_tuning_task.set_test_only(True) trainer.train(fine_tuning_task) self._compare_model_state( pre_train_task.model.get_classy_state(), fine_tuning_task.model.get_classy_state(), check_heads=not reset_heads, ) # run in train mode to check accuracy fine_tuning_task.set_test_only(False) trainer.train(fine_tuning_task) if freeze_trunk: # if trunk is frozen the states should be the same self._compare_model_state( pre_train_task.model.get_classy_state(), fine_tuning_task.model.get_classy_state(), check_heads=False, ) else: # trunk isn't frozen, the states should be different with self.assertRaises(Exception): self._compare_model_state( pre_train_task.model.get_classy_state(), fine_tuning_task.model.get_classy_state(), check_heads=False, ) accuracy = fine_tuning_task.meters[0].value["top_1"] self.assertAlmostEqual(accuracy, 1.0)
def _save_checkpoint(self, task, filename): if getattr(task, "test_only", False): return assert os.path.exists( self.checkpoint_folder ), "Checkpoint folder '{}' deleted unexpectedly".format(self.checkpoint_folder) # save checkpoint: logging.info("Saving checkpoint to '{}'...".format(self.checkpoint_folder)) checkpoint_file = save_checkpoint( self.checkpoint_folder, get_checkpoint_dict(task, self.input_args) ) # make copy of checkpoint that won't be overwritten: if checkpoint_file: tmp_dir = tempfile.mkdtemp() tmp_file = os.path.join(tmp_dir, filename) copy2(checkpoint_file, tmp_file) move(tmp_file, os.path.join(self.checkpoint_folder, filename))
def _save_checkpoint(self, task, filename): if getattr(task, "test_only", False): return assert PathManager.exists( self.checkpoint_folder ), "Checkpoint folder '{}' deleted unexpectedly".format( self.checkpoint_folder) for prefix in gfs_prefix_list: if self.checkpoint_folder.startswith(prefix): logging.warning( "GFS is deprecating... please save checkpoint to manifold!" ) break # save checkpoint: logging.info("Saving checkpoint to '{}'...".format( self.checkpoint_folder)) checkpoint_file = save_checkpoint( self.checkpoint_folder, get_checkpoint_dict(task, self.input_args)) # make copy of checkpoint that won't be overwritten: PathManager.copy(checkpoint_file, f"{self.checkpoint_folder}/{filename}")
def test_prepare(self): pre_train_config = self._get_pre_train_config() pre_train_task = build_task(pre_train_config) pre_train_task.prepare() checkpoint = get_checkpoint_dict(pre_train_task, {}) fine_tuning_config = self._get_fine_tuning_config() fine_tuning_task = build_task(fine_tuning_config) # test: cannot prepare a fine tuning task without a pre-trained checkpoint with self.assertRaises(Exception): fine_tuning_task.prepare() # test: prepare should succeed after pre-trained checkpoint is set fine_tuning_task = build_task(fine_tuning_config) fine_tuning_task._set_pretrained_checkpoint_dict(checkpoint) fine_tuning_task.prepare() # test: prepare should succeed if a pre-trained checkpoint is provided in the # config fine_tuning_config = self._get_fine_tuning_config( pretrained_checkpoint=True) fine_tuning_task = build_task(fine_tuning_config) with mock.patch( "classy_vision.tasks.fine_tuning_task.load_and_broadcast_checkpoint", return_value=checkpoint, ): fine_tuning_task.prepare() # test: a fine tuning task with incompatible heads with a manually set # pre-trained checkpoint should fail to prepare if the heads are not reset fine_tuning_config = self._get_fine_tuning_config(head_num_classes=10) fine_tuning_task = build_task(fine_tuning_config) fine_tuning_task._set_pretrained_checkpoint_dict(checkpoint) with self.assertRaises(Exception): fine_tuning_task.prepare() # test: a fine tuning task with incompatible heads with a manually set # pre-trained checkpoint should succeed to prepare if the heads are reset fine_tuning_task = build_task(fine_tuning_config) fine_tuning_task._set_pretrained_checkpoint_dict( copy.deepcopy(checkpoint)).set_reset_heads(True) fine_tuning_task.prepare() # test: a fine tuning task with incompatible heads with the pre-trained # checkpoint provided in the config should fail to prepare fine_tuning_config = self._get_fine_tuning_config( head_num_classes=10, pretrained_checkpoint=True) fine_tuning_task = build_task(fine_tuning_config) with mock.patch( "classy_vision.tasks.fine_tuning_task.load_and_broadcast_checkpoint", return_value=copy.deepcopy(checkpoint), ) and self.assertRaises(Exception): fine_tuning_task.prepare() # test: a fine tuning task with incompatible heads with the pre-trained # checkpoint provided in the config should succeed to prepare if the heads are # reset fine_tuning_task = build_task(fine_tuning_config) fine_tuning_task.set_reset_heads(True) with mock.patch( "classy_vision.tasks.fine_tuning_task.load_and_broadcast_checkpoint", return_value=copy.deepcopy(checkpoint), ): fine_tuning_task.prepare()
def save(self, stream): checkpoint_state = get_checkpoint_dict(self.task, self.input_args) checkpoint_state[ "advance_to_next_phase"] = self.advance_to_next_phase torch.save(checkpoint_state, stream)