def test_update_classy_model(self): """ Tests that the update_classy_model successfully updates from a checkpoint """ config = get_fast_test_task_config() task = build_task(config) trainer = LocalTrainer() trainer.train(task) for reset_heads in [False, True]: task_2 = build_task(config) # prepare task_2 for the right device task_2.prepare() update_classy_model(task_2.model, task.model.get_classy_state(deep_copy=True), reset_heads) self._compare_model_state( task.model.get_classy_state(), task_2.model.get_classy_state(), check_heads=not reset_heads, ) if reset_heads: # the model head states should be different with self.assertRaises(Exception): self._compare_model_state( task.model.get_classy_state(), task_2.model.get_classy_state(), check_heads=True, )
def test_checkpointing(self): """ Tests checkpointing by running train_steps to make sure the train_steps run the same way after loading from a checkpoint. """ config = get_fast_test_task_config() task = build_task(config).set_hooks([LossLrMeterLoggingHook()]) task_2 = build_task(config).set_hooks([LossLrMeterLoggingHook()]) task.set_use_gpu(torch.cuda.is_available()) # prepare the tasks for the right device task.prepare() # test in both train and test mode for _ in range(2): task.advance_phase() # set task's state as task_2's checkpoint task_2.set_checkpoint(get_checkpoint_dict(task, {}, deep_copy=True)) task_2.prepare() # task 2 should have the same state self._compare_states(task.get_classy_state(), task_2.get_classy_state()) # this tests that both states' iterators return the same samples sample = next(task.get_data_iterator()) sample_2 = next(task_2.get_data_iterator()) self._compare_samples(sample, sample_2) # test that the train step runs the same way on both states # and the loss remains the same task.train_step() task_2.train_step() self._compare_states(task.get_classy_state(), task_2.get_classy_state())
def test_training(self): config = get_fast_test_task_config() config["amp_args"] = {"opt_level": "O2"} task = build_task(config) task.set_use_gpu(True) trainer = LocalTrainer() trainer.train(task)
def test_checkpointing(self): # make checkpoint directory checkpoint_folder = self.base_dir + "/checkpoint/" os.mkdir(checkpoint_folder) config = get_fast_test_task_config() cuda_available = torch.cuda.is_available() task = build_task(config) task.prepare(use_gpu=cuda_available) # create a checkpoint hook checkpoint_hook = CheckpointHook(checkpoint_folder, {}, phase_types=["train"]) # call the on end phase function checkpoint_hook.on_phase_end(task) # we should be able to train a task using the checkpoint on all available # devices for use_gpu in {False, cuda_available}: # load the checkpoint checkpoint = load_checkpoint(checkpoint_folder) # create a new task task = build_task(config) # set the checkpoint task.set_checkpoint(checkpoint) task.prepare(use_gpu=use_gpu) # we should be able to run the trainer using the checkpoint trainer = LocalTrainer(use_gpu=use_gpu) trainer.train(task)
def test_test_only_task(self): """ Tests the task in test mode by running train_steps to make sure the train_steps run as expected on a test_only task """ test_config = get_fast_test_task_config() test_config["test_only"] = True # delete train dataset del test_config["dataset"]["train"] test_only_task = build_task(test_config).set_hooks( [LossLrMeterLoggingHook()]) test_only_task.prepare() test_state = test_only_task.get_classy_state() # We expect that test only state is test, no matter what train state is self.assertFalse(test_state["train"]) # Num updates should be 0 self.assertEqual(test_state["num_updates"], 0) # Verify task will run trainer = LocalTrainer() trainer.train(test_only_task)
def test_final_train_checkpoint(self): """Test that a train phase checkpoint with a where of 1.0 can be loaded""" config = get_fast_test_task_config() task = build_task(config).set_hooks( [CheckpointHook(self.base_dir, {}, phase_types=["train"])]) task_2 = build_task(config) use_gpu = torch.cuda.is_available() trainer = LocalTrainer(use_gpu=use_gpu) trainer.train(task) # load the final train checkpoint checkpoint = load_checkpoint(self.base_dir) # make sure fetching the where raises an exception, which means that # where is >= 1.0 with self.assertRaises(Exception): task.where # set task_2's state as task's final train checkpoint task_2.set_checkpoint(checkpoint) task_2.prepare(use_gpu=use_gpu) # we should be able to train the task trainer.train(task_2)
def test_checkpointing(self): """ Tests checkpointing by running train_steps to make sure the train_steps run the same way after loading from a checkpoint. """ config = get_fast_test_task_config() task = build_task(config).set_hooks([LossLrMeterLoggingHook()]) task_2 = build_task(config).set_hooks([LossLrMeterLoggingHook()]) task.set_use_gpu(torch.cuda.is_available()) # only train 1 phase at a time trainer = LimitedPhaseTrainer(num_phases=1) while not task.done_training(): # set task's state as task_2's checkpoint task_2._set_checkpoint_dict( get_checkpoint_dict(task, {}, deep_copy=True)) # task 2 should have the same state before training self._compare_states(task.get_classy_state(), task_2.get_classy_state()) # train for one phase trainer.train(task) trainer.train(task_2) # task 2 should have the same state after training self._compare_states(task.get_classy_state(), task_2.get_classy_state())
def test_test_only_checkpointing(self): """ Tests checkpointing by running train_steps to make sure the train_steps run the same way after loading from a training task checkpoint on a test_only task. """ train_config = get_fast_test_task_config() train_config["num_epochs"] = 10 test_config = get_fast_test_task_config() test_config["test_only"] = True train_task = build_task(train_config).set_hooks( [LossLrMeterLoggingHook()]) test_only_task = build_task(test_config).set_hooks( [LossLrMeterLoggingHook()]) use_gpu = torch.cuda.is_available() # prepare the tasks for the right device train_task.prepare(use_gpu=use_gpu) # test in both train and test mode trainer = LocalTrainer(use_gpu=use_gpu) trainer.train(train_task) # set task's state as task_2's checkpoint test_only_task.set_checkpoint( get_checkpoint_dict(train_task, {}, deep_copy=True)) test_only_task.prepare(use_gpu=use_gpu) test_state = test_only_task.get_classy_state() # We expect the phase idx to be different for a test only task self.assertEqual(test_state["phase_idx"], -1) # We expect that test only state is test, no matter what train state is self.assertFalse(test_state["train"]) # Num updates should be 0 self.assertEqual(test_state["num_updates"], 0) # train_phase_idx should -1 self.assertEqual(test_state["train_phase_idx"], -1) # Verify task will run trainer = LocalTrainer(use_gpu=use_gpu) trainer.train(test_only_task)
def test_train_step(self): # test that the model can be run in a train step model = models.resnet34(pretrained=False) classy_model = ClassyModelWrapper(model) config = get_fast_test_task_config() task = build_task(config) task.set_model(classy_model) trainer = LocalTrainer() trainer.train(task)
def test_synchronize_losses_when_losses_empty(self): config = get_fast_test_task_config() task = build_task(config) task.prepare() task.set_use_gpu(torch.cuda.is_available()) # Losses should be empty when creating task self.assertEqual(len(task.losses), 0) task.synchronize_losses()
def _get_task_config(self): config = get_fast_test_task_config() config["optimizer"] = { "name": "zero", "base_optimizer": { "name": "sgd", "momentum": 0.9 }, } return config
def _get_fine_tuning_config(self, head_num_classes=100, pretrained_checkpoint=False): config = get_fast_test_task_config(head_num_classes=head_num_classes) config["name"] = "fine_tuning" config["num_epochs"] = 2 if pretrained_checkpoint: config["pretrained_checkpoint"] = "/path/to/pretrained/checkpoint" return config
def test_synchronize_losses_non_distributed(self): """ Tests that synchronize losses has no side effects in a non-distributed setting. """ test_config = get_fast_test_task_config() task = build_task(test_config) task.prepare() old_losses = copy.deepcopy(task.losses) task.synchronize_losses() self.assertEqual(old_losses, task.losses)
def test_fp16_grad_compression(self): # there is no API defined to check that a DDP hook has been enabled, so we just # test that we set the right variables config = copy.deepcopy(get_fast_test_task_config()) task = build_task(config) self.assertFalse(task.fp16_grad_compress) config.setdefault("distributed", {}) config["distributed"]["fp16_grad_compress"] = True task = build_task(config) self.assertTrue(task.fp16_grad_compress)
def test_update_classy_state(self): """ Tests that the update_classy_state successfully updates from a checkpoint """ config = get_fast_test_task_config() task = build_task(config) task_2 = build_task(config) task_2.prepare() trainer = LocalTrainer() trainer.train(task) update_classy_state(task_2, task.get_classy_state(deep_copy=True)) self._compare_states(task.get_classy_state(), task_2.get_classy_state())
def test_checkpointing_different_device(self): config = get_fast_test_task_config() task = build_task(config) task_2 = build_task(config) for use_gpu in [True, False]: task.prepare(use_gpu=use_gpu) # set task's state as task_2's checkpoint task_2.set_checkpoint(get_checkpoint_dict(task, {}, deep_copy=True)) # we should be able to run the trainer using state from a different device trainer = LocalTrainer(use_gpu=not use_gpu) trainer.train(task_2)
def test_training(self): # Test an Apex AMP training config = get_fast_test_task_config() config["amp_args"] = {"opt_level": "O2"} task = build_task(config) task.set_use_gpu(True) trainer = LocalTrainer() trainer.train(task) # Test a Pytorch AMP training config["amp_args"] = {"amp_type": "pytorch"} task = build_task(config) task.set_use_gpu(True) trainer = LocalTrainer() trainer.train(task)
def test_train_only_task(self): """ Tests that the task runs when only a train dataset is specified. """ test_config = get_fast_test_task_config() # delete the test dataset from the config del test_config["dataset"]["test"] task = build_task(test_config).set_hooks([LossLrMeterLoggingHook()]) task.prepare() # verify the the task can still be trained trainer = LocalTrainer() trainer.train(task)
def test_train(self) -> None: for use_gpu in {False, torch.cuda.is_available()}: folder = f"{self.base_dir}/train_test/{use_gpu}" os.makedirs(folder) task = build_task(get_fast_test_task_config(head_num_classes=2)) csv_hook = OutputCSVHook(folder) task.set_hooks([csv_hook]) task.set_use_gpu(use_gpu) trainer = LocalTrainer() trainer.train(task) self.assertEqual(parse_csv(csv_hook.output_path), 10)
def train_with_clipped_gradients(amp_args=None): task = build_task(get_fast_test_task_config()) task.set_num_epochs(1) task.set_model(SimpleModel()) task.set_loss(SimpleLoss()) task.set_meters([]) task.set_use_gpu(torch.cuda.is_available()) task.set_clip_grad_norm(0.5) task.set_amp_args(amp_args) task.set_optimizer(SGD(lr=1)) trainer = LocalTrainer() trainer.train(task) return task.model.param.grad.norm()
def test_clip_stateful_loss(self): config = get_fast_test_task_config() config["loss"] = {"name": "test_stateful_loss", "in_plane": 256} config["grad_norm_clip"] = grad_norm_clip = 1 task = build_task(config) task.set_use_gpu(False) task.prepare() # set fake gradients with norm > grad_norm_clip for param in itertools.chain(task.base_model.parameters(), task.base_loss.parameters()): param.grad = 1.1 + torch.rand(param.shape) self.assertGreater(param.grad.norm(), grad_norm_clip) task._clip_gradients(grad_norm_clip) for param in itertools.chain(task.base_model.parameters(), task.base_loss.parameters()): self.assertLessEqual(param.grad.norm(), grad_norm_clip)
def test_final_train_checkpoint(self): """Test that a train phase checkpoint with a where of 1.0 can be loaded""" config = get_fast_test_task_config() task = build_task(config).set_hooks( [CheckpointHook(self.base_dir, {}, phase_types=["train"])]) task_2 = build_task(config) task.set_use_gpu(torch.cuda.is_available()) trainer = LocalTrainer() trainer.train(task) self.assertAlmostEqual(task.where, 1.0, delta=1e-3) # set task_2's state as task's final train checkpoint task_2.set_checkpoint(self.base_dir) task_2.prepare() # we should be able to train the task trainer.train(task_2)
def train_with_batch(self, simulated_bs, actual_bs, clip_grad_norm=None): config = copy.deepcopy(get_fast_test_task_config()) config["dataset"]["train"]["num_samples"] = 12 config["dataset"]["train"]["batchsize_per_replica"] = actual_bs del config["dataset"]["test"] task = build_task(config) task.set_num_epochs(1) task.set_model(SimpleModel()) task.set_loss(SimpleLoss()) task.set_meters([]) task.set_use_gpu(torch.cuda.is_available()) if simulated_bs is not None: task.set_simulated_global_batchsize(simulated_bs) if clip_grad_norm is not None: task.set_clip_grad_norm(clip_grad_norm) task.set_optimizer(SGD(lr=1)) trainer = LocalTrainer() trainer.train(task) return task.model.param
def test_training(self): config = get_fast_test_task_config() config["amp_opt_level"] = "O2" task = build_task(config) trainer = LocalTrainer(use_gpu=True) trainer.train(task)
def test_get_classy_state_on_loss(self): config = get_fast_test_task_config() config["loss"] = {"name": "test_stateful_loss", "in_plane": 256} task = build_task(config) task.prepare() self.assertIn("alpha", task.get_classy_state()["loss"])
def _get_fine_tuning_config(self, head_num_classes=1000): config = get_fast_test_task_config(head_num_classes=head_num_classes) config["name"] = "fine_tuning" config["num_epochs"] = 10 return config
def _get_pre_train_config(self, head_num_classes=100): config = get_fast_test_task_config(head_num_classes=head_num_classes) config["num_epochs"] = 2 return config