def test_test_only_task(self): """ Tests the task in test mode by running train_steps to make sure the train_steps run as expected on a test_only task """ test_config = get_fast_test_task_config() test_config["test_only"] = True # delete train dataset del test_config["dataset"]["train"] test_only_task = build_task(test_config).set_hooks( [LossLrMeterLoggingHook()]) test_only_task.prepare() test_state = test_only_task.get_classy_state() # We expect that test only state is test, no matter what train state is self.assertFalse(test_state["train"]) # Num updates should be 0 self.assertEqual(test_state["num_updates"], 0) # Verify task will run trainer = LocalTrainer() trainer.train(test_only_task)
def test_update_classy_model(self): """ Tests that the update_classy_model successfully updates from a checkpoint """ config = get_fast_test_task_config() task = build_task(config) trainer = LocalTrainer() trainer.train(task) for reset_heads in [False, True]: task_2 = build_task(config) # prepare task_2 for the right device task_2.prepare() update_classy_model(task_2.model, task.model.get_classy_state(deep_copy=True), reset_heads) self._compare_model_state( task.model.get_classy_state(), task_2.model.get_classy_state(), check_heads=not reset_heads, ) if reset_heads: # the model head states should be different with self.assertRaises(Exception): self._compare_model_state( task.model.get_classy_state(), task_2.model.get_classy_state(), check_heads=True, )
def test_logged_lr(self): # Mock LR scheduler def scheduler_mock(where): return where mock_lr_scheduler = mock.Mock(side_effect=scheduler_mock) mock_lr_scheduler.update_interval = UpdateInterval.STEP config = get_test_mlp_task_config() config["num_epochs"] = 3 config["dataset"]["train"]["batchsize_per_replica"] = 10 config["dataset"]["test"]["batchsize_per_replica"] = 5 task = build_task(config) task.optimizer.param_schedulers["lr"] = mock_lr_scheduler trainer = LocalTrainer() # 2 LR updates per epoch = 6 lr_order = [0.0, 1 / 6, 2 / 6, 3 / 6, 4 / 6, 5 / 6] lr_list = [] class LRLoggingHook(ClassyHook): on_end = ClassyHook._noop on_phase_end = ClassyHook._noop on_phase_start = ClassyHook._noop on_start = ClassyHook._noop def on_step(self, task): if task.train: lr_list.append(task.optimizer.parameters.lr) hook = LRLoggingHook() task.set_hooks([hook]) trainer.train(task) self.assertEqual(lr_list, lr_order)
def test_one(self): train_dataset = MyDataset( batchsize_per_replica=32, shuffle=False, transform=GenericImageTransform( transform=transforms.Compose( [ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize( mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] ), ] ) ), num_samples=100, crop_size=224, class_ratio=0.5, seed=0, ) test_dataset = MyDataset( batchsize_per_replica=32, shuffle=False, transform=GenericImageTransform( transform=transforms.Compose( [ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize( mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] ), ] ) ), num_samples=100, crop_size=224, class_ratio=0.5, seed=0, ) model = MyModel() loss = MyLoss() optimizer = SGD(lr_scheduler=ConstantParamScheduler(0.01)) task = ( ClassificationTask() .set_model(model) .set_dataset(train_dataset, "train") .set_dataset(test_dataset, "test") .set_loss(loss) .set_optimizer(optimizer) .set_num_epochs(1) ) trainer = LocalTrainer() trainer.train(task)
def test_train_parametric_loss(self): heads_num_classes = 100 pre_train_config = self._get_pre_train_config( head_num_classes=heads_num_classes) pre_train_config["loss"] = { "name": "batchnorm_cross_entropy_loss", "num_classes": heads_num_classes, } pre_train_task = build_task(pre_train_config) trainer = LocalTrainer() trainer.train(pre_train_task) checkpoint = get_checkpoint_dict(pre_train_task, {}) fine_tuning_config = self._get_fine_tuning_config( head_num_classes=heads_num_classes) fine_tuning_config["loss"] = { "name": "batchnorm_cross_entropy_loss", "num_classes": heads_num_classes, } fine_tuning_task = build_task(fine_tuning_config) fine_tuning_task._set_pretrained_checkpoint_dict( copy.deepcopy(checkpoint)) # run in test mode to compare the loss state. Since we have a BatchNorm module in # the loss, its moving mean/std should be unchanged when we run in test-only mode fine_tuning_task.set_test_only(True) loss_state = copy.deepcopy(fine_tuning_task.loss.get_classy_state()) trainer.train(fine_tuning_task) self._compare_state_dict(loss_state, fine_tuning_task.loss.get_classy_state())
def test_logged_lr(self): class SchedulerMock(ClassyParamScheduler): def __call__(self, where): return where mock_lr_scheduler = SchedulerMock(UpdateInterval.STEP) config = get_test_mlp_task_config() config["num_epochs"] = 3 config["dataset"]["train"]["batchsize_per_replica"] = 10 config["dataset"]["test"]["batchsize_per_replica"] = 5 task = build_task(config) task.set_optimizer_schedulers({"lr": mock_lr_scheduler}) trainer = LocalTrainer() # 2 LR updates per epoch = 6 lr_order = [0.0, 1 / 6, 2 / 6, 3 / 6, 4 / 6, 5 / 6] lr_list = [] class LRLoggingHook(ClassyHook): on_end = ClassyHook._noop on_phase_end = ClassyHook._noop on_phase_start = ClassyHook._noop on_start = ClassyHook._noop def on_step(self, task): if task.train: lr_list.append(task.optimizer.options_view.lr) hook = LRLoggingHook() task.set_hooks([hook]) trainer.train(task) self.assertEqual(lr_list, lr_order)
def test_checkpointing(self): # make checkpoint directory checkpoint_folder = self.base_dir + "/checkpoint/" os.mkdir(checkpoint_folder) config = get_fast_test_task_config() cuda_available = torch.cuda.is_available() task = build_task(config) task.prepare(use_gpu=cuda_available) # create a checkpoint hook checkpoint_hook = CheckpointHook(checkpoint_folder, {}, phase_types=["train"]) # call the on end phase function checkpoint_hook.on_phase_end(task) # we should be able to train a task using the checkpoint on all available # devices for use_gpu in {False, cuda_available}: # load the checkpoint checkpoint = load_checkpoint(checkpoint_folder) # create a new task task = build_task(config) # set the checkpoint task.set_checkpoint(checkpoint) task.prepare(use_gpu=use_gpu) # we should be able to run the trainer using the checkpoint trainer = LocalTrainer(use_gpu=use_gpu) trainer.train(task)
def test_logged_lr(self): # Mock LR scheduler def scheduler_mock(where): return where mock_lr_scheduler = mock.Mock(side_effect=scheduler_mock) mock_lr_scheduler.update_interval = UpdateInterval.STEP config = get_test_mlp_task_config() config["num_epochs"] = 3 config["dataset"]["train"]["batchsize_per_replica"] = 5 config["dataset"]["test"]["batchsize_per_replica"] = 5 task = build_task(config) task.optimizer.lr_scheduler = mock_lr_scheduler trainer = LocalTrainer() # 2 LR updates per epoch # At end of each epoch for train, LR is logged an additional time lr_order = [ 0.0, 1 / 6, 1 / 6, 2 / 6, 3 / 6, 3 / 6, 4 / 6, 5 / 6, 5 / 6 ] lr_list = [] def mock_log_lr(task: ClassyTask, local_variables) -> None: lr_list.append(task.optimizer.lr) with mock.patch.object(LossLrMeterLoggingHook, "_log_lr", side_effect=mock_log_lr): hook = LossLrMeterLoggingHook(1) task.set_hooks([hook]) trainer.train(task) self.assertEqual(lr_list, lr_order)
def train(datasets, model, loss, optimizer, meters, args): task = (ClassificationTask() .set_num_epochs(args.num_epochs) .set_loss(loss) .set_model(model) .set_optimizer(optimizer) .set_meters(meters)) for phase in ["train", "test"]: task.set_dataset(datasets[phase], phase) hooks = [LossLrMeterLoggingHook(log_freq=args.print_freq)] # show progress hooks.append(ProgressBarHook()) if not args.skip_tensorboard: try: from tensorboardX import SummaryWriter tb_writer = SummaryWriter(log_dir=args.video_dir + "/tensorboard") hooks.append(TensorboardPlotHook(tb_writer)) except ImportError: print("tensorboardX not installed, skipping tensorboard hooks") checkpoint_dir = f"{args.video_dir}/checkpoint/classy_checkpoint_{time.time()}" os.mkdir(checkpoint_dir) hooks.append(CheckpointHook(checkpoint_dir, input_args={})) task = task.set_hooks(hooks) trainer = LocalTrainer(use_gpu=args.cuda, num_dataloader_workers=args.num_workers) trainer.train(task)
def test_training(self): config = get_fast_test_task_config() config["amp_args"] = {"opt_level": "O2"} task = build_task(config) task.set_use_gpu(True) trainer = LocalTrainer() trainer.train(task)
def test_hook(self): task = self._build_task(num_epochs=3) lr_list = [] class TestHook(ClassyHook): on_rendezvous = ClassyHook._noop on_start = ClassyHook._noop on_phase_start = ClassyHook._noop on_sample = ClassyHook._noop on_forward = ClassyHook._noop on_loss_and_meter = ClassyHook._noop on_backward = ClassyHook._noop on_phase_end = ClassyHook._noop on_end = ClassyHook._noop def on_update(self, task: ClassyTask, local_variables) -> None: lr_list.append(task.optimizer.lr) task.set_hooks([TestHook()]) def scheduler_mock(where): return where mock = Mock(side_effect=scheduler_mock) mock.update_interval = UpdateInterval.STEP task.optimizer.lr_scheduler = mock trainer = LocalTrainer() trainer.train(task) # We have 10 samples, batch size is 5. Each epoch is done in two steps. self.assertEqual(lr_list, [0, 1 / 6, 2 / 6, 3 / 6, 4 / 6, 5 / 6])
def test_final_train_checkpoint(self): """Test that a train phase checkpoint with a where of 1.0 can be loaded""" config = get_fast_test_task_config() task = build_task(config).set_hooks( [CheckpointHook(self.base_dir, {}, phase_types=["train"])]) task_2 = build_task(config) use_gpu = torch.cuda.is_available() trainer = LocalTrainer(use_gpu=use_gpu) trainer.train(task) # load the final train checkpoint checkpoint = load_checkpoint(self.base_dir) # make sure fetching the where raises an exception, which means that # where is >= 1.0 with self.assertRaises(Exception): task.where # set task_2's state as task's final train checkpoint task_2.set_checkpoint(checkpoint) task_2.prepare(use_gpu=use_gpu) # we should be able to train the task trainer.train(task_2)
def test_hook(self): task = self._build_task(num_epochs=3) lr_list = [] weight_decay_list = [] momentum_list = [] test_instance = self class TestHook(ClassyHook): on_rendezvous = ClassyHook._noop on_start = ClassyHook._noop on_phase_start = ClassyHook._noop on_sample = ClassyHook._noop on_forward = ClassyHook._noop on_loss_and_meter = ClassyHook._noop on_backward = ClassyHook._noop on_phase_end = ClassyHook._noop on_end = ClassyHook._noop def on_update(self, task: ClassyTask, local_variables) -> None: # make sure we have non-zero param groups test_instance.assertGreater( len(task.optimizer.optimizer.param_groups), 0) # test that our overrides work on the underlying PyTorch optimizer for param_group in task.optimizer.optimizer.param_groups: test_instance.assertEqual(param_group["lr"], task.optimizer.parameters.lr) test_instance.assertEqual( param_group["weight_decay"], task.optimizer.parameters.weight_decay, ) test_instance.assertEqual( param_group["momentum"], task.optimizer.parameters.momentum) lr_list.append(param_group["lr"]) weight_decay_list.append(param_group["weight_decay"]) momentum_list.append(param_group["momentum"]) task.set_hooks([TestHook()]) def scheduler_mock(where): return where mock = Mock(side_effect=scheduler_mock) mock.update_interval = UpdateInterval.STEP task.optimizer.param_schedulers["lr"] = mock trainer = LocalTrainer() trainer.train(task) # We have 10 samples, batch size is 5. Each epoch takes two steps. So, # there will be a total of 6 steps. # the lr scheduler uses a step update interval self.assertEqual(lr_list, [0 / 6, 1 / 6, 2 / 6, 3 / 6, 4 / 6, 5 / 6]) # the weight decay scheduler uses an epoch update interval self.assertEqual(weight_decay_list, [0 / 6, 0 / 6, 4 / 6, 4 / 6, 8 / 6, 8 / 6]) self.assertEqual(momentum_list, [0.9, 0.9, 0.9, 0.9, 0.9, 0.9])
def test_no_param_schedulers(self): task = self._build_task(num_epochs=3, skip_param_schedulers=True) # there should be no param schedulers self.assertEqual(task.optimizer.param_schedulers, {}) # we should still be able to train the task trainer = LocalTrainer() trainer.train(task)
def test_train_step(self): # test that the model can be run in a train step model = models.resnet34(pretrained=False) classy_model = ClassyModelWrapper(model) config = get_fast_test_task_config() task = build_task(config) task.set_model(classy_model) trainer = LocalTrainer() trainer.train(task)
def test_logged_lr(self): # Mock LR scheduler class SchedulerMock(ClassyParamScheduler): def __call__(self, where): return where mock_lr_scheduler = SchedulerMock(UpdateInterval.STEP) # Mock Logging class DummySummaryWriter(object): def __init__(self): self.scalar_logs = {} def add_scalar(self, key, value, global_step=None, walltime=None) -> None: self.scalar_logs[key] = self.scalar_logs.get(key, []) + [value] def add_histogram(self, key, value, global_step=None, walltime=None) -> None: return def add_text(self, *args, **kwargs): pass def flush(self): return config = get_test_mlp_task_config() config["num_epochs"] = 3 config["dataset"]["train"]["batchsize_per_replica"] = 10 config["dataset"]["test"]["batchsize_per_replica"] = 5 task = build_task(config) writer = DummySummaryWriter() hook = TensorboardPlotHook(writer) hook.log_period = 1 task.set_hooks([hook]) task.set_optimizer_schedulers({"lr": mock_lr_scheduler}) trainer = LocalTrainer() trainer.train(task) # We have 20 samples, batch size is 10. Each epoch is done in two steps. self.assertEqual( writer.scalar_logs["Learning Rate/train"], [0, 1 / 6, 2 / 6, 3 / 6, 4 / 6, 5 / 6], )
def test_update_classy_state(self): """ Tests that the update_classy_state successfully updates from a checkpoint """ config = get_fast_test_task_config() task = build_task(config) task_2 = build_task(config) task_2.prepare() trainer = LocalTrainer() trainer.train(task) update_classy_state(task_2, task.get_classy_state(deep_copy=True)) self._compare_states(task.get_classy_state(), task_2.get_classy_state())
def test_checkpointing_different_device(self): config = get_fast_test_task_config() task = build_task(config) task_2 = build_task(config) for use_gpu in [True, False]: task.prepare(use_gpu=use_gpu) # set task's state as task_2's checkpoint task_2.set_checkpoint(get_checkpoint_dict(task, {}, deep_copy=True)) # we should be able to run the trainer using state from a different device trainer = LocalTrainer(use_gpu=not use_gpu) trainer.train(task_2)
def test_train(self) -> None: for use_gpu in {False, torch.cuda.is_available()}: folder = f"{self.base_dir}/train_test/{use_gpu}" os.makedirs(folder) task = build_task(get_fast_test_task_config(head_num_classes=2)) csv_hook = OutputCSVHook(folder) task.set_hooks([csv_hook]) task.set_use_gpu(use_gpu) trainer = LocalTrainer() trainer.train(task) self.assertEqual(parse_csv(csv_hook.output_path), 10)
def test_training(self): # Test an Apex AMP training config = get_fast_test_task_config() config["amp_args"] = {"opt_level": "O2"} task = build_task(config) task.set_use_gpu(True) trainer = LocalTrainer() trainer.train(task) # Test a Pytorch AMP training config["amp_args"] = {"amp_type": "pytorch"} task = build_task(config) task.set_use_gpu(True) trainer = LocalTrainer() trainer.train(task)
def test_train_only_task(self): """ Tests that the task runs when only a train dataset is specified. """ test_config = get_fast_test_task_config() # delete the test dataset from the config del test_config["dataset"]["test"] task = build_task(test_config).set_hooks([LossLrMeterLoggingHook()]) task.prepare() # verify the the task can still be trained trainer = LocalTrainer() trainer.train(task)
def train_with_clipped_gradients(amp_args=None): task = build_task(get_fast_test_task_config()) task.set_num_epochs(1) task.set_model(SimpleModel()) task.set_loss(SimpleLoss()) task.set_meters([]) task.set_use_gpu(torch.cuda.is_available()) task.set_clip_grad_norm(0.5) task.set_amp_args(amp_args) task.set_optimizer(SGD(lr=1)) trainer = LocalTrainer() trainer.train(task) return task.model.param.grad.norm()
def test_param_scheduler_epoch(self): task = self._build_task(num_epochs=3) where_list = [] def scheduler_mock(where): where_list.append(where) return 0.1 mock = Mock(side_effect=scheduler_mock) mock.update_interval = UpdateInterval.EPOCH task.optimizer.param_schedulers["lr"] = mock trainer = LocalTrainer() trainer.train(task) self.assertEqual(where_list, [0, 1 / 3, 2 / 3])
def test_param_scheduler_epoch(self): task = self._build_task(num_epochs=3) where_list = [] class SchedulerMock(ClassyParamScheduler): def __call__(self, where): where_list.append(where) return 0.1 mock = SchedulerMock(UpdateInterval.EPOCH) task.set_optimizer_schedulers({"lr": mock}) trainer = LocalTrainer() trainer.train(task) self.assertEqual(where_list, [0, 1 / 3, 2 / 3])
def test_test_only_checkpointing(self): """ Tests checkpointing by running train_steps to make sure the train_steps run the same way after loading from a training task checkpoint on a test_only task. """ train_config = get_fast_test_task_config() train_config["num_epochs"] = 10 test_config = get_fast_test_task_config() test_config["test_only"] = True train_task = build_task(train_config).set_hooks( [LossLrMeterLoggingHook()]) test_only_task = build_task(test_config).set_hooks( [LossLrMeterLoggingHook()]) use_gpu = torch.cuda.is_available() # prepare the tasks for the right device train_task.prepare(use_gpu=use_gpu) # test in both train and test mode trainer = LocalTrainer(use_gpu=use_gpu) trainer.train(train_task) # set task's state as task_2's checkpoint test_only_task.set_checkpoint( get_checkpoint_dict(train_task, {}, deep_copy=True)) test_only_task.prepare(use_gpu=use_gpu) test_state = test_only_task.get_classy_state() # We expect the phase idx to be different for a test only task self.assertEqual(test_state["phase_idx"], -1) # We expect that test only state is test, no matter what train state is self.assertFalse(test_state["train"]) # Num updates should be 0 self.assertEqual(test_state["num_updates"], 0) # train_phase_idx should -1 self.assertEqual(test_state["train_phase_idx"], -1) # Verify task will run trainer = LocalTrainer(use_gpu=use_gpu) trainer.train(test_only_task)
def test_param_scheduler_step(self): task = self._build_task(num_epochs=3) where_list = [] def scheduler_mock(where): where_list.append(where) return 0.1 mock = Mock(side_effect=scheduler_mock) mock.update_interval = UpdateInterval.STEP task.optimizer.param_schedulers["lr"] = mock trainer = LocalTrainer() trainer.train(task) # We have 10 samples, batch size is 5. Each epoch is done in two steps. self.assertEqual(where_list, [0, 1 / 6, 2 / 6, 3 / 6, 4 / 6, 5 / 6])
def test_param_scheduler_step(self): task = self._build_task(num_epochs=3) where_list = [] class SchedulerMock(ClassyParamScheduler): def __call__(self, where): where_list.append(where) return 0.1 mock = SchedulerMock(UpdateInterval.STEP) task.set_optimizer_schedulers({"lr": mock}) trainer = LocalTrainer() trainer.train(task) # We have 10 samples, batch size is 5. Each epoch is done in two steps. # The first call is the initialization self.assertEqual(where_list, [0, 0, 1 / 6, 2 / 6, 3 / 6, 4 / 6, 5 / 6])
def test_training(self): """Checks we can train a small MLP model.""" config = get_test_mlp_task_config() task = (ClassificationTask().set_num_epochs(10).set_loss( build_loss(config["loss"])).set_model(build_model( config["model"])).set_optimizer( build_optimizer(config["optimizer"])).set_meters([ AccuracyMeter(topk=[1]) ]).set_hooks([LossLrMeterLoggingHook()])) for split in ["train", "test"]: dataset = build_dataset(config["dataset"][split]) task.set_dataset(dataset, split) self.assertTrue(task is not None) trainer = LocalTrainer() trainer.train(task) accuracy = task.meters[0].value["top_1"] self.assertAlmostEqual(accuracy, 1.0)
def test_logged_lr(self): # Mock LR scheduler def scheduler_mock(where): return where mock_lr_scheduler = mock.Mock(side_effect=scheduler_mock) mock_lr_scheduler.update_interval = UpdateInterval.STEP # Mock Logging class DummySummaryWriter(object): def __init__(self): self.scalar_logs = {} def add_scalar(self, key, value, global_step=None, walltime=None) -> None: self.scalar_logs[key] = self.scalar_logs.get(key, []) + [value] def flush(self): return config = get_test_mlp_task_config() config["num_epochs"] = 3 config["dataset"]["train"]["batchsize_per_replica"] = 5 config["dataset"]["test"]["batchsize_per_replica"] = 5 task = build_task(config) writer = DummySummaryWriter() hook = TensorboardPlotHook(writer) task.set_hooks([hook]) task.optimizer.param_schedulers["lr"] = mock_lr_scheduler trainer = LocalTrainer() trainer.train(task) # We have 10 samples, batch size is 5. Each epoch is done in two steps. self.assertEqual( writer.scalar_logs["train_learning_rate_updates"], [0, 1 / 6, 2 / 6, 3 / 6, 4 / 6, 5 / 6], )
def test_final_train_checkpoint(self): """Test that a train phase checkpoint with a where of 1.0 can be loaded""" config = get_fast_test_task_config() task = build_task(config).set_hooks( [CheckpointHook(self.base_dir, {}, phase_types=["train"])]) task_2 = build_task(config) task.set_use_gpu(torch.cuda.is_available()) trainer = LocalTrainer() trainer.train(task) self.assertAlmostEqual(task.where, 1.0, delta=1e-3) # set task_2's state as task's final train checkpoint task_2.set_checkpoint(self.base_dir) task_2.prepare() # we should be able to train the task trainer.train(task_2)