def test_config_object(self): # test that we can switch from zero2 to zero3 in the same process for example # test is_zero, etc. output_dir = self.get_auto_remove_tmp_dir() kwargs = dict(output_dir=output_dir, train_len=8, fp16=True) ds_config_zero3_dict = self.get_config_dict("zero3") ds_config_zero2_dict = self.get_config_dict("zero2") with mockenv_context(**self.dist_env_1_gpu): trainer = get_regression_trainer(deepspeed=ds_config_zero3_dict, **kwargs) self.assertTrue(is_deepspeed_zero3_enabled()) # test we can repeat that and with train this time trainer = get_regression_trainer(deepspeed=ds_config_zero3_dict, **kwargs) trainer.train() self.assertTrue(is_deepspeed_zero3_enabled()) # test zero3 is disabled trainer = get_regression_trainer(deepspeed=ds_config_zero2_dict, **kwargs) self.assertFalse(is_deepspeed_zero3_enabled()) # check config obj config = deepspeed_config() self.assertTrue(bool(config), "Deepspeed config should be accessible") del trainer # now weakref should gc the global and we shouldn't get anything here config = deepspeed_config() self.assertFalse(is_deepspeed_zero3_enabled()) self.assertFalse(bool(config), "Deepspeed config should not be accessible")
def test_gradient_accumulation(self, stage): # this test measures that we get identical weights and similar loss with: # 1. per_device_train_batch_size=8, gradient_accumulation_steps=1 # 2. per_device_train_batch_size=4, gradient_accumulation_steps=2 # since the 2nd should produce the effective batch of 1st, with the same results # # I can get an identical loss for a small train_len=32, plus the power of the initial # dynamic loss scale value set to: # "fp16.initial_scale_power": 1 # plus having the same WarmupLR's warmup_min_lr == warmup_max_lr in the config file # but for some reason going to train_len=64 the weights, weights start to mismatch with this setup. # the culprit seems to be `initial_scale_power` - putting it back to its default 32 keeps the weights identical train_len = 64 a = b = 0.0 with mockenv_context(**self.dist_env_1_gpu): no_grad_accum_trainer = get_regression_trainer( a=a, b=b, local_rank=0, train_len=train_len, fp16=True, deepspeed=self.get_config_dict(stage), per_device_train_batch_size=8, gradient_accumulation_steps=1, ) no_grad_accum_result = no_grad_accum_trainer.train() no_grad_accum_loss = no_grad_accum_result.training_loss no_grad_accum_a = no_grad_accum_trainer.model.a.item() no_grad_accum_b = no_grad_accum_trainer.model.b.item() # make sure the optimizer kicked in - if it hasn't changed from the original value of a then make train_len bigger self.assertNotEqual(no_grad_accum_a, a) with mockenv_context(**self.dist_env_1_gpu): yes_grad_accum_trainer = get_regression_trainer( a=a, b=b, local_rank=0, train_len=train_len, fp16=True, deepspeed=self.get_config_dict(stage), per_device_train_batch_size=4, gradient_accumulation_steps=2, ) yes_grad_accum_result = yes_grad_accum_trainer.train() yes_grad_accum_loss = yes_grad_accum_result.training_loss yes_grad_accum_a = yes_grad_accum_trainer.model.a.item() yes_grad_accum_b = yes_grad_accum_trainer.model.b.item() self.assertNotEqual(yes_grad_accum_a, a) # training with half the batch size but accumulation steps as 2 should give the same # weights, but sometimes get a slight difference still of 1e-6 self.assertAlmostEqual(no_grad_accum_a, yes_grad_accum_a, places=5) self.assertAlmostEqual(no_grad_accum_b, yes_grad_accum_b, places=5) # see the note above how to get identical loss on a small bs self.assertAlmostEqual(no_grad_accum_loss, yes_grad_accum_loss, places=5)
def test_can_resume_training_normal(self, stage): # adapted from TrainerIntegrationTest.test_can_resume_training # test normal resume for each stage separately, error-handling is tested in a different test output_dir = self.get_auto_remove_tmp_dir() ds_config_dict = self.get_config_dict(stage) ds_config_dict["fp16"][ "initial_scale_power"] = 1 # force optimizer on the first step if stage == ZERO3: ds_config_dict["zero_optimization"][ "stage3_gather_fp16_weights_on_model_save"] = True kwargs = dict(output_dir=output_dir, train_len=128, save_steps=5, learning_rate=0.1, fp16=True, deepspeed=ds_config_dict) with mockenv_context(**self.dist_env_1_gpu): trainer = get_regression_trainer(**kwargs) trainer.train() (a, b) = trainer.model.a.item(), trainer.model.b.item() state = dataclasses.asdict(trainer.state) checkpoint = os.path.join(output_dir, "checkpoint-5") # Reinitialize trainer trainer = get_regression_trainer(**kwargs) trainer.train(resume_from_checkpoint=checkpoint) (a1, b1) = trainer.model.a.item(), trainer.model.b.item() state1 = dataclasses.asdict(trainer.state) self.assertEqual(a, a1) self.assertEqual(b, b1) self.check_trainer_state_are_the_same(state, state1) # Now check with a later checkpoint that it also works when we span over one epoch checkpoint = os.path.join(output_dir, "checkpoint-15") # Reinitialize trainer and load model trainer = get_regression_trainer(**kwargs) trainer.train(resume_from_checkpoint=checkpoint) (a1, b1) = trainer.model.a.item(), trainer.model.b.item() state1 = dataclasses.asdict(trainer.state) self.assertEqual(a, a1) self.assertEqual(b, b1) self.check_trainer_state_are_the_same(state, state1)
def test_fake_notebook_no_launcher(self): # this setup emulates a notebook where a launcher needs to be emulated by hand with CaptureStd() as cs: # noqa with mockenv_context(**self.dist_env_1_gpu): trainer = get_regression_trainer(local_rank=0, deepspeed=self.ds_config_file) trainer.train()
def test_load_state_dict_from_zero_checkpoint(self, stage): # test that we can load fp32 weights directly from the zero checkpoint into the current model output_dir = self.get_auto_remove_tmp_dir() # "./xxx", after=False, before=False) ds_config_dict = self.get_config_dict(stage) kwargs = dict( output_dir=output_dir, train_len=4, per_device_train_batch_size=4, num_train_epochs=1, save_strategy="steps", save_steps=1, learning_rate=0.1, fp16=True, deepspeed=ds_config_dict, ) with mockenv_context(**self.dist_env_1_gpu): trainer = get_regression_trainer(**kwargs) trainer.train() (a, b) = trainer.model.a.item(), trainer.model.b.item() state = dataclasses.asdict(trainer.state) checkpoint_dir = get_last_checkpoint(output_dir) model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) (a1, b1) = model.a.item(), model.b.item() state1 = dataclasses.asdict(trainer.state) self.assertEqual(a, a1) self.assertEqual(b, b1) self.check_trainer_state_are_the_same(state, state1)
def test_can_resume_training_errors(self, stage): with mockenv_context(**self.dist_env_1_gpu): ds_config_dict = self.get_config_dict(stage) output_dir = self.get_auto_remove_tmp_dir() trainer = get_regression_trainer(output_dir=output_dir, fp16=True, deepspeed=ds_config_dict) # 1. fail to find any checkpoint - due a fresh output_dir with self.assertRaises(Exception) as context: trainer.train(resume_from_checkpoint=True) self.assertTrue( "No valid checkpoint found in output directory" in str(context.exception), f"got exception: {context.exception}", ) # 2. fail to find a bogus checkpoint with self.assertRaises(Exception) as context: checkpoint = os.path.join(output_dir, "checkpoint-5") trainer.train(resume_from_checkpoint=f"{checkpoint}-bogus") self.assertTrue( "Can't find a valid checkpoint at" in str(context.exception), f"got exception: {context.exception}")
def test_early_get_last_lr(self, stage): # with deepspeed's fp16 and dynamic loss scale enabled the optimizer/scheduler steps may # not run for the first few dozen steps while loss scale is too large, and thus during # that time `get_last_lr` will fail if called during that warm up stage, # # setting `logging_steps=1` forces an early `trainer._maybe_log_save_evaluate()` which calls # `self.lr_scheduler.get_last_lr()` and originally it'd fail on the very first step. with mockenv_context(**self.dist_env_1_gpu): a = b = 0.0 trainer = get_regression_trainer( a=a, b=b, local_rank=0, train_len=8, fp16=True, deepspeed=self.get_config_dict(stage), per_device_train_batch_size=8, logging_steps=1, ) trainer.train() post_train_a = trainer.model.a.item() # XXX: for some reason the following check fails with zero3 - not a broken but a # different qualitative outcome - as if optimizer did run # oddly getting 1.0 for both a and b from 0.0 - there is a bug somewhere # print(trainer.model.a.item()) # print(trainer.model.b.item()) # need to investigate at some point if stage == ZERO3: return # it's enough that train didn't fail for this test, but we must check that # optimizer/scheduler didn't run (since if it did this test isn't testing the right thing) self.assertEqual(post_train_a, a)
def test_fake_notebook_no_launcher(self): # this setup emulates a notebook where a launcher needs to be emulated by hand with CaptureStd() as cs: with mockenv_context(**self.dist_env_1_gpu): trainer = get_regression_trainer(local_rank=0, deepspeed=self.ds_config_file) trainer.train() assert "DeepSpeed info" in cs.out, "expected DeepSpeed logger output but got none"
def test_fake_notebook_no_launcher(self): sys.path.append(self.tests_dir_str) from test_trainer import get_regression_trainer del sys.path[-1] # restore ds_config_file = f"{self.test_file_dir_str}/ds_config.json" with CaptureStd() as cs: trainer = get_regression_trainer(local_rank=0, deepspeed=ds_config_file) trainer.train() assert "DeepSpeed info" in cs.out, "expected DeepSpeed logger output but got none"
def test_hf_ds_config_mismatch(self): ds_config = self.get_config_dict(ZERO2) # Purposefully configure these values to mismatch TrainingArguments values. # This currently doesn't cover all keys (but it could) per_device_train_batch_size = 2 ds_config[ "train_micro_batch_size_per_gpu"] = per_device_train_batch_size + 2 ds_config["train_batch_size"] = 1000 gradient_accumulation_steps = 2 ds_config[ "gradient_accumulation_steps"] = gradient_accumulation_steps + 2 max_grad_norm = 1.0 ds_config["gradient_clipping"] = max_grad_norm + 0.1 adam_beta1, adam_beta2 = 0.9, 0.99 ds_config["optimizer"]["params"]["betas"] = [ adam_beta1 - 0.1, adam_beta2 - 0.1 ] fp16 = True ds_config["fp16"]["enabled"] = not fp16 keys = [ "per_device_train_batch_size", "train_batch_size", "gradient_accumulation_steps", "max_grad_norm", "betas", "fp16", ] with mockenv_context(**self.dist_env_1_gpu): trainer = get_regression_trainer( local_rank=0, fp16=fp16, deepspeed=ds_config, per_device_train_batch_size=per_device_train_batch_size, gradient_accumulation_steps=gradient_accumulation_steps, max_grad_norm=max_grad_norm, adam_beta1=adam_beta1, adam_beta2=adam_beta2, ) with self.assertRaises(Exception) as context: trainer.train() for key in keys: self.assertTrue( key in str(context.exception), f"{key} is not in the exception message:\n{context.exception}", )
def test_hf_scheduler_ds_optimizer(self): a = 0 with mockenv_context(**self.dist_env_1_gpu): ds_config_zero2_dict = self.get_config_dict(ZERO2) del ds_config_zero2_dict["scheduler"] # force default HF Trainer scheduler ds_config_zero2_dict["zero_optimization"]["offload_optimizer"]["device"] = "none" ds_config_zero2_dict["fp16"]["initial_scale_power"] = 1 # force optimizer on the first step trainer = get_regression_trainer(local_rank=0, fp16=True, deepspeed=ds_config_zero2_dict) trainer.train() new_a = trainer.model.a.item() self.assertNotEqual(new_a, a)
def test_hf_optimizer_with_offload(self, stage): # non-DS optimizers can be used with ZERO-offload (as long as they have both CPU and GPU implementation (except LAMB)) ds_config_dict = self.get_config_dict(stage) del ds_config_dict["optimizer"] # force default HF Trainer optimizer # force cpu offload ds_config_dict["zero_optimization"]["offload_optimizer"]["device"] = "cpu" with mockenv_context(**self.dist_env_1_gpu): trainer = get_regression_trainer(local_rank=0, fp16=True, deepspeed=ds_config_dict) with CaptureLogger(deepspeed_logger) as cl: trainer.train() self.assertIn("DeepSpeed info", cl.out, "expected DeepSpeed logger output but got none")
def test_hf_scheduler_ds_optimizer(self): # this combo is not possible at the moment with mockenv_context(**self.dist_env_1_gpu): ds_config_zero2_dict = self.get_config_dict(ZERO2) del ds_config_zero2_dict["scheduler"] # force default HF Trainer scheduler ds_config_zero2_dict["zero_optimization"]["cpu_offload"] = False ds_config_zero2_dict["fp16"]["initial_scale_power"] = 1 # force optimizer on the first step trainer = get_regression_trainer(local_rank=0, deepspeed=ds_config_zero2_dict) with self.assertRaises(Exception) as context: trainer.train() self.assertTrue("HF scheduler + DeepSpeed optimizer combination is not possible" in str(context.exception))
def test_fake_notebook_no_launcher(self, stage): # this setup emulates a notebook where a launcher needs to be emulated by hand # note that unittest resets sys.stdout each test, so `CaptureStd` will work here to capture # DeepSpeed log if this test happens to run first in this pytest worker. But it will fail if # it's run not as a first test as `sys.stdout` will no longer be the same. So we either have # to reset `deepspeed_logger.handlers[0].setStream(sys.stdout)` or directly capture from the deepspeed_logger. with mockenv_context(**self.dist_env_1_gpu): trainer = get_regression_trainer(local_rank=0, fp16=True, deepspeed=self.get_config_dict(stage)) with CaptureLogger(deepspeed_logger) as cl: trainer.train() self.assertIn("DeepSpeed info", cl.out, "expected DeepSpeed logger output but got none")
def test_stage3_nvme_offload(self): with mockenv_context(**self.dist_env_1_gpu): # this actually doesn't have to be on NVMe, any storage will do since this test only # runs a simple check that we can use some directory as if it were NVMe nvme_path = self.get_auto_remove_tmp_dir() nvme_config = dict(device="nvme", nvme_path=nvme_path) ds_config_zero3_dict = self.get_config_dict(ZERO3) ds_config_zero3_dict["zero_optimization"]["offload_optimizer"] = nvme_config ds_config_zero3_dict["zero_optimization"]["offload_param"] = nvme_config trainer = get_regression_trainer(local_rank=0, fp16=True, deepspeed=ds_config_zero3_dict) with CaptureLogger(deepspeed_logger) as cl: trainer.train() self.assertIn("DeepSpeed info", cl.out, "expected DeepSpeed logger output but got none")
def test_hf_optimizer_with_offload(self, stage): # must not allow non-DS optimizer when using ZERO-offload ds_config_dict = self.get_config_dict(stage) del ds_config_dict["optimizer"] # force default HF Trainer optimizer # force cpu offload ds_config_dict["zero_optimization"]["offload_optimizer"]["device"] = "cpu" with mockenv_context(**self.dist_env_1_gpu): trainer = get_regression_trainer(local_rank=0, fp16=True, deepspeed=ds_config_dict) with self.assertRaises(Exception) as context: trainer.train() self.assertIn( "ZeRO Offload can only work with DeepSpeed optimizers", str(context.exception), f"got exception: {context.exception}", )
def test_ds_scheduler_hf_optimizer(self): a = 0 with mockenv_context(**self.dist_env_1_gpu): ds_config_dict = deepcopy(self.ds_config_dict) del ds_config_dict[ "optimizer"] # force default HF Trainer optimizer ds_config_dict["zero_optimization"]["cpu_offload"] = False ds_config_dict["fp16"][ "initial_scale_power"] = 1 # force optimizer on the first step trainer = get_regression_trainer(a=a, local_rank=0, deepspeed=ds_config_dict) trainer.train() new_a = trainer.model.a.item() self.assertNotEqual(new_a, a)
def test_hf_optimizer_with_offload(self): # must not allow non-DS optimizer when using ZERO-offload with mockenv_context(**self.dist_env_1_gpu): ds_config_zero2_dict = self.get_config_dict(ZERO2) del ds_config_zero2_dict["optimizer"] # force default HF Trainer optimizer ds_config_zero2_dict["zero_optimization"]["cpu_offload"] = True # sanity check - should the default config change assert ( "cpu_offload" in ds_config_zero2_dict["zero_optimization"] and ds_config_zero2_dict["zero_optimization"]["cpu_offload"] is True ), "ensure the config is set up correctly" trainer = get_regression_trainer(local_rank=0, deepspeed=ds_config_zero2_dict) with self.assertRaises(Exception) as context: trainer.train() self.assertTrue("ZeRO Offload can only work with DeepSpeed optimizers" in str(context.exception))
def test_fp32(self, stage): ds_config_dict = self.get_config_dict(stage) ds_config_dict["fp16"]["enabled"] = False # force non-fp16 mode # XXX: do we go via from_pretrained in zero 3 here? need to test zero.Init(dtype=torch.float) # XXX: rewrite this test once fp32 is supported by DeepSpeed with mockenv_context(**self.dist_env_1_gpu): trainer = get_regression_trainer(local_rank=0, deepspeed=ds_config_dict) with self.assertRaises(Exception) as context: trainer.train() self.assertIn( "ZeRO is only supported if fp16 is enabled", str(context.exception), f"got exception: {context.exception}", )
def test_save_checkpoints(self): # adapted from TrainerIntegrationTest.test_save_checkpoints output_dir = self.get_auto_remove_tmp_dir() ds_config_dict = deepcopy(self.ds_config_dict) ds_config_dict["fp16"][ "initial_scale_power"] = 1 # force optimizer on the first step freq = 5 # save checkpoints with mockenv_context(**self.dist_env_1_gpu): trainer = get_regression_trainer( output_dir=output_dir, save_steps=freq, deepspeed=ds_config_dict, ) trainer.train() total = int(self.n_epochs * 64 / self.batch_size) self.check_saved_checkpoints_deepspeed(output_dir, freq, total)
def test_save_checkpoints(self, stage): # adapted from TrainerIntegrationTest.test_save_checkpoints freq = 5 output_dir = self.get_auto_remove_tmp_dir() ds_config_dict = self.get_config_dict(stage) ds_config_dict["fp16"]["initial_scale_power"] = 1 # force optimizer on the first step if stage == ZERO3: ds_config_dict["zero_optimization"]["stage3_gather_fp16_weights_on_model_save"] = True # save checkpoints with mockenv_context(**self.dist_env_1_gpu): trainer = get_regression_trainer( output_dir=output_dir, save_steps=freq, deepspeed=ds_config_dict, ) trainer.train() total = int(self.n_epochs * 64 / self.batch_size) self.check_saved_checkpoints_deepspeed(output_dir, freq, total, stage)
def test_early_get_last_lr(self): # with deepspeed's fp16 and dynamic loss scale enabled the optimizer/scheduler steps may # not run for the first few dozen steps while loss scale is too large, and thus during # that time `get_last_lr` will fail if called during that warm up stage, # # setting `logging_steps=1` forces an early `trainer._maybe_log_save_evaluate()` which calls # `self.lr_scheduler.get_last_lr()` and originally it'd fail on the very first step. with mockenv_context(**self.dist_env_1_gpu): a = b = 0.0 trainer = get_regression_trainer( a=a, b=b, local_rank=0, train_len=8, deepspeed=self.ds_config_file, per_device_train_batch_size=8, logging_steps=1, ) trainer.train() no_grad_accum_a = trainer.model.a.item() # it's enough that train didn't fail for this test, but we must check that # optimizer/scheduler didn't run (since if it did this test isn't testing the right thing) self.assertEqual(no_grad_accum_a, a)
def test_can_resume_training(self): # adapted from TrainerIntegrationTest.test_can_resume_training output_dir = self.get_auto_remove_tmp_dir() ds_config_dict = deepcopy(self.ds_config_dict) ds_config_dict["fp16"][ "initial_scale_power"] = 1 # force optimizer on the first step kwargs = dict(output_dir=output_dir, train_len=128, save_steps=5, learning_rate=0.1, deepspeed=ds_config_dict) with mockenv_context(**self.dist_env_1_gpu): trainer = get_regression_trainer(**kwargs) trainer.train() (a, b) = trainer.model.a.item(), trainer.model.b.item() state = dataclasses.asdict(trainer.state) checkpoint = os.path.join(output_dir, "checkpoint-5") # Reinitialize trainer trainer = get_regression_trainer(**kwargs) trainer.train(resume_from_checkpoint=checkpoint) (a1, b1) = trainer.model.a.item(), trainer.model.b.item() state1 = dataclasses.asdict(trainer.state) self.assertEqual(a, a1) self.assertEqual(b, b1) self.check_trainer_state_are_the_same(state, state1) # Now check with a later checkpoint that it also works when we span over one epoch checkpoint = os.path.join(output_dir, "checkpoint-15") # Reinitialize trainer and load model trainer = get_regression_trainer(**kwargs) trainer.train(resume_from_checkpoint=checkpoint) (a1, b1) = trainer.model.a.item(), trainer.model.b.item() state1 = dataclasses.asdict(trainer.state) self.assertEqual(a, a1) self.assertEqual(b, b1) self.check_trainer_state_are_the_same(state, state1) # Now check failures # 1. fail to find a bogus checkpoint trainer = get_regression_trainer(**kwargs) with self.assertRaises(Exception) as context: trainer.train(resume_from_checkpoint=f"{checkpoint}-bogus") self.assertTrue( "failed to resume from checkpoint" in str(context.exception)) # 2. fail to find any checkpoint - due a fresh output_dir output_dir2 = self.get_auto_remove_tmp_dir() trainer = get_regression_trainer(output_dir=output_dir2, deepspeed=ds_config_dict) with self.assertRaises(Exception) as context: trainer.train(resume_from_checkpoint=True) self.assertTrue("No valid checkpoint found in output directory" in str(context.exception))