def test_hf_scheduler_hf_optimizer(self): a = 0 with mockenv_context(**self.dist_env_1_gpu): ds_config_zero2_dict = self.get_config_dict(ZERO2) del ds_config_zero2_dict[ "optimizer"] # force default HF Trainer optimizer del ds_config_zero2_dict[ "scheduler"] # force default HF Trainer scheduler ds_config_zero2_dict["zero_optimization"]["offload_optimizer"][ "device"] = "none" ds_config_zero2_dict["fp16"][ "initial_scale_power"] = 1 # force optimizer on the first step trainer = get_regression_trainer(a=a, local_rank=0, fp16=True, deepspeed=ds_config_zero2_dict) trainer.train() new_a = trainer.model.a.item() self.assertNotEqual(new_a, a)
def test_can_resume_training_normal(self, stage): # adapted from TrainerIntegrationTest.test_can_resume_training # test normal resume for each stage separately, error-handling is tested in a different test output_dir = self.get_auto_remove_tmp_dir() ds_config_dict = self.get_config_dict(stage) ds_config_dict["fp16"]["initial_scale_power"] = 1 # force optimizer on the first step if stage == ZERO3: ds_config_dict["zero_optimization"]["stage3_gather_fp16_weights_on_model_save"] = True kwargs = dict( output_dir=output_dir, train_len=128, save_steps=5, learning_rate=0.1, fp16=True, deepspeed=ds_config_dict ) with mockenv_context(**self.dist_env_1_gpu): trainer = get_regression_trainer(**kwargs) trainer.train() (a, b) = trainer.model.a.item(), trainer.model.b.item() state = dataclasses.asdict(trainer.state) checkpoint = os.path.join(output_dir, "checkpoint-5") # Reinitialize trainer trainer = get_regression_trainer(**kwargs) trainer.train(resume_from_checkpoint=checkpoint) (a1, b1) = trainer.model.a.item(), trainer.model.b.item() state1 = dataclasses.asdict(trainer.state) self.assertEqual(a, a1) self.assertEqual(b, b1) self.check_trainer_state_are_the_same(state, state1) # Now check with a later checkpoint that it also works when we span over one epoch checkpoint = os.path.join(output_dir, "checkpoint-15") # Reinitialize trainer and load model trainer = get_regression_trainer(**kwargs) trainer.train(resume_from_checkpoint=checkpoint) (a1, b1) = trainer.model.a.item(), trainer.model.b.item() state1 = dataclasses.asdict(trainer.state) self.assertEqual(a, a1) self.assertEqual(b, b1) self.check_trainer_state_are_the_same(state, state1)
def test_hf_scheduler_ds_optimizer(self): # this combo is not possible at the moment with mockenv_context(**self.dist_env_1_gpu): ds_config_zero2_dict = self.get_config_dict(ZERO2) del ds_config_zero2_dict[ "scheduler"] # force default HF Trainer scheduler ds_config_zero2_dict["zero_optimization"]["offload_optimizer"][ "device"] = "none" ds_config_zero2_dict["fp16"][ "initial_scale_power"] = 1 # force optimizer on the first step trainer = get_regression_trainer(local_rank=0, fp16=True, deepspeed=ds_config_zero2_dict) with self.assertRaises(Exception) as context: trainer.train() self.assertTrue( "HF scheduler + DeepSpeed optimizer combination is not possible" in str(context.exception), f"got exception: {context.exception}", )
def test_hf_optimizer_with_offload(self, stage): # must not allow non-DS optimizer when using ZERO-offload ds_config_dict = self.get_config_dict(stage) del ds_config_dict["optimizer"] # force default HF Trainer optimizer # force cpu offload if stage == "stage2": ds_config_dict["zero_optimization"]["cpu_offload"] = True elif stage == "stage3": ds_config_dict["zero_optimization"]["offload_optimizer"][ "device"] = "cpu" with mockenv_context(**self.dist_env_1_gpu): trainer = get_regression_trainer(local_rank=0, deepspeed=ds_config_dict) with self.assertRaises(Exception) as context: trainer.train() self.assertIn( "ZeRO Offload can only work with DeepSpeed optimizers", str(context.exception), f"got exception: {context.exception}", )
def test_save_checkpoints(self): # adapted from TrainerIntegrationTest.test_save_checkpoints output_dir = self.get_auto_remove_tmp_dir() ds_config_dict = deepcopy(self.ds_config_dict) ds_config_dict["fp16"][ "initial_scale_power"] = 1 # force optimizer on the first step freq = 5 # save checkpoints with mockenv_context(**self.dist_env_1_gpu): trainer = get_regression_trainer( output_dir=output_dir, save_steps=freq, deepspeed=ds_config_dict, ) trainer.train() total = int(self.n_epochs * 64 / self.batch_size) self.check_saved_checkpoints_deepspeed(output_dir, freq, total)
def test_save_checkpoints(self, stage): # adapted from TrainerIntegrationTest.test_save_checkpoints freq = 5 output_dir = self.get_auto_remove_tmp_dir() ds_config_dict = self.get_config_dict(stage) ds_config_dict["fp16"]["initial_scale_power"] = 1 # force optimizer on the first step if stage == ZERO3: ds_config_dict["zero_optimization"]["stage3_gather_fp16_weights_on_model_save"] = True # save checkpoints with mockenv_context(**self.dist_env_1_gpu): trainer = get_regression_trainer( output_dir=output_dir, save_steps=freq, deepspeed=ds_config_dict, ) trainer.train() total = int(self.n_epochs * 64 / self.batch_size) self.check_saved_checkpoints_deepspeed(output_dir, freq, total, stage)
def test_can_resume_training_errors(self, stage): with mockenv_context(**self.dist_env_1_gpu): ds_config_dict = self.get_config_dict(stage) output_dir = self.get_auto_remove_tmp_dir() trainer = get_regression_trainer(output_dir=output_dir, fp16=True, deepspeed=ds_config_dict) # 1. fail to find any checkpoint - due a fresh output_dir with self.assertRaises(Exception) as context: trainer.train(resume_from_checkpoint=True) self.assertTrue( "No valid checkpoint found in output directory" in str(context.exception), f"got exception: {context.exception}", ) # 2. fail to find a bogus checkpoint with self.assertRaises(Exception) as context: checkpoint = os.path.join(output_dir, "checkpoint-5") trainer.train(resume_from_checkpoint=f"{checkpoint}-bogus") self.assertTrue( "Can't find a valid checkpoint at" in str(context.exception), f"got exception: {context.exception}" )
def test_early_get_last_lr(self): # with deepspeed's fp16 and dynamic loss scale enabled the optimizer/scheduler steps may # not run for the first few dozen steps while loss scale is too large, and thus during # that time `get_last_lr` will fail if called during that warm up stage, # # setting `logging_steps=1` forces an early `trainer._maybe_log_save_evaluate()` which calls # `self.lr_scheduler.get_last_lr()` and originally it'd fail on the very first step. with mockenv_context(**self.dist_env_1_gpu): a = b = 0.0 trainer = get_regression_trainer( a=a, b=b, local_rank=0, train_len=8, deepspeed=self.ds_config_file, per_device_train_batch_size=8, logging_steps=1, ) trainer.train() no_grad_accum_a = trainer.model.a.item() # it's enough that train didn't fail for this test, but we must check that # optimizer/scheduler didn't run (since if it did this test isn't testing the right thing) self.assertEqual(no_grad_accum_a, a)
def test_config_object(self): # test that we can switch from zero2 to zero3 in the same process for example # test is_zero, etc. output_dir = self.get_auto_remove_tmp_dir() kwargs = dict(output_dir=output_dir, train_len=8, fp16=True) ds_config_zero3_dict = self.get_config_dict("zero3") ds_config_zero2_dict = self.get_config_dict("zero2") with mockenv_context(**self.dist_env_1_gpu): trainer = get_regression_trainer(deepspeed=ds_config_zero3_dict, **kwargs) self.assertTrue(is_deepspeed_zero3_enabled()) # test we can repeat that and with train this time trainer = get_regression_trainer(deepspeed=ds_config_zero3_dict, **kwargs) trainer.train() self.assertTrue(is_deepspeed_zero3_enabled()) # test zero3 is disabled trainer = get_regression_trainer(deepspeed=ds_config_zero2_dict, **kwargs) self.assertFalse(is_deepspeed_zero3_enabled()) # check config obj config = deepspeed_config() self.assertTrue(bool(config), "Deepspeed config should be accessible") del trainer # now weakref should gc the global and we shouldn't get anything here config = deepspeed_config() self.assertFalse(is_deepspeed_zero3_enabled()) self.assertFalse(bool(config), "Deepspeed config should not be accessible")
def test_early_get_last_lr(self, stage, dtype): # with deepspeed's fp16 and dynamic loss scale enabled the optimizer/scheduler steps may # not run for the first few dozen steps while loss scale is too large, and thus during # that time `get_last_lr` will fail if called during that warm up stage, # # setting `logging_steps=1` forces an early `trainer._maybe_log_save_evaluate()` which calls # `self.lr_scheduler.get_last_lr()` and originally it'd fail on the very first step. with mockenv_context(**self.dist_env_1_gpu): a = b = 0.0 kwargs = dict( a=a, b=b, local_rank=0, train_len=8, deepspeed=self.get_config_dict(stage), per_device_train_batch_size=8, logging_steps=1, ) kwargs[dtype] = True trainer = get_regression_trainer(**kwargs) trainer.train() post_train_a = trainer.model.a.item() # XXX: for some reason the following check fails with zero3/fp16 and any/bf16 - not a # broken but a different qualitative outcome - as if optimizer did run # oddly getting 1.0 for both a and b from 0.0 - there is a bug somewhere # print(trainer.model.a.item()) # print(trainer.model.b.item()) # need to investigate at some point if (stage == ZERO3 and dtype == FP16) or (dtype == BF16): return # it's enough that train didn't fail for this test, but we must check that # optimizer/scheduler didn't run (since if it did this test isn't testing the right thing) self.assertEqual(post_train_a, a)
def test_load_best_model(self, stage, dtype): # Test that forced deepspeed reinit doesn't break the model. the forced re-init after # loading the best model in Trainer is there to workaround this bug in Deepspeed # # # The test is derived from a repro script submitted in this Issue: # # # One additional feature of this test is that we use a non-AdamW optimizer to test that # deepspeed doesn't fallback to AdamW, which would prevent the optimizer states from loading # correctly from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer # noqa output_dir = self.get_auto_remove_tmp_dir( ) # "./xxx", after=False, before=False) ds_config_dict = self.get_config_dict(stage) del ds_config_dict["optimizer"] # will use HF Trainer optimizer del ds_config_dict["scheduler"] # will use HF Trainer scheduler # must use this setting to get the reload path exercised ds_config_dict["zero_optimization"][ "stage3_gather_16bit_weights_on_model_save"] = True with mockenv_context(**self.dist_env_1_gpu): args_dict = { "per_gpu_train_batch_size": 1, "per_gpu_eval_batch_size": 1, "gradient_accumulation_steps": 1, "learning_rate": 1e-4, "num_train_epochs": 1, "do_train": True, "do_eval": True, "optim": "adafactor", "evaluation_strategy": "steps", "eval_steps": 1, "save_strategy": "steps", "save_steps": 1, "load_best_model_at_end": True, "max_steps": 1, "deepspeed": ds_config_dict, } training_args = TrainingArguments(output_dir, **args_dict) tokenizer = T5Tokenizer.from_pretrained(T5_TINY) model = T5ForConditionalGeneration.from_pretrained(T5_TINY) def _add_eos_to_examples(example): example[ "input_text"] = f"question: {example['question']} context: {example['context']}" example["target_text"] = example["answers"]["text"][0] if len( example["answers"]["text"]) > 0 else "" return example def _convert_to_features(example_batch): input_encodings = tokenizer.batch_encode_plus( example_batch["input_text"], pad_to_max_length=True, max_length=512, truncation=True) target_encodings = tokenizer.batch_encode_plus( example_batch["target_text"], pad_to_max_length=True, max_length=16, truncation=True) encodings = { "input_ids": input_encodings["input_ids"], "attention_mask": input_encodings["attention_mask"], "labels": target_encodings["input_ids"], } return encodings def get_dataset(): data_file = str(self.tests_dir / "fixtures/tests_samples/SQUAD/sample.json") data_files = dict(train=data_file, validation=data_file) raw_datasets = datasets.load_dataset("json", data_files=data_files, field="data") train_dataset = raw_datasets["train"].map( _add_eos_to_examples).map(_convert_to_features, batched=True) valid_dataset = deepcopy(train_dataset) return train_dataset, valid_dataset train_dataset, eval_dataset = get_dataset() trainer = Trainer( model=model, tokenizer=tokenizer, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, ) trainer.train() # crash 1 was here trainer.evaluate() # crash 2 was here
def test_can_resume_training(self): # adapted from TrainerIntegrationTest.test_can_resume_training output_dir = self.get_auto_remove_tmp_dir() ds_config_dict = deepcopy(self.ds_config_dict) ds_config_dict["fp16"][ "initial_scale_power"] = 1 # force optimizer on the first step kwargs = dict(output_dir=output_dir, train_len=128, save_steps=5, learning_rate=0.1, deepspeed=ds_config_dict) with mockenv_context(**self.dist_env_1_gpu): trainer = get_regression_trainer(**kwargs) trainer.train() (a, b) = trainer.model.a.item(), trainer.model.b.item() state = dataclasses.asdict(trainer.state) checkpoint = os.path.join(output_dir, "checkpoint-5") # Reinitialize trainer trainer = get_regression_trainer(**kwargs) trainer.train(resume_from_checkpoint=checkpoint) (a1, b1) = trainer.model.a.item(), trainer.model.b.item() state1 = dataclasses.asdict(trainer.state) self.assertEqual(a, a1) self.assertEqual(b, b1) self.check_trainer_state_are_the_same(state, state1) # Now check with a later checkpoint that it also works when we span over one epoch checkpoint = os.path.join(output_dir, "checkpoint-15") # Reinitialize trainer and load model trainer = get_regression_trainer(**kwargs) trainer.train(resume_from_checkpoint=checkpoint) (a1, b1) = trainer.model.a.item(), trainer.model.b.item() state1 = dataclasses.asdict(trainer.state) self.assertEqual(a, a1) self.assertEqual(b, b1) self.check_trainer_state_are_the_same(state, state1) # Now check failures # 1. fail to find a bogus checkpoint trainer = get_regression_trainer(**kwargs) with self.assertRaises(Exception) as context: trainer.train(resume_from_checkpoint=f"{checkpoint}-bogus") self.assertTrue( "failed to resume from checkpoint" in str(context.exception)) # 2. fail to find any checkpoint - due a fresh output_dir output_dir2 = self.get_auto_remove_tmp_dir() trainer = get_regression_trainer(output_dir=output_dir2, deepspeed=ds_config_dict) with self.assertRaises(Exception) as context: trainer.train(resume_from_checkpoint=True) self.assertTrue("No valid checkpoint found in output directory" in str(context.exception))