コード例 #1
0
 def test_hf_scheduler_hf_optimizer(self):
     a = 0
     with mockenv_context(**self.dist_env_1_gpu):
         ds_config_zero2_dict = self.get_config_dict(ZERO2)
         del ds_config_zero2_dict[
             "optimizer"]  # force default HF Trainer optimizer
         del ds_config_zero2_dict[
             "scheduler"]  # force default HF Trainer scheduler
         ds_config_zero2_dict["zero_optimization"]["offload_optimizer"][
             "device"] = "none"
         ds_config_zero2_dict["fp16"][
             "initial_scale_power"] = 1  # force optimizer on the first step
         trainer = get_regression_trainer(a=a,
                                          local_rank=0,
                                          fp16=True,
                                          deepspeed=ds_config_zero2_dict)
         trainer.train()
     new_a = trainer.model.a.item()
     self.assertNotEqual(new_a, a)
コード例 #2
0
    def test_can_resume_training_normal(self, stage):
        # adapted from TrainerIntegrationTest.test_can_resume_training
        # test normal resume for each stage separately, error-handling is tested in a different test
        output_dir = self.get_auto_remove_tmp_dir()
        ds_config_dict = self.get_config_dict(stage)
        ds_config_dict["fp16"]["initial_scale_power"] = 1  # force optimizer on the first step
        if stage == ZERO3:
            ds_config_dict["zero_optimization"]["stage3_gather_fp16_weights_on_model_save"] = True

        kwargs = dict(
            output_dir=output_dir, train_len=128, save_steps=5, learning_rate=0.1, fp16=True, deepspeed=ds_config_dict
        )

        with mockenv_context(**self.dist_env_1_gpu):
            trainer = get_regression_trainer(**kwargs)
            trainer.train()
            (a, b) = trainer.model.a.item(), trainer.model.b.item()
            state = dataclasses.asdict(trainer.state)

            checkpoint = os.path.join(output_dir, "checkpoint-5")

            # Reinitialize trainer
            trainer = get_regression_trainer(**kwargs)

            trainer.train(resume_from_checkpoint=checkpoint)
            (a1, b1) = trainer.model.a.item(), trainer.model.b.item()
            state1 = dataclasses.asdict(trainer.state)
            self.assertEqual(a, a1)
            self.assertEqual(b, b1)
            self.check_trainer_state_are_the_same(state, state1)

            # Now check with a later checkpoint that it also works when we span over one epoch
            checkpoint = os.path.join(output_dir, "checkpoint-15")

            # Reinitialize trainer and load model
            trainer = get_regression_trainer(**kwargs)

            trainer.train(resume_from_checkpoint=checkpoint)
            (a1, b1) = trainer.model.a.item(), trainer.model.b.item()
            state1 = dataclasses.asdict(trainer.state)
            self.assertEqual(a, a1)
            self.assertEqual(b, b1)
            self.check_trainer_state_are_the_same(state, state1)
コード例 #3
0
 def test_hf_scheduler_ds_optimizer(self):
     # this combo is not possible at the moment
     with mockenv_context(**self.dist_env_1_gpu):
         ds_config_zero2_dict = self.get_config_dict(ZERO2)
         del ds_config_zero2_dict[
             "scheduler"]  # force default HF Trainer scheduler
         ds_config_zero2_dict["zero_optimization"]["offload_optimizer"][
             "device"] = "none"
         ds_config_zero2_dict["fp16"][
             "initial_scale_power"] = 1  # force optimizer on the first step
         trainer = get_regression_trainer(local_rank=0,
                                          fp16=True,
                                          deepspeed=ds_config_zero2_dict)
         with self.assertRaises(Exception) as context:
             trainer.train()
     self.assertTrue(
         "HF scheduler + DeepSpeed optimizer combination is not possible"
         in str(context.exception),
         f"got exception: {context.exception}",
     )
コード例 #4
0
 def test_hf_optimizer_with_offload(self, stage):
     # must not allow non-DS optimizer when using ZERO-offload
     ds_config_dict = self.get_config_dict(stage)
     del ds_config_dict["optimizer"]  # force default HF Trainer optimizer
     # force cpu offload
     if stage == "stage2":
         ds_config_dict["zero_optimization"]["cpu_offload"] = True
     elif stage == "stage3":
         ds_config_dict["zero_optimization"]["offload_optimizer"][
             "device"] = "cpu"
     with mockenv_context(**self.dist_env_1_gpu):
         trainer = get_regression_trainer(local_rank=0,
                                          deepspeed=ds_config_dict)
         with self.assertRaises(Exception) as context:
             trainer.train()
         self.assertIn(
             "ZeRO Offload can only work with DeepSpeed optimizers",
             str(context.exception),
             f"got exception: {context.exception}",
         )
コード例 #5
0
    def test_save_checkpoints(self):
        # adapted from  TrainerIntegrationTest.test_save_checkpoints

        output_dir = self.get_auto_remove_tmp_dir()
        ds_config_dict = deepcopy(self.ds_config_dict)
        ds_config_dict["fp16"][
            "initial_scale_power"] = 1  # force optimizer on the first step
        freq = 5

        # save checkpoints
        with mockenv_context(**self.dist_env_1_gpu):
            trainer = get_regression_trainer(
                output_dir=output_dir,
                save_steps=freq,
                deepspeed=ds_config_dict,
            )
            trainer.train()

        total = int(self.n_epochs * 64 / self.batch_size)
        self.check_saved_checkpoints_deepspeed(output_dir, freq, total)
コード例 #6
0
    def test_save_checkpoints(self, stage):
        # adapted from  TrainerIntegrationTest.test_save_checkpoints

        freq = 5
        output_dir = self.get_auto_remove_tmp_dir()
        ds_config_dict = self.get_config_dict(stage)
        ds_config_dict["fp16"]["initial_scale_power"] = 1  # force optimizer on the first step
        if stage == ZERO3:
            ds_config_dict["zero_optimization"]["stage3_gather_fp16_weights_on_model_save"] = True

        # save checkpoints
        with mockenv_context(**self.dist_env_1_gpu):
            trainer = get_regression_trainer(
                output_dir=output_dir,
                save_steps=freq,
                deepspeed=ds_config_dict,
            )
            trainer.train()

        total = int(self.n_epochs * 64 / self.batch_size)
        self.check_saved_checkpoints_deepspeed(output_dir, freq, total, stage)
コード例 #7
0
    def test_can_resume_training_errors(self, stage):

        with mockenv_context(**self.dist_env_1_gpu):
            ds_config_dict = self.get_config_dict(stage)
            output_dir = self.get_auto_remove_tmp_dir()
            trainer = get_regression_trainer(output_dir=output_dir, fp16=True, deepspeed=ds_config_dict)

            # 1. fail to find any checkpoint - due a fresh output_dir
            with self.assertRaises(Exception) as context:
                trainer.train(resume_from_checkpoint=True)
            self.assertTrue(
                "No valid checkpoint found in output directory" in str(context.exception),
                f"got exception: {context.exception}",
            )

            # 2. fail to find a bogus checkpoint
            with self.assertRaises(Exception) as context:
                checkpoint = os.path.join(output_dir, "checkpoint-5")
                trainer.train(resume_from_checkpoint=f"{checkpoint}-bogus")
            self.assertTrue(
                "Can't find a valid checkpoint at" in str(context.exception), f"got exception: {context.exception}"
            )
コード例 #8
0
    def test_early_get_last_lr(self):
        # with deepspeed's fp16 and dynamic loss scale enabled the optimizer/scheduler steps may
        # not run for the first few dozen steps while loss scale is too large, and thus during
        # that time `get_last_lr` will fail if called during that warm up stage,
        #
        # setting `logging_steps=1` forces an early `trainer._maybe_log_save_evaluate()` which calls
        # `self.lr_scheduler.get_last_lr()` and originally it'd fail on the very first step.
        with mockenv_context(**self.dist_env_1_gpu):
            a = b = 0.0
            trainer = get_regression_trainer(
                a=a,
                b=b,
                local_rank=0,
                train_len=8,
                deepspeed=self.ds_config_file,
                per_device_train_batch_size=8,
                logging_steps=1,
            )
            trainer.train()
            no_grad_accum_a = trainer.model.a.item()

            # it's enough that train didn't fail for this test, but we must check that
            # optimizer/scheduler didn't run (since if it did this test isn't testing the right thing)
            self.assertEqual(no_grad_accum_a, a)
コード例 #9
0
    def test_config_object(self):
        # test that we can switch from zero2 to zero3 in the same process for example
        # test is_zero, etc.
        output_dir = self.get_auto_remove_tmp_dir()
        kwargs = dict(output_dir=output_dir, train_len=8, fp16=True)

        ds_config_zero3_dict = self.get_config_dict("zero3")
        ds_config_zero2_dict = self.get_config_dict("zero2")

        with mockenv_context(**self.dist_env_1_gpu):
            trainer = get_regression_trainer(deepspeed=ds_config_zero3_dict,
                                             **kwargs)
            self.assertTrue(is_deepspeed_zero3_enabled())

            # test we can repeat that and with train this time
            trainer = get_regression_trainer(deepspeed=ds_config_zero3_dict,
                                             **kwargs)
            trainer.train()
            self.assertTrue(is_deepspeed_zero3_enabled())

            # test zero3 is disabled
            trainer = get_regression_trainer(deepspeed=ds_config_zero2_dict,
                                             **kwargs)
            self.assertFalse(is_deepspeed_zero3_enabled())

            # check config obj
            config = deepspeed_config()
            self.assertTrue(bool(config),
                            "Deepspeed config should be accessible")

            del trainer
            # now weakref should gc the global and we shouldn't get anything here
            config = deepspeed_config()
            self.assertFalse(is_deepspeed_zero3_enabled())
            self.assertFalse(bool(config),
                             "Deepspeed config should not be accessible")
コード例 #10
0
    def test_early_get_last_lr(self, stage, dtype):
        # with deepspeed's fp16 and dynamic loss scale enabled the optimizer/scheduler steps may
        # not run for the first few dozen steps while loss scale is too large, and thus during
        # that time `get_last_lr` will fail if called during that warm up stage,
        #
        # setting `logging_steps=1` forces an early `trainer._maybe_log_save_evaluate()` which calls
        # `self.lr_scheduler.get_last_lr()` and originally it'd fail on the very first step.
        with mockenv_context(**self.dist_env_1_gpu):
            a = b = 0.0
            kwargs = dict(
                a=a,
                b=b,
                local_rank=0,
                train_len=8,
                deepspeed=self.get_config_dict(stage),
                per_device_train_batch_size=8,
                logging_steps=1,
            )
            kwargs[dtype] = True
            trainer = get_regression_trainer(**kwargs)

            trainer.train()
            post_train_a = trainer.model.a.item()

            # XXX: for some reason the following check fails with zero3/fp16 and any/bf16 - not a
            # broken but a different qualitative outcome - as if optimizer did run
            # oddly getting 1.0 for both a and b from 0.0 - there is a bug somewhere
            # print(trainer.model.a.item())
            # print(trainer.model.b.item())
            # need to investigate at some point
            if (stage == ZERO3 and dtype == FP16) or (dtype == BF16):
                return

            # it's enough that train didn't fail for this test, but we must check that
            # optimizer/scheduler didn't run (since if it did this test isn't testing the right thing)
            self.assertEqual(post_train_a, a)
コード例 #11
0
    def test_load_best_model(self, stage, dtype):
        # Test that forced deepspeed reinit doesn't break the model. the forced re-init after
        # loading the best model in Trainer is there to workaround this bug in Deepspeed
        # https://github.com/microsoft/DeepSpeed/issues/1612
        #
        # The test is derived from a repro script submitted in this Issue:
        # https://github.com/huggingface/transformers/issues/17114
        #
        # One additional feature of this test is that we use a non-AdamW optimizer to test that
        # deepspeed doesn't fallback to AdamW, which would prevent the optimizer states from loading
        # correctly

        from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer  # noqa

        output_dir = self.get_auto_remove_tmp_dir(
        )  # "./xxx", after=False, before=False)

        ds_config_dict = self.get_config_dict(stage)
        del ds_config_dict["optimizer"]  # will use HF Trainer optimizer
        del ds_config_dict["scheduler"]  # will use HF Trainer scheduler
        # must use this setting to get the reload path exercised
        ds_config_dict["zero_optimization"][
            "stage3_gather_16bit_weights_on_model_save"] = True

        with mockenv_context(**self.dist_env_1_gpu):

            args_dict = {
                "per_gpu_train_batch_size": 1,
                "per_gpu_eval_batch_size": 1,
                "gradient_accumulation_steps": 1,
                "learning_rate": 1e-4,
                "num_train_epochs": 1,
                "do_train": True,
                "do_eval": True,
                "optim": "adafactor",
                "evaluation_strategy": "steps",
                "eval_steps": 1,
                "save_strategy": "steps",
                "save_steps": 1,
                "load_best_model_at_end": True,
                "max_steps": 1,
                "deepspeed": ds_config_dict,
            }

            training_args = TrainingArguments(output_dir, **args_dict)
            tokenizer = T5Tokenizer.from_pretrained(T5_TINY)
            model = T5ForConditionalGeneration.from_pretrained(T5_TINY)

            def _add_eos_to_examples(example):
                example[
                    "input_text"] = f"question: {example['question']}  context: {example['context']}"
                example["target_text"] = example["answers"]["text"][0] if len(
                    example["answers"]["text"]) > 0 else ""
                return example

            def _convert_to_features(example_batch):
                input_encodings = tokenizer.batch_encode_plus(
                    example_batch["input_text"],
                    pad_to_max_length=True,
                    max_length=512,
                    truncation=True)
                target_encodings = tokenizer.batch_encode_plus(
                    example_batch["target_text"],
                    pad_to_max_length=True,
                    max_length=16,
                    truncation=True)

                encodings = {
                    "input_ids": input_encodings["input_ids"],
                    "attention_mask": input_encodings["attention_mask"],
                    "labels": target_encodings["input_ids"],
                }

                return encodings

            def get_dataset():
                data_file = str(self.tests_dir /
                                "fixtures/tests_samples/SQUAD/sample.json")
                data_files = dict(train=data_file, validation=data_file)
                raw_datasets = datasets.load_dataset("json",
                                                     data_files=data_files,
                                                     field="data")
                train_dataset = raw_datasets["train"].map(
                    _add_eos_to_examples).map(_convert_to_features,
                                              batched=True)
                valid_dataset = deepcopy(train_dataset)
                return train_dataset, valid_dataset

            train_dataset, eval_dataset = get_dataset()

            trainer = Trainer(
                model=model,
                tokenizer=tokenizer,
                args=training_args,
                train_dataset=train_dataset,
                eval_dataset=eval_dataset,
            )
            trainer.train()  # crash 1 was here
            trainer.evaluate()  # crash 2 was here
コード例 #12
0
    def test_can_resume_training(self):
        # adapted from TrainerIntegrationTest.test_can_resume_training

        output_dir = self.get_auto_remove_tmp_dir()
        ds_config_dict = deepcopy(self.ds_config_dict)
        ds_config_dict["fp16"][
            "initial_scale_power"] = 1  # force optimizer on the first step
        kwargs = dict(output_dir=output_dir,
                      train_len=128,
                      save_steps=5,
                      learning_rate=0.1,
                      deepspeed=ds_config_dict)

        with mockenv_context(**self.dist_env_1_gpu):
            trainer = get_regression_trainer(**kwargs)
            trainer.train()
            (a, b) = trainer.model.a.item(), trainer.model.b.item()
            state = dataclasses.asdict(trainer.state)

            checkpoint = os.path.join(output_dir, "checkpoint-5")

            # Reinitialize trainer
            trainer = get_regression_trainer(**kwargs)

            trainer.train(resume_from_checkpoint=checkpoint)
            (a1, b1) = trainer.model.a.item(), trainer.model.b.item()
            state1 = dataclasses.asdict(trainer.state)
            self.assertEqual(a, a1)
            self.assertEqual(b, b1)
            self.check_trainer_state_are_the_same(state, state1)

            # Now check with a later checkpoint that it also works when we span over one epoch
            checkpoint = os.path.join(output_dir, "checkpoint-15")

            # Reinitialize trainer and load model
            trainer = get_regression_trainer(**kwargs)

            trainer.train(resume_from_checkpoint=checkpoint)
            (a1, b1) = trainer.model.a.item(), trainer.model.b.item()
            state1 = dataclasses.asdict(trainer.state)
            self.assertEqual(a, a1)
            self.assertEqual(b, b1)
            self.check_trainer_state_are_the_same(state, state1)

            # Now check failures

            # 1. fail to find a bogus checkpoint
            trainer = get_regression_trainer(**kwargs)
            with self.assertRaises(Exception) as context:
                trainer.train(resume_from_checkpoint=f"{checkpoint}-bogus")
            self.assertTrue(
                "failed to resume from checkpoint" in str(context.exception))

            # 2. fail to find any checkpoint - due a fresh output_dir
            output_dir2 = self.get_auto_remove_tmp_dir()
            trainer = get_regression_trainer(output_dir=output_dir2,
                                             deepspeed=ds_config_dict)
            with self.assertRaises(Exception) as context:
                trainer.train(resume_from_checkpoint=True)
            self.assertTrue("No valid checkpoint found in output directory" in
                            str(context.exception))