Python get_regression_trainer Exemples, test_trainer.get_regression_trainer Python Exemples

Exemple #1

0

Afficher le fichier

    def test_config_object(self):
        # test that we can switch from zero2 to zero3 in the same process for example
        # test is_zero, etc.
        output_dir = self.get_auto_remove_tmp_dir()
        kwargs = dict(output_dir=output_dir, train_len=8, fp16=True)

        ds_config_zero3_dict = self.get_config_dict("zero3")
        ds_config_zero2_dict = self.get_config_dict("zero2")

        with mockenv_context(**self.dist_env_1_gpu):
            trainer = get_regression_trainer(deepspeed=ds_config_zero3_dict, **kwargs)
            self.assertTrue(is_deepspeed_zero3_enabled())

            # test we can repeat that and with train this time
            trainer = get_regression_trainer(deepspeed=ds_config_zero3_dict, **kwargs)
            trainer.train()
            self.assertTrue(is_deepspeed_zero3_enabled())

            # test zero3 is disabled
            trainer = get_regression_trainer(deepspeed=ds_config_zero2_dict, **kwargs)
            self.assertFalse(is_deepspeed_zero3_enabled())

            # check config obj
            config = deepspeed_config()
            self.assertTrue(bool(config), "Deepspeed config should be accessible")

            del trainer
            # now weakref should gc the global and we shouldn't get anything here
            config = deepspeed_config()
            self.assertFalse(is_deepspeed_zero3_enabled())
            self.assertFalse(bool(config), "Deepspeed config should not be accessible")

Exemple #2

0

Afficher le fichier

Fichier : test_deepspeed.py Projet : riklopfer/transformers

    def test_gradient_accumulation(self, stage):
        # this test measures that we get identical weights and similar loss with:
        # 1. per_device_train_batch_size=8, gradient_accumulation_steps=1
        # 2. per_device_train_batch_size=4, gradient_accumulation_steps=2
        # since the 2nd should produce the effective batch of 1st, with the same results
        #
        # I can get an identical loss for a small train_len=32, plus the power of the initial
        # dynamic loss scale value set to:
        #   "fp16.initial_scale_power": 1
        # plus having the same WarmupLR's warmup_min_lr == warmup_max_lr in the config file
        # but for some reason going to train_len=64 the weights, weights start to mismatch with this setup.
        # the culprit seems to be `initial_scale_power` - putting it back to its default 32 keeps the weights identical

        train_len = 64
        a = b = 0.0

        with mockenv_context(**self.dist_env_1_gpu):
            no_grad_accum_trainer = get_regression_trainer(
                a=a,
                b=b,
                local_rank=0,
                train_len=train_len,
                fp16=True,
                deepspeed=self.get_config_dict(stage),
                per_device_train_batch_size=8,
                gradient_accumulation_steps=1,
            )
            no_grad_accum_result = no_grad_accum_trainer.train()
            no_grad_accum_loss = no_grad_accum_result.training_loss
            no_grad_accum_a = no_grad_accum_trainer.model.a.item()
            no_grad_accum_b = no_grad_accum_trainer.model.b.item()
            # make sure the optimizer kicked in - if it hasn't changed from the original value of a then make train_len bigger
            self.assertNotEqual(no_grad_accum_a, a)

        with mockenv_context(**self.dist_env_1_gpu):
            yes_grad_accum_trainer = get_regression_trainer(
                a=a,
                b=b,
                local_rank=0,
                train_len=train_len,
                fp16=True,
                deepspeed=self.get_config_dict(stage),
                per_device_train_batch_size=4,
                gradient_accumulation_steps=2,
            )
            yes_grad_accum_result = yes_grad_accum_trainer.train()
            yes_grad_accum_loss = yes_grad_accum_result.training_loss
            yes_grad_accum_a = yes_grad_accum_trainer.model.a.item()
            yes_grad_accum_b = yes_grad_accum_trainer.model.b.item()
            self.assertNotEqual(yes_grad_accum_a, a)

        # training with half the batch size but accumulation steps as 2 should give the same
        # weights, but sometimes get a slight difference still of 1e-6
        self.assertAlmostEqual(no_grad_accum_a, yes_grad_accum_a, places=5)
        self.assertAlmostEqual(no_grad_accum_b, yes_grad_accum_b, places=5)

        # see the note above how to get identical loss on a small bs
        self.assertAlmostEqual(no_grad_accum_loss,
                               yes_grad_accum_loss,
                               places=5)

Exemple #3

0

Afficher le fichier

    def test_can_resume_training_normal(self, stage):
        # adapted from TrainerIntegrationTest.test_can_resume_training
        # test normal resume for each stage separately, error-handling is tested in a different test
        output_dir = self.get_auto_remove_tmp_dir()
        ds_config_dict = self.get_config_dict(stage)
        ds_config_dict["fp16"][
            "initial_scale_power"] = 1  # force optimizer on the first step
        if stage == ZERO3:
            ds_config_dict["zero_optimization"][
                "stage3_gather_fp16_weights_on_model_save"] = True

        kwargs = dict(output_dir=output_dir,
                      train_len=128,
                      save_steps=5,
                      learning_rate=0.1,
                      fp16=True,
                      deepspeed=ds_config_dict)

        with mockenv_context(**self.dist_env_1_gpu):
            trainer = get_regression_trainer(**kwargs)
            trainer.train()
            (a, b) = trainer.model.a.item(), trainer.model.b.item()
            state = dataclasses.asdict(trainer.state)

            checkpoint = os.path.join(output_dir, "checkpoint-5")

            # Reinitialize trainer
            trainer = get_regression_trainer(**kwargs)

            trainer.train(resume_from_checkpoint=checkpoint)
            (a1, b1) = trainer.model.a.item(), trainer.model.b.item()
            state1 = dataclasses.asdict(trainer.state)
            self.assertEqual(a, a1)
            self.assertEqual(b, b1)
            self.check_trainer_state_are_the_same(state, state1)

            # Now check with a later checkpoint that it also works when we span over one epoch
            checkpoint = os.path.join(output_dir, "checkpoint-15")

            # Reinitialize trainer and load model
            trainer = get_regression_trainer(**kwargs)

            trainer.train(resume_from_checkpoint=checkpoint)
            (a1, b1) = trainer.model.a.item(), trainer.model.b.item()
            state1 = dataclasses.asdict(trainer.state)
            self.assertEqual(a, a1)
            self.assertEqual(b, b1)
            self.check_trainer_state_are_the_same(state, state1)

Exemple #4

0

Afficher le fichier

Fichier : test_deepspeed.py Projet : zhuanglineu/transformers

 def test_fake_notebook_no_launcher(self):
     # this setup emulates a notebook where a launcher needs to be emulated by hand
     with CaptureStd() as cs:  # noqa
         with mockenv_context(**self.dist_env_1_gpu):
             trainer = get_regression_trainer(local_rank=0,
                                              deepspeed=self.ds_config_file)
             trainer.train()

Exemple #5

0

Afficher le fichier

    def test_load_state_dict_from_zero_checkpoint(self, stage):
        # test that we can load fp32 weights directly from the zero checkpoint into the current model

        output_dir = self.get_auto_remove_tmp_dir()  # "./xxx", after=False, before=False)

        ds_config_dict = self.get_config_dict(stage)

        kwargs = dict(
            output_dir=output_dir,
            train_len=4,
            per_device_train_batch_size=4,
            num_train_epochs=1,
            save_strategy="steps",
            save_steps=1,
            learning_rate=0.1,
            fp16=True,
            deepspeed=ds_config_dict,
        )

        with mockenv_context(**self.dist_env_1_gpu):
            trainer = get_regression_trainer(**kwargs)
            trainer.train()
            (a, b) = trainer.model.a.item(), trainer.model.b.item()
            state = dataclasses.asdict(trainer.state)

            checkpoint_dir = get_last_checkpoint(output_dir)
            model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)

            (a1, b1) = model.a.item(), model.b.item()
            state1 = dataclasses.asdict(trainer.state)
            self.assertEqual(a, a1)
            self.assertEqual(b, b1)
            self.check_trainer_state_are_the_same(state, state1)

Exemple #6

0

Afficher le fichier

    def test_can_resume_training_errors(self, stage):

        with mockenv_context(**self.dist_env_1_gpu):
            ds_config_dict = self.get_config_dict(stage)
            output_dir = self.get_auto_remove_tmp_dir()
            trainer = get_regression_trainer(output_dir=output_dir,
                                             fp16=True,
                                             deepspeed=ds_config_dict)

            # 1. fail to find any checkpoint - due a fresh output_dir
            with self.assertRaises(Exception) as context:
                trainer.train(resume_from_checkpoint=True)
            self.assertTrue(
                "No valid checkpoint found in output directory"
                in str(context.exception),
                f"got exception: {context.exception}",
            )

            # 2. fail to find a bogus checkpoint
            with self.assertRaises(Exception) as context:
                checkpoint = os.path.join(output_dir, "checkpoint-5")
                trainer.train(resume_from_checkpoint=f"{checkpoint}-bogus")
            self.assertTrue(
                "Can't find a valid checkpoint at" in str(context.exception),
                f"got exception: {context.exception}")

Exemple #7

0

Afficher le fichier

    def test_early_get_last_lr(self, stage):
        # with deepspeed's fp16 and dynamic loss scale enabled the optimizer/scheduler steps may
        # not run for the first few dozen steps while loss scale is too large, and thus during
        # that time `get_last_lr` will fail if called during that warm up stage,
        #
        # setting `logging_steps=1` forces an early `trainer._maybe_log_save_evaluate()` which calls
        # `self.lr_scheduler.get_last_lr()` and originally it'd fail on the very first step.
        with mockenv_context(**self.dist_env_1_gpu):
            a = b = 0.0
            trainer = get_regression_trainer(
                a=a,
                b=b,
                local_rank=0,
                train_len=8,
                fp16=True,
                deepspeed=self.get_config_dict(stage),
                per_device_train_batch_size=8,
                logging_steps=1,
            )
            trainer.train()
            post_train_a = trainer.model.a.item()

            # XXX: for some reason the following check fails with zero3 - not a broken but a
            # different qualitative outcome - as if optimizer did run
            # oddly getting 1.0 for both a and b from 0.0 - there is a bug somewhere
            # print(trainer.model.a.item())
            # print(trainer.model.b.item())
            # need to investigate at some point
            if stage == ZERO3:
                return

            # it's enough that train didn't fail for this test, but we must check that
            # optimizer/scheduler didn't run (since if it did this test isn't testing the right thing)
            self.assertEqual(post_train_a, a)

Exemple #8

0

Afficher le fichier

    def test_fake_notebook_no_launcher(self):

        # this setup emulates a notebook where a launcher needs to be emulated by hand

        with CaptureStd() as cs:
            with mockenv_context(**self.dist_env_1_gpu):
                trainer = get_regression_trainer(local_rank=0,
                                                 deepspeed=self.ds_config_file)
                trainer.train()
        assert "DeepSpeed info" in cs.out, "expected DeepSpeed logger output but got none"

Exemple #9

0

Afficher le fichier

    def test_fake_notebook_no_launcher(self):
        sys.path.append(self.tests_dir_str)
        from test_trainer import get_regression_trainer

        del sys.path[-1]  # restore
        ds_config_file = f"{self.test_file_dir_str}/ds_config.json"
        with CaptureStd() as cs:
            trainer = get_regression_trainer(local_rank=0, deepspeed=ds_config_file)
            trainer.train()
        assert "DeepSpeed info" in cs.out, "expected DeepSpeed logger output but got none"

Exemple #10

0

Afficher le fichier

    def test_hf_ds_config_mismatch(self):

        ds_config = self.get_config_dict(ZERO2)

        # Purposefully configure these values to mismatch TrainingArguments values.
        # This currently doesn't cover all keys (but it could)
        per_device_train_batch_size = 2
        ds_config[
            "train_micro_batch_size_per_gpu"] = per_device_train_batch_size + 2

        ds_config["train_batch_size"] = 1000

        gradient_accumulation_steps = 2
        ds_config[
            "gradient_accumulation_steps"] = gradient_accumulation_steps + 2

        max_grad_norm = 1.0
        ds_config["gradient_clipping"] = max_grad_norm + 0.1

        adam_beta1, adam_beta2 = 0.9, 0.99
        ds_config["optimizer"]["params"]["betas"] = [
            adam_beta1 - 0.1, adam_beta2 - 0.1
        ]

        fp16 = True
        ds_config["fp16"]["enabled"] = not fp16

        keys = [
            "per_device_train_batch_size",
            "train_batch_size",
            "gradient_accumulation_steps",
            "max_grad_norm",
            "betas",
            "fp16",
        ]

        with mockenv_context(**self.dist_env_1_gpu):
            trainer = get_regression_trainer(
                local_rank=0,
                fp16=fp16,
                deepspeed=ds_config,
                per_device_train_batch_size=per_device_train_batch_size,
                gradient_accumulation_steps=gradient_accumulation_steps,
                max_grad_norm=max_grad_norm,
                adam_beta1=adam_beta1,
                adam_beta2=adam_beta2,
            )
            with self.assertRaises(Exception) as context:
                trainer.train()

        for key in keys:
            self.assertTrue(
                key in str(context.exception),
                f"{key} is not in the exception message:\n{context.exception}",
            )

Exemple #11

0

Afficher le fichier

 def test_hf_scheduler_ds_optimizer(self):
     a = 0
     with mockenv_context(**self.dist_env_1_gpu):
         ds_config_zero2_dict = self.get_config_dict(ZERO2)
         del ds_config_zero2_dict["scheduler"]  # force default HF Trainer scheduler
         ds_config_zero2_dict["zero_optimization"]["offload_optimizer"]["device"] = "none"
         ds_config_zero2_dict["fp16"]["initial_scale_power"] = 1  # force optimizer on the first step
         trainer = get_regression_trainer(local_rank=0, fp16=True, deepspeed=ds_config_zero2_dict)
         trainer.train()
     new_a = trainer.model.a.item()
     self.assertNotEqual(new_a, a)

Exemple #12

0

Afficher le fichier

 def test_hf_optimizer_with_offload(self, stage):
     # non-DS optimizers can be used with ZERO-offload (as long as they have both CPU and GPU implementation (except LAMB))
     ds_config_dict = self.get_config_dict(stage)
     del ds_config_dict["optimizer"]  # force default HF Trainer optimizer
     # force cpu offload
     ds_config_dict["zero_optimization"]["offload_optimizer"]["device"] = "cpu"
     with mockenv_context(**self.dist_env_1_gpu):
         trainer = get_regression_trainer(local_rank=0, fp16=True, deepspeed=ds_config_dict)
         with CaptureLogger(deepspeed_logger) as cl:
             trainer.train()
         self.assertIn("DeepSpeed info", cl.out, "expected DeepSpeed logger output but got none")

Exemple #13

0

Afficher le fichier

 def test_hf_scheduler_ds_optimizer(self):
     # this combo is not possible at the moment
     with mockenv_context(**self.dist_env_1_gpu):
         ds_config_zero2_dict = self.get_config_dict(ZERO2)
         del ds_config_zero2_dict["scheduler"]  # force default HF Trainer scheduler
         ds_config_zero2_dict["zero_optimization"]["cpu_offload"] = False
         ds_config_zero2_dict["fp16"]["initial_scale_power"] = 1  # force optimizer on the first step
         trainer = get_regression_trainer(local_rank=0, deepspeed=ds_config_zero2_dict)
         with self.assertRaises(Exception) as context:
             trainer.train()
     self.assertTrue("HF scheduler + DeepSpeed optimizer combination is not possible" in str(context.exception))

Exemple #14

0

Afficher le fichier

    def test_fake_notebook_no_launcher(self, stage):
        # this setup emulates a notebook where a launcher needs to be emulated by hand

        # note that unittest resets sys.stdout each test, so `CaptureStd` will work here to capture
        # DeepSpeed log if this test happens to run first in this pytest worker. But it will fail if
        # it's run not as a first test as `sys.stdout` will no longer be the same. So we either have
        # to reset `deepspeed_logger.handlers[0].setStream(sys.stdout)` or directly capture from the deepspeed_logger.
        with mockenv_context(**self.dist_env_1_gpu):
            trainer = get_regression_trainer(local_rank=0, fp16=True, deepspeed=self.get_config_dict(stage))
            with CaptureLogger(deepspeed_logger) as cl:
                trainer.train()
            self.assertIn("DeepSpeed info", cl.out, "expected DeepSpeed logger output but got none")

Exemple #15

0

Afficher le fichier

 def test_stage3_nvme_offload(self):
     with mockenv_context(**self.dist_env_1_gpu):
         # this actually doesn't have to be on NVMe, any storage will do since this test only
         # runs a simple check that we can use some directory as if it were NVMe
         nvme_path = self.get_auto_remove_tmp_dir()
         nvme_config = dict(device="nvme", nvme_path=nvme_path)
         ds_config_zero3_dict = self.get_config_dict(ZERO3)
         ds_config_zero3_dict["zero_optimization"]["offload_optimizer"] = nvme_config
         ds_config_zero3_dict["zero_optimization"]["offload_param"] = nvme_config
         trainer = get_regression_trainer(local_rank=0, fp16=True, deepspeed=ds_config_zero3_dict)
         with CaptureLogger(deepspeed_logger) as cl:
             trainer.train()
         self.assertIn("DeepSpeed info", cl.out, "expected DeepSpeed logger output but got none")

Exemple #16

0

Afficher le fichier

Fichier : test_deepspeed.py Projet : vishal-burman/transformers

 def test_hf_optimizer_with_offload(self, stage):
     # must not allow non-DS optimizer when using ZERO-offload
     ds_config_dict = self.get_config_dict(stage)
     del ds_config_dict["optimizer"]  # force default HF Trainer optimizer
     # force cpu offload
     ds_config_dict["zero_optimization"]["offload_optimizer"]["device"] = "cpu"
     with mockenv_context(**self.dist_env_1_gpu):
         trainer = get_regression_trainer(local_rank=0, fp16=True, deepspeed=ds_config_dict)
         with self.assertRaises(Exception) as context:
             trainer.train()
         self.assertIn(
             "ZeRO Offload can only work with DeepSpeed optimizers",
             str(context.exception),
             f"got exception: {context.exception}",
         )

Exemple #17

0

Afficher le fichier

Fichier : test_deepspeed.py Projet : zhuanglineu/transformers

 def test_ds_scheduler_hf_optimizer(self):
     a = 0
     with mockenv_context(**self.dist_env_1_gpu):
         ds_config_dict = deepcopy(self.ds_config_dict)
         del ds_config_dict[
             "optimizer"]  # force default HF Trainer optimizer
         ds_config_dict["zero_optimization"]["cpu_offload"] = False
         ds_config_dict["fp16"][
             "initial_scale_power"] = 1  # force optimizer on the first step
         trainer = get_regression_trainer(a=a,
                                          local_rank=0,
                                          deepspeed=ds_config_dict)
         trainer.train()
     new_a = trainer.model.a.item()
     self.assertNotEqual(new_a, a)

Exemple #18

0

Afficher le fichier

 def test_hf_optimizer_with_offload(self):
     # must not allow non-DS optimizer when using ZERO-offload
     with mockenv_context(**self.dist_env_1_gpu):
         ds_config_zero2_dict = self.get_config_dict(ZERO2)
         del ds_config_zero2_dict["optimizer"]  # force default HF Trainer optimizer
         ds_config_zero2_dict["zero_optimization"]["cpu_offload"] = True
         # sanity check - should the default config change
         assert (
             "cpu_offload" in ds_config_zero2_dict["zero_optimization"]
             and ds_config_zero2_dict["zero_optimization"]["cpu_offload"] is True
         ), "ensure the config is set up correctly"
         trainer = get_regression_trainer(local_rank=0, deepspeed=ds_config_zero2_dict)
         with self.assertRaises(Exception) as context:
             trainer.train()
     self.assertTrue("ZeRO Offload can only work with DeepSpeed optimizers" in str(context.exception))

Exemple #19

0

Afficher le fichier

    def test_fp32(self, stage):
        ds_config_dict = self.get_config_dict(stage)
        ds_config_dict["fp16"]["enabled"] = False  # force non-fp16 mode

        # XXX: do we go via from_pretrained in zero 3 here? need to test zero.Init(dtype=torch.float)

        # XXX: rewrite this test once fp32 is supported by DeepSpeed
        with mockenv_context(**self.dist_env_1_gpu):
            trainer = get_regression_trainer(local_rank=0,
                                             deepspeed=ds_config_dict)
            with self.assertRaises(Exception) as context:
                trainer.train()
            self.assertIn(
                "ZeRO is only supported if fp16 is enabled",
                str(context.exception),
                f"got exception: {context.exception}",
            )

Exemple #20

0

Afficher le fichier

Fichier : test_deepspeed.py Projet : Kevin-Zhao-Github/oLMpics

    def test_save_checkpoints(self):
        # adapted from  TrainerIntegrationTest.test_save_checkpoints

        output_dir = self.get_auto_remove_tmp_dir()
        ds_config_dict = deepcopy(self.ds_config_dict)
        ds_config_dict["fp16"][
            "initial_scale_power"] = 1  # force optimizer on the first step
        freq = 5

        # save checkpoints
        with mockenv_context(**self.dist_env_1_gpu):
            trainer = get_regression_trainer(
                output_dir=output_dir,
                save_steps=freq,
                deepspeed=ds_config_dict,
            )
            trainer.train()

        total = int(self.n_epochs * 64 / self.batch_size)
        self.check_saved_checkpoints_deepspeed(output_dir, freq, total)

Exemple #21

0

Afficher le fichier

    def test_save_checkpoints(self, stage):
        # adapted from  TrainerIntegrationTest.test_save_checkpoints

        freq = 5
        output_dir = self.get_auto_remove_tmp_dir()
        ds_config_dict = self.get_config_dict(stage)
        ds_config_dict["fp16"]["initial_scale_power"] = 1  # force optimizer on the first step
        if stage == ZERO3:
            ds_config_dict["zero_optimization"]["stage3_gather_fp16_weights_on_model_save"] = True

        # save checkpoints
        with mockenv_context(**self.dist_env_1_gpu):
            trainer = get_regression_trainer(
                output_dir=output_dir,
                save_steps=freq,
                deepspeed=ds_config_dict,
            )
            trainer.train()

        total = int(self.n_epochs * 64 / self.batch_size)
        self.check_saved_checkpoints_deepspeed(output_dir, freq, total, stage)

Exemple #22

0

Afficher le fichier

    def test_early_get_last_lr(self):
        # with deepspeed's fp16 and dynamic loss scale enabled the optimizer/scheduler steps may
        # not run for the first few dozen steps while loss scale is too large, and thus during
        # that time `get_last_lr` will fail if called during that warm up stage,
        #
        # setting `logging_steps=1` forces an early `trainer._maybe_log_save_evaluate()` which calls
        # `self.lr_scheduler.get_last_lr()` and originally it'd fail on the very first step.
        with mockenv_context(**self.dist_env_1_gpu):
            a = b = 0.0
            trainer = get_regression_trainer(
                a=a,
                b=b,
                local_rank=0,
                train_len=8,
                deepspeed=self.ds_config_file,
                per_device_train_batch_size=8,
                logging_steps=1,
            )
            trainer.train()
            no_grad_accum_a = trainer.model.a.item()

            # it's enough that train didn't fail for this test, but we must check that
            # optimizer/scheduler didn't run (since if it did this test isn't testing the right thing)
            self.assertEqual(no_grad_accum_a, a)

Exemple #23

0

Afficher le fichier

Fichier : test_deepspeed.py Projet : Kevin-Zhao-Github/oLMpics

    def test_can_resume_training(self):
        # adapted from TrainerIntegrationTest.test_can_resume_training

        output_dir = self.get_auto_remove_tmp_dir()
        ds_config_dict = deepcopy(self.ds_config_dict)
        ds_config_dict["fp16"][
            "initial_scale_power"] = 1  # force optimizer on the first step
        kwargs = dict(output_dir=output_dir,
                      train_len=128,
                      save_steps=5,
                      learning_rate=0.1,
                      deepspeed=ds_config_dict)

        with mockenv_context(**self.dist_env_1_gpu):
            trainer = get_regression_trainer(**kwargs)
            trainer.train()
            (a, b) = trainer.model.a.item(), trainer.model.b.item()
            state = dataclasses.asdict(trainer.state)

            checkpoint = os.path.join(output_dir, "checkpoint-5")

            # Reinitialize trainer
            trainer = get_regression_trainer(**kwargs)

            trainer.train(resume_from_checkpoint=checkpoint)
            (a1, b1) = trainer.model.a.item(), trainer.model.b.item()
            state1 = dataclasses.asdict(trainer.state)
            self.assertEqual(a, a1)
            self.assertEqual(b, b1)
            self.check_trainer_state_are_the_same(state, state1)

            # Now check with a later checkpoint that it also works when we span over one epoch
            checkpoint = os.path.join(output_dir, "checkpoint-15")

            # Reinitialize trainer and load model
            trainer = get_regression_trainer(**kwargs)

            trainer.train(resume_from_checkpoint=checkpoint)
            (a1, b1) = trainer.model.a.item(), trainer.model.b.item()
            state1 = dataclasses.asdict(trainer.state)
            self.assertEqual(a, a1)
            self.assertEqual(b, b1)
            self.check_trainer_state_are_the_same(state, state1)

            # Now check failures

            # 1. fail to find a bogus checkpoint
            trainer = get_regression_trainer(**kwargs)
            with self.assertRaises(Exception) as context:
                trainer.train(resume_from_checkpoint=f"{checkpoint}-bogus")
            self.assertTrue(
                "failed to resume from checkpoint" in str(context.exception))

            # 2. fail to find any checkpoint - due a fresh output_dir
            output_dir2 = self.get_auto_remove_tmp_dir()
            trainer = get_regression_trainer(output_dir=output_dir2,
                                             deepspeed=ds_config_dict)
            with self.assertRaises(Exception) as context:
                trainer.train(resume_from_checkpoint=True)
            self.assertTrue("No valid checkpoint found in output directory" in
                            str(context.exception))