def _run():
            inputs = self.get_inputs()
            args_defaults = {
                'num_layers': 2,
                'hidden_size': 128,
                'num_attention_heads': 8,
                'max_position_embeddings': 128,
            }

            model = get_gpt2_model(args_defaults)
            model = self.get_deepspeed_model(model, tmpdir)

            model.eval()
            baseline = model(inputs[0].cuda(), inputs[1].cuda(),
                             inputs[2].cuda())

            tag = 'mp_1'
            state_dict = {}
            state_dict['checkpoint_version'] = get_megatron_version()
            model.save_checkpoint(tmpdir, tag=tag, client_state=state_dict)
            dist.barrier()
            model.load_checkpoint(tmpdir,
                                  tag=tag,
                                  load_optimizer_states=False,
                                  load_lr_scheduler_states=False)

            test = model(inputs[0], inputs[1], inputs[2])
            assert torch.allclose(
                baseline, test, atol=1e-07
            ), f"Baseline output {baseline} is not equal to save-then-load output {test}"
        def _run_resize(inputs, tag, output, quit_event):
            reset_random()
            args_defaults = {
                'num_layers': 2,
                'hidden_size': 128,
                'num_attention_heads': 8,
                'max_position_embeddings': 128,
            }

            model = get_gpt2_model(args_defaults, mp_size=resize)
            model = self.get_deepspeed_model(model, tmpdir)

            model.eval()

            with torch.no_grad():
                model.load_checkpoint(tmpdir,
                                      tag=tag,
                                      load_optimizer_states=False,
                                      load_lr_scheduler_states=False)
                test = model(inputs[0].cuda(), inputs[1].cuda(),
                             inputs[2].cuda())
                if dist.get_rank() == 0:
                    output.put(test.cpu())
            quit_event.wait()
        def _run_baseline(inputs, tag, output, quit_event):
            reset_random()
            args_defaults = {
                'num_layers': 2,
                'hidden_size': 128,
                'num_attention_heads': 8,
                'max_position_embeddings': 128,
            }

            model = get_gpt2_model(args_defaults, mp_size=mp_size)
            model = self.get_deepspeed_model(model, tmpdir)

            model.eval()

            with torch.no_grad():
                baseline = model(inputs[0].cuda(), inputs[1].cuda(),
                                 inputs[2].cuda())
                if dist.get_rank() == 0:
                    output.put(baseline.cpu())

                state_dict = {}
                state_dict['checkpoint_version'] = get_megatron_version()
                model.save_checkpoint(tmpdir, tag=tag, client_state=state_dict)
                quit_event.wait()