def _run(): inputs = self.get_inputs() args_defaults = { 'num_layers': 2, 'hidden_size': 128, 'num_attention_heads': 8, 'max_position_embeddings': 128, } model = get_gpt2_model(args_defaults) model = self.get_deepspeed_model(model, tmpdir) model.eval() baseline = model(inputs[0].cuda(), inputs[1].cuda(), inputs[2].cuda()) tag = 'mp_1' state_dict = {} state_dict['checkpoint_version'] = get_megatron_version() model.save_checkpoint(tmpdir, tag=tag, client_state=state_dict) dist.barrier() model.load_checkpoint(tmpdir, tag=tag, load_optimizer_states=False, load_lr_scheduler_states=False) test = model(inputs[0], inputs[1], inputs[2]) assert torch.allclose( baseline, test, atol=1e-07 ), f"Baseline output {baseline} is not equal to save-then-load output {test}"
def _run_resize(inputs, tag, output, quit_event): reset_random() args_defaults = { 'num_layers': 2, 'hidden_size': 128, 'num_attention_heads': 8, 'max_position_embeddings': 128, } model = get_gpt2_model(args_defaults, mp_size=resize) model = self.get_deepspeed_model(model, tmpdir) model.eval() with torch.no_grad(): model.load_checkpoint(tmpdir, tag=tag, load_optimizer_states=False, load_lr_scheduler_states=False) test = model(inputs[0].cuda(), inputs[1].cuda(), inputs[2].cuda()) if dist.get_rank() == 0: output.put(test.cpu()) quit_event.wait()
def _run_baseline(inputs, tag, output, quit_event): reset_random() args_defaults = { 'num_layers': 2, 'hidden_size': 128, 'num_attention_heads': 8, 'max_position_embeddings': 128, } model = get_gpt2_model(args_defaults, mp_size=mp_size) model = self.get_deepspeed_model(model, tmpdir) model.eval() with torch.no_grad(): baseline = model(inputs[0].cuda(), inputs[1].cuda(), inputs[2].cuda()) if dist.get_rank() == 0: output.put(baseline.cpu()) state_dict = {} state_dict['checkpoint_version'] = get_megatron_version() model.save_checkpoint(tmpdir, tag=tag, client_state=state_dict) quit_event.wait()