def _run(): args_defaults = { 'num_layers': 8, 'hidden_size': 128, 'num_attention_heads': 8, 'max_position_embeddings': 128, } topo = self.get_topology(mp_size, pp_size, world_size) gpt2_pipe_model = GPT2ModelPipe(num_layers=8, num_stages=pp_size, mp_size=mp_size, args_others=args_defaults, topo=topo) model = self.get_deepspeed_model(gpt2_pipe_model, tmpdir) tag = 'pp_basic' state_dict = {} state_dict['checkpoint_version'] = get_megatron_version() model.save_checkpoint(tmpdir, tag=tag, client_state=state_dict) if model.is_first_stage() or model.is_last_stage(): inputs = self.get_inputs() loader = RepeatingLoader([(inputs[0], 0)]) data_iter = iter(loader) else: data_iter = None baseline = model.eval_batch(data_iter=data_iter, compute_loss=False, reduce_output=None) dist.barrier() model.load_checkpoint(tmpdir, tag=tag, load_optimizer_states=False, load_lr_scheduler_states=False) dist.barrier() test = model.eval_batch(data_iter=data_iter, compute_loss=False, reduce_output=None) if test is not None: assert len(baseline) == len(test) # Compare outputs of each microbatch for mb in range(len(baseline)): for b, t in zip(baseline[mb], test[mb]): if b.is_floating_point(): # don't compare masks assert torch.allclose( b, t, atol=1e-07 ), f"Baseline output {baseline} is not equal to save-then-load output {test}"
def _run_resize(inputs, tag, output, quit_event): reset_random() args_defaults = { 'num_layers': 8, 'hidden_size': 128, 'num_attention_heads': 8, 'max_position_embeddings': 128, } topo = self.get_topology(mp_resize, pp_resize, mp_resize * pp_resize) gpt2_pipe_model = GPT2ModelPipe(num_layers=8, num_stages=pp_resize, mp_size=mp_resize, args_others=args_defaults, topo=topo) model = self.get_deepspeed_model(gpt2_pipe_model, tmpdir) with torch.no_grad(): model.load_checkpoint(tmpdir, tag=tag, load_optimizer_states=False, load_lr_scheduler_states=False) inputs = [x.cuda() for x in inputs] if model.is_first_stage() or model.is_last_stage(): loader = RepeatingLoader([((inputs[0], inputs[1]), 0)]) data_iter = iter(loader) else: data_iter = None test = model.eval_batch(data_iter=data_iter, compute_loss=False, reduce_output=None) if test is not None: # test should be [[hidden, True]]] assert len(test) == 1 assert len(test[0]) == 2 assert torch.is_tensor(test[0][0]) assert test[0][1].numel() == 1 output.put(test[0][0].cpu()) quit_event.wait()
def _run_baseline(inputs, tag, output, quit_event): reset_random() args_defaults = { 'num_layers': 8, 'hidden_size': 128, 'num_attention_heads': 8, 'max_position_embeddings': 128, } topo = self.get_topology(mp_size, pp_size, mp_size * pp_size) gpt2_pipe_model = GPT2ModelPipe(num_layers=8, num_stages=pp_size, mp_size=mp_size, args_others=args_defaults, topo=topo) model = self.get_deepspeed_model(gpt2_pipe_model, tmpdir) with torch.no_grad(): inputs = [x.cuda() for x in inputs] if model.is_first_stage() or model.is_last_stage(): loader = RepeatingLoader([((inputs[0], inputs[1]), 0)]) data_iter = iter(loader) else: data_iter = None baseline = model.eval_batch(data_iter=data_iter, compute_loss=False, reduce_output=None) if baseline is not None: # baseline should be [[hidden, True]]] assert len(baseline) == 1 assert len(baseline[0]) == 2 assert torch.is_tensor(baseline[0][0]) assert baseline[0][1].numel() == 1 output.put(baseline[0][0].cpu()) state_dict = {} state_dict['checkpoint_version'] = get_megatron_version() model.save_checkpoint(tmpdir, tag=tag, client_state=state_dict) quit_event.wait()