def test_repeating_loader(): loader = [1, 2, 3] loader = RepeatingLoader(loader) for idx in range(50): assert next(loader) == 1 assert next(loader) == 2 assert next(loader) == 3
def _run(): args_defaults = { 'num_layers': 8, 'hidden_size': 128, 'num_attention_heads': 8, 'max_position_embeddings': 128, } topo = self.get_topology(mp_size, pp_size, world_size) gpt2_pipe_model = GPT2ModelPipe(num_layers=8, num_stages=pp_size, mp_size=mp_size, args_others=args_defaults, topo=topo) model = self.get_deepspeed_model(gpt2_pipe_model, tmpdir) tag = 'pp_basic' state_dict = {} state_dict['checkpoint_version'] = get_megatron_version() model.save_checkpoint(tmpdir, tag=tag, client_state=state_dict) if model.is_first_stage() or model.is_last_stage(): inputs = self.get_inputs() loader = RepeatingLoader([(inputs[0], 0)]) data_iter = iter(loader) else: data_iter = None baseline = model.eval_batch(data_iter=data_iter, compute_loss=False, reduce_output=None) dist.barrier() model.load_checkpoint(tmpdir, tag=tag, load_optimizer_states=False, load_lr_scheduler_states=False) dist.barrier() test = model.eval_batch(data_iter=data_iter, compute_loss=False, reduce_output=None) if test is not None: assert len(baseline) == len(test) # Compare outputs of each microbatch for mb in range(len(baseline)): for b, t in zip(baseline[mb], test[mb]): if b.is_floating_point(): # don't compare masks assert torch.allclose( b, t, atol=1e-07 ), f"Baseline output {baseline} is not equal to save-then-load output {test}"
def _run_resize(inputs, tag, output, quit_event): reset_random() args_defaults = { 'num_layers': 8, 'hidden_size': 128, 'num_attention_heads': 8, 'max_position_embeddings': 128, } topo = self.get_topology(mp_resize, pp_resize, mp_resize * pp_resize) gpt2_pipe_model = GPT2ModelPipe(num_layers=8, num_stages=pp_resize, mp_size=mp_resize, args_others=args_defaults, topo=topo) model = self.get_deepspeed_model(gpt2_pipe_model, tmpdir) with torch.no_grad(): model.load_checkpoint(tmpdir, tag=tag, load_optimizer_states=False, load_lr_scheduler_states=False) inputs = [x.cuda() for x in inputs] if model.is_first_stage() or model.is_last_stage(): loader = RepeatingLoader([((inputs[0], inputs[1]), 0)]) data_iter = iter(loader) else: data_iter = None test = model.eval_batch(data_iter=data_iter, compute_loss=False, reduce_output=None) if test is not None: # test should be [[hidden, True]]] assert len(test) == 1 assert len(test[0]) == 2 assert torch.is_tensor(test[0][0]) assert test[0][1].numel() == 1 output.put(test[0][0].cpu()) quit_event.wait()
def _run_baseline(inputs, tag, output, quit_event): reset_random() args_defaults = { 'num_layers': 8, 'hidden_size': 128, 'num_attention_heads': 8, 'max_position_embeddings': 128, } topo = self.get_topology(mp_size, pp_size, mp_size * pp_size) gpt2_pipe_model = GPT2ModelPipe(num_layers=8, num_stages=pp_size, mp_size=mp_size, args_others=args_defaults, topo=topo) model = self.get_deepspeed_model(gpt2_pipe_model, tmpdir) with torch.no_grad(): inputs = [x.cuda() for x in inputs] if model.is_first_stage() or model.is_last_stage(): loader = RepeatingLoader([((inputs[0], inputs[1]), 0)]) data_iter = iter(loader) else: data_iter = None baseline = model.eval_batch(data_iter=data_iter, compute_loss=False, reduce_output=None) if baseline is not None: # baseline should be [[hidden, True]]] assert len(baseline) == 1 assert len(baseline[0]) == 2 assert torch.is_tensor(baseline[0][0]) assert baseline[0][1].numel() == 1 output.put(baseline[0][0].cpu()) state_dict = {} state_dict['checkpoint_version'] = get_megatron_version() model.save_checkpoint(tmpdir, tag=tag, client_state=state_dict) quit_event.wait()
def train_base(args): torch.manual_seed(args.seed) # VGG also works :-) #net = vgg19(num_classes=10) net = AlexNet(num_classes=10) trainset = cifar_trainset(args.local_rank) engine, _, dataloader, __ = deepspeed.initialize( args=args, model=net, model_parameters=[p for p in net.parameters() if p.requires_grad], training_data=trainset) dataloader = RepeatingLoader(dataloader) data_iter = iter(dataloader) rank = dist.get_rank() gas = engine.gradient_accumulation_steps() criterion = torch.nn.CrossEntropyLoss() total_steps = args.steps * engine.gradient_accumulation_steps() step = 0 for micro_step in range(total_steps): batch = next(data_iter) inputs = batch[0].to(engine.device) labels = batch[1].to(engine.device) outputs = engine(inputs) loss = criterion(outputs, labels) engine.backward(loss) engine.step() if micro_step % engine.gradient_accumulation_steps() == 0: step += 1 if rank == 0 and (step % 10 == 0): print(f'step: {step:3d} / {args.steps:3d} loss: {loss}')
def _helper(): base_model = copy.deepcopy(sequential_model) base_input = batch_input.clone().detach() base_output = base_model(base_input) base_output = base_output base_params = sum(p.numel() for p in base_model.parameters()) pipe_model = copy.deepcopy(sequential_model) pipe_model = PipelineModule(layers=pipe_model, num_stages=4) # Ensure all parameters are accounted for. my_params = sum(p.numel() for p in pipe_model.parameters()) total_pipe_params = torch.LongTensor([my_params]).to('cuda') dist.all_reduce(total_pipe_params) total_pipe_params = total_pipe_params.item() assert total_pipe_params == base_params pipe_model, _, _, _ = deepspeed.initialize( args=simple_args, model=pipe_model, model_parameters=[p for p in pipe_model.parameters()]) if pipe_model.is_first_stage or pipe_model.is_last_stage: pipe_input = base_input.clone().detach().to('cuda') # label 0 is meaningless dataset = [(pipe_input, 0)] loader = RepeatingLoader(dataset) data_iter = iter(loader) else: data_iter = None pipe_output = pipe_model.eval_batch(data_iter=data_iter) base_output = base_output.to('cpu') pipe_output = pipe_output.to('cpu') assert torch.allclose(base_output, pipe_output)