def _test_zero_empty_partition(args): hidden_dim = 1 model = SimpleModel(hidden_dim) # Ensure model has 2 parameters, to cause empty partition with DP=3 assert len(list(model.parameters())) == 2 model, _, _, _ = deepspeed.initialize(args=args, model=model, model_parameters=model.parameters()) model.step()
def _test_zero_allow_untested_optimizer(args): hidden_dim = 10 model = SimpleModel(hidden_dim, empty_grad=True) optimizer = SimpleOptimizer(model.parameters()) with pytest.raises(AssertionError): model, optim, _,_ = deepspeed.initialize(args=args, model=model, optimizer=optimizer, model_parameters=model.parameters())
def _test_zero_supported_client_optimizer(args, zero_stage, optimizer_constructor): model = SimpleModel(hidden_dim) client_optimizer = optimizer_constructor(params=model.parameters()) model, _, _, _ = deepspeed.initialize(args=args, model=model, optimizer=client_optimizer)
def test_client_optimizer(tmpdir, optimizer_type): def _optimizer_callable(params) -> Optimizer: return AdamW(params=params) hidden_dim = 10 model = SimpleModel(hidden_dim) config_dict = {'train_batch_size': 1} if optimizer_type is None: client_optimizer = None config_dict['optimizer'] = {'type': ADAM_OPTIMIZER} elif optimizer_type is Optimizer: client_optimizer = Adam(model.parameters()) else: client_optimizer = _optimizer_callable args = args_from_dict(tmpdir, config_dict) @distributed_test(world_size=[1]) def _test_client_optimizer(args, model, client_optimizer): _, ds_optimizer, _, _ = deepspeed.initialize( args=args, model=model, model_parameters=list(model.parameters()), optimizer=client_optimizer) if client_optimizer is None: assert isinstance(ds_optimizer, FusedAdam) elif isinstance(client_optimizer, Optimizer): assert ds_optimizer == client_optimizer else: assert isinstance(ds_optimizer, AdamW) _test_client_optimizer(args=args, model=model, client_optimizer=client_optimizer)
def _test_zero_empty_partition(args): hidden_dim = 1 model = SimpleModel(hidden_dim) # Ensure model has 2 parameters, to cause empty partition with DP=3 assert len(list(model.parameters())) == 2 model, _, _, _ = deepspeed.initialize( args=args, model=model, model_parameters=model.parameters()) # Now make sure things work.. data_loader = random_dataloader(model=model, total_samples=1, hidden_dim=hidden_dim, device=model.device) for n, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) model.backward(loss) model.step()
def helper(args): model = SimpleModel(10) model, _, _, _ = deepspeed.initialize(args=args, model=model, model_parameters=model.parameters()) # get base optimizer under zero ds_optimizer = model.optimizer.optimizer opt_class, adam_w_mode = resulting_optimizer assert isinstance(ds_optimizer, opt_class) if adam_w_mode in [True, False]: assert ds_optimizer.adam_w_mode == adam_w_mode
def _test_adam_fp16_zero_onecycle_compatibility(args, zero_stage, hidden_dim): model = SimpleModel(hidden_dim) model, _, _, _ = deepspeed.initialize( args=args, model=model, model_parameters=model.parameters()) data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device) for n, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) model.backward(loss) model.step()
def _go(args): model = SimpleModel(hidden_dim) model, _, _, _ = deepspeed.initialize( args=args, model=model, model_parameters=model.parameters()) data_loader = random_dataloader(model=model, total_samples=10, hidden_dim=hidden_dim, device=model.device) for _, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) model.backward(loss) model.step()
def _test_fused_all_overflow(args): hidden_dim = 1 model = SimpleModel(hidden_dim, empty_grad=True) model, optim, _, _ = deepspeed.initialize(args=args, model=model, model_parameters=model.parameters()) expected_loss_scale = 2**4 # Ensure the dynamic loss scaler is correctly configured. assert optim.dynamic_loss_scale == True assert optim.cur_scale == expected_loss_scale overflow_gradients = [float('inf'), float('-inf')] + [float('nan')] * 6 for i, value in enumerate(overflow_gradients): run_model_step(model, [value]) expected_loss_scale = max(expected_loss_scale / 2, 1) assert optim.cur_scale == expected_loss_scale assert optim.cur_iter == (i + 1)
def _go(hidden_dim): with deepspeed.zero.Init(enabled=zero_stage == 3, config_dict_or_path=ds_config): model = SimpleModel(hidden_dim, nlayers=78) print('total number of parameters:', sum([p.numel() for p in model.parameters()])) see_memory_usage('pre-init', force=True) model, _, _, _ = deepspeed.initialize(model=model, config=ds_config) see_memory_usage('post-init', force=True) data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device, dtype=torch.half) print(f"optimizer={model.optimizer}") for batch in data_loader: model(batch[0], batch[1]) see_memory_usage('post-fwds', force=True)
def _test_zero_static_scale(args): hidden_dim = 10 model = SimpleModel(hidden_dim) model, optim, _, _ = deepspeed.initialize( args=args, model=model, model_parameters=model.parameters()) # Ensure the static scaler is configured. assert optim.dynamic_loss_scale == False assert optim.loss_scaler.loss_scale == 138. # Now make sure things work.. data_loader = random_dataloader(model=model, total_samples=10, hidden_dim=hidden_dim, device=model.device) for n, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) model.backward(loss) model.step()
def _test_unfused_no_overflow(args): hidden_dim = 1 model = SimpleModel(hidden_dim, empty_grad=True) model, optim, _, _ = deepspeed.initialize(args=args, model=model, model_parameters=model.parameters()) expected_loss_scale = 2**8 expected_scale_window = 2 # Ensure the dynamic loss scaler is correctly configured. assert optim.dynamic_loss_scale == True assert optim.cur_scale == expected_loss_scale assert optim.scale_window == expected_scale_window for i, value in enumerate(np.random.uniform(-0.1, 0.1, 10)): run_model_step(model, [value]) assert optim.cur_scale == expected_loss_scale assert optim.cur_iter == (i + 1) if optim.cur_iter % expected_scale_window == 0: expected_loss_scale *= 2
def _test_unfused_some_overflow(args): hidden_dim = 1 model = SimpleModel(hidden_dim, empty_grad=True) model, optim, _, _ = deepspeed.initialize(args=args, model=model, model_parameters=model.parameters()) expected_loss_scale = 2**8 expected_scale_window = 2 expected_iteration = 0 # Ensure the dynamic loss scaler is correctly configured. assert optim.dynamic_loss_scale == True assert optim.cur_scale == expected_loss_scale assert optim.scale_window == expected_scale_window # Run model with overflows to decrease scale overflow_gradients = [float('inf'), float('nan')] expected_iteration += len(overflow_gradients) run_model_step(model, overflow_gradients) expected_loss_scale /= (2**len(overflow_gradients)) assert optim.cur_scale == expected_loss_scale assert optim.cur_iter == expected_iteration # Run model scale_window + 1 times to increase scale once normal_gradients = np.random.uniform(-0.1, 0.1, expected_scale_window + 1) expected_iteration += len(normal_gradients) run_model_step(model, normal_gradients) expected_loss_scale *= 2 assert optim.cur_scale == expected_loss_scale assert optim.cur_iter == expected_iteration # Run model with overflows to decrease scale overflow_gradients = [float('inf')] expected_iteration += len(overflow_gradients) run_model_step(model, overflow_gradients) expected_loss_scale /= (2**len(overflow_gradients)) assert optim.cur_scale == expected_loss_scale assert optim.cur_iter == expected_iteration
# x = nn.ReLU()(x) # out = self.last_fc(x) # return nn.Sigmoid()(out) # Set device to be used device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print('Device used:', device) model_pytorch = SimpleModel(input_size=input_size, hidden_sizes=hidden_sizes, output_size=output_size) model_pytorch = model_pytorch.to(device) # Set loss and optimizer # Set binary cross entropy loss since 2 classes only criterion = nn.BCELoss() optimizer = optim.Adam(model_pytorch.parameters(), lr=1e-3) num_epochs = 20 # Train model time_start = time.time() for epoch in range(num_epochs): model_pytorch.train() train_loss_total = 0 for data, target in train_loader: data, target = data.to(device), target.float().to(device) optimizer.zero_grad() output = model_pytorch(data)
def test_client_lr_scheduler(tmpdir, scheduler_type, optimizer_type): def _my_lambda(epoch): return epoch // 10 def _optimizer_callable(params) -> Optimizer: return torch.optim.AdamW(params=params) def _lr_scheduler_callable(optimizer) -> _LRScheduler: return LambdaLR(optimizer, _my_lambda) hidden_dim = 10 model = SimpleModel(hidden_dim) config_dict = {'train_batch_size': 1} client_optimizer = None client_scheduler = None if optimizer_type is None: config_dict['optimizer'] = {'type': ADAM_OPTIMIZER} elif optimizer_type is Optimizer: client_optimizer = torch.optim.Adam(model.parameters()) else: client_optimizer = _optimizer_callable if scheduler_type is None: config_dict['scheduler'] = {'type': WARMUP_LR, 'params': {}} elif scheduler_type == _LRScheduler: if isinstance(client_optimizer, Optimizer): client_scheduler = LambdaLR(client_optimizer, _my_lambda) else: # Verify invalid combination is correctly handled client_scheduler = LambdaLR(torch.optim.Adam(model.parameters()), _my_lambda) else: client_scheduler = _lr_scheduler_callable args = args_from_dict(tmpdir, config_dict) @distributed_test(world_size=[1]) def _test_client_lr_scheduler(args, model, optimizer, lr_scheduler): if isinstance(lr_scheduler, _LRScheduler) and not isinstance(optimizer, Optimizer): with pytest.raises(AssertionError): _, _, _, _ = deepspeed.initialize(args=args, model=model, model_parameters=list( model.parameters()), optimizer=optimizer, lr_scheduler=lr_scheduler) else: _, _, _, ds_lr_scheduler = deepspeed.initialize( args=args, model=model, model_parameters=list(model.parameters()), optimizer=optimizer, lr_scheduler=lr_scheduler) if lr_scheduler is None: assert isinstance(ds_lr_scheduler, WarmupLR) elif isinstance(lr_scheduler, _LRScheduler): assert ds_lr_scheduler == lr_scheduler else: assert isinstance(ds_lr_scheduler, LambdaLR) _test_client_lr_scheduler(args=args, model=model, optimizer=client_optimizer, lr_scheduler=client_scheduler)