def setup_model_and_optimizer(neox_args, inference=False, get_key_value=True): """Setup model and optimizer.""" model = get_model(neox_args=neox_args, inference=inference, get_key_value=get_key_value) optimizer, param_groups = get_optimizer(model=model, neox_args=neox_args) lr_scheduler = get_learning_rate_scheduler(optimizer=optimizer, neox_args=neox_args) if neox_args.deepspeed: print_rank_0("DeepSpeed is enabled.") if neox_args.no_load_optim: assert optimizer is None _model_params = None _lr_scheduler = None else: _model_params = param_groups if optimizer is None else None _lr_scheduler = lr_scheduler model, optimizer, _, lr_scheduler = deepspeed.initialize( model=model, optimizer=optimizer, args=neox_args, lr_scheduler=_lr_scheduler, dist_init_required=False, model_parameters=_model_params, config_params=neox_args.deepspeed_config, mpu=mpu if not neox_args.is_pipe_parallel else None) model.total_params = get_total_params(model.module) print_rank_0(f' > total params: {"{:,}".format(model.total_params)}') if neox_args.is_pipe_parallel: model.set_has_attention_mask(True) model.set_batch_fn(model.module._megatron_batch_fn) else: raise ValueError("Must be using deepspeed to run neox") if neox_args.load is not None: neox_args.iteration = load_checkpoint(neox_args=neox_args, model=model, optimizer=optimizer, lr_scheduler=lr_scheduler, inference=inference) print_rank_0( f'Loading checkpoint and starting from iteration {neox_args.iteration}' ) else: neox_args.iteration = 0 return model, optimizer, lr_scheduler
def _test(model, optimizer): engine, _, _, _ = deepspeed.initialize(model=model, optimizer=optimizer, config=config_dict) loss = torch.nn.BCEWithLogitsLoss() x = torch.tensor([1, 2, 4, 5, 4, 3, 2, 9], dtype=torch.long, device=engine.device) offsets = torch.tensor([0, 4], dtype=torch.long, device=engine.device) y = torch.tensor([[1.0], [0.0]], device=engine.device) res = engine(x, offsets) with pytest.raises(AssertionError): engine.backward(loss(res, y)) engine.step()
def _test_adam_fp16_zero_onecycle_compatibility(args, zero_stage, hidden_dim): model = SimpleModel(hidden_dim) model, _, _, _ = deepspeed.initialize( args=args, model=model, model_parameters=model.parameters()) data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device) for n, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) model.backward(loss) model.step()
def _go(model, hidden_dim): model, _, _, _ = deepspeed.initialize( model=model, model_parameters=model.parameters(), config=config_dict) data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device) dist.barrier() for n, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) model.backward(loss) model.step()
def _test_dist_init_true(args, model, hidden_dim): model, _, _, _ = deepspeed.initialize( args=args, model=model, model_parameters=model.parameters(), dist_init_required=True) data_loader = random_dataloader(model=model, total_samples=5, hidden_dim=hidden_dim, device=model.device) for n, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) model.backward(loss) model.step()
def test_ext_param_return(): setup_serial_env() net = DanglingExt() args = SimpleNamespace(local_rank=0) engine, optim, _, _ = deepspeed.initialize( args=args, model=net, model_parameters=net.parameters(), config=config) for _ in range(5): input = torch.rand(net.dim).to(engine.device).half() loss = engine(input) engine.backward(loss) engine.step()
def _test_partition_nccl_alignment(model, hidden_dim): model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters()) # get nccl all-gather send buffers alignment factor nccl_start_alignment_factor = model.optimizer.nccl_start_alignment_factor parallel_partitioned_bit16_groups = model.optimizer.parallel_partitioned_bit16_groups if zero_stage == 2 else model.optimizer.parallel_partitioned_fp16_groups for data_parallel_partitions in parallel_partitioned_bit16_groups: for partition_id, partitioned_data in enumerate(data_parallel_partitions): # verify that data partition start locations are 4-byte aligned assert (partitioned_data.data_ptr() % (2 * nccl_start_alignment_factor) == 0)
def _test_adamw_fp16_basic(args, model, hidden_dim, config_dict): optimizer = torch.optim.AdamW(params=model.parameters()) model, _, _, _ = deepspeed.initialize(args=args, model=model, optimizer=optimizer, config_params=config_dict) data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device) for n, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) model.backward(loss) model.step()
def _test_fp16_adam_types(args, model, hidden_dim): model, _, _, _ = deepspeed.initialize( args=args, model=model, model_parameters=model.parameters()) data_loader = random_dataloader(model=model, total_samples=10, hidden_dim=hidden_dim, device=model.device) for _, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) model.backward(loss) model.step()
def _test_get_lr_before_train(args, model, hidden_dim): model, _, _, lr_scheduler = deepspeed.initialize( args=args, model=model, model_parameters=model.parameters()) data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device, dtype=torch.float) for n, batch in enumerate(data_loader): # get lr before training starts lr_scheduler.get_lr() loss = model(batch[0], batch[1]) model.backward(loss) model.step()
def _test_scheduler_optimizer_parity(args, model, hidden_dim): model, _, _, lr_scheduler = deepspeed.initialize(args=args, model=model, model_parameters=model.parameters()) data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device, dtype=torch.float) for n, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) model.backward(loss) model.step() assert lr_scheduler.get_lr() == model.get_lr()
def _test_zero3_repeat_forward_loop(model, hidden_dim): model, _, _, _ = deepspeed.initialize( config=config_dict, model=model, model_parameters=model.parameters()) data_loader = random_dataloader(model=model, total_samples=16, hidden_dim=hidden_dim, device=model.device) for i, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) model.backward(loss) model.step()
def _go(args, model, hidden_dim): optimizer = torch.optim.Adam(model.parameters()) model, _, _, _ = deepspeed.initialize(args=args, model=model, optimizer=optimizer) data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device, dtype=torch.bfloat16) for n, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) model.backward(loss) model.step()
def checkpoint_correctness_verification(args, model, hidden_dim, load_optimizer_states=True): ds_model, _, _,_ = deepspeed.initialize(args=args, model=model, model_parameters=model.parameters()) data_loader = random_dataloader(model=ds_model, total_samples=50, hidden_dim=hidden_dim, device=ds_model.device) for n, batch in enumerate(data_loader): loss = ds_model(batch[0], batch[1]) ds_model.backward(loss) ds_model.step() trained_model = ds_model save_folder = 'saved_checkpoint' save_tag = '1' trained_model.save_checkpoint(save_folder, save_tag) loaded_model, _, _,_ = deepspeed.initialize(args=args, model=model, model_parameters=model.parameters()) loaded_model.load_checkpoint(save_folder, save_tag, load_optimizer_states=load_optimizer_states) if load_optimizer_states: compare_optimizer_states(trained_model, loaded_model, hidden_dim) else: compare_model_states(trained_model, loaded_model)
def _test_curriculum_scheduler_fixed_linear(args, model, hidden_dim): model, _, _, _ = deepspeed.initialize( args=args, model=model, model_parameters=model.parameters()) data_loader = random_dataloader(model=model, total_samples=20, hidden_dim=hidden_dim, device=model.device) for n, batch in enumerate(data_loader): loss, seqlen = model(batch[0], batch[1]) model.backward(loss) model.step() if n + 1 in ground_truths: true_seqlen = ground_truths[n + 1] print('at step {} the seqlen is {}'.format(n + 1, seqlen)) assert seqlen == true_seqlen, f"Incorrect curriculum schedule"
def test_already_init(self, dist_init_required): torch.distributed.init_process_group('nccl') model = SimpleModel(4) config_dict = { "train_micro_batch_size_per_gpu": 1, "optimizer": { "type": "Adam", "params": {} } } engine, *_ = deepspeed.initialize( model=model, config=config_dict, model_parameters=model.parameters(), dist_init_required=dist_init_required)
def test_overflow(self, tmpdir): config_dict = { "train_batch_size": 2, "steps_per_print": 1, "optimizer": { "type": "OneBitLamb", "params": { "lr": 0.00015, "weight_decay": 0.01, "max_coeff": 0.3, "min_coeff": 0.01, "freeze_step": 2, "cuda_aware": False, "comm_backend_name": "nccl", "coeff_beta": 0.9, "factor_max": 1.0, "factor_min": 0.5, "factor_threshold": 0.1, }, }, "gradient_clipping": 1.0, "fp16": { "enabled": True, "loss_scale": 0, "initial_scale_power": 16 }, } hidden_dim = 10 model = SimpleModel(hidden_dim) model, _, _, _ = deepspeed.initialize( config=config_dict, model=model, model_parameters=model.parameters()) data_loader = random_dataloader(model=model, total_samples=100, hidden_dim=hidden_dim, device=model.device) save_folder = os.path.join(tmpdir, "saved_checkpoint") for n, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) if dist.get_rank() == 0 and n >= 10: loss = loss * 1000000.0 model.backward(loss) dist.barrier() model.step() dist.barrier() model.save_checkpoint(save_folder, tag=None)
def _test_flops_profiler_in_ds_training(args, model, hidden_dim): model, _, _, _ = deepspeed.initialize( args=args, model=model, model_parameters=model.parameters()) data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device, dtype=torch.half) for n, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) model.backward(loss) model.step() if n == 3: break assert model.flops_profiler.flops == 100 assert model.flops_profiler.params == 110
def _test_onebitlamb_exp_avg_mask(args, model, hidden_dim): model, optimizer, _, _ = deepspeed.initialize(args=args, model=model, model_parameters=optimizer_grouped_parameters) data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device) for n, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) model.backward(loss) model.step() # Test whether the momentum mask works for v in optimizer.state.values(): if v['exp_avg'].size() == mask1.size(): assert torch.allclose(v['exp_avg'], v['exp_avg'].mul_(mask1.to(device=v['exp_avg'].device)), atol=1e-07), f"Momentum mask is not working properly"
def launch(self, model: torch.nn.Module, optimizer: Union[str, torch.optim.Optimizer], local_rank: int, serialization_dir: str, batch_size: int, gradient_accumulation_steps: int): path = self._to_temp_file( serialization_dir, train_batch_size=batch_size, gradient_accumulation_steps=gradient_accumulation_steps) ds = deepspeed.initialize(args=self.build_deepspeed_args( path, local_rank), model=model, model_parameters=model.parameters(), dist_init_required=False) os.remove(path) return ds
def test(self): if not bf16_required_version_check(): pytest.skip( " DeepSpeed BFloat16 tests need torch >= 1.10, NCCL >= 2.10.3, CUDA > =11.0 and HW support for BFloat16 to run correctly" ) config_dict = { "train_batch_size": 2, "steps_per_print": 1, "optimizer": { "type": "Adam", "params": { "lr": 0.00015 } }, "gradient_clipping": 1.0, "zero_optimization": { "stage": 2, "contiguous_gradients": True, "allgather_bucket_size": 2000000000, "reduce_bucket_size": 200000000, "overlap_comm": False, "reduce_scatter": False }, "fp16": { "enabled": False }, "bf16": { "enabled": True } } hidden_dim = 10 model = SimpleModel(hidden_dim) model, _, _, _ = deepspeed.initialize( config=config_dict, model=model, model_parameters=model.parameters()) data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device, dtype=torch.bfloat16) for n, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) model.backward(loss) model.step()
def _test_pld_model(args, model, hidden_dim, theta, gamma): model, _, _, _ = deepspeed.initialize( args=args, model=model, model_parameters=model.parameters()) data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device) for i, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) model.backward(loss) model.step() expected_theta = (1. - theta) * np.exp(-gamma * i) + theta actual_theta = model.get_pld_theta() assert expected_theta == actual_theta
def test_ext_param_returnobj(): setup_serial_env() print() net = ModelContainer(return_obj=True) args = SimpleNamespace(local_rank=0) engine, optim, _, _ = deepspeed.initialize( args=args, model=net, model_parameters=net.parameters(), config=config) for _ in range(5): input = torch.rand(net.dim).to(engine.device).half() loss = engine(input) assert len(net._external_params) == 1 assert len(net.dangler._external_params) == 0 engine.backward(loss) engine.step()
def get_deepspeed_model(self, model, tmpdir): ds_config_dict = { "train_micro_batch_size_per_gpu": 1, "optimizer": { "type": "Lamb", "params": { "lr": 0.00015 } }, } dist.barrier() model, _, _, _ = deepspeed.initialize( model=model, model_parameters=model.parameters(), config=ds_config_dict) return model.cuda()
def _test_onebitlamb_checkpointing_overflow(args, model, hidden_dim): model, _, _, _ = deepspeed.initialize( args=args, model=model, model_parameters=model.parameters()) data_loader = random_dataloader(model=model, total_samples=100, hidden_dim=hidden_dim, device=model.device) save_folder = os.path.join(tmpdir, 'saved_checkpoint') for n, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) if dist.get_rank() == 0 and n >= 10: loss = loss * 1000000.0 model.backward(loss) dist.barrier() model.step() dist.barrier() model.save_checkpoint(save_folder, tag=None)
def _test_zero_empty_partition(args): hidden_dim = 1 model = SimpleModel(hidden_dim) # Ensure model has 2 parameters, to cause empty partition with DP=3 assert len(list(model.parameters())) == 2 model, _, _, _ = deepspeed.initialize( args=args, model=model, model_parameters=model.parameters()) # Now make sure things work.. data_loader = random_dataloader(model=model, total_samples=1, hidden_dim=hidden_dim, device=model.device) for n, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) model.backward(loss) model.step()
def __init__(self, config_filepath) -> None: super().__init__() self.config = Config(config_filepath) set_seed(self.config.seed) self.loader = SemEvalDataLoader(self.config) model = BertCls(self.config) optimiser = optim.Adam(model.parameters(), lr=self.config.lr) model_engine, optimizer, _, _ = deepspeed.initialize( config=self.config.__dict__, model=model, optimizer=optimiser, model_parameters=model.parameters()) self.model_engine = model_engine self.model_engine = self.model_engine.to(self.config.device) self.optimiser = optimizer self.criterion = nn.BCELoss()
def _test_fused_all_overflow(args): hidden_dim = 1 model = SimpleModel(hidden_dim) model, optim, _, _ = deepspeed.initialize( args=args, model=model, model_parameters=model.parameters()) expected_loss_scale = 2**4 # Ensure the dynamic loss scaler is correctly configured. assert optim.dynamic_loss_scale == True assert optim.cur_scale == expected_loss_scale overflow_gradients = [float('inf'), float('-inf')] + [float('nan')] * 6 for i, value in enumerate(overflow_gradients): run_model_step(model, [value]) expected_loss_scale = max(expected_loss_scale / 2, 1) assert optim.cur_scale == expected_loss_scale assert optim.cur_iter == (i + 1)
def test(self, dtype): config_dict = { "train_batch_size": 2, "steps_per_print": 1, "optimizer": { "type": "OneBitLamb", "params": { "lr": 0.00015, "weight_decay": 0.01, "max_coeff": 0.3, "min_coeff": 0.01, "freeze_step": 2, "cuda_aware": False, "comm_backend_name": "nccl", "coeff_beta": 0.9, "factor_max": 1.0, "factor_min": 0.5, "factor_threshold": 0.1, }, }, "gradient_clipping": 1.0, "fp16": { "enabled": (dtype == torch.float16), "loss_scale": 0, "initial_scale_power": 16, }, } hidden_dim = 10 model = SimpleModel(hidden_dim) model, _, _, _ = deepspeed.initialize( config=config_dict, model=model, model_parameters=model.parameters()) data_loader = random_dataloader( model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device, dtype=dtype, ) for n, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) model.backward(loss) model.step()
def test(self): config_dict = { "train_batch_size": 1, "steps_per_print": 1, "optimizer": { "type": "Adam", "params": { "lr": 0.001, } }, "zero_optimization": { "stage": 0 }, "fp16": { "enabled": True, }, "flops_profiler": { "enabled": True, "step": 1, "module_depth": -1, "top_modules": 3, }, } hidden_dim = 10 model = SimpleModel(hidden_dim, empty_grad=False) model, _, _, _ = deepspeed.initialize( config=config_dict, model=model, model_parameters=model.parameters()) data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device, dtype=torch.half) for n, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) model.backward(loss) model.step() if n == 3: break assert within_range(model.flops_profiler.flops, 200, tolerance=TOLERANCE) assert model.flops_profiler.params == 110