def _dist_init(self, local_rank, num_procs, skip_msg): """Initialize deepspeed.comm and execute the user function. """ if self.set_dist_env: os.environ['MASTER_ADDR'] = '127.0.0.1' os.environ['MASTER_PORT'] = get_master_port() os.environ['LOCAL_RANK'] = str(local_rank) # NOTE: unit tests don't support multi-node so local_rank == global rank os.environ['RANK'] = str(local_rank) os.environ['WORLD_SIZE'] = str(num_procs) # turn off NCCL logging if set os.environ.pop('NCCL_DEBUG', None) set_cuda_visibile() if self.init_distributed: deepspeed.init_distributed(dist_backend=self.backend) dist.barrier() if torch.cuda.is_available(): torch.cuda.set_device(local_rank) try: self.current_test(**self.test_kwargs) except BaseException as e: if isinstance(e, Skipped): skip_msg.put(e.msg) else: raise e if self.init_distributed or dist.is_initialized(): # make sure all ranks finish at the same time dist.barrier() # tear down after test completes dist.destroy_process_group()
def dist_init(local_rank, num_procs, *func_args, **func_kwargs): """Initialize deepspeed.comm and execute the user function. """ os.environ['MASTER_ADDR'] = '127.0.0.1' os.environ['MASTER_PORT'] = get_master_port() os.environ['LOCAL_RANK'] = str(local_rank) # NOTE: unit tests don't support multi-node so local_rank == global rank os.environ['RANK'] = str(local_rank) os.environ['WORLD_SIZE'] = str(num_procs) # turn off NCCL logging if set os.environ.pop('NCCL_DEBUG', None) set_cuda_visibile() deepspeed.init_distributed(dist_backend=backend) #dist.init_process_group(backend=backend) dist.barrier() if torch.cuda.is_available(): torch.cuda.set_device(local_rank) run_func(*func_args, **func_kwargs) # make sure all ranks finish at the same time dist.barrier() # tear down after test completes dist.destroy_process_group()
def _run(inputs): args_defaults = { 'num_layers': 2, 'hidden_size': 128, 'num_attention_heads': 8, 'max_position_embeddings': 128, } model = get_gpt2_model(args_defaults, mp_size=2) model = self.get_deepspeed_model(model, tmpdir) model.eval() baseline = model(inputs[0].cuda(), inputs[1].cuda(), inputs[2].cuda()) tag = 'mp_2' state_dict = {} state_dict['checkpoint_version'] = get_megatron_version() model.save_checkpoint(tmpdir, tag=tag, client_state=state_dict) dist.barrier() model.load_checkpoint(tmpdir, tag=tag, load_optimizer_states=False, load_lr_scheduler_states=False) test = model(inputs[0].cuda(), inputs[1].cuda(), inputs[2].cuda()) assert torch.allclose( baseline, test, rtol=1.0, atol=1e-07 ), f"Baseline output {baseline} is not equal to save-then-load output {test}"
def _run(): args_defaults = { 'num_layers': 8, 'hidden_size': 128, 'num_attention_heads': 8, 'max_position_embeddings': 128, } topo = self.get_topology(mp_size, pp_size, world_size) gpt2_pipe_model = GPT2ModelPipe(num_layers=8, num_stages=pp_size, mp_size=mp_size, args_others=args_defaults, topo=topo) model = self.get_deepspeed_model(gpt2_pipe_model, tmpdir) tag = 'pp_basic' state_dict = {} state_dict['checkpoint_version'] = get_megatron_version() model.save_checkpoint(tmpdir, tag=tag, client_state=state_dict) if model.is_first_stage() or model.is_last_stage(): inputs = self.get_inputs() loader = RepeatingLoader([(inputs[0], 0)]) data_iter = iter(loader) else: data_iter = None baseline = model.eval_batch(data_iter=data_iter, compute_loss=False, reduce_output=None) dist.barrier() model.load_checkpoint(tmpdir, tag=tag, load_optimizer_states=False, load_lr_scheduler_states=False) dist.barrier() test = model.eval_batch(data_iter=data_iter, compute_loss=False, reduce_output=None) if test is not None: assert len(baseline) == len(test) # Compare outputs of each microbatch for mb in range(len(baseline)): for b, t in zip(baseline[mb], test[mb]): if b.is_floating_point(): # don't compare masks assert torch.allclose( b, t, atol=1e-07 ), f"Baseline output {baseline} is not equal to save-then-load output {test}"
def _go(model, hidden_dim): model, _, _, _ = deepspeed.initialize( model=model, model_parameters=model.parameters(), config=config_dict) data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device) dist.barrier() for n, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) model.backward(loss) model.step()
def test_overflow(self, tmpdir): config_dict = { "train_batch_size": 2, "steps_per_print": 1, "optimizer": { "type": "OneBitLamb", "params": { "lr": 0.00015, "weight_decay": 0.01, "max_coeff": 0.3, "min_coeff": 0.01, "freeze_step": 2, "cuda_aware": False, "comm_backend_name": "nccl", "coeff_beta": 0.9, "factor_max": 1.0, "factor_min": 0.5, "factor_threshold": 0.1, }, }, "gradient_clipping": 1.0, "fp16": { "enabled": True, "loss_scale": 0, "initial_scale_power": 16 }, } hidden_dim = 10 model = SimpleModel(hidden_dim) model, _, _, _ = deepspeed.initialize( config=config_dict, model=model, model_parameters=model.parameters()) data_loader = random_dataloader(model=model, total_samples=100, hidden_dim=hidden_dim, device=model.device) save_folder = os.path.join(tmpdir, "saved_checkpoint") for n, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) if dist.get_rank() == 0 and n >= 10: loss = loss * 1000000.0 model.backward(loss) dist.barrier() model.step() dist.barrier() model.save_checkpoint(save_folder, tag=None)
def get_deepspeed_model(self, model, tmpdir): ds_config_dict = { "train_micro_batch_size_per_gpu": 1, "optimizer": { "type": "Lamb", "params": { "lr": 0.00015 } }, } dist.barrier() model, _, _, _ = deepspeed.initialize( model=model, model_parameters=model.parameters(), config=ds_config_dict) return model.cuda()
def _go(hidden_dim): with deepspeed.zero.Init(): model = MyModel(hidden_dim) model, _, _, _ = deepspeed.initialize( model=model, model_parameters=model.parameters(), config=config_dict) data_loader = random_dataloader(model=model, total_samples=50, hidden_dim=hidden_dim, device=model.device) dist.barrier() for n, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) if return_type == dict: loss = loss['loss'] else: loss = loss[1] model.backward(loss) model.step()
def cifar_trainset(fp16=False): torchvision = pytest.importorskip("torchvision", minversion="0.5.0") import torchvision.transforms as transforms transform_list = [ transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), ] if fp16: transform_list.append(torchvision.transforms.Lambda(cast_to_half)) transform = transforms.Compose(transform_list) local_rank = torch.cuda.current_device() # Only one rank per machine downloads. dist.barrier() if local_rank != 0: dist.barrier() trainset = torchvision.datasets.CIFAR10(root='/tmp/cifar10-data', train=True, download=True, transform=transform) if local_rank == 0: dist.barrier() return trainset
def check_using_norm(self, norm_group, reduce_overflow=True): # TODO: I don't think reduce_overflow is needed if mpu is None overflow = -1 in norm_group overflow_gpu = torch.cuda.FloatTensor([overflow]) if self.has_moe_params: # In this case, we need to do an all_reduce across # the expert_parallel_group, so that if there was # an overflow due to expert weights, we detect it # Only need to check groups.get_largest_expert_parallel_group() dist.all_reduce(overflow_gpu, op=dist.ReduceOp.MAX, group=groups._get_max_expert_parallel_group()) if self.mpu is not None: dist.all_reduce(overflow_gpu, op=dist.ReduceOp.MAX, group=self.mpu.get_model_parallel_group()) elif reduce_overflow: dist.all_reduce(overflow_gpu, op=dist.ReduceOp.MAX) dist.barrier() overflow = overflow_gpu[0].item() return bool(overflow)
def torch_sim(a): a_sign = a.sign().add_(1).bool().float().add_(-0.5).mul_(2.0) scale = a.norm() / np.sqrt(a.numel()) a_compressed = scale * a_sign a_sign = None worker_error = a - a_compressed dist.all_reduce(a_compressed) a_compressed.mul_(1 / dist.get_world_size()) a_server_sign = a_compressed.sign().add_(1).bool().float().add_(-0.5).mul_( 2.0) a_list = torch.chunk(a_compressed, chunks=dist.get_world_size()) server_scale = [ chunk_a.norm() / np.sqrt(chunk_a.numel()) for chunk_a in a_list ] a_sign_list = torch.chunk(a_server_sign, dist.get_world_size()) a_server_compressed = torch.cat([ server_scale[i] * a_sign_list[i] for i in range(dist.get_world_size()) ]) rank = dist.get_rank() server_error = a_list[rank] - server_scale[rank] * a_sign_list[rank] torch.cuda.synchronize() dist.barrier() return a_server_compressed, worker_error, server_error
def replace_transformer_layer(orig_layer_impl, model, policy=None, micro_batch_size=-1, config=None, seed=-1, hidden_size=-1, num_attention_heads=-1, mp_size=1, training_mp_size=1, mp_group=None, ep_group=None, expert_mp_group=None, fp16=True, local_rank=-1, stochastic_mode=True, training=True, quantize=False, quantize_settings=None, triangular_masking=False, return_tuple=True, replace_with_kernel_inject=False, linear_layer_setting=None, moe=False, moe_experts=1, moe_type='standard', checkpoint_dict=None, save_mp_checkpoint_path=None): """ Replace bert-style transformer layers with DeepSpeed's transformer layer Arguments: orig_layer_impl (torch.nn.Module): the original transformer layer implementation to look for, e.g., transformers.modeling_bert.BertLayer. model (torch.nn.Module): user's nn.module representing their model policy: shows the policy for mapping from the orig_layer_impl to transformer parameters when replace_with_kernel_inject is set, otherwise, it provides the names of two linear layers as a tuple: (attention_output projection, transformer output projection) micro_batch_size (int): micro batch size per gpu used during training/eval config (dict): model config containing hidden size, attention heads, etc. seed (int): random seed value max_seq_length (int): max sequence length for training hidden_size (int): hidden dimension num_attention_heads (int): number of attention heads mp_size (int): model_parallelism degree mp_group : model_parallel group initialized on the modeling side preln (bool): does the original layer implementation do pre or post layer norm? fp16 (bool): fp16 or fp32 local_rank (int): GPU rank (optional), stochastic_mode (bool): whether to use stochastic mode training (bool): specifying whether kernel-injection is done for training/inference (set to false for inference-mode injection) quantize_settings (tuple): this setting shows how we can quantize a model for running it through the inference kernels. It includes (quantization_scales, merge_count, mlp_extra_grouping, quantize_groups). return_tuple (bool): if set, transformer layer returns a tuple as the output. Note: this flag needs to be set for huggingface models. replace_with_kernel_inject (bool): injection_mode, if true, kernels will be add along with configuring Tensor-Parallelism linear_layer_setting (tuple of modules) [Optional]: shows which two classes are used for linear layers and embedding layers attention_params: (list of strings) [Optional]: shows the parameters in the attention part that needs to be adjusted based on the model-parallelism Returns: Updated nn.module with replaced transformer layers """ mp_replace = ReplaceWithTensorSlicing(mp_group=mp_group, mp_size=mp_size) #, out_dim=0, in_dim=1) def replace_with_policy(child, policy_cls, triangular_masking, inference=False, layer_id=0): policy = policy_cls(child, inference=inference) if inference: hidden_size, num_attention_heads = policy.get_hidden_heads() assert num_attention_heads % mp_size == 0,\ "To run the model parallel across the GPUs, the attention_heads require to be divisible by the world_size!" +\ "This is because the attention computation is partitioned evenly among the parallel GPUs." from deepspeed.moe.layer import MoE moe = False if hasattr(child, 'mlp') and isinstance(child.mlp, MoE): num_experts = child.mlp.num_experts moe = True attn_linear_layer, qkvw, qkvb, dense_w, dense_b, scale_attention, megatron_v2 = policy.attention() if not moe or moe_type == 'standard': mlp_linear_layer, _h4h_w, _h4h_b, _4hh_w, _4hh_b = policy.mlp() else: mlp_linear_layer, _h4h_w, _h4h_b, _4hh_w, _4hh_b, \ _res_h4h_w, _res_h4h_b, _res_4hh_w, _res_4hh_b, _res_coef = policy.mlp(moe_type) attn_nw, attn_nb, input_nw, input_nb = policy.layerNorm() if quantize: if policy_cls is not HFBertLayerPolicy: qkvw = qkvw.to(torch.int8) dense_w = dense_w.to(torch.int8) _h4h_w = [moe_w1.to(torch.int8) for moe_w1 in _h4h_w] if moe else _h4h_w.to(torch.int8) _4hh_w = [moe_w1.to(torch.int8) for moe_w1 in _4hh_w] if moe else _4hh_w.to(torch.int8) elif fp16: qkvw = qkvw.half() dense_w = dense_w.half() _h4h_w = [moe_w1.half() for moe_w1 in _h4h_w] if moe else _h4h_w.half() _4hh_w = [moe_w1.half() for moe_w1 in _4hh_w] if moe else _4hh_w.half() if quantize or fp16: qkvb = qkvb if qkvb is None else qkvb.half() dense_b = dense_b if dense_b is None else dense_b.half() _h4h_b = [moe_b1.half() for moe_b1 in _h4h_b] if moe else _h4h_b.half() _4hh_b = [moe_b1.half() for moe_b1 in _4hh_b] if moe else _4hh_b.half() attn_nw = attn_nw if attn_nw is None else attn_nw.half() attn_nb = attn_nb if attn_nb is None else attn_nb.half() input_nw = input_nw.half() input_nb = input_nb.half() if moe and moe_type == 'residual' and fp16: _res_h4h_b = _res_h4h_b.half() _res_4hh_b = _res_4hh_b.half() _res_h4h_w = _res_h4h_w.half() _res_4hh_w = _res_4hh_w.half() _res_coef = _res_coef.half() #expert_mp_replace = ReplaceWithTensorSlicing(mp_group=expert_mp_group) if inference: if moe: ep_world_size = dist.get_world_size() local_ep_size = 1 if num_experts < ep_world_size else num_experts // ep_world_size transformer_config = transformer_inference.DeepSpeedMoEInferenceConfig( hidden_size=hidden_size, heads=num_attention_heads, layer_norm_eps=config.layer_norm_eps if hasattr( config, 'layer_norm_eps') else 1e-12, fp16=fp16, pre_layer_norm=policy.pre_attn_norm, mp_size=mp_size, q_int8=quantize, moe_experts=local_ep_size, global_experts=num_experts, mlp_type=moe_type) else: rotary_dim = config.rotary_dim if hasattr(config, 'rotary_dim') else child.attention.rotary_ndims \ if hasattr(child, 'attention') and hasattr(child.attention,'rotary_ndims') else -1 bigscience_bloom = policy_cls is BLOOMLayerPolicy transformer_config = transformer_inference.DeepSpeedInferenceConfig( hidden_size=hidden_size, heads=num_attention_heads, layer_norm_eps=config.layer_norm_eps if hasattr( config, 'layer_norm_eps') else (config.layer_norm_epsilon if hasattr(config, 'layer_norm_epsilon') else config.layernorm_epsilon if hasattr(config, 'layernorm_epsilon') else 1.0e-12), fp16=fp16, pre_layer_norm=policy.pre_attn_norm, mp_size=mp_size, q_int8=quantize, return_tuple=(return_tuple or (policy_cls is HFBertLayerPolicy)), triangular_masking=(policy_cls is not HFBertLayerPolicy), local_attention=((config.attention_layers[layer_id] == "local") if hasattr(config, 'attention_layers') else False), window_size=(config.window_size if hasattr(config, 'window_size') else 1), rotary_dim=rotary_dim, mlp_after_attn=(rotary_dim is None or rotary_dim < 0), mlp_act_func_type=policy.mlp_act_func_type, training_mp_size=training_mp_size, bigscience_bloom=bigscience_bloom) if quantize and quantize_settings is not None: (quantization_scales, merge_count, mlp_extra_grouping, quantize_groups) = quantize_settings if moe: new_module = transformer_inference.DeepSpeedMoEInference( transformer_config, mp_group=mp_group, ep_group=None if ep_group is None else ep_group[num_experts], expert_mp_group=None if expert_mp_group is None else expert_mp_group[num_experts], quantize_scales=quantization_scales[layer_id], quantize_groups=quantize_groups, merge_count=merge_count, mlp_extra_grouping=mlp_extra_grouping, qkv_merging=(policy_cls is HFBertLayerPolicy)) else: new_module = transformer_inference.DeepSpeedTransformerInference( transformer_config, mp_group=mp_group, quantize_scales=quantization_scales[layer_id], quantize_groups=quantize_groups, merge_count=merge_count, mlp_extra_grouping=mlp_extra_grouping, qkv_merging=(policy_cls is HFBertLayerPolicy)) if quantize and qkvw.dtype != torch.int8: quantize_bits = 8 quantizer = WeightQuantization() if policy_cls is HFBertLayerPolicy: data_quantized, _ = quantizer.quantize_data(qkvw.data, quantize_bits, quantize_groups * 3) else: data_quantized, _ = quantizer.quantize_data(qkvw.data, quantize_bits, quantize_groups) qkvw.data.copy_(data_quantized) qkvw.data = qkvw.data.to(torch.int8) else: if moe: new_module = transformer_inference.DeepSpeedMoEInference( transformer_config, mp_group=mp_group, ep_group=None if ep_group is None else ep_group[num_experts], expert_mp_group=None if expert_mp_group is None else expert_mp_group[num_experts], ) else: new_module = transformer_inference.DeepSpeedTransformerInference( transformer_config, mp_group=mp_group, ) new_module.config.scale_attention = scale_attention # we want the weights in [input, output] shape # linear layer is created with [input, output] shape # transpose it here to reduce inference cost! def transpose(data): # temp move to cpu to avoid requiring extra GPU memory during the reshape data = data.to('cpu') data.reshape(-1).copy_(data.transpose(-1, -2).contiguous().reshape(-1)) data = data.reshape(data.shape[-1], data.shape[-2]) data.to(torch.cuda.current_device()) return data attn_block = new_module.attention mpl_block = new_module.mlp if attn_linear_layer: if qkvw.numel() == 0 or qkvw.is_meta: if qkvw.is_meta or qkvw.ds_tensor.numel( ) < attn_block.attn_qkvw.numel(): pass else: with GatheredParameters([qkvw, dense_w, qkvb, dense_b], modifier_rank=0): qkvw = transpose(qkvw.data) dense_w = transpose(dense_w.data) qkvb = qkvb.data dense_b = dense_b.data else: qkvw.data = transpose(qkvw.data) dense_w.data = transpose(dense_w.data) def _transpose(x): num_attention_heads_per_partition = transformer_config.heads // transformer_config.mp_size attention_head_size = x.shape[-1] // num_attention_heads_per_partition new_x_shape = x.size()[:-1] + (num_attention_heads_per_partition, attention_head_size) x_1 = x.view(*new_x_shape) (q, k, v) = torch.split(x_1, (x_1.shape[-1] // 3), dim=(x_1.dim() - 1)) if len(q.shape) > 2: return torch.cat((q.reshape(q.shape[0], -1), k.reshape(q.shape[0], -1), v.reshape(q.shape[0], -1)), dim=-1).reshape(x.shape) else: return torch.cat((q.reshape(-1), k.reshape(-1), v.reshape(-1)), dim=-1).reshape(x.shape) if megatron_v2: new_module.config.rotate_half = True new_module.config.rotate_every_two = False # Note: this part needs to be added for BLOOM architecture qkvw = torch.nn.parameter.Parameter(_transpose(qkvw).contiguous()) qkvb = torch.nn.parameter.Parameter(_transpose(qkvb).contiguous()) # NOTE: This part caused instability in the multi-GPU inference! # TODO: This needs to be incorporated in the kernels. #dense_b = dense_b if dense_b is None else dense_b * ( # transformer_config.training_mp_size / transformer_config.mp_size) #_4hh_b = _4hh_b * (transformer_config.training_mp_size / # transformer_config.mp_size) if mlp_linear_layer: if not moe and (_4hh_w.numel() == 0 or _4hh_w.is_meta): if _4hh_w.is_meta or _4hh_w.ds_tensor.numel( ) < mpl_block.inter_w.numel(): pass else: with GatheredParameters([_h4h_w, _4hh_w, _4hh_b, _h4h_b], modifier_rank=0): _h4h_w = transpose(_h4h_w.data) _4hh_w = transpose(_4hh_w.data) _h4h_b = _h4h_b.data _4hh_b = _4hh_b.data else: _h4h_w = [transpose(moe_w1.data) for moe_w1 in _h4h_w] if moe else transpose(_h4h_w.data) _4hh_w = [transpose(moe_w1.data) for moe_w1 in _4hh_w] if moe else transpose(_4hh_w.data) if moe and moe_type == 'residual': _res_h4h_w.data = transpose(_res_h4h_w.data) _res_4hh_w.data = transpose(_res_4hh_w.data) _res_coef.data = transpose(_res_coef.data) if qkvw.is_meta or qkvw.numel() == 0 or qkvw.is_meta: if qkvw.is_meta or qkvw.ds_tensor.numel() < attn_block.attn_qkvw.numel(): pass else: with GatheredParameters([ attn_block.attn_qkvw, attn_block.attn_qkvb, attn_block.attn_ow, attn_block.attn_ob ], modifier_rank=0): attn_block.attn_qkvw = mp_replace.copy( attn_block.attn_qkvw, qkvw) attn_block.attn_qkvb = mp_replace.copy( attn_block.attn_qkvb, qkvb) attn_block.attn_ow = mp_replace.copy(attn_block.attn_ow, dense_w) attn_block.attn_ob = mp_replace.copy(attn_block.attn_ob, dense_b) else: if bigscience_bloom: attn_block.attn_qkvw = mp_replace.copy(attn_block.attn_qkvw, qkvw) attn_block.attn_qkvb = mp_replace.copy(attn_block.attn_qkvb, qkvb) else: attn_block.attn_qkvw = mp_replace.qkv_copy( attn_block.attn_qkvw, qkvw) attn_block.attn_qkvb = mp_replace.qkv_copy( attn_block.attn_qkvb, qkvb) attn_block.attn_ow = mp_replace.copy(attn_block.attn_ow, dense_w) attn_block.attn_ob = mp_replace.copy(attn_block.attn_ob, dense_b) if moe: gpu_index = dist.get_rank() gpu_index = 0 for ep_index in range(local_ep_size): mpl_block[ep_index].inter_w.data = _h4h_w[ gpu_index * local_ep_size + ep_index].to( torch.cuda.current_device()) mpl_block[ep_index].inter_b.data = _h4h_b[ gpu_index * local_ep_size + ep_index].to( torch.cuda.current_device()) mpl_block[ep_index].output_w.data = _4hh_w[ gpu_index * local_ep_size + ep_index].to( torch.cuda.current_device()) mpl_block[ep_index].output_b.data = _4hh_b[ gpu_index * local_ep_size + ep_index].to( torch.cuda.current_device()) new_module.attn_nw.data = attn_nw.to(torch.cuda.current_device()) new_module.attn_nb.data = attn_nb.to(torch.cuda.current_device()) if moe_type == 'residual': new_module.res_mlp.inter_w.data = _res_h4h_w.to( torch.cuda.current_device()) new_module.res_mlp.inter_b.data = _res_h4h_b.to( torch.cuda.current_device()) new_module.res_mlp.output_w.data = _res_4hh_w.to( torch.cuda.current_device()) new_module.res_mlp.output_b.data = _res_4hh_b.to( torch.cuda.current_device()) new_module.res_coef.data = _res_coef.to(torch.cuda.current_device()) else: if _4hh_w.numel() == 0 or _4hh_w.is_meta: if _4hh_w.is_meta or _4hh_w.ds_tensor.numel( ) < mpl_block.inter_w.numel(): pass else: with GatheredParameters([_h4h_w, _4hh_w, _4hh_w, _4hh_b], modifier_rank=0): mpl_block.inter_w = mp_replace.copy( mpl_block.inter_w, _h4h_w) mpl_block.inter_b = mp_replace.copy( mpl_block.inter_b, _h4h_b) mpl_block.output_w = mp_replace.copy( mpl_block.output_w, _4hh_w) mpl_block.output_b = mp_replace.copy( mpl_block.output_b, _4hh_b) else: mpl_block.inter_w = mp_replace.copy(mpl_block.inter_w, _h4h_w) mpl_block.inter_b = mp_replace.copy(mpl_block.inter_b, _h4h_b) mpl_block.output_w = mp_replace.copy(mpl_block.output_w, _4hh_w) mpl_block.output_b = mp_replace.copy(mpl_block.output_b, _4hh_b) if attn_nw is None: new_module.mlp.attn_nw = attn_nw new_module.mlp.attn_nb = attn_nb else: if attn_nw.is_meta or attn_nw.numel() == 0: if attn_nw.is_meta or attn_nw.ds_tensor.numel( ) < new_module.mlp.attn_nw.numel(): pass else: with GatheredParameters([attn_nw, attn_nb], modifier_rank=0): new_module.mlp.attn_nw.data.copy_( attn_nw.to(torch.cuda.current_device())) new_module.mlp.attn_nb.data.copy_( attn_nb.to(torch.cuda.current_device())) else: new_module.mlp.attn_nw.data.copy_( attn_nw.to(torch.cuda.current_device())) new_module.mlp.attn_nb.data.copy_( attn_nb.to(torch.cuda.current_device())) if input_nw.is_meta or input_nw.numel() == 0: if input_nw.is_meta or input_nw.ds_tensor.numel( ) < new_module.norm_w.numel(): pass else: with GatheredParameters([input_nw, input_nb], modifier_rank=0): new_module.norm_w.data.copy_( input_nw.to(torch.cuda.current_device())) new_module.norm_b.data.copy_( input_nb.to(torch.cuda.current_device())) else: new_module.norm_w.data.copy_(input_nw.to(torch.cuda.current_device())) new_module.norm_b.data.copy_(input_nb.to(torch.cuda.current_device())) else: transformer_config = deepspeed.DeepSpeedTransformerConfig( batch_size=micro_batch_size if micro_batch_size > 0 else 1, hidden_size=config.hidden_size, heads=config.num_attention_heads, attn_dropout_ratio=config.attention_probs_dropout_prob, hidden_dropout_ratio=config.hidden_dropout_prob, num_hidden_layers=config.num_hidden_layers, initializer_range=config.initializer_range, layer_norm_eps=config.layer_norm_eps if hasattr( config, 'layer_norm_eps') else 1e-12, seed=seed, fp16=fp16, pre_layer_norm=policy.pre_attn_norm, return_tuple=return_tuple, local_rank=local_rank, stochastic_mode=stochastic_mode, normalize_invertible=True, training=training) new_module = deepspeed.DeepSpeedTransformerLayer(transformer_config) new_module.attn_qkvw.data = qkvw new_module.attn_qkvb.data = qkvb new_module.attn_ow.data = dense_w new_module.attn_ob.data = dense_b new_module.attn_nw.data = attn_nw new_module.attn_nb.data = attn_nb new_module.norm_w.data = input_nw new_module.norm_b.data = input_nb new_module.inter_w.data = _h4h_w new_module.inter_b.data = _h4h_b new_module.output_w.data = _4hh_w new_module.output_b.data = _4hh_b return new_module def replace_wo_policy(module, all_reduce_linears): def _replace(child, name, conv_linear_layer): mp_replace = ReplaceWithTensorSlicing(mp_group=mp_group) z_inference = (len(list(child.parameters())) > 0) and (list( child.parameters())[0].numel() == 0) if z_inference: weight_shape = child.weight.ds_shape else: weight_shape = child.weight.shape if name in all_reduce_linears: new_weight = torch.empty(( weight_shape[1] if conv_linear_layer else weight_shape[0], (weight_shape[0] if conv_linear_layer else weight_shape[1]) // mp_size, ), device=child.weight.device, dtype=child.weight.dtype) if z_inference: with deepspeed.zero.GatheredParameters(child.weight, modifier_rank=0): data = child.weight.data.to(new_weight.device) if conv_linear_layer: data = data.transpose(-1, -2).contiguous() data = mp_replace.copy(new_weight, data) child.weight.ds_tensor = torch.empty(1) else: if conv_linear_layer: child.weight.data = child.weight.data.transpose(-1, -2).contiguous() data = mp_replace.copy(new_weight, child.weight.data) new_bias = torch.empty((weight_shape[0]), device=child.weight.device, dtype=child.weight.dtype) if z_inference: with deepspeed.zero.GatheredParameters(child.bias, modifier_rank=0): new_bias.data.copy_(child.bias.data) elif child.bias is not None: new_bias.data.copy_(child.bias.data) return LinearAllreduce(data, child.bias if child.bias is None else \ torch.nn.parameter.Parameter(new_bias.to(torch.cuda.current_device())), mp_group) else: new_weight = torch.empty(( (weight_shape[1] if conv_linear_layer else weight_shape[0]) // mp_size, weight_shape[0] // mp_size if conv_linear_layer else weight_shape[1], ), device=child.weight.device, dtype=child.weight.dtype) if z_inference: with deepspeed.zero.GatheredParameters(child.weight, modifier_rank=0): data = child.weight.data.to(new_weight.device) if conv_linear_layer: data = data.transpose(-1, -2).contiguous() data = mp_replace.copy(new_weight, data) child.weight.ds_tensor = torch.empty(1) else: if conv_linear_layer: child.weight.data = child.weight.data.transpose(-1, -2).contiguous() data = mp_replace.copy(new_weight, child.weight.data) new_bias = torch.empty((weight_shape[0] // mp_size), device=child.weight.device, dtype=child.weight.dtype) if z_inference: with deepspeed.zero.GatheredParameters(child.bias, modifier_rank=0): bias_data = None if child.bias is None else mp_replace.copy( new_bias, child.bias.data).to(torch.cuda.current_device()) else: bias_data = None if child.bias is None else mp_replace.copy( new_bias, child.bias.data).to(torch.cuda.current_device()) return LinearLayer(weight=data.to(torch.cuda.current_device()), bias=bias_data) def _slice_embedding(child, name, conv_linear_layer): mp_replace = ReplaceWithTensorSlicing(mp_group=mp_group) new_weight = torch.empty((child.weight.shape[0], child.weight.shape[1] // mp_size), device=child.weight.device, dtype=child.weight.dtype) data = mp_replace.copy(new_weight, child.weight.ds_tensor.data if hasattr(child.weight, 'ds_tensor') else \ child.weight.data) new_embedding = nn.Embedding(child.weight.shape[0], child.weight.shape[1] // mp_size) new_embedding.weight.data.copy_(data) return new_embedding def update_mp_params(child): if hasattr(child, 'n_heads'): child.n_heads = child.n_heads // mp_size if hasattr(child, 'inner_dim'): child.inner_dim = child.inner_dim // mp_size if hasattr(child, 'num_heads'): child.num_heads = child.num_heads // mp_size if hasattr(child, 'num_attention_heads'): child.num_attention_heads = child.num_attention_heads // mp_size if hasattr(child, 'all_head_size'): child.all_head_size = child.all_head_size // mp_size if hasattr(child, 'embed_dim'): child.embed_dim = child.embed_dim // mp_size if hasattr(child, 'hidden_size'): child.hidden_size = child.hidden_size // mp_size conv_linear_layer = False if linear_layer_setting is not None: linear_policies = {linear_layer_setting[0]: _replace} if len(linear_layer_setting) == 2: linear_policies.update({linear_layer_setting[1]: _slice_embedding}) else: if orig_layer_impl is HFGPT2LayerPolicy._orig_layer_class: try: import transformers conv_linear_layer = True linear_policies = {transformers.model_utils.Conv1D: _replace} except ImportError: linear_policies = {nn.Linear: _replace} else: linear_policies = {nn.Linear: _replace, nn.Embedding: _slice_embedding} def _replace_module(r_module, prev_name=''): for name, child in r_module.named_children(): if child.__class__ in linear_policies: setattr( r_module, name, linear_policies[child.__class__](child, prev_name + '.' + name, conv_linear_layer)) else: update_mp_params(child) _replace_module(child, name) return r_module return _replace_module(module) def replace_fn(child, _policy, layer_id=0): if training: # copy relevant state from child -> new module new_module = replace_with_policy(child, _policy, triangular_masking) else: # copy relevant state from child -> new module if replace_with_kernel_inject: new_module = replace_with_policy(child, _policy, triangular_masking, inference=True, layer_id=layer_id) else: new_module = replace_wo_policy(child, _policy) return new_module replaced_module = replace_module(model=model, orig_class=orig_layer_impl, replace_fn=replace_fn, _replace_policy=policy) world_size = dist.get_world_size() if dist.is_initialized() else 1 rank = dist.get_rank() if dist.is_initialized() else 0 if checkpoint_dict is not None: start_time = time.time() checkpoint = checkpoint_dict['checkpoints'] ckpt_type = checkpoint_dict.get('parallelization', 'pp') ckpt_mp_size = checkpoint_dict.get('mp_size', mp_size) base_dir = checkpoint_dict.get('base_dir', '') if ckpt_type == 'pp': pbar = tqdm.tqdm(total=len(checkpoint), desc=f"Loading {len(checkpoint)} checkpoint shards") for i in range(len(checkpoint)): if not deepspeed.comm.is_initialized() or deepspeed.comm.get_rank() == 0: pbar.update(1) sd = torch.load(checkpoint[i], map_location='cpu') load_model_with_checkpoint(replaced_module, sd, mp_replace, ckpt_type) else: num_checkpoints = len(checkpoint) // ckpt_mp_size assert world_size >= ckpt_mp_size,\ "Currently, merging checkpoints is not supported (when world_size is smaller than #checkpoints)!" checkpoint_stride = world_size // ckpt_mp_size if not deepspeed.comm.is_initialized() or deepspeed.comm.get_rank() == 0: pbar = tqdm.tqdm(total=num_checkpoints, desc=f"Loading {num_checkpoints} checkpoint shards") for i in range(num_checkpoints): if not deepspeed.comm.is_initialized() or deepspeed.comm.get_rank() == 0: pbar.update(1) ckpt_index = i * ckpt_mp_size + (rank // checkpoint_stride) ckpt_file = os.path.join( base_dir, checkpoint[ckpt_index]) if base_dir else checkpoint[ckpt_index] sd = torch.load(ckpt_file, map_location='cpu') load_model_with_checkpoint(replaced_module, sd, mp_replace, ckpt_type, rank % (world_size // ckpt_mp_size)) print(f"checkpoint loading time at rank {rank}: {time.time()-start_time} sec") if save_mp_checkpoint_path is not None: from collections import OrderedDict import json if checkpoint_dict is None: ckpt_name = "ds_model" try: from transformers.models.bloom.modeling_bloom import BloomForCausalLM if isinstance(model, BloomForCausalLM): ckpt_name = "bloom" except ImportError: ckpt_name = "ds_model" else: ckpt_name = checkpoint_dict['type'] if dist.is_initialized(): dist.barrier() transformer_name = get_transformer_name(replaced_module) non_tp_ckpt_name = f'{ckpt_name}-non-tp.pt' ckpt_files = [non_tp_ckpt_name] * world_size os.makedirs(save_mp_checkpoint_path, exist_ok=True) if not dist.is_initialized() or dist.get_rank() == 0: print("Saving tp-sharded checkpoints") torch.save( OrderedDict({ k: v for k, v in dict(replaced_module.state_dict()).items() if transformer_name not in k }), f'{save_mp_checkpoint_path}/{non_tp_ckpt_name}') ckpt_files += [f'{ckpt_name}-tp_{r:0>2d}.pt' for r in range(world_size)] config = json.dumps({ 'type': ckpt_name, 'base_dir': f'{save_mp_checkpoint_path}', 'checkpoints': ckpt_files, 'version': 1.0, 'parallelization': 'tp', 'mp_size': world_size }) with open(f"{save_mp_checkpoint_path}/{ckpt_name}_ds-inference_config.json", "w") as cfg: cfg.write(config) torch.save( OrderedDict({ k: v for k, v in dict(replaced_module.state_dict()).items() if transformer_name in k }), f'{save_mp_checkpoint_path}/{ckpt_name}-tp_{rank:0>2d}.pt') return replaced_module
def _test_zero_to_fp32(): class MyModel(torch.nn.Module): def __init__(self, hidden_dim, n_layers): super().__init__() self.ll = torch.nn.ModuleList( torch.nn.Linear(hidden_dim, hidden_dim) for i in range(n_layers)) self.cross_entropy_loss = torch.nn.CrossEntropyLoss() def forward(self, x, y): hidden = x for l in self.ll: hidden = l(hidden) return self.cross_entropy_loss(hidden, y) hidden_dim = 3 world_size = dist.get_world_size() n_layers = world_size * 2 model = MyModel(hidden_dim=hidden_dim, n_layers=n_layers) optim_groups = [ { "params": [l.weight for l in model.ll], "weight_decay": 0.01, }, { "params": [l.bias for l in model.ll], "weight_decay": 0.0 }, ] optim = torch.optim.SGD(optim_groups, lr=0.1) model, _, _, _ = deepspeed.initialize( model=model, model_parameters=model.parameters(), optimizer=optim, config=config_dict) data_loader = random_dataloader(model=model, total_samples=16, hidden_dim=hidden_dim, device=model.device) for i, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) model.backward(loss) model.step() model.save_checkpoint(tmpdir) # make sure all sides saved it dist.barrier() if zero_stage == 3: with deepspeed.zero.GatheredParameters(list( model.module.parameters(recurse=True)), modifier_rank=None): pass # this forces gathering the model #dump_state_dict(model) orig_state_dict = {} for name, param in model.module.named_parameters(): orig_state_dict[name] = param.detach().cpu() if dist.get_rank() == 0: fp32_model = load_state_dict_from_zero_checkpoint( model.module, tmpdir) #dump_state_dict(fp32_model) fp32_state_dict = fp32_model.state_dict() for name in orig_state_dict.keys(): # float() workaround for torch<1.6 assert torch.allclose(orig_state_dict[name].float(), fp32_state_dict[name].float())
def _test_zero_to_fp32(): class MyModel(torch.nn.Module): def __init__(self, hidden_dim, n_layers): super().__init__() # to reproduce https://github.com/microsoft/DeepSpeed/pull/1372 it is important that # the number of total elements is uneven: # (1) 4 layers of 3*(3+1)=12 elements each, 48 in total self.ll = torch.nn.ModuleList( torch.nn.Linear(hidden_dim, hidden_dim) for i in range(n_layers)) # (2) the following adds 4+1=5 elements self.classifier = torch.nn.Linear(4, 1) # total 48+5=53 (uneven as desired) elements self.cross_entropy_loss = torch.nn.CrossEntropyLoss() def forward(self, x, y): hidden = x for l in self.ll: hidden = l(hidden) return self.cross_entropy_loss(hidden, y) hidden_dim = 3 # do not change world_size = dist.get_world_size() # we want at least 2x layers as there are gpus to trigger round_robin_fp16_groups reshuffle in zero2 n_layers = world_size * 2 model = MyModel(hidden_dim=hidden_dim, n_layers=n_layers) model, _, _, _ = deepspeed.initialize( config=config_dict, model=model, model_parameters=model.parameters()) data_loader = random_dataloader(model=model, total_samples=16, hidden_dim=hidden_dim, device=model.device) for i, batch in enumerate(data_loader): loss = model(batch[0], batch[1]) model.backward(loss) model.step() model.save_checkpoint(tmpdir) # make sure all sides saved it dist.barrier() if zero_stage == 3: with deepspeed.zero.GatheredParameters(list( model.module.parameters(recurse=True)), modifier_rank=None): pass # this forces gathering the model #dump_state_dict(model) orig_state_dict = {} for name, param in model.module.named_parameters(): orig_state_dict[name] = param.detach().cpu() if dist.get_rank() == 0: fp32_model = load_state_dict_from_zero_checkpoint( model.module, tmpdir) #dump_state_dict(fp32_model) fp32_state_dict = fp32_model.state_dict() for name in orig_state_dict.keys(): # float() workaround for torch<1.6 assert torch.allclose(orig_state_dict[name].float(), fp32_state_dict[name].float())