def append(self, raw_name, record_name, latency, msg_size): import deepspeed.comm as dist algbw, busbw = calc_bw_log(raw_name, msg_size, latency) if record_name in self.comms_dict.keys(): # If this comm_op has already been logged with this message size, just add to existing record if msg_size in self.comms_dict[record_name].keys(): self.comms_dict[record_name][msg_size][0] += 1 self.comms_dict[record_name][msg_size][1].append(latency) self.comms_dict[record_name][msg_size][2].append(algbw) self.comms_dict[record_name][msg_size][3].append(busbw) # If this is a new message size for this comm_op, add new record under existing comm_op else: self.comms_dict[record_name][msg_size] = [ 1, [latency], [algbw], [busbw] ] else: # Create entirely new record self.comms_dict[record_name] = { msg_size: [1, [latency], [algbw], [busbw]] } # If verbose, print every comm op # TODO: Add to tensorboard if self.verbose: n = dist.get_world_size() log_str = f"rank={dist.get_rank()} | comm op: " + record_name + " | time (ms): {:.2f}".format( latency) log_str += " | msg size: " + convert_size(msg_size) log_str += " | algbw (Gbps): {:.2f} ".format(algbw) log_str += " | busbw (Gbps): {:.2f} ".format(busbw) log_dist(log_str, [0])
def valid_step(self, batch_itr): if self.model.global_steps % self.fs_args.validate_interval_updates != 0: return with torch.no_grad(): self.model.eval() for subset in batch_itr.valid_dataset(): with metrics.aggregate(new_root=True) as agg: for batch, is_dummy_batch in batch_itr.valid_batch(): _, sample_size, logging_output = self.task.valid_step( batch, self.model.module.model, self.model.module.criterion ) logging_outputs = [logging_output] if is_dummy_batch: if torch.is_tensor(sample_size): sample_size.zero_() else: sample_size *= 0.0 logging_outputs, (sample_size,) = torch_reduce_sum( self.model.device, logging_outputs, sample_size, ignore=is_dummy_batch, ) logging_output = self.reduce_log(logging_outputs, sample_size) log_dist( "Valid on step: {}, dataset: {}. {}".format( self.model.global_steps, subset, view_log(agg.get_smoothed_values()), ), ranks=[0], )
def __init__(self, theta=0.5, gamma=0.001): super().__init__() self.theta = theta self.gamma = gamma self.current_theta = 1.0 log_dist(f'Enabled progressive layer dropping (theta = {self.theta})', ranks=[0])
def _create_expert_data_and_model_parallel(expert_parallel_size_, mpu): """ Create expert and data parallel groups based on MPU (model parallel) group. Note: Caller of this function is responsible to check if the groups already exist. Example - E + M + D parallel world_size = 16 model_degree = 2 expert_degree = 4 # number of experts in same group mp_group = [0, 1], [2,3], [4,5] ... data_parallel_group =[0,2,4,6,8,10, 12,14], [1,3,5,7,9,11,13,15] expert_parallel_group = [0,2,4,6], [8,10,12,14] [1,3,5,7], [9,11,13,15] expert_data_parallel_group = [0,8],[2,10],[4,12],[6,14], [1,9],[3,11],[5,13],[] """ assert torch.distributed.is_initialized( ), "torch distributed is not initialized" assert mpu.model_parallel_is_initialized( ), "model parallel group is not initialized" model_parallel_size_ = mpu.get_model_parallel_world_size() world_size = torch.distributed.get_world_size() rank = torch.distributed.get_rank() dp_world_size = mpu.get_data_parallel_world_size() dp_rank = mpu.get_data_parallel_rank() log_dist( f"Creating deepspeed groups with model parallel size {model_parallel_size_}, expert parallel size {expert_parallel_size_}, world size {world_size}, dp world size {dp_world_size}", [0]) global _EXPERT_PARALLEL_GROUP, _EXPERT_DATA_PARALLEL_GROUP # Get world size and rank. Ensure some consistencies. _DATA_PARALLEL_GROUP = mpu.get_data_parallel_group() _MODEL_PARALLEL_GROUP = mpu.get_model_parallel_group() expert_parallel_size_ = min(expert_parallel_size_, dp_world_size) _ensure_divisibility(world_size, expert_parallel_size_) group_name = f"ep_size_{expert_parallel_size_}" # Only create groups if they don't already exist # Need to check conditions outside the group creation loop because of the way torch.dist group creation works if group_name not in _EXPERT_DATA_PARALLEL_GROUP and group_name not in _EXPERT_PARALLEL_GROUP: for j in range(model_parallel_size_): for i in range(expert_parallel_size_): ranks = range(i * model_parallel_size_ + j, world_size, expert_parallel_size_ * model_parallel_size_) group = torch.distributed.new_group(ranks) if rank in list(ranks): _EXPERT_DATA_PARALLEL_GROUP[group_name] = group for i in range(dp_world_size // expert_parallel_size_): ranks = range(i * num_ep * model_parallel_size_ + j, (i + 1) * expert_parallel_size_ * model_parallel_size_, model_parallel_size_) group = torch.distributed.new_group(ranks) if rank in list(ranks): _EXPERT_PARALLEL_GROUP[group_name] = group
def __init__(self, hidden_size, expert, num_experts=1, k=1, output_dropout_prob=0.0, capacity_factor=1., eval_capacity_factor=1., min_capacity=4, noisy_gate_policy: typing.Optional[str] = None): """Initialize an MoE layer. Arguments: hidden_size (int): the hidden dimension of the model, importantly this is also the input and output dimension. expert (torch.nn.Module): the torch module that defines the expert (e.g., MLP, torch.linear). num_experts (int, optional): default=1, the total number of experts per layer. k (int, optional): default=1, top-k gating value, only supports k=1 or k=2. output_dropout_prob (float, optional): default=0.0, output dropout probability. capacity_factor (float, optional): default=1.0, the capacity of the expert at training time. eval_capacity_factor (float, optional): default=1.0, the capacity of the expert at eval time. min_capacity (int, optional): default=4, the minimum capacity per expert regardless of the capacity_factor. noisy_gate_policy (str, optional): default=None, noisy gate policy, valid options are 'Jitter', 'RSample' or 'None'. """ super(MoE, self).__init__() assert groups.is_initialized(), \ 'Please call deepspeed.utils.groups.initialize() before using MoE layers' assert noisy_gate_policy is None or noisy_gate_policy in ['None', 'Jitter', 'RSample'], \ 'Unsupported noisy_gate_policy: ' + noisy_gate_policy num_local_experts = num_experts // groups.get_expert_parallel_world_size() log_dist( f'num_experts: {num_experts} | num_local_experts: {num_local_experts} | expert_parallel_size: {groups.get_expert_parallel_world_size()}', [0]) self.num_experts = num_experts experts = Experts(expert, num_local_experts) self.deepspeed_moe = MOELayer(TopKGate(hidden_size, num_experts, k, capacity_factor, eval_capacity_factor, min_capacity, noisy_gate_policy), experts, num_local_experts, group=groups.get_expert_parallel_group()) self.dropout = torch.nn.Dropout(output_dropout_prob)
def initialize(ep_size=1, mpu=None, num_ep_list=None): """ Process groups initialization supporting expert (E), data (D), and model (M) parallelism. DeepSpeed considers the following scenarios w.r.t. process group creation. * S1: There is no expert parallelism or model parallelism, only data (D):: model = my_model(args) engine = deepspeed.initialize(model) # initialize groups without mpu * S2: There is expert parallelism but no model parallelism (E+D):: deepspeed.utils.groups.initialize(ep_size) # groups will be initialized here model = my_model(args) engine = deepspeed.initialize(model) * S3: There is model parallelism but no expert parallelism (M):: mpu.init() # client initializes it's model parallel unit model = my_model(args) engine = deepspeed.initialize(model, mpu=mpu) # init w. mpu but ep_size = dp_world_size * S4: There is model, data, and expert parallelism (E+D+M):: mpu.init() # client initializes it's model parallel unit deepspeed.utils.groups.initialize(ep_size, mpu) # initialize expert groups wrt mpu model = my_model(args) engine = deepspeed.initialize(model, mpu=mpu) # passing mpu is optional in this case Arguments: ep_size (int, optional): default=1, maximum expert parallel size, which should be divisible/divided by the world size. by each element in num_ep_list. mpu (module, optional): default=None, model parallel unit (e.g., from Megatron) that describes model/data parallel ranks. num_ep_list (list, optional): default=None, list of number of expert parallel sizes in each MoE layer. """ if num_ep_list is None: num_ep_list = [ep_size] assert max( num_ep_list ) >= ep_size, f"ep_size={ep_size} is larger than the largest num_ep_list={max(num_ep_list)}, you should reduce expert parallel size" num_ep_list = list(set(num_ep_list)) # remove duplicates num_ep_list.sort() # sort in ascending order for num_ep in num_ep_list: assert num_ep > 0, 'num_ep must be positive' assert num_ep % ep_size == 0 or ep_size % num_ep == 0, 'num_ep must be divisible/divided by ep_size' if mpu is not None: log_dist(message="initializing deepspeed groups using mpu", ranks=[0]) initialize_model_and_expert_parallel(ep_size, mpu, num_ep_list) else: log_dist(message="initializing deepspeed groups", ranks=[0]) initialize_model_parallel(1) initialize_expert_parallel(ep_size, num_ep_list)
def initialize( args, model, optimizer=None, model_parameters=None, training_data=None, lr_scheduler=None, mpu=None, dist_init_required=None, collate_fn=None, config_params=None, ): log_dist( "DeepSpeed info: version={}, git-hash={}, git-branch={}".format( __version__, __git_hash__, __git_branch__), ranks=[0], ) if not isinstance(model, PipelineModule): engine = DSEngine( args=args, model=model, optimizer=optimizer, model_parameters=model_parameters, training_data=training_data, lr_scheduler=lr_scheduler, mpu=mpu, dist_init_required=dist_init_required, collate_fn=collate_fn, config_params=config_params, ) else: assert mpu is None, "mpu must be None with pipeline parallelism" engine = PipelineEngine( args=args, model=model, optimizer=optimizer, model_parameters=model_parameters, training_data=training_data, lr_scheduler=lr_scheduler, mpu=model.mpu(), dist_init_required=dist_init_required, collate_fn=collate_fn, config_params=config_params, ) return_items = [ engine, engine.optimizer, engine.training_dataloader, engine.lr_scheduler, ] return tuple(return_items)
def tmp(): fs_args, ds_config = gen_ds_fairseq_arg() set_seed(fs_args.seed) task = tasks.setup_task(fs_args) trainer = DsFairseqTrainer(fs_args, ds_config, task) batch_itr = BatchIterator(fs_args, task) for epoch in batch_itr.train_epoch(): train(batch_itr, trainer) log_dist( f'Finish epoch {epoch}, \ {view_log(metrics.get_smoothed_values("train"))}', [0], ) metrics.reset_meters("train")
def initialize_model_parallel(model_parallel_size_): """ Initialize model data parallel groups. Arguments: model_parallel_size: number of GPUs used to parallelize model. Let's say we have a total of 8 GPUs denoted by g0 ... g7 and we use 2 GPUs to parallelize the model. The present function will create 4 model parallel groups and 2 data parallel grous as: 4 model parallel groups: [g0, g1], [g2, g3], [g4, g5], [g6, g7] 2 data parallel groups: [g0, g2, g4, g6], [g1, g3, g5, g7] Note that for efficiency, the caller should make sure adjacent ranks are on the same DGX box. For example if we are using 2 DGX-1 boxes with a total of 16 GPUs, rank 0 to 7 belong to the first box and ranks 8 to 15 belong to the second box. """ log_dist( 'initializing deepspeed model parallel group with size {}'.format( model_parallel_size_), [0]) # Get world size and rank. Ensure some consistencies. assert torch.distributed.is_initialized() world_size = torch.distributed.get_world_size() model_parallel_size = min(model_parallel_size_, world_size) ensure_divisibility(world_size, model_parallel_size) rank = torch.distributed.get_rank() # Build the data parallel groups. global _DATA_PARALLEL_GROUP assert _DATA_PARALLEL_GROUP is None, \ 'data parallel group is already initialized' for i in range(model_parallel_size): ranks = range(i, world_size, model_parallel_size) group = torch.distributed.new_group(ranks) if i == (rank % model_parallel_size): _DATA_PARALLEL_GROUP = group # Build the model parallel groups. global _MODEL_PARALLEL_GROUP assert _MODEL_PARALLEL_GROUP is None, \ 'model parallel group is already initialized' for i in range(world_size // model_parallel_size): ranks = range(i * model_parallel_size, (i + 1) * model_parallel_size) group = torch.distributed.new_group(ranks) if i == (rank // model_parallel_size): _MODEL_PARALLEL_GROUP = group
def _create_model_parallel(model_parallel_size_): """ Initialize model data parallel groups. Arguments: model_parallel_size: number of GPUs used to parallelize model. Returns: Tuple of data parallel group and model parallel group Let's say we have a total of 8 GPUs denoted by g0 ... g7 and we use 2 GPUs to parallelize the model. The present function will create 4 model parallel groups and 2 data parallel groups as: 4 model parallel groups: [g0, g1], [g2, g3], [g4, g5], [g6, g7] 2 data parallel groups: [g0, g2, g4, g6], [g1, g3, g5, g7] Note that for efficiency, the caller should make sure adjacent ranks are on the same DGX box. For example if we are using 2 DGX-1 boxes with a total of 16 GPUs, rank 0 to 7 belong to the first box and ranks 8 to 15 belong to the second box. """ log_dist(f'Creating model parallel group with size {model_parallel_size_}', ranks=[0]) # Get world size and rank. Ensure some consistencies. assert dist.is_initialized() world_size = dist.get_world_size() model_parallel_size = min(model_parallel_size_, world_size) _ensure_divisibility(world_size, model_parallel_size) rank = dist.get_rank() _DATA_PARALLEL_GROUP = None _MODEL_PARALLEL_GROUP = None # Build the data parallel groups. for i in range(model_parallel_size): ranks = range(i, world_size, model_parallel_size) group = dist.new_group(ranks) if i == (rank % model_parallel_size): _DATA_PARALLEL_GROUP = group # Build the model parallel groups. for i in range(world_size // model_parallel_size): ranks = range(i * model_parallel_size, (i + 1) * model_parallel_size) group = dist.new_group(ranks) if i == (rank // model_parallel_size): _MODEL_PARALLEL_GROUP = group return _DATA_PARALLEL_GROUP, _MODEL_PARALLEL_GROUP
def initialize(ep_size=1, mpu=None): """ Process groups initialization supporting expert (E), data (D), and model (M) parallelism. DeepSpeed considers the following scenarios w.r.t. process group creation. * S1: There is no expert parallelism or model parallelism, only data (D):: model = my_model(args) engine = deepspeed.initialize(model) # initialize groups without mpu * S2: There is expert parallelism but no model parallelism (E+D):: deepspeed.utils.groups.initialize(ep_size) # groups will be initialized here model = my_model(args) engine = deepspeed.initialize(model) * S3: There is model parallelism but no expert parallelism (M):: mpu.init() # client initializes it's model parallel unit model = my_model(args) engine = deepspeed.initialize(model, mpu=mpu) # init w. mpu but ep_size = dp_world_size * S4: There is model, data, and expert parallelism (E+D+M):: mpu.init() # client initializes it's model parallel unit deepspeed.utils.groups.initialize(ep_size, mpu) # initialize expert groups wrt mpu model = my_model(args) engine = deepspeed.initialize(model, mpu=mpu) # passing mpu is optional in this case Arguments: ep_size (int, optional): default=1, expert parallel size mpu (module, optional): default=None, model parallel unit (e.g., from Megatron) that descibes model/data parallel ranks. """ if mpu is not None: log_dist(message="initializing deepspeed groups using mpu", ranks=[0]) initialize_model_and_expert_parallel(ep_size, mpu) else: log_dist(message="initializing deepspeed groups", ranks=[0]) initialize_model_parallel(1) initialize_expert_parallel(ep_size)
def __init__(self, verbose=False, max_iter=100, tol=1e-2, stability=0, gas_boundary_resolution=1, layer_name='', layer_num=0): super().__init__() self.verbose = verbose self.max_iter = max_iter self.tol = tol self.stability = stability self.gas_boundary_resolution = gas_boundary_resolution self.layer_name = layer_name self.layer_num = layer_num assert len(self.layer_name) > 0 and layer_num > 0 log_dist( f'enabled eigenvalue with verbose={verbose}, max_iter={max_iter}, tol={tol}, stability={stability}, gas_boundary_resolution={gas_boundary_resolution}, layer_name={layer_name}, layer_num={layer_num}', ranks=[0])
def train_step(self, sample, is_dummy_batch): self.model.train() self.model.zero_grad() loss, sample_size, logging_output = self.model(sample) if is_dummy_batch: if torch.is_tensor(sample_size): sample_size.zero_() else: sample_size *= 0.0 loss *= 0.0 if torch.is_tensor(sample_size): sample_size = sample_size.float() else: sample_size = float(sample_size) logging_outputs, (sample_size, ) = torch_reduce_sum( self.model.device, [logging_output], sample_size, ignore=is_dummy_batch) final_loss = loss * (dist.get_world_size() / sample_size) self.model.backward(final_loss) self.model.step() logging_output = self.reduce_log(logging_outputs, sample_size) if self.model.global_steps % self.model.steps_per_print() != 0: return log_dist( f'Step: {self.model.global_steps}, \ {view_log(metrics.get_smoothed_values("train_inner"))}', [0], ) metrics.reset_meters("train_inner")
def _create_expert_and_data_parallel(ep_size): """ Create expert and data parallel groups. Note: Caller of this function is responsible to check if the groups already exist. Example - E + D parallel world_size = 16 expert_parallel_size = 2 # number of experts in same group expert_data_parallel_group = [0,2,4,6,8,10,12,14], [1,3,5,7,9,11,13,15] - all reduce is only on MoE params expert_parallel_group = [0, 1], [2,3], [4,5], [6,7], [8,9] - no all reduce, but all to all data_parallel_group = [0,1,...,15] - all reduce is only on non-MoE """ assert torch.distributed.is_initialized() log_dist(f'Creating expert and data parallel groups with size {ep_size}', ranks=[0]) world_size = torch.distributed.get_world_size() rank = torch.distributed.get_rank() expert_parallel_size_ = min(ep_size, world_size) _ensure_divisibility(world_size, expert_parallel_size_) group_name = f"ep_size_{expert_parallel_size_}" # Build the expert data parallel groups. global _EXPERT_DATA_PARALLEL_GROUP # Only create group if it does not already exist if group_name not in _EXPERT_DATA_PARALLEL_GROUP: for i in range(expert_parallel_size_): ranks = range(i, world_size, expert_parallel_size_) group = torch.distributed.new_group(ranks) log_dist( f'Creating expert data parallel process group named {group_name} with ranks: {list(ranks)}', [0]) if i == (rank % expert_parallel_size_): _EXPERT_DATA_PARALLEL_GROUP[group_name] = group # Build the expert parallel groups. global _EXPERT_PARALLEL_GROUP # Only create group if it does not already exist if group_name not in _EXPERT_PARALLEL_GROUP: for i in range(world_size // expert_parallel_size_): ranks = range(i * expert_parallel_size_, (i + 1) * expert_parallel_size_) group = torch.distributed.new_group(ranks) log_dist( f'creating expert parallel process group named {group_name} with ranks: {list(ranks)}', [0]) if i == (rank // expert_parallel_size_): _EXPERT_PARALLEL_GROUP[group_name] = group
def initialize_expert_parallel(expert_parallel_size_): """ Initialize expert plus data parallel groups. Example - E + D parallel world_size = 16 expert_parallel_size = 2 # number of experts in same group expert_data_parallel_group = [0,2,4,6,8,10,12,14], [1,3,5,7,9,11,13,15] - all reduce is only on MoE params expert_parallel_group = [0, 1], [2,3], [4,5], [6,7], [8,9] - no all reduce, but all to all data_parallel_group = [0,1,...,15] - all reduce is only on non-MoE """ assert torch.distributed.is_initialized() log_dist( 'initializing deepspeed expert parallel group with size {}'.format( expert_parallel_size_), [0]) world_size = get_data_parallel_world_size() rank = get_data_parallel_rank() expert_parallel_size_ = min(expert_parallel_size_, world_size) ensure_divisibility(world_size, expert_parallel_size_) # Build the expert data parallel groups. global _EXPERT_DATA_PARALLEL_GROUP assert _EXPERT_DATA_PARALLEL_GROUP is None, \ 'expert data parallel group is already initialized' for i in range(expert_parallel_size_): ranks = range(i, world_size, expert_parallel_size_) group = torch.distributed.new_group(ranks) # TODO: remove log_dist( f'creating expert data parallel process group with ranks: {list(ranks)}', [0]) if i == (rank % expert_parallel_size_): _EXPERT_DATA_PARALLEL_GROUP = group # Build the expert parallel groups. global _EXPERT_PARALLEL_GROUP assert _EXPERT_PARALLEL_GROUP is None, \ 'expert parallel group is already initialized' for i in range(world_size // expert_parallel_size_): ranks = range(i * expert_parallel_size_, (i + 1) * expert_parallel_size_) group = torch.distributed.new_group(ranks) # TODO: remove log_dist( f'creating expert parallel process group with ranks: {list(ranks)}', [0]) if i == (rank // expert_parallel_size_): _EXPERT_PARALLEL_GROUP = group
def flatten_dense_tensors_sub_partition_aligned(tensor_list, dp, max_elements_per_comm, pg): assert max_elements_per_comm >= dp, f"max_elements_per_comm {max_elements_per_comm} < dp {dp}" num_elements = sum(t.numel() for t in tensor_list) log_dist("Total number of elements in model: {}, max elements per com: {}".format( num_elements, max_elements_per_comm), ranks=[0]) # Compute aligned partition size based on parameter count aligned_param_partition_size = math.ceil(num_elements / dp) # Compute aligned partition size based on communication size aligned_comm_partition_size = int(max_elements_per_comm // dp) if aligned_param_partition_size <= aligned_comm_partition_size: sub_partition_count = 1 sub_partition_size = aligned_param_partition_size else: sub_partition_count = math.ceil(aligned_param_partition_size / aligned_comm_partition_size) sub_partition_size = aligned_comm_partition_size # Compute required padding for alignment to dp and max_elements_per_comm padding = (sub_partition_count * sub_partition_size * dp) - num_elements log_dist( f"sub_partition_count: {sub_partition_count}, sub_partition_size: {sub_partition_size}, padding: {padding}", ranks=[0]) log_dist( f"number of elements with padding: {num_elements} + {padding} = {num_elements + padding}", ranks=[0]) if padding == 0: aligned_tensor_list = tensor_list else: pad_tensor = torch.zeros(padding, device=tensor_list[0].device, dtype=tensor_list[0].dtype) aligned_tensor_list = tensor_list + [pad_tensor] return _flatten_dense_tensors(aligned_tensor_list)
def step(self, closure=None): """ Not supporting closure. """ if self.fused_adam_legacy: return self.step_fused_adam() COMPUTE_NORM = "compute_norm" OVERFLOW_CHECK = 'overflow_check' OVERFLOW_TIMERS = [COMPUTE_NORM, OVERFLOW_CHECK] UNSCALE_AND_CLIP = 'unscale_and_clip' BASIC_STEP = 'basic_step' UPDATE_FP16 = 'update_fp16' STEP_TIMERS = OVERFLOW_TIMERS + [UNSCALE_AND_CLIP, BASIC_STEP, UPDATE_FP16] # First determine if there is overflow. self.start_timers([OVERFLOW_CHECK]) fp16_params = [] for i, group in enumerate(self.fp16_groups): fp16_params.extend([p for p in group if p.grad is not None]) self.overflow = self.overflow_checker.has_overflow(fp16_params) self.stop_timers([OVERFLOW_CHECK]) prev_scale = self.cur_scale self._update_scale(self.overflow) if self.overflow: if self.verbose: log_dist( "Overflow detected. Skipping step. Attempted loss " f"scale: {prev_scale}, reducing to {self.cur_scale}", ranks=[0]) # Clear gradients for i, group in enumerate(self.fp16_groups): for p in group: p.grad = None self.log_timers(OVERFLOW_TIMERS) return self.overflow grads_groups_flat = [] for i, group in enumerate(self.fp16_groups): data_type = self.fp32_groups_flat[i].dtype grads_groups_flat.append( _flatten_dense_tensors([ torch.zeros(p.size(), dtype=data_type, device=p.device) if p.grad is None else p.grad.to(data_type) for p in group ])) for p in group: p.grad = None self.fp32_groups_flat[i].grad = grads_groups_flat[i] self.start_timers([COMPUTE_NORM]) all_groups_norm = get_grad_norm(self.fp32_groups_flat, mpu=self.mpu) self.stop_timers([COMPUTE_NORM]) self.start_timers([UNSCALE_AND_CLIP]) self.unscale_and_clip_grads(grads_groups_flat, [all_groups_norm]) self.stop_timers([UNSCALE_AND_CLIP]) self.start_timers([BASIC_STEP]) self.optimizer.step() self.stop_timers([BASIC_STEP]) #get rid of the fp32 gradients. Not needed anymore for group in self.fp32_groups_flat: group.grad = None self.start_timers([UPDATE_FP16]) for i in range(len(self.fp16_groups)): updated_params = _unflatten_dense_tensors(self.fp32_groups_flat[i], self.fp16_groups[i]) for p, q in zip(self.fp16_groups[i], updated_params): p.data.copy_(q.data) self.stop_timers([UPDATE_FP16]) self.log_timers(STEP_TIMERS) return self.overflow
def initialize_model_and_expert_parallel(expert_parallel_size_, mpu, num_ep_list_=None): """ Initialize Expert groups based on MPU groups. Example - E + M + D parallel world_size = 16 model_degree = 2 expert_degree = 4 # number of experts in same group mp_group = [0, 1], [2,3], [4,5] ... data_parallel_group =[0,2,4,6,8,10, 12,14], [1,3,5,7,9,11,13,15] expert_parallel_group = [0,2,4,6], [8,10,12,14] [1,3,5,7], [9,11,13,15] expert_data_parallel_group = [0,8],[2,10],[4,12],[6,14], [1,9],[3,11],[5,13],[] """ assert torch.distributed.is_initialized( ), "torch distributed is not initialized" assert mpu.model_parallel_is_initialized( ), "model parallel group is not initialized" model_parallel_size_ = mpu.get_model_parallel_world_size() global _MAX_EP_SIZE global _MAX_EP_SIZE_NAME _MAX_EP_SIZE = expert_parallel_size_ _MAX_EP_SIZE_NAME = f"ep_size_{expert_parallel_size_}" if num_ep_list_ is None: num_ep_list = [expert_parallel_size_] world_size = torch.distributed.get_world_size() rank = torch.distributed.get_rank() dp_world_size = mpu.get_data_parallel_world_size() dp_rank = mpu.get_data_parallel_rank() log_dist( f"Initializing deepspeed groups with model parallel size {model_parallel_size_}, expert parallel size {expert_parallel_size_}, world size {world_size}, dp world size {dp_world_size}", [0]) global _DATA_PARALLEL_GROUP, _MODEL_PARALLEL_GROUP global _EXPERT_PARALLEL_GROUP, _EXPERT_DATA_PARALLEL_GROUP # Get world size and rank. Ensure some consistencies. _DATA_PARALLEL_GROUP = mpu.get_data_parallel_group() _MODEL_PARALLEL_GROUP = mpu.get_model_parallel_group() expert_parallel_size_ = min(expert_parallel_size_, dp_world_size) ensure_divisibility(world_size, expert_parallel_size_) # Build the expert data parallel groups. assert _EXPERT_DATA_PARALLEL_GROUP is None, \ 'expert data parallel group is already initialized' # Build the expert parallel groups. assert _EXPERT_PARALLEL_GROUP is None, \ 'expert parallel group is already initialized' _EXPERT_DATA_PARALLEL_GROUP = {} _EXPERT_PARALLEL_GROUP = {} for num_ep in num_ep_list_: for j in range(model_parallel_size_): # For data parallel # Similar as initialize_expert_parallel we will need to think about two cases if num_ep >= expert_parallel_size_: #TODO: refactor this part of code to check condition in outer for-loop if True: #f"ep_size_{expert_parallel_size_}" not in _EXPERT_DATA_PARALLEL_GROUP: for i in range(expert_parallel_size_): ranks = range( i * model_parallel_size_ + j, world_size, expert_parallel_size_ * model_parallel_size_) group = torch.distributed.new_group(ranks) if rank in list(ranks): _EXPERT_DATA_PARALLEL_GROUP[ f"ep_size_{expert_parallel_size_}"] = group else: for i in range(num_ep): ranks = range(i * model_parallel_size_ + j, world_size, num_ep * model_parallel_size_) group = torch.distributed.new_group(ranks) if rank in list(ranks): _EXPERT_DATA_PARALLEL_GROUP[ f"ep_size_{num_ep}"] = group # For expert parallel if num_ep >= expert_parallel_size_: #TODO: refactor this part of code to check condition in outer for-loop if True: #f"ep_size_{expert_parallel_size_}" not in _EXPERT_PARALLEL_GROUP: for i in range(dp_world_size // expert_parallel_size_): ranks = range( i * expert_parallel_size_ * model_parallel_size_ + j, (i + 1) * expert_parallel_size_ * model_parallel_size_, model_parallel_size_) group = torch.distributed.new_group(ranks) if rank in list(ranks): _EXPERT_PARALLEL_GROUP[ f"ep_size_{expert_parallel_size_}"] = group else: for i in range(dp_world_size // num_ep): ranks = range(i * num_ep * model_parallel_size_ + j, (i + 1) * num_ep * model_parallel_size_, model_parallel_size_) group = torch.distributed.new_group(ranks) if rank in list(ranks): _EXPERT_PARALLEL_GROUP[f"ep_size_{num_ep}"] = group
def __init__(self, hidden_size, expert, num_experts=1, k=1, capacity_factor=1., eval_capacity_factor=1., min_capacity=4, noisy_gate_policy: typing.Optional[str] = None, drop_tokens: bool = True, use_rts=True, use_tutel: bool = False): """Initialize an MoE layer. Arguments: hidden_size (int): the hidden dimension of the model, importantly this is also the input and output dimension. expert (torch.nn.Module): the torch module that defines the expert (e.g., MLP, torch.linear). num_experts (int, optional): default=1, the total number of experts per layer. k (int, optional): default=1, top-k gating value, only supports k=1 or k=2. capacity_factor (float, optional): default=1.0, the capacity of the expert at training time. eval_capacity_factor (float, optional): default=1.0, the capacity of the expert at eval time. min_capacity (int, optional): default=4, the minimum capacity per expert regardless of the capacity_factor. noisy_gate_policy (str, optional): default=None, noisy gate policy, valid options are 'Jitter', 'RSample' or 'None'. drop_tokens (bool, optional): default=True, whether to drop tokens - (setting to False is equivalent to infinite capacity). use_rts (bool, optional): default=True, whether to use Random Token Selection. use_tutel (bool, optional): default=False, whether to use Tutel optimizations (if installed). """ super(MoE, self).__init__() assert groups.is_initialized(), \ 'Please call deepspeed.utils.groups.initialize() before using MoE layers' assert noisy_gate_policy is None or noisy_gate_policy in ['None', 'Jitter', 'RSample'], \ 'Unsupported noisy_gate_policy: ' + noisy_gate_policy num_local_experts = num_experts // groups.get_expert_parallel_world_size( ) log_dist( f'num_experts: {num_experts} | num_local_experts: {num_local_experts} | expert_parallel_size: {groups.get_expert_parallel_world_size()}', [0]) self.num_experts = num_experts experts = Experts(expert, num_local_experts) self.deepspeed_moe = MOELayer(TopKGate(hidden_size, num_experts, k, capacity_factor, eval_capacity_factor, min_capacity, noisy_gate_policy, drop_tokens, use_rts), experts, num_local_experts, group=groups.get_expert_parallel_group(), use_tutel=use_tutel)
def __init__(self, hidden_size, expert, num_experts=1, ep_size=1, k=1, capacity_factor=1., eval_capacity_factor=1., min_capacity=4, use_residual=False, noisy_gate_policy: typing.Optional[str] = None, drop_tokens: bool = True, use_rts=True, use_tutel: bool = False): """Initialize an MoE layer. Arguments: hidden_size (int): the hidden dimension of the model, importantly this is also the input and output dimension. expert (torch.nn.Module): the torch module that defines the expert (e.g., MLP, torch.linear). num_experts (int, optional): default=1, the total number of experts per layer. ep_size (int, optional): default=1, number of ranks in the expert parallel world or group. k (int, optional): default=1, top-k gating value, only supports k=1 or k=2. capacity_factor (float, optional): default=1.0, the capacity of the expert at training time. eval_capacity_factor (float, optional): default=1.0, the capacity of the expert at eval time. min_capacity (int, optional): default=4, the minimum capacity per expert regardless of the capacity_factor. use_residual (bool, optional): default=False, make this MoE layer a Residual MoE (https://arxiv.org/abs/2201.05596) layer. noisy_gate_policy (str, optional): default=None, noisy gate policy, valid options are 'Jitter', 'RSample' or 'None'. drop_tokens (bool, optional): default=True, whether to drop tokens - (setting to False is equivalent to infinite capacity). use_rts (bool, optional): default=True, whether to use Random Token Selection. use_tutel (bool, optional): default=False, whether to use Tutel optimizations (if installed). """ super(MoE, self).__init__() self.use_residual = use_residual self.ep_size = min( ep_size, num_experts ) # the ep size should be less than the number of experts self.expert_group_name = f"ep_size_{self.ep_size}" self.num_experts = num_experts self.num_local_experts = 1 if num_experts < ep_size else num_experts // ep_size log_dist( f'Creating MoE layer with num_experts: {num_experts} | num_local_experts: {self.num_local_experts} | expert_parallel_size: {ep_size}', [0]) assert noisy_gate_policy is None or noisy_gate_policy in ['None', 'Jitter', 'RSample'], \ 'Unsupported noisy_gate_policy: ' + noisy_gate_policy experts = Experts(expert, self.num_local_experts, self.expert_group_name) self.deepspeed_moe = MOELayer(TopKGate(hidden_size, num_experts, k, capacity_factor, eval_capacity_factor, min_capacity, noisy_gate_policy, drop_tokens, use_rts), experts, self.expert_group_name, self.ep_size, self.num_local_experts, use_tutel=use_tutel) if self.use_residual: self.mlp = expert # coefficient is used for weighted sum of the output of expert and mlp self.coefficient = torch.nn.Linear(hidden_size, 2)
def create(self, tag): log_dist(f"[Nebula] Start Checkpoint for tag:{tag}", ranks=[0]) # -2 means: customer needs to explicitly tell nebula # current checkpoint is complete by commit methond. self.checkpoint = torch_nebula.Checkpoint(tag, -2)
def step(self, closure=None): """ Not supporting closure. """ if self.fused_adam_legacy: return self.step_fused_adam() COMPUTE_NORM = "compute_norm" OVERFLOW_CHECK = 'overflow_check' OVERFLOW_TIMERS = [COMPUTE_NORM, OVERFLOW_CHECK] UNSCALE_AND_CLIP = 'unscale_and_clip' BASIC_STEP = 'basic_step' UPDATE_FP16 = 'update_fp16' STEP_TIMERS = OVERFLOW_TIMERS + [UNSCALE_AND_CLIP, BASIC_STEP, UPDATE_FP16] # First determine if there is overflow. self.start_timers([OVERFLOW_CHECK]) fp16_params = [] for i, group in enumerate(self.fp16_groups): fp16_params.extend([p for p in group if p.grad is not None]) self.overflow = self.overflow_checker.has_overflow(fp16_params) self.stop_timers([OVERFLOW_CHECK]) prev_scale = self.cur_scale self._update_scale(self.overflow) if self.overflow: if self.verbose: log_dist( "Overflow detected. Skipping step. Attempted loss " f"scale: {prev_scale}, reducing to {self.cur_scale}", ranks=[0]) # Clear gradients for i, group in enumerate(self.fp16_groups): for p in group: p.grad = None self.log_timers(OVERFLOW_TIMERS) return self.overflow grads_groups_flat = [] for i, group in enumerate(self.fp16_groups): data_type = self.fp32_groups_flat[i].dtype grads_groups_flat.append( _flatten_dense_tensors([ torch.zeros(p.size(), dtype=data_type, device=p.device) if p.grad is None else p.grad.to(data_type) for p in group ])) for p in group: p.grad = None self.fp32_groups_flat[i].grad = grads_groups_flat[i] self.start_timers([COMPUTE_NORM]) all_groups_norm = get_grad_norm(self.fp32_groups_flat, mpu=self.mpu) #all_groups_norm_old = all_groups_norm # Need to allreduce (avg) the norms across different ranks because moe params will not be synced during allreduce if self.using_pipeline: pg = self.deepspeed.mpu.get_data_parallel_group() else: pg = groups._get_data_parallel_group() scaled_norm = all_groups_norm * 1.0 / float(dist.get_world_size(group=pg)) scaled_norm_tensor = torch.tensor(scaled_norm, device=self.fp32_groups_flat[i].device, dtype=torch.float) dist.all_reduce(scaled_norm_tensor, group=pg) all_groups_norm = scaled_norm_tensor.item() #print(f"old = {all_groups_norm_old} and new = {all_groups_norm} at rank: {torch.distributed.get_rank()}") self.stop_timers([COMPUTE_NORM]) self._global_grad_norm = get_global_norm(norm_list=[all_groups_norm]) self.start_timers([UNSCALE_AND_CLIP]) self.unscale_and_clip_grads(grads_groups_flat, self._global_grad_norm) self.stop_timers([UNSCALE_AND_CLIP]) self.start_timers([BASIC_STEP]) self.optimizer.step() self.stop_timers([BASIC_STEP]) #get rid of the fp32 gradients. Not needed anymore for group in self.fp32_groups_flat: group.grad = None self.start_timers([UPDATE_FP16]) for i in range(len(self.fp16_groups)): updated_params = _unflatten_dense_tensors(self.fp32_groups_flat[i], self.fp16_groups[i]) for p, q in zip(self.fp16_groups[i], updated_params): p.data.copy_(q.data) self.stop_timers([UPDATE_FP16]) self.log_timers(STEP_TIMERS) return self.overflow
def compute_eigenvalue(self, module, device=None, scale=1.0): block_eigenvalue = [] param_keys = [] layers = self.get_layers(module) for block in range(self.layer_num): model_block = layers[block] # We found this randn() has obvious accuracy impact in some cases, save/recover random state here. rng_state = torch.random.get_rng_state() if device is None: v = [ torch.randn(p.size()) for p in model_block.parameters() if p.grad is not None and p.grad.grad_fn is not None ] else: v = [ torch.randn(p.size(), device=device) for p in model_block.parameters() if p.grad is not None and p.grad.grad_fn is not None ] torch.random.set_rng_state(rng_state) grads = [ param.grad for param in model_block.parameters() if param.grad is not None and param.grad.grad_fn is not None ] params = [ param for param in model_block.parameters() if param.grad is not None and param.grad.grad_fn is not None ] layer_keys = [id(p) for p in model_block.parameters()] param_keys.append(layer_keys) v = self.normalize(v) # Disable eigenvalue if the model doesn't support second order gradients computation, # e.g. when enabling DS transformer kernel. if len(grads) == 0 or len(params) == 0: log_dist(f'The model does NOT support eigenvalue computation.', ranks=[0], level=logging.WARNING) return [] i = 0 eigenvalue_current, eigenvalue_previous = 1., 0. while (i < self.max_iter) and abs(eigenvalue_current) > 0 and (abs( (eigenvalue_current - eigenvalue_previous) / eigenvalue_current) >= self.tol): # test convergence criteria eigenvalue_previous = eigenvalue_current Hv = torch.autograd.grad(grads, params, grad_outputs=v, only_inputs=True, retain_graph=True) #Hv = [hv.float() for hv in Hv] Hv = [self.nan_to_num(hv).float() for hv in Hv] eigenvalue_current = self.inner_product(Hv, v).item() v = self.normalize(Hv) v = [x / scale for x in v] i += 1 eigenvalue_current *= scale block_eigenvalue.append(eigenvalue_current) if self.verbose: log_dist( f'block: {block}, power iteration: {i}, eigenvalue: {eigenvalue_current}', ranks=[0]) block_eigenvalue = self.post_process(block_eigenvalue) if self.verbose: log_dist(f'post processed block_eigenvalue: {block_eigenvalue}', ranks=[0]) # {param_id: (eigenvalue, layer_id)} ev_dict = {} for i, (layer_keys, value) in enumerate(zip(param_keys, block_eigenvalue)): ev_dict.update(dict.fromkeys(layer_keys, (value, i))) return ev_dict
def initialize_model_and_expert_parallel(expert_parallel_size_, mpu): """ Initialize Expert groups based on MPU groups. Example - E + M + D parallel world_size = 16 model_degree = 2 expert_degree = 4 # number of experts in same group mp_group = [0, 1], [2,3], [4,5] ... data_parallel_group =[0,2,4,6,8,10, 12,14], [1,3,5,7,9,11,13,15] expert_parallel_group = [0,2,4,6], [8,10,12,14] [1,3,5,7], [9,11,13,15] expert_data_parallel_group = [0,8],[2,10],[4,12],[6,14], [1,9],[3,11],[5,13],[] """ assert torch.distributed.is_initialized( ), "torch distributed is not initialized" assert mpu.model_parallel_is_initialized( ), "model parallel group is not initialized" model_parallel_size_ = mpu.get_model_parallel_world_size() world_size = torch.distributed.get_world_size() rank = torch.distributed.get_rank() dp_world_size = mpu.get_data_parallel_world_size() dp_rank = mpu.get_data_parallel_rank() log_dist( f"Initializing deepspeed groups with model parallel size {model_parallel_size_}, expert parallel size {expert_parallel_size_}, and data parallel size {world_size}", [0]) global _DATA_PARALLEL_GROUP, _MODEL_PARALLEL_GROUP global _EXPERT_PARALLEL_GROUP, _EXPERT_DATA_PARALLEL_GROUP # Get world size and rank. Ensure some consistencies. _DATA_PARALLEL_GROUP = mpu.get_data_parallel_group() _MODEL_PARALLEL_GROUP = mpu.get_model_parallel_group() expert_parallel_size_ = min(expert_parallel_size_, dp_world_size) ensure_divisibility(world_size, expert_parallel_size_) # Build the expert data parallel groups. assert _EXPERT_DATA_PARALLEL_GROUP is None, \ 'expert data parallel group is already initialized' # Build the expert parallel groups. assert _EXPERT_PARALLEL_GROUP is None, \ 'expert parallel group is already initialized' for j in range(model_parallel_size_): for i in range(expert_parallel_size_): ranks = range(i * model_parallel_size_ + j, world_size, expert_parallel_size_ * model_parallel_size_) group = torch.distributed.new_group(ranks) # TODO: remove log_dist( f'creating expert data parallel process group with ranks: {list(ranks)}', [0]) if rank in list(ranks): _EXPERT_DATA_PARALLEL_GROUP = group for i in range(dp_world_size // expert_parallel_size_): ranks = range(i * expert_parallel_size_ * model_parallel_size_ + j, (i + 1) * expert_parallel_size_ * model_parallel_size_, model_parallel_size_) group = torch.distributed.new_group(ranks) # TODO: remove log_dist( f'creating expert parallel process group with ranks: {list(ranks)}', [0]) if rank in list(ranks): _EXPERT_PARALLEL_GROUP = group
def create(self, tag): log_dist(f"[Torch] Checkpoint {tag} is begin to save!", ranks=[0])
def initialize_expert_parallel(expert_parallel_size_, num_ep_list_=None): """ Initialize expert plus data parallel groups. Example - E + D parallel world_size = 16 expert_parallel_size = 2 # number of experts in same group expert_data_parallel_group = [0,2,4,6,8,10,12,14], [1,3,5,7,9,11,13,15] - all reduce is only on MoE params expert_parallel_group = [0, 1], [2,3], [4,5], [6,7], [8,9] - no all reduce, but all to all data_parallel_group = [0,1,...,15] - all reduce is only on non-MoE """ assert torch.distributed.is_initialized() global _MAX_EP_SIZE global _MAX_EP_SIZE_NAME _MAX_EP_SIZE = expert_parallel_size_ _MAX_EP_SIZE_NAME = f"ep_size_{expert_parallel_size_}" if num_ep_list_ is None: num_ep_list_ = [expert_parallel_size_] log_dist( 'initializing deepspeed expert parallel group with max size {} for number expert list {}' .format(expert_parallel_size_, num_ep_list_), [0]) world_size = get_data_parallel_world_size() rank = get_data_parallel_rank() expert_parallel_size_ = min(expert_parallel_size_, world_size) ensure_divisibility(world_size, expert_parallel_size_) # Build the expert data parallel groups. global _EXPERT_DATA_PARALLEL_GROUP assert _EXPERT_DATA_PARALLEL_GROUP is None, \ 'expert data parallel group is already initialized' _EXPERT_DATA_PARALLEL_GROUP = {} for num_ep in num_ep_list_: # Build the data parallel groups for each num_ep # We will have two cases # 1. num_ep >= expert_parallel_size_, we can assign the same group to to num_ep from expert_parallel_size_ to num_ep # 2. num_ep < expert_parallel_size_, we will need to create the new group if num_ep >= expert_parallel_size_: if f"ep_size_{expert_parallel_size_}" not in _EXPERT_DATA_PARALLEL_GROUP: for i in range(expert_parallel_size_): # generate all groups ranks = range(i, world_size, expert_parallel_size_) group = torch.distributed.new_group(ranks) if i == (rank % expert_parallel_size_): # get the correct group _EXPERT_DATA_PARALLEL_GROUP[ f"ep_size_{expert_parallel_size_}"] = group else: for i in range(num_ep): ranks = range(i, world_size, num_ep) group = torch.distributed.new_group(ranks) if i == (rank % num_ep): _EXPERT_DATA_PARALLEL_GROUP[f"ep_size_{num_ep}"] = group # Build the expert parallel groups. global _EXPERT_PARALLEL_GROUP assert _EXPERT_PARALLEL_GROUP is None, \ 'expert parallel group is already initialized' _EXPERT_PARALLEL_GROUP = {} for num_ep in num_ep_list_: # Similar as above we will need to think about two cases if num_ep >= expert_parallel_size_: if f"ep_size_{expert_parallel_size_}" not in _EXPERT_PARALLEL_GROUP: for i in range(world_size // expert_parallel_size_): ranks = range(i * expert_parallel_size_, (i + 1) * expert_parallel_size_) group = torch.distributed.new_group(ranks) if i == (rank // expert_parallel_size_): _EXPERT_PARALLEL_GROUP[ f"ep_size_{expert_parallel_size_}"] = group else: for i in range(world_size // num_ep): ranks = range(i * num_ep, (i + 1) * num_ep) group = torch.distributed.new_group(ranks) if i == (rank // num_ep): _EXPERT_PARALLEL_GROUP[f"ep_size_{num_ep}"] = group