def __init__(self, hidden_size, expert, num_experts=1, k=1, output_dropout_prob=0.0, capacity_factor=1., eval_capacity_factor=1., min_capacity=4, noisy_gate_policy: typing.Optional[str] = None): """Initialize an MoE layer. Arguments: hidden_size (int): the hidden dimension of the model, importantly this is also the input and output dimension. expert (torch.nn.Module): the torch module that defines the expert (e.g., MLP, torch.linear). num_experts (int, optional): default=1, the total number of experts per layer. k (int, optional): default=1, top-k gating value, only supports k=1 or k=2. output_dropout_prob (float, optional): default=0.0, output dropout probability. capacity_factor (float, optional): default=1.0, the capacity of the expert at training time. eval_capacity_factor (float, optional): default=1.0, the capacity of the expert at eval time. min_capacity (int, optional): default=4, the minimum capacity per expert regardless of the capacity_factor. noisy_gate_policy (str, optional): default=None, noisy gate policy, valid options are 'Jitter', 'RSample' or 'None'. """ super(MoE, self).__init__() assert groups.is_initialized(), \ 'Please call deepspeed.utils.groups.initialize() before using MoE layers' assert noisy_gate_policy is None or noisy_gate_policy in ['None', 'Jitter', 'RSample'], \ 'Unsupported noisy_gate_policy: ' + noisy_gate_policy num_local_experts = num_experts // groups.get_expert_parallel_world_size() log_dist( f'num_experts: {num_experts} | num_local_experts: {num_local_experts} | expert_parallel_size: {groups.get_expert_parallel_world_size()}', [0]) self.num_experts = num_experts experts = Experts(expert, num_local_experts) self.deepspeed_moe = MOELayer(TopKGate(hidden_size, num_experts, k, capacity_factor, eval_capacity_factor, min_capacity, noisy_gate_policy), experts, num_local_experts, group=groups.get_expert_parallel_group()) self.dropout = torch.nn.Dropout(output_dropout_prob)
def has_overflow(self, params, has_moe_params=None): if has_moe_params is None: has_moe_params = self.has_moe_params overflow = self.has_overflow_serial(params) # Since each model parallel GPU carries only part of the model, # make sure overflow flag is synced across all the model parallel GPUs overflow_gpu = torch.cuda.ByteTensor([overflow]) # torch.distributed.all_reduce(overflow_gpu, # op=torch.distributed.ReduceOp.MAX, # group=mpu.get_model_parallel_group()) if has_moe_params: # All reduce this across expert_parallel_group, so that if an expert # overflows, we detect it here dist.all_reduce(overflow_gpu, op=dist.ReduceOp.MAX, group=groups.get_expert_parallel_group()) if self.zero_reduce_scatter: torch.distributed.all_reduce(overflow_gpu, op=torch.distributed.ReduceOp.MAX, group=torch.distributed.group.WORLD) elif self.mpu is not None: if self.deepspeed is not None: using_pipeline = hasattr(self.deepspeed, 'pipeline_enable_backward_allreduce') if (using_pipeline and self.deepspeed.pipeline_enable_backward_allreduce is False) or ( not using_pipeline and self.deepspeed.enable_backward_allreduce is False): torch.distributed.all_reduce( overflow_gpu, op=torch.distributed.ReduceOp.MAX, group=self.mpu.get_data_parallel_group()) torch.distributed.all_reduce( overflow_gpu, op=torch.distributed.ReduceOp.MAX, group=self.mpu.get_model_parallel_group()) elif self.deepspeed is not None and self.deepspeed.enable_backward_allreduce is False: torch.distributed.all_reduce(overflow_gpu, op=torch.distributed.ReduceOp.MAX, group=torch.distributed.group.WORLD) overflow = overflow_gpu[0].item() return bool(overflow)
def check_using_norm(self, norm_group, reduce_overflow=True): # TODO: I don't think reduce_overflow is needed if mpu is None overflow = -1 in norm_group overflow_gpu = torch.cuda.FloatTensor([overflow]) if self.has_moe_params: # In this case, we need to do an all_reduce across # the expert_parallel_group, so that if there was # an overflow due to expert weights, we detect it dist.all_reduce(overflow_gpu, op=dist.ReduceOp.MAX, group=groups.get_expert_parallel_group()) if self.mpu is not None: torch.distributed.all_reduce(overflow_gpu, op=torch.distributed.ReduceOp.MAX, group=self.mpu.get_model_parallel_group()) elif reduce_overflow: dist.all_reduce(overflow_gpu, op=torch.distributed.ReduceOp.MAX) dist.barrier() overflow = overflow_gpu[0].item() return bool(overflow)
def __init__(self, hidden_size, expert, num_experts=1, k=1, capacity_factor=1., eval_capacity_factor=1., min_capacity=4, noisy_gate_policy: typing.Optional[str] = None, drop_tokens: bool = True, use_rts=True, use_tutel: bool = False): """Initialize an MoE layer. Arguments: hidden_size (int): the hidden dimension of the model, importantly this is also the input and output dimension. expert (torch.nn.Module): the torch module that defines the expert (e.g., MLP, torch.linear). num_experts (int, optional): default=1, the total number of experts per layer. k (int, optional): default=1, top-k gating value, only supports k=1 or k=2. capacity_factor (float, optional): default=1.0, the capacity of the expert at training time. eval_capacity_factor (float, optional): default=1.0, the capacity of the expert at eval time. min_capacity (int, optional): default=4, the minimum capacity per expert regardless of the capacity_factor. noisy_gate_policy (str, optional): default=None, noisy gate policy, valid options are 'Jitter', 'RSample' or 'None'. drop_tokens (bool, optional): default=True, whether to drop tokens - (setting to False is equivalent to infinite capacity). use_rts (bool, optional): default=True, whether to use Random Token Selection. use_tutel (bool, optional): default=False, whether to use Tutel optimizations (if installed). """ super(MoE, self).__init__() assert groups.is_initialized(), \ 'Please call deepspeed.utils.groups.initialize() before using MoE layers' assert noisy_gate_policy is None or noisy_gate_policy in ['None', 'Jitter', 'RSample'], \ 'Unsupported noisy_gate_policy: ' + noisy_gate_policy num_local_experts = num_experts // groups.get_expert_parallel_world_size( ) log_dist( f'num_experts: {num_experts} | num_local_experts: {num_local_experts} | expert_parallel_size: {groups.get_expert_parallel_world_size()}', [0]) self.num_experts = num_experts experts = Experts(expert, num_local_experts) self.deepspeed_moe = MOELayer(TopKGate(hidden_size, num_experts, k, capacity_factor, eval_capacity_factor, min_capacity, noisy_gate_policy, drop_tokens, use_rts), experts, num_local_experts, group=groups.get_expert_parallel_group(), use_tutel=use_tutel)