Exemple #1
0
 def append(self, raw_name, record_name, latency, msg_size):
     import deepspeed.comm as dist
     algbw, busbw = calc_bw_log(raw_name, msg_size, latency)
     if record_name in self.comms_dict.keys():
         # If this comm_op has already been logged with this message size, just add to existing record
         if msg_size in self.comms_dict[record_name].keys():
             self.comms_dict[record_name][msg_size][0] += 1
             self.comms_dict[record_name][msg_size][1].append(latency)
             self.comms_dict[record_name][msg_size][2].append(algbw)
             self.comms_dict[record_name][msg_size][3].append(busbw)
         # If this is a new message size for this comm_op, add new record under existing comm_op
         else:
             self.comms_dict[record_name][msg_size] = [
                 1, [latency], [algbw], [busbw]
             ]
     else:
         # Create entirely new record
         self.comms_dict[record_name] = {
             msg_size: [1, [latency], [algbw], [busbw]]
         }
     # If verbose, print every comm op
     # TODO: Add to tensorboard
     if self.verbose:
         n = dist.get_world_size()
         log_str = f"rank={dist.get_rank()} | comm op: " + record_name + " | time (ms): {:.2f}".format(
             latency)
         log_str += " | msg size: " + convert_size(msg_size)
         log_str += " | algbw (Gbps): {:.2f} ".format(algbw)
         log_str += " | busbw (Gbps): {:.2f} ".format(busbw)
         log_dist(log_str, [0])
Exemple #2
0
 def valid_step(self, batch_itr):
     if self.model.global_steps % self.fs_args.validate_interval_updates != 0:
         return
     with torch.no_grad():
         self.model.eval()
         for subset in batch_itr.valid_dataset():
             with metrics.aggregate(new_root=True) as agg:
                 for batch, is_dummy_batch in batch_itr.valid_batch():
                     _, sample_size, logging_output = self.task.valid_step(
                         batch, self.model.module.model, self.model.module.criterion
                     )
                     logging_outputs = [logging_output]
                     if is_dummy_batch:
                         if torch.is_tensor(sample_size):
                             sample_size.zero_()
                         else:
                             sample_size *= 0.0
                     logging_outputs, (sample_size,) = torch_reduce_sum(
                         self.model.device,
                         logging_outputs,
                         sample_size,
                         ignore=is_dummy_batch,
                     )
                     logging_output = self.reduce_log(logging_outputs, sample_size)
             log_dist(
                 "Valid on step: {}, dataset: {}. {}".format(
                     self.model.global_steps,
                     subset,
                     view_log(agg.get_smoothed_values()),
                 ),
                 ranks=[0],
             )
    def __init__(self, theta=0.5, gamma=0.001):
        super().__init__()

        self.theta = theta
        self.gamma = gamma
        self.current_theta = 1.0
        log_dist(f'Enabled progressive layer dropping (theta = {self.theta})', ranks=[0])
Exemple #4
0
def _create_expert_data_and_model_parallel(expert_parallel_size_, mpu):
    """
        Create expert and data parallel groups based on MPU (model parallel) group.

        Note: Caller of this function is responsible to check if the groups already exist.

        Example - E + M + D parallel
        world_size = 16
        model_degree = 2
        expert_degree = 4 # number of experts in same group
        mp_group = [0, 1], [2,3], [4,5] ...
        data_parallel_group =[0,2,4,6,8,10, 12,14],                 [1,3,5,7,9,11,13,15]
        expert_parallel_group = [0,2,4,6], [8,10,12,14]             [1,3,5,7], [9,11,13,15]
        expert_data_parallel_group = [0,8],[2,10],[4,12],[6,14],    [1,9],[3,11],[5,13],[]
    """
    assert torch.distributed.is_initialized(
    ), "torch distributed is not initialized"
    assert mpu.model_parallel_is_initialized(
    ), "model parallel group is not initialized"
    model_parallel_size_ = mpu.get_model_parallel_world_size()

    world_size = torch.distributed.get_world_size()
    rank = torch.distributed.get_rank()
    dp_world_size = mpu.get_data_parallel_world_size()
    dp_rank = mpu.get_data_parallel_rank()

    log_dist(
        f"Creating deepspeed groups with model parallel size {model_parallel_size_}, expert parallel size {expert_parallel_size_}, world size {world_size}, dp world size {dp_world_size}",
        [0])

    global _EXPERT_PARALLEL_GROUP, _EXPERT_DATA_PARALLEL_GROUP

    # Get world size and rank. Ensure some consistencies.
    _DATA_PARALLEL_GROUP = mpu.get_data_parallel_group()
    _MODEL_PARALLEL_GROUP = mpu.get_model_parallel_group()

    expert_parallel_size_ = min(expert_parallel_size_, dp_world_size)
    _ensure_divisibility(world_size, expert_parallel_size_)

    group_name = f"ep_size_{expert_parallel_size_}"

    # Only create groups if they don't already exist
    # Need to check conditions outside the group creation loop because of the way torch.dist group creation works
    if group_name not in _EXPERT_DATA_PARALLEL_GROUP and group_name not in _EXPERT_PARALLEL_GROUP:
        for j in range(model_parallel_size_):
            for i in range(expert_parallel_size_):
                ranks = range(i * model_parallel_size_ + j, world_size,
                              expert_parallel_size_ * model_parallel_size_)
                group = torch.distributed.new_group(ranks)
                if rank in list(ranks):
                    _EXPERT_DATA_PARALLEL_GROUP[group_name] = group

                for i in range(dp_world_size // expert_parallel_size_):
                    ranks = range(i * num_ep * model_parallel_size_ + j,
                                  (i + 1) * expert_parallel_size_ *
                                  model_parallel_size_, model_parallel_size_)
                    group = torch.distributed.new_group(ranks)
                    if rank in list(ranks):
                        _EXPERT_PARALLEL_GROUP[group_name] = group
Exemple #5
0
    def __init__(self,
                 hidden_size,
                 expert,
                 num_experts=1,
                 k=1,
                 output_dropout_prob=0.0,
                 capacity_factor=1.,
                 eval_capacity_factor=1.,
                 min_capacity=4,
                 noisy_gate_policy: typing.Optional[str] = None):
        """Initialize an MoE layer.

        Arguments:
            hidden_size (int): the hidden dimension of the model, importantly this is also the input and output dimension.

            expert (torch.nn.Module): the torch module that defines the expert (e.g., MLP, torch.linear).

            num_experts (int, optional): default=1, the total number of experts per layer.

            k (int, optional): default=1, top-k gating value, only supports k=1 or k=2.

            output_dropout_prob (float, optional): default=0.0, output dropout probability.

            capacity_factor (float, optional): default=1.0, the capacity of the expert at training time.

            eval_capacity_factor (float, optional): default=1.0, the capacity of the expert at eval time.

            min_capacity (int, optional): default=4, the minimum capacity per expert regardless of the capacity_factor.

            noisy_gate_policy (str, optional): default=None, noisy gate policy, valid options are 'Jitter', 'RSample' or 'None'.
        """

        super(MoE, self).__init__()

        assert groups.is_initialized(), \
            'Please call deepspeed.utils.groups.initialize() before using MoE layers'
        assert noisy_gate_policy is None or noisy_gate_policy in ['None', 'Jitter', 'RSample'], \
            'Unsupported noisy_gate_policy: ' + noisy_gate_policy

        num_local_experts = num_experts // groups.get_expert_parallel_world_size()

        log_dist(
            f'num_experts: {num_experts} | num_local_experts: {num_local_experts} | expert_parallel_size: {groups.get_expert_parallel_world_size()}',
            [0])

        self.num_experts = num_experts
        experts = Experts(expert, num_local_experts)
        self.deepspeed_moe = MOELayer(TopKGate(hidden_size,
                                               num_experts,
                                               k,
                                               capacity_factor,
                                               eval_capacity_factor,
                                               min_capacity,
                                               noisy_gate_policy),
                                      experts,
                                      num_local_experts,
                                      group=groups.get_expert_parallel_group())

        self.dropout = torch.nn.Dropout(output_dropout_prob)
Exemple #6
0
def initialize(ep_size=1, mpu=None, num_ep_list=None):
    """
    Process groups initialization supporting expert (E), data (D), and model (M) parallelism. DeepSpeed considers
    the following scenarios w.r.t. process group creation.

    * S1: There is no expert parallelism or model parallelism, only data (D)::

        model = my_model(args)
        engine = deepspeed.initialize(model) # initialize groups without mpu

    * S2: There is expert parallelism but no model parallelism (E+D)::

        deepspeed.utils.groups.initialize(ep_size) # groups will be initialized here
        model = my_model(args)
        engine = deepspeed.initialize(model)

    * S3: There is model parallelism but no expert parallelism (M)::

        mpu.init() # client initializes it's model parallel unit
        model = my_model(args)
        engine = deepspeed.initialize(model, mpu=mpu) # init w. mpu but ep_size = dp_world_size

    * S4: There is model, data, and expert parallelism (E+D+M)::

        mpu.init() # client initializes it's model parallel unit
        deepspeed.utils.groups.initialize(ep_size, mpu) # initialize expert groups wrt mpu
        model = my_model(args)
        engine = deepspeed.initialize(model, mpu=mpu) # passing mpu is optional in this case

    Arguments:
        ep_size (int, optional): default=1, maximum expert parallel size, which should be divisible/divided by the world size.
        by each element in num_ep_list.
        mpu (module, optional): default=None, model parallel unit (e.g., from Megatron)
            that describes model/data parallel ranks.
        num_ep_list (list, optional): default=None, list of number of expert parallel sizes in each MoE layer.

    """

    if num_ep_list is None:
        num_ep_list = [ep_size]

    assert max(
        num_ep_list
    ) >= ep_size, f"ep_size={ep_size} is larger than the largest num_ep_list={max(num_ep_list)}, you should reduce expert parallel size"

    num_ep_list = list(set(num_ep_list))  # remove duplicates
    num_ep_list.sort()  # sort in ascending order
    for num_ep in num_ep_list:
        assert num_ep > 0, 'num_ep must be positive'
        assert num_ep % ep_size == 0 or ep_size % num_ep == 0, 'num_ep must be divisible/divided by ep_size'

    if mpu is not None:
        log_dist(message="initializing deepspeed groups using mpu", ranks=[0])
        initialize_model_and_expert_parallel(ep_size, mpu, num_ep_list)
    else:
        log_dist(message="initializing deepspeed groups", ranks=[0])
        initialize_model_parallel(1)
        initialize_expert_parallel(ep_size, num_ep_list)
def initialize(
    args,
    model,
    optimizer=None,
    model_parameters=None,
    training_data=None,
    lr_scheduler=None,
    mpu=None,
    dist_init_required=None,
    collate_fn=None,
    config_params=None,
):
    log_dist(
        "DeepSpeed info: version={}, git-hash={}, git-branch={}".format(
            __version__, __git_hash__, __git_branch__),
        ranks=[0],
    )

    if not isinstance(model, PipelineModule):
        engine = DSEngine(
            args=args,
            model=model,
            optimizer=optimizer,
            model_parameters=model_parameters,
            training_data=training_data,
            lr_scheduler=lr_scheduler,
            mpu=mpu,
            dist_init_required=dist_init_required,
            collate_fn=collate_fn,
            config_params=config_params,
        )
    else:
        assert mpu is None, "mpu must be None with pipeline parallelism"
        engine = PipelineEngine(
            args=args,
            model=model,
            optimizer=optimizer,
            model_parameters=model_parameters,
            training_data=training_data,
            lr_scheduler=lr_scheduler,
            mpu=model.mpu(),
            dist_init_required=dist_init_required,
            collate_fn=collate_fn,
            config_params=config_params,
        )

    return_items = [
        engine,
        engine.optimizer,
        engine.training_dataloader,
        engine.lr_scheduler,
    ]
    return tuple(return_items)
Exemple #8
0
def tmp():
    fs_args, ds_config = gen_ds_fairseq_arg()
    set_seed(fs_args.seed)
    task = tasks.setup_task(fs_args)
    trainer = DsFairseqTrainer(fs_args, ds_config, task)
    batch_itr = BatchIterator(fs_args, task)
    for epoch in batch_itr.train_epoch():
        train(batch_itr, trainer)
        log_dist(
            f'Finish epoch {epoch}, \
            {view_log(metrics.get_smoothed_values("train"))}',
            [0],
        )
        metrics.reset_meters("train")
Exemple #9
0
def initialize_model_parallel(model_parallel_size_):
    """
    Initialize model data parallel groups.

    Arguments:
        model_parallel_size: number of GPUs used to parallelize model.

    Let's say we have a total of 8 GPUs denoted by g0 ... g7 and we
    use 2 GPUs to parallelize the model. The present function will
    create 4 model parallel groups and 2 data parallel grous as:
        4 model parallel groups:
            [g0, g1], [g2, g3], [g4, g5], [g6, g7]
        2 data parallel groups:
            [g0, g2, g4, g6], [g1, g3, g5, g7]
    Note that for efficiency, the caller should make sure adjacent ranks
    are on the same DGX box. For example if we are using 2 DGX-1 boxes
    with a total of 16 GPUs, rank 0 to 7 belong to the first box and
    ranks 8 to 15 belong to the second box.
    """
    log_dist(
        'initializing deepspeed model parallel group with size {}'.format(
            model_parallel_size_), [0])
    # Get world size and rank. Ensure some consistencies.
    assert torch.distributed.is_initialized()
    world_size = torch.distributed.get_world_size()
    model_parallel_size = min(model_parallel_size_, world_size)
    ensure_divisibility(world_size, model_parallel_size)
    rank = torch.distributed.get_rank()

    # Build the data parallel groups.
    global _DATA_PARALLEL_GROUP
    assert _DATA_PARALLEL_GROUP is None, \
        'data parallel group is already initialized'
    for i in range(model_parallel_size):
        ranks = range(i, world_size, model_parallel_size)
        group = torch.distributed.new_group(ranks)
        if i == (rank % model_parallel_size):
            _DATA_PARALLEL_GROUP = group

    # Build the model parallel groups.
    global _MODEL_PARALLEL_GROUP
    assert _MODEL_PARALLEL_GROUP is None, \
        'model parallel group is already initialized'
    for i in range(world_size // model_parallel_size):
        ranks = range(i * model_parallel_size, (i + 1) * model_parallel_size)
        group = torch.distributed.new_group(ranks)
        if i == (rank // model_parallel_size):
            _MODEL_PARALLEL_GROUP = group
Exemple #10
0
def _create_model_parallel(model_parallel_size_):
    """
    Initialize model data parallel groups.

    Arguments:
        model_parallel_size: number of GPUs used to parallelize model.

    Returns:
        Tuple of data parallel group and model parallel group

    Let's say we have a total of 8 GPUs denoted by g0 ... g7 and we
    use 2 GPUs to parallelize the model. The present function will
    create 4 model parallel groups and 2 data parallel groups as:
        4 model parallel groups:
            [g0, g1], [g2, g3], [g4, g5], [g6, g7]
        2 data parallel groups:
            [g0, g2, g4, g6], [g1, g3, g5, g7]
    Note that for efficiency, the caller should make sure adjacent ranks
    are on the same DGX box. For example if we are using 2 DGX-1 boxes
    with a total of 16 GPUs, rank 0 to 7 belong to the first box and
    ranks 8 to 15 belong to the second box.
    """
    log_dist(f'Creating model parallel group with size {model_parallel_size_}',
             ranks=[0])
    # Get world size and rank. Ensure some consistencies.
    assert dist.is_initialized()
    world_size = dist.get_world_size()
    model_parallel_size = min(model_parallel_size_, world_size)
    _ensure_divisibility(world_size, model_parallel_size)
    rank = dist.get_rank()

    _DATA_PARALLEL_GROUP = None
    _MODEL_PARALLEL_GROUP = None
    # Build the data parallel groups.
    for i in range(model_parallel_size):
        ranks = range(i, world_size, model_parallel_size)
        group = dist.new_group(ranks)
        if i == (rank % model_parallel_size):
            _DATA_PARALLEL_GROUP = group

    # Build the model parallel groups.
    for i in range(world_size // model_parallel_size):
        ranks = range(i * model_parallel_size, (i + 1) * model_parallel_size)
        group = dist.new_group(ranks)
        if i == (rank // model_parallel_size):
            _MODEL_PARALLEL_GROUP = group

    return _DATA_PARALLEL_GROUP, _MODEL_PARALLEL_GROUP
Exemple #11
0
def initialize(ep_size=1, mpu=None):
    """
    Process groups initialization supporting expert (E), data (D), and model (M) parallelism. DeepSpeed considers
    the following scenarios w.r.t. process group creation.

    * S1: There is no expert parallelism or model parallelism, only data (D)::

        model = my_model(args)
        engine = deepspeed.initialize(model) # initialize groups without mpu

    * S2: There is expert parallelism but no model parallelism (E+D)::

        deepspeed.utils.groups.initialize(ep_size) # groups will be initialized here
        model = my_model(args)
        engine = deepspeed.initialize(model)

    * S3: There is model parallelism but no expert parallelism (M)::

        mpu.init() # client initializes it's model parallel unit
        model = my_model(args)
        engine = deepspeed.initialize(model, mpu=mpu) # init w. mpu but ep_size = dp_world_size

    * S4: There is model, data, and expert parallelism (E+D+M)::

        mpu.init() # client initializes it's model parallel unit
        deepspeed.utils.groups.initialize(ep_size, mpu) # initialize expert groups wrt mpu
        model = my_model(args)
        engine = deepspeed.initialize(model, mpu=mpu) # passing mpu is optional in this case

    Arguments:
        ep_size (int, optional): default=1, expert parallel size
        mpu (module, optional): default=None, model parallel unit (e.g., from Megatron)
            that descibes model/data parallel ranks.

    """
    if mpu is not None:
        log_dist(message="initializing deepspeed groups using mpu", ranks=[0])
        initialize_model_and_expert_parallel(ep_size, mpu)
    else:
        log_dist(message="initializing deepspeed groups", ranks=[0])
        initialize_model_parallel(1)
        initialize_expert_parallel(ep_size)
Exemple #12
0
    def __init__(self,
                 verbose=False,
                 max_iter=100,
                 tol=1e-2,
                 stability=0,
                 gas_boundary_resolution=1,
                 layer_name='',
                 layer_num=0):
        super().__init__()

        self.verbose = verbose
        self.max_iter = max_iter
        self.tol = tol
        self.stability = stability
        self.gas_boundary_resolution = gas_boundary_resolution
        self.layer_name = layer_name
        self.layer_num = layer_num

        assert len(self.layer_name) > 0 and layer_num > 0

        log_dist(
            f'enabled eigenvalue with verbose={verbose}, max_iter={max_iter}, tol={tol}, stability={stability}, gas_boundary_resolution={gas_boundary_resolution}, layer_name={layer_name}, layer_num={layer_num}',
            ranks=[0])
Exemple #13
0
    def train_step(self, sample, is_dummy_batch):
        self.model.train()
        self.model.zero_grad()

        loss, sample_size, logging_output = self.model(sample)

        if is_dummy_batch:
            if torch.is_tensor(sample_size):
                sample_size.zero_()
            else:
                sample_size *= 0.0
            loss *= 0.0
        if torch.is_tensor(sample_size):
            sample_size = sample_size.float()
        else:
            sample_size = float(sample_size)

        logging_outputs, (sample_size, ) = torch_reduce_sum(
            self.model.device, [logging_output],
            sample_size,
            ignore=is_dummy_batch)

        final_loss = loss * (dist.get_world_size() / sample_size)
        self.model.backward(final_loss)
        self.model.step()

        logging_output = self.reduce_log(logging_outputs, sample_size)

        if self.model.global_steps % self.model.steps_per_print() != 0:
            return

        log_dist(
            f'Step: {self.model.global_steps}, \
            {view_log(metrics.get_smoothed_values("train_inner"))}',
            [0],
        )
        metrics.reset_meters("train_inner")
Exemple #14
0
def _create_expert_and_data_parallel(ep_size):
    """
        Create expert and data parallel groups.

        Note: Caller of this function is responsible to check if the groups already exist.

        Example - E + D parallel
        world_size = 16
        expert_parallel_size = 2 # number of experts in same group
        expert_data_parallel_group = [0,2,4,6,8,10,12,14], [1,3,5,7,9,11,13,15] - all reduce is only on MoE params
        expert_parallel_group = [0, 1], [2,3], [4,5], [6,7], [8,9] - no all reduce, but all to all
        data_parallel_group = [0,1,...,15] - all reduce is only on non-MoE
    """
    assert torch.distributed.is_initialized()

    log_dist(f'Creating expert and data parallel groups with size {ep_size}',
             ranks=[0])
    world_size = torch.distributed.get_world_size()
    rank = torch.distributed.get_rank()

    expert_parallel_size_ = min(ep_size, world_size)
    _ensure_divisibility(world_size, expert_parallel_size_)

    group_name = f"ep_size_{expert_parallel_size_}"

    # Build the expert data parallel groups.
    global _EXPERT_DATA_PARALLEL_GROUP

    # Only create group if it does not already exist
    if group_name not in _EXPERT_DATA_PARALLEL_GROUP:
        for i in range(expert_parallel_size_):
            ranks = range(i, world_size, expert_parallel_size_)
            group = torch.distributed.new_group(ranks)
            log_dist(
                f'Creating expert data parallel process group named {group_name} with ranks: {list(ranks)}',
                [0])
            if i == (rank % expert_parallel_size_):
                _EXPERT_DATA_PARALLEL_GROUP[group_name] = group

    # Build the expert parallel groups.
    global _EXPERT_PARALLEL_GROUP

    # Only create group if it does not already exist
    if group_name not in _EXPERT_PARALLEL_GROUP:
        for i in range(world_size // expert_parallel_size_):
            ranks = range(i * expert_parallel_size_,
                          (i + 1) * expert_parallel_size_)
            group = torch.distributed.new_group(ranks)
            log_dist(
                f'creating expert parallel process group named {group_name} with ranks: {list(ranks)}',
                [0])
            if i == (rank // expert_parallel_size_):
                _EXPERT_PARALLEL_GROUP[group_name] = group
Exemple #15
0
def initialize_expert_parallel(expert_parallel_size_):
    """
        Initialize expert plus data parallel groups.

        Example - E + D parallel
        world_size = 16
        expert_parallel_size = 2 # number of experts in same group
        expert_data_parallel_group = [0,2,4,6,8,10,12,14], [1,3,5,7,9,11,13,15] - all reduce is only on MoE params
        expert_parallel_group = [0, 1], [2,3], [4,5], [6,7], [8,9] - no all reduce, but all to all
        data_parallel_group = [0,1,...,15] - all reduce is only on non-MoE
    """
    assert torch.distributed.is_initialized()

    log_dist(
        'initializing deepspeed expert parallel group with size {}'.format(
            expert_parallel_size_), [0])
    world_size = get_data_parallel_world_size()
    rank = get_data_parallel_rank()

    expert_parallel_size_ = min(expert_parallel_size_, world_size)
    ensure_divisibility(world_size, expert_parallel_size_)

    # Build the expert data parallel groups.
    global _EXPERT_DATA_PARALLEL_GROUP
    assert _EXPERT_DATA_PARALLEL_GROUP is None, \
        'expert data parallel group is already initialized'
    for i in range(expert_parallel_size_):
        ranks = range(i, world_size, expert_parallel_size_)
        group = torch.distributed.new_group(ranks)

        # TODO: remove
        log_dist(
            f'creating expert data parallel process group with ranks: {list(ranks)}',
            [0])
        if i == (rank % expert_parallel_size_):
            _EXPERT_DATA_PARALLEL_GROUP = group

    # Build the expert parallel groups.
    global _EXPERT_PARALLEL_GROUP
    assert _EXPERT_PARALLEL_GROUP is None, \
        'expert parallel group is already initialized'
    for i in range(world_size // expert_parallel_size_):
        ranks = range(i * expert_parallel_size_,
                      (i + 1) * expert_parallel_size_)
        group = torch.distributed.new_group(ranks)

        # TODO: remove
        log_dist(
            f'creating expert parallel process group with ranks: {list(ranks)}',
            [0])
        if i == (rank // expert_parallel_size_):
            _EXPERT_PARALLEL_GROUP = group
Exemple #16
0
def flatten_dense_tensors_sub_partition_aligned(tensor_list,
                                                dp,
                                                max_elements_per_comm,
                                                pg):
    assert max_elements_per_comm >= dp, f"max_elements_per_comm {max_elements_per_comm} < dp {dp}"

    num_elements = sum(t.numel() for t in tensor_list)
    log_dist("Total number of elements in model: {}, max elements per com: {}".format(
        num_elements,
        max_elements_per_comm),
             ranks=[0])

    # Compute aligned partition size based on parameter count
    aligned_param_partition_size = math.ceil(num_elements / dp)

    # Compute aligned partition size based on communication size
    aligned_comm_partition_size = int(max_elements_per_comm // dp)

    if aligned_param_partition_size <= aligned_comm_partition_size:
        sub_partition_count = 1
        sub_partition_size = aligned_param_partition_size
    else:
        sub_partition_count = math.ceil(aligned_param_partition_size /
                                        aligned_comm_partition_size)
        sub_partition_size = aligned_comm_partition_size

    # Compute required padding  for alignment to dp and max_elements_per_comm
    padding = (sub_partition_count * sub_partition_size * dp) - num_elements

    log_dist(
        f"sub_partition_count: {sub_partition_count}, sub_partition_size: {sub_partition_size}, padding: {padding}",
        ranks=[0])
    log_dist(
        f"number of elements with padding: {num_elements} + {padding} = {num_elements + padding}",
        ranks=[0])

    if padding == 0:
        aligned_tensor_list = tensor_list
    else:
        pad_tensor = torch.zeros(padding,
                                 device=tensor_list[0].device,
                                 dtype=tensor_list[0].dtype)
        aligned_tensor_list = tensor_list + [pad_tensor]

    return _flatten_dense_tensors(aligned_tensor_list)
Exemple #17
0
    def step(self, closure=None):
        """
        Not supporting closure.
        """

        if self.fused_adam_legacy:
            return self.step_fused_adam()

        COMPUTE_NORM = "compute_norm"
        OVERFLOW_CHECK = 'overflow_check'
        OVERFLOW_TIMERS = [COMPUTE_NORM, OVERFLOW_CHECK]
        UNSCALE_AND_CLIP = 'unscale_and_clip'
        BASIC_STEP = 'basic_step'
        UPDATE_FP16 = 'update_fp16'
        STEP_TIMERS = OVERFLOW_TIMERS + [UNSCALE_AND_CLIP, BASIC_STEP, UPDATE_FP16]

        # First determine if there is overflow.
        self.start_timers([OVERFLOW_CHECK])
        fp16_params = []
        for i, group in enumerate(self.fp16_groups):
            fp16_params.extend([p for p in group if p.grad is not None])
        self.overflow = self.overflow_checker.has_overflow(fp16_params)
        self.stop_timers([OVERFLOW_CHECK])
        prev_scale = self.cur_scale
        self._update_scale(self.overflow)
        if self.overflow:
            if self.verbose:
                log_dist(
                    "Overflow detected. Skipping step. Attempted loss "
                    f"scale: {prev_scale}, reducing to {self.cur_scale}",
                    ranks=[0])
            # Clear gradients
            for i, group in enumerate(self.fp16_groups):
                for p in group:
                    p.grad = None

            self.log_timers(OVERFLOW_TIMERS)
            return self.overflow

        grads_groups_flat = []
        for i, group in enumerate(self.fp16_groups):
            data_type = self.fp32_groups_flat[i].dtype

            grads_groups_flat.append(
                _flatten_dense_tensors([
                    torch.zeros(p.size(),
                                dtype=data_type,
                                device=p.device)
                    if p.grad is None else p.grad.to(data_type) for p in group
                ]))

            for p in group:
                p.grad = None

            self.fp32_groups_flat[i].grad = grads_groups_flat[i]

        self.start_timers([COMPUTE_NORM])
        all_groups_norm = get_grad_norm(self.fp32_groups_flat, mpu=self.mpu)
        self.stop_timers([COMPUTE_NORM])

        self.start_timers([UNSCALE_AND_CLIP])
        self.unscale_and_clip_grads(grads_groups_flat, [all_groups_norm])
        self.stop_timers([UNSCALE_AND_CLIP])

        self.start_timers([BASIC_STEP])
        self.optimizer.step()
        self.stop_timers([BASIC_STEP])

        #get rid of the fp32 gradients. Not needed anymore
        for group in self.fp32_groups_flat:
            group.grad = None

        self.start_timers([UPDATE_FP16])
        for i in range(len(self.fp16_groups)):
            updated_params = _unflatten_dense_tensors(self.fp32_groups_flat[i],
                                                      self.fp16_groups[i])
            for p, q in zip(self.fp16_groups[i], updated_params):
                p.data.copy_(q.data)
        self.stop_timers([UPDATE_FP16])

        self.log_timers(STEP_TIMERS)

        return self.overflow
Exemple #18
0
def initialize_model_and_expert_parallel(expert_parallel_size_,
                                         mpu,
                                         num_ep_list_=None):
    """
        Initialize Expert groups based on MPU groups.

        Example - E + M + D parallel
        world_size = 16
        model_degree = 2
        expert_degree = 4 # number of experts in same group
        mp_group = [0, 1], [2,3], [4,5] ...
        data_parallel_group =[0,2,4,6,8,10, 12,14],                 [1,3,5,7,9,11,13,15]
        expert_parallel_group = [0,2,4,6], [8,10,12,14]             [1,3,5,7], [9,11,13,15]
        expert_data_parallel_group = [0,8],[2,10],[4,12],[6,14],    [1,9],[3,11],[5,13],[]
    """
    assert torch.distributed.is_initialized(
    ), "torch distributed is not initialized"
    assert mpu.model_parallel_is_initialized(
    ), "model parallel group is not initialized"
    model_parallel_size_ = mpu.get_model_parallel_world_size()

    global _MAX_EP_SIZE
    global _MAX_EP_SIZE_NAME
    _MAX_EP_SIZE = expert_parallel_size_
    _MAX_EP_SIZE_NAME = f"ep_size_{expert_parallel_size_}"

    if num_ep_list_ is None:
        num_ep_list = [expert_parallel_size_]

    world_size = torch.distributed.get_world_size()
    rank = torch.distributed.get_rank()
    dp_world_size = mpu.get_data_parallel_world_size()
    dp_rank = mpu.get_data_parallel_rank()

    log_dist(
        f"Initializing deepspeed groups with model parallel size {model_parallel_size_}, expert parallel size {expert_parallel_size_}, world size {world_size}, dp world size {dp_world_size}",
        [0])

    global _DATA_PARALLEL_GROUP, _MODEL_PARALLEL_GROUP
    global _EXPERT_PARALLEL_GROUP, _EXPERT_DATA_PARALLEL_GROUP

    # Get world size and rank. Ensure some consistencies.
    _DATA_PARALLEL_GROUP = mpu.get_data_parallel_group()
    _MODEL_PARALLEL_GROUP = mpu.get_model_parallel_group()

    expert_parallel_size_ = min(expert_parallel_size_, dp_world_size)
    ensure_divisibility(world_size, expert_parallel_size_)

    # Build the expert data parallel groups.
    assert _EXPERT_DATA_PARALLEL_GROUP is None, \
        'expert data parallel group is already initialized'
    # Build the expert parallel groups.
    assert _EXPERT_PARALLEL_GROUP is None, \
        'expert parallel group is already initialized'

    _EXPERT_DATA_PARALLEL_GROUP = {}
    _EXPERT_PARALLEL_GROUP = {}

    for num_ep in num_ep_list_:
        for j in range(model_parallel_size_):
            # For data parallel
            # Similar as initialize_expert_parallel we will need to think about two cases
            if num_ep >= expert_parallel_size_:
                #TODO: refactor this part of code to check condition in outer for-loop
                if True:  #f"ep_size_{expert_parallel_size_}" not in _EXPERT_DATA_PARALLEL_GROUP:
                    for i in range(expert_parallel_size_):
                        ranks = range(
                            i * model_parallel_size_ + j, world_size,
                            expert_parallel_size_ * model_parallel_size_)
                        group = torch.distributed.new_group(ranks)
                        if rank in list(ranks):
                            _EXPERT_DATA_PARALLEL_GROUP[
                                f"ep_size_{expert_parallel_size_}"] = group
            else:
                for i in range(num_ep):
                    ranks = range(i * model_parallel_size_ + j, world_size,
                                  num_ep * model_parallel_size_)
                    group = torch.distributed.new_group(ranks)
                    if rank in list(ranks):
                        _EXPERT_DATA_PARALLEL_GROUP[
                            f"ep_size_{num_ep}"] = group

            # For expert parallel
            if num_ep >= expert_parallel_size_:
                #TODO: refactor this part of code to check condition in outer for-loop
                if True:  #f"ep_size_{expert_parallel_size_}" not in _EXPERT_PARALLEL_GROUP:
                    for i in range(dp_world_size // expert_parallel_size_):
                        ranks = range(
                            i * expert_parallel_size_ * model_parallel_size_ +
                            j, (i + 1) * expert_parallel_size_ *
                            model_parallel_size_, model_parallel_size_)
                        group = torch.distributed.new_group(ranks)
                        if rank in list(ranks):
                            _EXPERT_PARALLEL_GROUP[
                                f"ep_size_{expert_parallel_size_}"] = group
            else:
                for i in range(dp_world_size // num_ep):
                    ranks = range(i * num_ep * model_parallel_size_ + j,
                                  (i + 1) * num_ep * model_parallel_size_,
                                  model_parallel_size_)
                    group = torch.distributed.new_group(ranks)
                    if rank in list(ranks):
                        _EXPERT_PARALLEL_GROUP[f"ep_size_{num_ep}"] = group
Exemple #19
0
    def __init__(self,
                 hidden_size,
                 expert,
                 num_experts=1,
                 k=1,
                 capacity_factor=1.,
                 eval_capacity_factor=1.,
                 min_capacity=4,
                 noisy_gate_policy: typing.Optional[str] = None,
                 drop_tokens: bool = True,
                 use_rts=True,
                 use_tutel: bool = False):
        """Initialize an MoE layer.

        Arguments:
            hidden_size (int): the hidden dimension of the model, importantly this is also the input and output dimension.

            expert (torch.nn.Module): the torch module that defines the expert (e.g., MLP, torch.linear).

            num_experts (int, optional): default=1, the total number of experts per layer.

            k (int, optional): default=1, top-k gating value, only supports k=1 or k=2.

            capacity_factor (float, optional): default=1.0, the capacity of the expert at training time.

            eval_capacity_factor (float, optional): default=1.0, the capacity of the expert at eval time.

            min_capacity (int, optional): default=4, the minimum capacity per expert regardless of the capacity_factor.

            noisy_gate_policy (str, optional): default=None, noisy gate policy, valid options are 'Jitter', 'RSample' or 'None'.

            drop_tokens (bool, optional): default=True, whether to drop tokens - (setting to False is equivalent to infinite capacity).

            use_rts (bool, optional): default=True, whether to use Random Token Selection.

            use_tutel (bool, optional): default=False, whether to use Tutel optimizations (if installed).
        """

        super(MoE, self).__init__()

        assert groups.is_initialized(), \
            'Please call deepspeed.utils.groups.initialize() before using MoE layers'
        assert noisy_gate_policy is None or noisy_gate_policy in ['None', 'Jitter', 'RSample'], \
            'Unsupported noisy_gate_policy: ' + noisy_gate_policy

        num_local_experts = num_experts // groups.get_expert_parallel_world_size(
        )

        log_dist(
            f'num_experts: {num_experts} | num_local_experts: {num_local_experts} | expert_parallel_size: {groups.get_expert_parallel_world_size()}',
            [0])

        self.num_experts = num_experts
        experts = Experts(expert, num_local_experts)
        self.deepspeed_moe = MOELayer(TopKGate(hidden_size, num_experts, k,
                                               capacity_factor,
                                               eval_capacity_factor,
                                               min_capacity, noisy_gate_policy,
                                               drop_tokens, use_rts),
                                      experts,
                                      num_local_experts,
                                      group=groups.get_expert_parallel_group(),
                                      use_tutel=use_tutel)
Exemple #20
0
    def __init__(self,
                 hidden_size,
                 expert,
                 num_experts=1,
                 ep_size=1,
                 k=1,
                 capacity_factor=1.,
                 eval_capacity_factor=1.,
                 min_capacity=4,
                 use_residual=False,
                 noisy_gate_policy: typing.Optional[str] = None,
                 drop_tokens: bool = True,
                 use_rts=True,
                 use_tutel: bool = False):
        """Initialize an MoE layer.

        Arguments:
            hidden_size (int): the hidden dimension of the model, importantly this is also the input and output dimension.
            expert (torch.nn.Module): the torch module that defines the expert (e.g., MLP, torch.linear).
            num_experts (int, optional): default=1, the total number of experts per layer.
            ep_size (int, optional): default=1, number of ranks in the expert parallel world or group.
            k (int, optional): default=1, top-k gating value, only supports k=1 or k=2.
            capacity_factor (float, optional): default=1.0, the capacity of the expert at training time.
            eval_capacity_factor (float, optional): default=1.0, the capacity of the expert at eval time.
            min_capacity (int, optional): default=4, the minimum capacity per expert regardless of the capacity_factor.
            use_residual (bool, optional): default=False, make this MoE layer a Residual MoE (https://arxiv.org/abs/2201.05596) layer.
            noisy_gate_policy (str, optional): default=None, noisy gate policy, valid options are 'Jitter', 'RSample' or 'None'.
            drop_tokens (bool, optional): default=True, whether to drop tokens - (setting to False is equivalent to infinite capacity).
            use_rts (bool, optional): default=True, whether to use Random Token Selection.
            use_tutel (bool, optional): default=False, whether to use Tutel optimizations (if installed).
        """

        super(MoE, self).__init__()

        self.use_residual = use_residual
        self.ep_size = min(
            ep_size, num_experts
        )  # the ep size should be less than the number of experts
        self.expert_group_name = f"ep_size_{self.ep_size}"
        self.num_experts = num_experts
        self.num_local_experts = 1 if num_experts < ep_size else num_experts // ep_size

        log_dist(
            f'Creating MoE layer with num_experts: {num_experts} | num_local_experts: {self.num_local_experts} | expert_parallel_size: {ep_size}',
            [0])

        assert noisy_gate_policy is None or noisy_gate_policy in ['None', 'Jitter', 'RSample'], \
            'Unsupported noisy_gate_policy: ' + noisy_gate_policy

        experts = Experts(expert, self.num_local_experts,
                          self.expert_group_name)
        self.deepspeed_moe = MOELayer(TopKGate(hidden_size, num_experts, k,
                                               capacity_factor,
                                               eval_capacity_factor,
                                               min_capacity, noisy_gate_policy,
                                               drop_tokens, use_rts),
                                      experts,
                                      self.expert_group_name,
                                      self.ep_size,
                                      self.num_local_experts,
                                      use_tutel=use_tutel)
        if self.use_residual:
            self.mlp = expert
            # coefficient is used for weighted sum of the output of expert and mlp
            self.coefficient = torch.nn.Linear(hidden_size, 2)
 def create(self, tag):
     log_dist(f"[Nebula] Start Checkpoint for tag:{tag}", ranks=[0])
     # -2 means: customer needs to  explicitly tell nebula
     # current checkpoint is complete by commit methond.
     self.checkpoint = torch_nebula.Checkpoint(tag, -2)
Exemple #22
0
    def step(self, closure=None):
        """
        Not supporting closure.
        """

        if self.fused_adam_legacy:
            return self.step_fused_adam()

        COMPUTE_NORM = "compute_norm"
        OVERFLOW_CHECK = 'overflow_check'
        OVERFLOW_TIMERS = [COMPUTE_NORM, OVERFLOW_CHECK]
        UNSCALE_AND_CLIP = 'unscale_and_clip'
        BASIC_STEP = 'basic_step'
        UPDATE_FP16 = 'update_fp16'
        STEP_TIMERS = OVERFLOW_TIMERS + [UNSCALE_AND_CLIP, BASIC_STEP, UPDATE_FP16]

        # First determine if there is overflow.
        self.start_timers([OVERFLOW_CHECK])
        fp16_params = []
        for i, group in enumerate(self.fp16_groups):
            fp16_params.extend([p for p in group if p.grad is not None])
        self.overflow = self.overflow_checker.has_overflow(fp16_params)
        self.stop_timers([OVERFLOW_CHECK])
        prev_scale = self.cur_scale
        self._update_scale(self.overflow)
        if self.overflow:
            if self.verbose:
                log_dist(
                    "Overflow detected. Skipping step. Attempted loss "
                    f"scale: {prev_scale}, reducing to {self.cur_scale}",
                    ranks=[0])
            # Clear gradients
            for i, group in enumerate(self.fp16_groups):
                for p in group:
                    p.grad = None

            self.log_timers(OVERFLOW_TIMERS)
            return self.overflow

        grads_groups_flat = []
        for i, group in enumerate(self.fp16_groups):
            data_type = self.fp32_groups_flat[i].dtype

            grads_groups_flat.append(
                _flatten_dense_tensors([
                    torch.zeros(p.size(),
                                dtype=data_type,
                                device=p.device)
                    if p.grad is None else p.grad.to(data_type) for p in group
                ]))

            for p in group:
                p.grad = None

            self.fp32_groups_flat[i].grad = grads_groups_flat[i]

        self.start_timers([COMPUTE_NORM])

        all_groups_norm = get_grad_norm(self.fp32_groups_flat, mpu=self.mpu)
        #all_groups_norm_old = all_groups_norm
        # Need to allreduce (avg) the norms across different ranks because moe params will not be synced during allreduce
        if self.using_pipeline:
            pg = self.deepspeed.mpu.get_data_parallel_group()
        else:
            pg = groups._get_data_parallel_group()
        scaled_norm = all_groups_norm * 1.0 / float(dist.get_world_size(group=pg))
        scaled_norm_tensor = torch.tensor(scaled_norm,
                                          device=self.fp32_groups_flat[i].device,
                                          dtype=torch.float)
        dist.all_reduce(scaled_norm_tensor, group=pg)
        all_groups_norm = scaled_norm_tensor.item()
        #print(f"old = {all_groups_norm_old} and new = {all_groups_norm} at rank: {torch.distributed.get_rank()}")

        self.stop_timers([COMPUTE_NORM])

        self._global_grad_norm = get_global_norm(norm_list=[all_groups_norm])

        self.start_timers([UNSCALE_AND_CLIP])
        self.unscale_and_clip_grads(grads_groups_flat, self._global_grad_norm)
        self.stop_timers([UNSCALE_AND_CLIP])

        self.start_timers([BASIC_STEP])
        self.optimizer.step()
        self.stop_timers([BASIC_STEP])

        #get rid of the fp32 gradients. Not needed anymore
        for group in self.fp32_groups_flat:
            group.grad = None

        self.start_timers([UPDATE_FP16])

        for i in range(len(self.fp16_groups)):
            updated_params = _unflatten_dense_tensors(self.fp32_groups_flat[i],
                                                      self.fp16_groups[i])
            for p, q in zip(self.fp16_groups[i], updated_params):
                p.data.copy_(q.data)

        self.stop_timers([UPDATE_FP16])

        self.log_timers(STEP_TIMERS)

        return self.overflow
Exemple #23
0
    def compute_eigenvalue(self, module, device=None, scale=1.0):
        block_eigenvalue = []
        param_keys = []
        layers = self.get_layers(module)

        for block in range(self.layer_num):
            model_block = layers[block]

            # We found this randn() has obvious accuracy impact in some cases, save/recover random state here.
            rng_state = torch.random.get_rng_state()
            if device is None:
                v = [
                    torch.randn(p.size()) for p in model_block.parameters()
                    if p.grad is not None and p.grad.grad_fn is not None
                ]
            else:
                v = [
                    torch.randn(p.size(),
                                device=device) for p in model_block.parameters()
                    if p.grad is not None and p.grad.grad_fn is not None
                ]
            torch.random.set_rng_state(rng_state)

            grads = [
                param.grad for param in model_block.parameters()
                if param.grad is not None and param.grad.grad_fn is not None
            ]
            params = [
                param for param in model_block.parameters()
                if param.grad is not None and param.grad.grad_fn is not None
            ]

            layer_keys = [id(p) for p in model_block.parameters()]
            param_keys.append(layer_keys)

            v = self.normalize(v)

            # Disable eigenvalue if the model doesn't support second order gradients computation,
            # e.g. when enabling DS transformer kernel.
            if len(grads) == 0 or len(params) == 0:
                log_dist(f'The model does NOT support eigenvalue computation.',
                         ranks=[0],
                         level=logging.WARNING)
                return []

            i = 0
            eigenvalue_current, eigenvalue_previous = 1., 0.

            while (i < self.max_iter) and abs(eigenvalue_current) > 0 and (abs(
                (eigenvalue_current - eigenvalue_previous) /
                    eigenvalue_current) >= self.tol):  # test convergence criteria
                eigenvalue_previous = eigenvalue_current

                Hv = torch.autograd.grad(grads,
                                         params,
                                         grad_outputs=v,
                                         only_inputs=True,
                                         retain_graph=True)
                #Hv = [hv.float() for hv in Hv]
                Hv = [self.nan_to_num(hv).float() for hv in Hv]

                eigenvalue_current = self.inner_product(Hv, v).item()

                v = self.normalize(Hv)
                v = [x / scale for x in v]
                i += 1

            eigenvalue_current *= scale
            block_eigenvalue.append(eigenvalue_current)

            if self.verbose:
                log_dist(
                    f'block: {block}, power iteration: {i}, eigenvalue: {eigenvalue_current}',
                    ranks=[0])

        block_eigenvalue = self.post_process(block_eigenvalue)

        if self.verbose:
            log_dist(f'post processed block_eigenvalue: {block_eigenvalue}', ranks=[0])

        # {param_id: (eigenvalue, layer_id)}
        ev_dict = {}
        for i, (layer_keys, value) in enumerate(zip(param_keys, block_eigenvalue)):
            ev_dict.update(dict.fromkeys(layer_keys, (value, i)))

        return ev_dict
Exemple #24
0
def initialize_model_and_expert_parallel(expert_parallel_size_, mpu):
    """
        Initialize Expert groups based on MPU groups.

        Example - E + M + D parallel
        world_size = 16
        model_degree = 2
        expert_degree = 4 # number of experts in same group
        mp_group = [0, 1], [2,3], [4,5] ...
        data_parallel_group =[0,2,4,6,8,10, 12,14],                 [1,3,5,7,9,11,13,15]
        expert_parallel_group = [0,2,4,6], [8,10,12,14]             [1,3,5,7], [9,11,13,15]
        expert_data_parallel_group = [0,8],[2,10],[4,12],[6,14],    [1,9],[3,11],[5,13],[]
    """
    assert torch.distributed.is_initialized(
    ), "torch distributed is not initialized"
    assert mpu.model_parallel_is_initialized(
    ), "model parallel group is not initialized"
    model_parallel_size_ = mpu.get_model_parallel_world_size()

    world_size = torch.distributed.get_world_size()
    rank = torch.distributed.get_rank()
    dp_world_size = mpu.get_data_parallel_world_size()
    dp_rank = mpu.get_data_parallel_rank()

    log_dist(
        f"Initializing deepspeed groups with model parallel size {model_parallel_size_}, expert parallel size {expert_parallel_size_}, and data parallel size {world_size}",
        [0])

    global _DATA_PARALLEL_GROUP, _MODEL_PARALLEL_GROUP
    global _EXPERT_PARALLEL_GROUP, _EXPERT_DATA_PARALLEL_GROUP

    # Get world size and rank. Ensure some consistencies.
    _DATA_PARALLEL_GROUP = mpu.get_data_parallel_group()
    _MODEL_PARALLEL_GROUP = mpu.get_model_parallel_group()

    expert_parallel_size_ = min(expert_parallel_size_, dp_world_size)
    ensure_divisibility(world_size, expert_parallel_size_)

    # Build the expert data parallel groups.
    assert _EXPERT_DATA_PARALLEL_GROUP is None, \
        'expert data parallel group is already initialized'
    # Build the expert parallel groups.
    assert _EXPERT_PARALLEL_GROUP is None, \
        'expert parallel group is already initialized'

    for j in range(model_parallel_size_):
        for i in range(expert_parallel_size_):
            ranks = range(i * model_parallel_size_ + j, world_size,
                          expert_parallel_size_ * model_parallel_size_)
            group = torch.distributed.new_group(ranks)

            # TODO: remove
            log_dist(
                f'creating expert data parallel process group with ranks: {list(ranks)}',
                [0])
            if rank in list(ranks):
                _EXPERT_DATA_PARALLEL_GROUP = group

        for i in range(dp_world_size // expert_parallel_size_):
            ranks = range(i * expert_parallel_size_ * model_parallel_size_ + j,
                          (i + 1) * expert_parallel_size_ *
                          model_parallel_size_, model_parallel_size_)
            group = torch.distributed.new_group(ranks)

            # TODO: remove
            log_dist(
                f'creating expert parallel process group with ranks: {list(ranks)}',
                [0])
            if rank in list(ranks):
                _EXPERT_PARALLEL_GROUP = group
 def create(self, tag):
     log_dist(f"[Torch] Checkpoint {tag} is begin to save!", ranks=[0])
Exemple #26
0
def initialize_expert_parallel(expert_parallel_size_, num_ep_list_=None):
    """
        Initialize expert plus data parallel groups.

        Example - E + D parallel
        world_size = 16
        expert_parallel_size = 2 # number of experts in same group
        expert_data_parallel_group = [0,2,4,6,8,10,12,14], [1,3,5,7,9,11,13,15] - all reduce is only on MoE params
        expert_parallel_group = [0, 1], [2,3], [4,5], [6,7], [8,9] - no all reduce, but all to all
        data_parallel_group = [0,1,...,15] - all reduce is only on non-MoE
    """
    assert torch.distributed.is_initialized()

    global _MAX_EP_SIZE
    global _MAX_EP_SIZE_NAME
    _MAX_EP_SIZE = expert_parallel_size_
    _MAX_EP_SIZE_NAME = f"ep_size_{expert_parallel_size_}"

    if num_ep_list_ is None:
        num_ep_list_ = [expert_parallel_size_]

    log_dist(
        'initializing deepspeed expert parallel group with max size {} for number expert list {}'
        .format(expert_parallel_size_, num_ep_list_), [0])
    world_size = get_data_parallel_world_size()
    rank = get_data_parallel_rank()

    expert_parallel_size_ = min(expert_parallel_size_, world_size)
    ensure_divisibility(world_size, expert_parallel_size_)

    # Build the expert data parallel groups.
    global _EXPERT_DATA_PARALLEL_GROUP
    assert _EXPERT_DATA_PARALLEL_GROUP is None, \
        'expert data parallel group is already initialized'

    _EXPERT_DATA_PARALLEL_GROUP = {}

    for num_ep in num_ep_list_:
        # Build the data parallel groups for each num_ep
        # We will have two cases
        # 1. num_ep >= expert_parallel_size_, we can assign the same group to to num_ep from expert_parallel_size_ to num_ep
        # 2. num_ep < expert_parallel_size_, we will need to create the new group
        if num_ep >= expert_parallel_size_:
            if f"ep_size_{expert_parallel_size_}" not in _EXPERT_DATA_PARALLEL_GROUP:
                for i in range(expert_parallel_size_):
                    # generate all groups
                    ranks = range(i, world_size, expert_parallel_size_)
                    group = torch.distributed.new_group(ranks)
                    if i == (rank % expert_parallel_size_):
                        # get the correct group
                        _EXPERT_DATA_PARALLEL_GROUP[
                            f"ep_size_{expert_parallel_size_}"] = group
        else:
            for i in range(num_ep):
                ranks = range(i, world_size, num_ep)
                group = torch.distributed.new_group(ranks)
                if i == (rank % num_ep):
                    _EXPERT_DATA_PARALLEL_GROUP[f"ep_size_{num_ep}"] = group

    # Build the expert parallel groups.
    global _EXPERT_PARALLEL_GROUP
    assert _EXPERT_PARALLEL_GROUP is None, \
        'expert parallel group is already initialized'

    _EXPERT_PARALLEL_GROUP = {}

    for num_ep in num_ep_list_:
        # Similar as above we will need to think about two cases
        if num_ep >= expert_parallel_size_:
            if f"ep_size_{expert_parallel_size_}" not in _EXPERT_PARALLEL_GROUP:
                for i in range(world_size // expert_parallel_size_):
                    ranks = range(i * expert_parallel_size_,
                                  (i + 1) * expert_parallel_size_)
                    group = torch.distributed.new_group(ranks)
                    if i == (rank // expert_parallel_size_):
                        _EXPERT_PARALLEL_GROUP[
                            f"ep_size_{expert_parallel_size_}"] = group
        else:
            for i in range(world_size // num_ep):
                ranks = range(i * num_ep, (i + 1) * num_ep)
                group = torch.distributed.new_group(ranks)
                if i == (rank // num_ep):
                    _EXPERT_PARALLEL_GROUP[f"ep_size_{num_ep}"] = group