def test_get_group_with_rank( rank: int, groups: list[list[int]], result: list[int] | None, error: bool, ) -> None: """Test get_group_with_rank.""" if error: with pytest.raises(ValueError): get_group_with_rank(rank, groups) else: assert get_group_with_rank(rank, groups) == result
def test_gpt_neox_assignment_load_balancing( work: dict[str, dict[str, float]], ranks: list[int], expected: dict[str, dict[str, float]], ) -> None: """Test GPTNeoXAssignment load balancing.""" topology = PipeModelDataParallelTopology(1, len(ranks), 1) for rank in ranks: assignment = GPTNeoXAssignment( work, local_rank=rank, topology=topology, data_parallel_group=None, model_parallel_group=None, ) for layer, factors in expected.items(): for factor in factors: inv_worker = assignment.inv_worker(layer, factor) assert inv_worker == factors[factor] model_parallel_peers = get_group_with_rank( rank, topology.get_axis_comm_lists('model'), ) assert assignment.is_grad_worker(layer) == ( rank in model_parallel_peers and inv_worker in model_parallel_peers)
def factor_worker(self, layer: str, factor: str) -> int: """Worker that gathers the factor from model parallel group peers. Also referred to as the primary worker in the layer code. """ inv_ranks = set(self._inv_assignments[layer].values()) assert len(inv_ranks) == 1 inv_rank = inv_ranks.pop() data_parallel_ranks = get_group_with_rank( inv_rank, self.data_parallel_groups, ) factor_workers = set(data_parallel_ranks) & set( self.model_parallel_peers, ) assert len(factor_workers) == 1 return factor_workers.pop()
def src_grad_worker(self, layer: str) -> int: """Return rank that will share preconditioned gradient. If process is a gradient worker, this method should return the process rank. Otherwise, if the process is a gradient receiver, this method returns the rank that is responsible for sending the preconditioned gradient to this process. With model parallelism, the src rank is the rank that received the partial preconditioned gradient from the inv_worker. """ ranks = list(self._inv_assignments[layer].values()) assert ranks.count(ranks[0]) == len(ranks) # This is just the src rank that computes the preconditioned gradient # and then scatters it to the other ranks in its model parallel group src_rank = ranks[0] model_parallel_ranks = get_group_with_rank( src_rank, self.model_parallel_groups, ) src = set(self.data_parallel_peers) & set(model_parallel_ranks) assert len(src) == 1 return src.pop()
def __init__( self, work: dict[str, dict[str, float]], *, local_rank: int, topology: PipeModelDataParallelTopology, data_parallel_group: dist.ProcessGroup | None, model_parallel_group: dist.ProcessGroup | None, ) -> None: """Init GPTNeoxAssignment. Args: work (dict[str, dict[str, int]]): dictionary mapping unique layer names to sub-dictionaries where the keys are the str names for each factor associated with the layer and the values are the cost of each factor computation for load balancing. Note: that this should only be the work performed by the data parallel group. local_rank (int): local rank of this process. topology (PipeModelDataParallelTopology): topology created by DeepSpeed. data_parallel_group (ProcessGroup): DeepSpeed data parallel process group. model_parallel_group (ProcessGroup): DeepSpeed model parallel process group. """ if deepspeed_import_error is not None: # pragma: no cover raise deepspeed_import_error if not isinstance(topology, PipeModelDataParallelTopology): raise TypeError( 'Expected topology to be of type ' f'{PipeModelDataParallelTopology.__name__} but got ' f'{type(topology)} instead.', ) self.local_rank = local_rank self.data_parallel_group = data_parallel_group self.model_parallel_group = model_parallel_group # global information self.data_parallel_groups = topology.get_axis_comm_lists('data') self.model_parallel_groups = topology.get_axis_comm_lists('model') self.pipe_parallel_groups = topology.get_axis_comm_lists('pipe') self.data_parallel_peers = get_group_with_rank( self.local_rank, self.data_parallel_groups, ) self.model_parallel_peers = get_group_with_rank( self.local_rank, self.model_parallel_groups, ) self.pipe_parallel_rank = topology.get_coord(self.local_rank).pipe # List of ranks with same pipe rank as us. These are the ranks that # have the same layers as us so they are all we care about for the # purpose of assigning work self.pipe_parallel_peers = [ r for r in range(topology.world_size()) if topology.get_coord(r).pipe == self.pipe_parallel_rank ] # Reuse existing groups if possible if set(self.pipe_parallel_peers) == set(self.model_parallel_peers): self.pipe_parallel_peer_group = self.model_parallel_group elif set(self.pipe_parallel_peers) == set(self.data_parallel_peers): self.pipe_parallel_peer_group = self.data_parallel_group else: self.pipe_parallel_peer_group = dist.new_group( self.pipe_parallel_peers, ) worker_loads = [0.0 for _ in self.pipe_parallel_peers] self._inv_assignments = { layer: {factor: -1 for factor in factors} for layer, factors in work.items() } summed_work = [(layer, sum(factors.values())) for layer, factors in work.items()] sorted_work = sorted( summed_work, key=lambda item: (item[1], item[0]), reverse=True, ) for layer, cost in sorted_work: min_worker_index = worker_loads.index(min(worker_loads)) min_worker = self.pipe_parallel_peers[min_worker_index] for factor in self._inv_assignments[layer]: self._inv_assignments[layer][factor] = min_worker worker_loads[min_worker_index] += cost
def test_gpt_neox_assignment( work: dict[str, dict[str, float]], ranks: list[int], ) -> None: """Test GPTNeoXAssignment.""" with pytest.raises(TypeError): GPTNeoXAssignment( work, local_rank=99999, topology=object(), data_parallel_group=None, model_parallel_group=None, ) assignments = [] topology = PipeModelDataParallelTopology(1, len(ranks), 1) for rank in ranks: assignment = GPTNeoXAssignment( work, local_rank=rank, topology=topology, data_parallel_group=None, model_parallel_group=None, ) assignments.append((rank, assignment)) for rank, assignment in assignments: # GPTNeoXAssignment uses MEM-OPT so we should always broadcast # gradients and never inverses. assert assignment.broadcast_gradients() assert not assignment.broadcast_inverses() assert set(assignment.get_layers()) == set(work.keys()) for layer, factors in work.items(): assert set(assignment.get_factors(layer)) == set(factors.keys()) for layer, factors in work.items(): inv_workers = [ assignment.inv_worker(layer, factor) for factor in factors ] # Check every factor is assigned to same inv worker assert inv_workers.count(inv_workers[0]) == len(inv_workers) assert inv_workers[0] in ranks model_parallel_peers = get_group_with_rank( rank, topology.get_axis_comm_lists('model'), ) assert assignment.is_grad_worker(layer) == ( rank in model_parallel_peers and inv_workers[0] in model_parallel_peers) for layer in work: with pytest.raises(NotImplementedError): assignment.grad_worker_group(layer) for layer in work: src_grad_workers = [ assignment.src_grad_worker(layer) for _, assignment in assignments ] assert src_grad_workers.count(src_grad_workers[0]) == 1 factor_workers = set() for factor in work[layer]: factor_workers.add(assignment.factor_worker(layer, factor)) assert len(factor_workers) == 1 groups = [ assignment.factor_group(layer, 'A') for _, assignment in assignments ] groups += [ assignment.grad_receiver_group(layer) for _, assignment in assignments ] assert groups.count(groups[0]) == len(groups)