コード例 #1
0
def test_get_group_with_rank(
    rank: int,
    groups: list[list[int]],
    result: list[int] | None,
    error: bool,
) -> None:
    """Test get_group_with_rank."""
    if error:
        with pytest.raises(ValueError):
            get_group_with_rank(rank, groups)
    else:
        assert get_group_with_rank(rank, groups) == result
コード例 #2
0
def test_gpt_neox_assignment_load_balancing(
    work: dict[str, dict[str, float]],
    ranks: list[int],
    expected: dict[str, dict[str, float]],
) -> None:
    """Test GPTNeoXAssignment load balancing."""
    topology = PipeModelDataParallelTopology(1, len(ranks), 1)
    for rank in ranks:
        assignment = GPTNeoXAssignment(
            work,
            local_rank=rank,
            topology=topology,
            data_parallel_group=None,
            model_parallel_group=None,
        )

        for layer, factors in expected.items():
            for factor in factors:
                inv_worker = assignment.inv_worker(layer, factor)
                assert inv_worker == factors[factor]

            model_parallel_peers = get_group_with_rank(
                rank,
                topology.get_axis_comm_lists('model'),
            )
            assert assignment.is_grad_worker(layer) == (
                rank in model_parallel_peers
                and inv_worker in model_parallel_peers)
コード例 #3
0
ファイル: assignment.py プロジェクト: gpauloski/kfac_pytorch
    def factor_worker(self, layer: str, factor: str) -> int:
        """Worker that gathers the factor from model parallel group peers.

        Also referred to as the primary worker in the layer code.
        """
        inv_ranks = set(self._inv_assignments[layer].values())
        assert len(inv_ranks) == 1
        inv_rank = inv_ranks.pop()

        data_parallel_ranks = get_group_with_rank(
            inv_rank,
            self.data_parallel_groups,
        )
        factor_workers = set(data_parallel_ranks) & set(
            self.model_parallel_peers, )
        assert len(factor_workers) == 1
        return factor_workers.pop()
コード例 #4
0
ファイル: assignment.py プロジェクト: gpauloski/kfac_pytorch
    def src_grad_worker(self, layer: str) -> int:
        """Return rank that will share preconditioned gradient.

        If process is a gradient worker, this method should return the
        process rank. Otherwise, if the process is a gradient receiver, this
        method returns the rank that is responsible for sending the
        preconditioned gradient to this process.

        With model parallelism, the src rank is the rank that received the
        partial preconditioned gradient from the inv_worker.
        """
        ranks = list(self._inv_assignments[layer].values())
        assert ranks.count(ranks[0]) == len(ranks)
        # This is just the src rank that computes the preconditioned gradient
        # and then scatters it to the other ranks in its model parallel group
        src_rank = ranks[0]

        model_parallel_ranks = get_group_with_rank(
            src_rank,
            self.model_parallel_groups,
        )
        src = set(self.data_parallel_peers) & set(model_parallel_ranks)
        assert len(src) == 1
        return src.pop()
コード例 #5
0
ファイル: assignment.py プロジェクト: gpauloski/kfac_pytorch
    def __init__(
        self,
        work: dict[str, dict[str, float]],
        *,
        local_rank: int,
        topology: PipeModelDataParallelTopology,
        data_parallel_group: dist.ProcessGroup | None,
        model_parallel_group: dist.ProcessGroup | None,
    ) -> None:
        """Init GPTNeoxAssignment.

        Args:
            work (dict[str, dict[str, int]]): dictionary mapping unique layer
                names to sub-dictionaries where the keys are the str names for
                each factor associated with the layer and the values are the
                cost of each factor computation for load balancing. Note: that
                this should only be the work performed by the data parallel
                group.
            local_rank (int): local rank of this process.
            topology (PipeModelDataParallelTopology): topology created
                by DeepSpeed.
            data_parallel_group (ProcessGroup): DeepSpeed data parallel
                process group.
            model_parallel_group (ProcessGroup): DeepSpeed model parallel
                process group.
        """
        if deepspeed_import_error is not None:  # pragma: no cover
            raise deepspeed_import_error
        if not isinstance(topology, PipeModelDataParallelTopology):
            raise TypeError(
                'Expected topology to be of type '
                f'{PipeModelDataParallelTopology.__name__} but got '
                f'{type(topology)} instead.', )

        self.local_rank = local_rank
        self.data_parallel_group = data_parallel_group
        self.model_parallel_group = model_parallel_group

        # global information
        self.data_parallel_groups = topology.get_axis_comm_lists('data')
        self.model_parallel_groups = topology.get_axis_comm_lists('model')
        self.pipe_parallel_groups = topology.get_axis_comm_lists('pipe')

        self.data_parallel_peers = get_group_with_rank(
            self.local_rank,
            self.data_parallel_groups,
        )
        self.model_parallel_peers = get_group_with_rank(
            self.local_rank,
            self.model_parallel_groups,
        )
        self.pipe_parallel_rank = topology.get_coord(self.local_rank).pipe
        # List of ranks with same pipe rank as us. These are the ranks that
        # have the same layers as us so they are all we care about for the
        # purpose of assigning work
        self.pipe_parallel_peers = [
            r for r in range(topology.world_size())
            if topology.get_coord(r).pipe == self.pipe_parallel_rank
        ]

        # Reuse existing groups if possible
        if set(self.pipe_parallel_peers) == set(self.model_parallel_peers):
            self.pipe_parallel_peer_group = self.model_parallel_group
        elif set(self.pipe_parallel_peers) == set(self.data_parallel_peers):
            self.pipe_parallel_peer_group = self.data_parallel_group
        else:
            self.pipe_parallel_peer_group = dist.new_group(
                self.pipe_parallel_peers, )

        worker_loads = [0.0 for _ in self.pipe_parallel_peers]
        self._inv_assignments = {
            layer: {factor: -1
                    for factor in factors}
            for layer, factors in work.items()
        }
        summed_work = [(layer, sum(factors.values()))
                       for layer, factors in work.items()]
        sorted_work = sorted(
            summed_work,
            key=lambda item: (item[1], item[0]),
            reverse=True,
        )

        for layer, cost in sorted_work:
            min_worker_index = worker_loads.index(min(worker_loads))
            min_worker = self.pipe_parallel_peers[min_worker_index]
            for factor in self._inv_assignments[layer]:
                self._inv_assignments[layer][factor] = min_worker
            worker_loads[min_worker_index] += cost
コード例 #6
0
def test_gpt_neox_assignment(
    work: dict[str, dict[str, float]],
    ranks: list[int],
) -> None:
    """Test GPTNeoXAssignment."""
    with pytest.raises(TypeError):
        GPTNeoXAssignment(
            work,
            local_rank=99999,
            topology=object(),
            data_parallel_group=None,
            model_parallel_group=None,
        )

    assignments = []
    topology = PipeModelDataParallelTopology(1, len(ranks), 1)
    for rank in ranks:
        assignment = GPTNeoXAssignment(
            work,
            local_rank=rank,
            topology=topology,
            data_parallel_group=None,
            model_parallel_group=None,
        )
        assignments.append((rank, assignment))

    for rank, assignment in assignments:
        # GPTNeoXAssignment uses MEM-OPT so we should always broadcast
        # gradients and never inverses.
        assert assignment.broadcast_gradients()
        assert not assignment.broadcast_inverses()

        assert set(assignment.get_layers()) == set(work.keys())
        for layer, factors in work.items():
            assert set(assignment.get_factors(layer)) == set(factors.keys())

        for layer, factors in work.items():
            inv_workers = [
                assignment.inv_worker(layer, factor) for factor in factors
            ]
            # Check every factor is assigned to same inv worker
            assert inv_workers.count(inv_workers[0]) == len(inv_workers)
            assert inv_workers[0] in ranks

            model_parallel_peers = get_group_with_rank(
                rank,
                topology.get_axis_comm_lists('model'),
            )
            assert assignment.is_grad_worker(layer) == (
                rank in model_parallel_peers
                and inv_workers[0] in model_parallel_peers)

        for layer in work:
            with pytest.raises(NotImplementedError):
                assignment.grad_worker_group(layer)

    for layer in work:
        src_grad_workers = [
            assignment.src_grad_worker(layer) for _, assignment in assignments
        ]

        assert src_grad_workers.count(src_grad_workers[0]) == 1

        factor_workers = set()
        for factor in work[layer]:
            factor_workers.add(assignment.factor_worker(layer, factor))
        assert len(factor_workers) == 1

        groups = [
            assignment.factor_group(layer, 'A')
            for _, assignment in assignments
        ]
        groups += [
            assignment.grad_receiver_group(layer)
            for _, assignment in assignments
        ]
        assert groups.count(groups[0]) == len(groups)