Esempio n. 1
0
def test_reuse_comm_groups() -> None:
    """Test that we reuse exisiting comm groups when possible."""
    with mock.patch('torch.distributed.new_group', return_value=-1):
        topology = PipeModelDataParallelTopology(2, 1, 2)
        assignment = GPTNeoXAssignment(
            {},
            local_rank=0,
            topology=topology,
            data_parallel_group=-2,  # type: ignore
            model_parallel_group=-3,  # type: ignore
        )
        assert (assignment.pipe_parallel_peer_group ==
                assignment.data_parallel_group)

        topology = PipeModelDataParallelTopology(2, 2, 2)
        assignment = GPTNeoXAssignment(
            {},
            local_rank=0,
            topology=topology,
            data_parallel_group=-2,  # type: ignore
            model_parallel_group=-3,  # type: ignore
        )
        assert (assignment.pipe_parallel_peer_group !=
                assignment.data_parallel_group !=
                assignment.model_parallel_group)
Esempio n. 2
0
def test_gpt_neox_assignment_load_balancing(
    work: dict[str, dict[str, float]],
    ranks: list[int],
    expected: dict[str, dict[str, float]],
) -> None:
    """Test GPTNeoXAssignment load balancing."""
    topology = PipeModelDataParallelTopology(1, len(ranks), 1)
    for rank in ranks:
        assignment = GPTNeoXAssignment(
            work,
            local_rank=rank,
            topology=topology,
            data_parallel_group=None,
            model_parallel_group=None,
        )

        for layer, factors in expected.items():
            for factor in factors:
                inv_worker = assignment.inv_worker(layer, factor)
                assert inv_worker == factors[factor]

            model_parallel_peers = get_group_with_rank(
                rank,
                topology.get_axis_comm_lists('model'),
            )
            assert assignment.is_grad_worker(layer) == (
                rank in model_parallel_peers
                and inv_worker in model_parallel_peers)
    def get_topology(self, mp, pp, world_size):
        assert world_size % (pp * mp) == 0
        dp = world_size // (pp * mp)

        from deepspeed.runtime.pipe.topology import PipeModelDataParallelTopology
        topo = PipeModelDataParallelTopology(num_pp=pp, num_mp=mp, num_dp=dp)

        return topo
Esempio n. 4
0
def get_pipeline_module(*args: Any, **kwargs: Any) -> PipelineModule:
    """Create pipeline module with correct topology type."""
    with mock.patch.object(PipelineModule, 'to', mock.MagicMock()):
        m = PipelineModule(*args, **kwargs)
    m._topo = PipeModelDataParallelTopology(
        num_pp=m.num_stages,
        num_dp=dist.get_world_size(m.world_group) // m.num_stages,
        num_mp=1,
    )
    return m
Esempio n. 5
0
def _initialize_distributed():
    """Initialize torch.distributed and mpu."""
    args = get_args()

    device_count = torch.cuda.device_count()
    if torch.distributed.is_initialized():

        if args.rank == 0:
            print('torch distributed is already initialized, '
                  'skipping initialization ...', flush=True)
        args.rank = torch.distributed.get_rank()
        args.world_size = torch.distributed.get_world_size()

    else:

        if args.rank == 0:
            print('> initializing torch distributed ...', flush=True)
        # Manually set the device ids.
        if device_count > 0:
            device = args.rank % device_count
            if args.local_rank is not None:
                assert args.local_rank == device, \
                    'expected local-rank to be the same as rank % device-count.'
            else:
                args.local_rank = device
            torch.cuda.set_device(device)
            
        distributed.init_distributed(
            dist_backend=args.distributed_backend,
            auto_mpi_discovery=True,
            distributed_port=os.getenv('MASTER_PORT', '6000'),
            verbose=True,
        )

    # Setup 3D topology.
    if args.pipe_parallel_size > 0:
        pp = args.pipe_parallel_size
        mp = args.model_parallel_size
        assert args.world_size % (pp * mp) == 0
        dp = args.world_size // (pp * mp)

        from deepspeed.runtime.pipe.topology import PipeModelDataParallelTopology
        # this does pipe on the most outside, then data, then model. 
        # PipeModelDataParallelTopology is just a wrapper over ProcessTopology that predefines this order.
        topo = PipeModelDataParallelTopology(num_pp=pp, num_mp=mp, num_dp=dp)

        # Offset base seeds for the interior pipeline stages.
        # TODO: adjust last stage too once IO is improved.
        stage_id = topo.get_coord(rank=torch.distributed.get_rank()).pipe
        if 0 < stage_id < topo.get_dim('pipe') - 1:
            offset = args.seed + 1138
            args.seed = offset + (stage_id * mp)
    else:
        topo = None

    # Set the model-parallel / data-parallel communicators.
    if device_count > 0:
        if mpu.model_parallel_is_initialized():
            print('model parallel is already initialized')
        else:
            mpu.initialize_model_parallel(args.model_parallel_size, topology=topo)

    # Optional DeepSpeed Activation Checkpointing Features
    #
    if args.deepspeed and args.deepspeed_activation_checkpointing:
        setup_deepspeed_random_and_activation_checkpointing(args)
Esempio n. 6
0
    def __init__(
        self,
        work: dict[str, dict[str, float]],
        *,
        local_rank: int,
        topology: PipeModelDataParallelTopology,
        data_parallel_group: dist.ProcessGroup | None,
        model_parallel_group: dist.ProcessGroup | None,
    ) -> None:
        """Init GPTNeoxAssignment.

        Args:
            work (dict[str, dict[str, int]]): dictionary mapping unique layer
                names to sub-dictionaries where the keys are the str names for
                each factor associated with the layer and the values are the
                cost of each factor computation for load balancing. Note: that
                this should only be the work performed by the data parallel
                group.
            local_rank (int): local rank of this process.
            topology (PipeModelDataParallelTopology): topology created
                by DeepSpeed.
            data_parallel_group (ProcessGroup): DeepSpeed data parallel
                process group.
            model_parallel_group (ProcessGroup): DeepSpeed model parallel
                process group.
        """
        if deepspeed_import_error is not None:  # pragma: no cover
            raise deepspeed_import_error
        if not isinstance(topology, PipeModelDataParallelTopology):
            raise TypeError(
                'Expected topology to be of type '
                f'{PipeModelDataParallelTopology.__name__} but got '
                f'{type(topology)} instead.', )

        self.local_rank = local_rank
        self.data_parallel_group = data_parallel_group
        self.model_parallel_group = model_parallel_group

        # global information
        self.data_parallel_groups = topology.get_axis_comm_lists('data')
        self.model_parallel_groups = topology.get_axis_comm_lists('model')
        self.pipe_parallel_groups = topology.get_axis_comm_lists('pipe')

        self.data_parallel_peers = get_group_with_rank(
            self.local_rank,
            self.data_parallel_groups,
        )
        self.model_parallel_peers = get_group_with_rank(
            self.local_rank,
            self.model_parallel_groups,
        )
        self.pipe_parallel_rank = topology.get_coord(self.local_rank).pipe
        # List of ranks with same pipe rank as us. These are the ranks that
        # have the same layers as us so they are all we care about for the
        # purpose of assigning work
        self.pipe_parallel_peers = [
            r for r in range(topology.world_size())
            if topology.get_coord(r).pipe == self.pipe_parallel_rank
        ]

        # Reuse existing groups if possible
        if set(self.pipe_parallel_peers) == set(self.model_parallel_peers):
            self.pipe_parallel_peer_group = self.model_parallel_group
        elif set(self.pipe_parallel_peers) == set(self.data_parallel_peers):
            self.pipe_parallel_peer_group = self.data_parallel_group
        else:
            self.pipe_parallel_peer_group = dist.new_group(
                self.pipe_parallel_peers, )

        worker_loads = [0.0 for _ in self.pipe_parallel_peers]
        self._inv_assignments = {
            layer: {factor: -1
                    for factor in factors}
            for layer, factors in work.items()
        }
        summed_work = [(layer, sum(factors.values()))
                       for layer, factors in work.items()]
        sorted_work = sorted(
            summed_work,
            key=lambda item: (item[1], item[0]),
            reverse=True,
        )

        for layer, cost in sorted_work:
            min_worker_index = worker_loads.index(min(worker_loads))
            min_worker = self.pipe_parallel_peers[min_worker_index]
            for factor in self._inv_assignments[layer]:
                self._inv_assignments[layer][factor] = min_worker
            worker_loads[min_worker_index] += cost
Esempio n. 7
0
def _initialize_distributed(neox_args):
    """Initialize torch.distributed and mpu."""

    device_count = torch.cuda.device_count()
    if torch.distributed.is_initialized():

        if neox_args.rank == 0:
            print(
                "torch distributed is already initialized, "
                "skipping initialization ...",
                flush=True,
            )
        neox_args.rank = torch.distributed.get_rank()
        neox_args.world_size = torch.distributed.get_world_size()

    else:

        if neox_args.rank == 0:
            print("> initializing torch distributed ...", flush=True)
        # Manually set the device ids.
        if device_count > 0:
            device = neox_args.rank % device_count
            if neox_args.local_rank is not None:
                assert (
                    neox_args.local_rank == device
                ), "expected local-rank to be the same as rank % device-count."
            else:
                neox_args.local_rank = device
            torch.cuda.set_device(device)

        distributed.init_distributed(
            dist_backend=neox_args.distributed_backend,
            auto_mpi_discovery=True,
            distributed_port=os.getenv("MASTER_PORT", "6000"),
            verbose=True,
        )

    # Setup 3D topology.
    pp = neox_args.pipe_parallel_size if neox_args.pipe_parallel_size >= 1 else 1
    mp = neox_args.model_parallel_size if neox_args.model_parallel_size >= 1 else 1
    assert (
        neox_args.world_size %
        (pp * mp) == 0), f"world_size={neox_args.world_size}, pp={pp}, mp={mp}"
    dp = neox_args.world_size // (pp * mp)

    from deepspeed.runtime.pipe.topology import PipeModelDataParallelTopology

    # this does pipe on the most outside, then data, then model.
    # PipeModelDataParallelTopology is just a wrapper over ProcessTopology that predefines this order.
    topo = PipeModelDataParallelTopology(num_pp=pp, num_mp=mp, num_dp=dp)

    # Offset base seeds for the interior pipeline stages.
    # TODO: adjust last stage too once IO is improved.
    stage_id = topo.get_coord(rank=torch.distributed.get_rank()).pipe
    if 0 < stage_id < topo.get_dim("pipe") - 1:
        offset = neox_args.seed + 1138
        neox_args.seed = offset + (stage_id * mp)

    # Set the model-parallel / data-parallel communicators.
    if device_count > 0:
        if mpu.model_parallel_is_initialized():
            print(
                "_initialize_distributed() model parallel is already initialized",
                flush=True,
            )
        else:
            mpu.initialize_model_parallel(
                neox_args.model_parallel_size,
                topology=topo,
                fp32_allreduce=neox_args.fp32_allreduce,
            )

    # Init DeepSpeed Activation Checkpointing Features
    setup_deepspeed_random_and_activation_checkpointing(neox_args=neox_args)
Esempio n. 8
0
def test_gpt_neox_assignment(
    work: dict[str, dict[str, float]],
    ranks: list[int],
) -> None:
    """Test GPTNeoXAssignment."""
    with pytest.raises(TypeError):
        GPTNeoXAssignment(
            work,
            local_rank=99999,
            topology=object(),
            data_parallel_group=None,
            model_parallel_group=None,
        )

    assignments = []
    topology = PipeModelDataParallelTopology(1, len(ranks), 1)
    for rank in ranks:
        assignment = GPTNeoXAssignment(
            work,
            local_rank=rank,
            topology=topology,
            data_parallel_group=None,
            model_parallel_group=None,
        )
        assignments.append((rank, assignment))

    for rank, assignment in assignments:
        # GPTNeoXAssignment uses MEM-OPT so we should always broadcast
        # gradients and never inverses.
        assert assignment.broadcast_gradients()
        assert not assignment.broadcast_inverses()

        assert set(assignment.get_layers()) == set(work.keys())
        for layer, factors in work.items():
            assert set(assignment.get_factors(layer)) == set(factors.keys())

        for layer, factors in work.items():
            inv_workers = [
                assignment.inv_worker(layer, factor) for factor in factors
            ]
            # Check every factor is assigned to same inv worker
            assert inv_workers.count(inv_workers[0]) == len(inv_workers)
            assert inv_workers[0] in ranks

            model_parallel_peers = get_group_with_rank(
                rank,
                topology.get_axis_comm_lists('model'),
            )
            assert assignment.is_grad_worker(layer) == (
                rank in model_parallel_peers
                and inv_workers[0] in model_parallel_peers)

        for layer in work:
            with pytest.raises(NotImplementedError):
                assignment.grad_worker_group(layer)

    for layer in work:
        src_grad_workers = [
            assignment.src_grad_worker(layer) for _, assignment in assignments
        ]

        assert src_grad_workers.count(src_grad_workers[0]) == 1

        factor_workers = set()
        for factor in work[layer]:
            factor_workers.add(assignment.factor_worker(layer, factor))
        assert len(factor_workers) == 1

        groups = [
            assignment.factor_group(layer, 'A')
            for _, assignment in assignments
        ]
        groups += [
            assignment.grad_receiver_group(layer)
            for _, assignment in assignments
        ]
        assert groups.count(groups[0]) == len(groups)