Python broadcast Examples, paddle.distributed.broadcast Python Examples

Example #1

0

Show file

File: sharding_stage3.py Project: wuhuachaocoding/Paddle

 def _sync_buffers(self):
     for buffer in self._layer.buffers(include_sublayers=True):
         dist.broadcast(buffer,
                        self._global_root_rank,
                        self._group,
                        use_calc_stream=True)
     # Multi stream operation will be supported later
     dist.wait(tensor=buffer, group=self._group, use_calc_stream=True)

Example #2

0

Show file

File: sharding_stage3.py Project: wuhuachaocoding/Paddle

    def _sync_params_and_buffers(self):
        """
        Sync all model states for all ranks
        """

        for p in self._layer.parameters():
            dist.broadcast(p,
                           src=self._global_root_rank,
                           group=self._group,
                           use_calc_stream=True)

        # Multi stream operation will be supported later
        dist.wait(tensor=p, group=self._group, use_calc_stream=True)

Example #3

0

Show file

File: sharding_stage2.py Project: wuhuachaocoding/Paddle

    def __sync_buffers(self):
        """
        Sync all the param buffers from all ranks (exp: batch norm statistics).
        """

        for buffer in self._layer.buffers(include_sublayers=True):
            dist.broadcast(
                buffer,
                self._global_root_rank,
                self._group,
                use_calc_stream=True)
        # Multi stream operation will be supported later
        dist.wait(tensor=buffer, group=self._group, use_calc_stream=True)

Example #4

0

Show file

 def _get_size(self):
     # random_interval = 10 as default, every 10 iters to change self._input_size
     image_ratio = self.input_size[1] * 1.0 / self.input_size[0]
     if self._step % self.random_interval == 0:
         size_factor = random.randint(*self.size_range)
         size = [
             self.size_stride * size_factor,
             self.size_stride * int(size_factor * image_ratio)
         ]
         size = paddle.to_tensor(size)
         if dist.get_world_size() > 1 and paddle_distributed_is_initialized(
         ):
             dist.barrier()
             dist.broadcast(size, 0)
         self._input_size = size
     self._step += 1

Example #5

0

Show file

File: sharding_optimizer_stage2.py Project: wuhuachaocoding/Paddle

    def _broadcast_params(self):
        """Broadcast the parameters of the current rank to each rank"""

        assert self._default_device == "gpu", "Only supported gpu"

        # Exchange all the shards with the other ranks
        for dtype_per_rank in self.param_storages.values():
            for dst_rank, internal_storage in dtype_per_rank.items():
                dist.broadcast(
                    tensor=internal_storage.buffer,
                    src=dst_rank,
                    group=self.group,
                    use_calc_stream=True)

            # Multi stream operation will be supported later
            dist.wait(
                tensor=internal_storage.buffer,
                group=self.group,
                use_calc_stream=True)

Example #6

0

Show file

    def _sync_params_buffers(self):
        for buffer in self._layers.buffers():
            dist.broadcast(buffer, src=0)

        for param in self._layers.parameters():
            dist.broadcast(param, src=0)

Example #7

0

Show file

    def test_create_process_group_nccl(self):
        with _test_eager_guard():
            paddle.set_device('gpu:%d' %
                              paddle.distributed.ParallelEnv().dev_id)

            pg = init_process_group()
            print("rank:", pg.rank(), "size:", pg.size(), "name:", pg.name())
            print("test new group api ok")

            # test allreduce sum
            # rank 0
            x = np.random.random(self.shape).astype(self.dtype)
            tensor_x = paddle.to_tensor(x)
            # rank 1
            y = np.random.random(self.shape).astype(self.dtype)
            tensor_y = paddle.to_tensor(y)

            sum_result = tensor_x + tensor_y
            if pg.rank() == 0:
                task = dist.all_reduce(tensor_x)
                assert np.array_equal(tensor_x, sum_result)
            else:
                task = dist.all_reduce(tensor_y)
                assert np.array_equal(tensor_y, sum_result)

            print("test allreduce sum api ok")

            # test allreduce max
            # rank 0
            x = np.random.random(self.shape).astype(self.dtype)
            tensor_x = paddle.to_tensor(x)
            # rank 1
            y = np.random.random(self.shape).astype(self.dtype)
            tensor_y = paddle.to_tensor(y)

            max_result = paddle.maximum(tensor_x, tensor_y)

            if pg.rank() == 0:
                task = dist.all_reduce(tensor_x,
                                       dist.ReduceOp.MAX,
                                       use_calc_stream=False)
                task.wait()
                assert np.array_equal(tensor_x, max_result)
            else:
                task = dist.all_reduce(tensor_y,
                                       dist.ReduceOp.MAX,
                                       use_calc_stream=False)
                task.wait()
                assert np.array_equal(tensor_y, max_result)

            print("test allreduce max api ok")

            # test allreduce min
            # rank 0
            x = np.random.random(self.shape).astype(self.dtype)
            tensor_x = paddle.to_tensor(x)
            # rank 1
            y = np.random.random(self.shape).astype(self.dtype)
            tensor_y = paddle.to_tensor(y)

            min_result = paddle.minimum(tensor_x, tensor_y)

            if pg.rank() == 0:
                task = dist.all_reduce(tensor_x,
                                       dist.ReduceOp.MIN,
                                       use_calc_stream=False)
                task.wait()
                assert np.array_equal(tensor_x, min_result)
            else:
                task = dist.all_reduce(tensor_y,
                                       dist.ReduceOp.MIN,
                                       use_calc_stream=False)
                task.wait()
                assert np.array_equal(tensor_y, min_result)

            print("test allreduce min api ok")

            # test allreduce prod
            # rank 0
            x = np.random.random(self.shape).astype(self.dtype)
            tensor_x = paddle.to_tensor(x)
            # rank 1
            y = np.random.random(self.shape).astype(self.dtype)
            tensor_y = paddle.to_tensor(y)

            prod_result = np.multiply(x, y)

            if pg.rank() == 0:
                task = dist.all_reduce(tensor_x,
                                       dist.ReduceOp.PROD,
                                       use_calc_stream=False)
                task.wait()
                assert np.array_equal(tensor_x, prod_result)
            else:
                task = dist.all_reduce(tensor_y,
                                       dist.ReduceOp.PROD,
                                       use_calc_stream=False)
                task.wait()
                assert np.array_equal(tensor_y, prod_result)

            print("test allreduce prod api ok")

            # test broadcast
            # rank 0
            x = np.random.random(self.shape).astype(self.dtype)
            tensor_x = paddle.to_tensor(x)
            # rank 1
            y = np.random.random(self.shape).astype(self.dtype)
            tensor_y = paddle.to_tensor(y)

            broadcast_result = paddle.assign(tensor_x)
            if pg.rank() == 0:
                task = dist.broadcast(tensor_x, 0, use_calc_stream=False)
                task.synchronize()
                paddle.device.cuda.synchronize()
                assert task.is_completed()
                assert np.array_equal(broadcast_result, tensor_x)
            else:
                task = dist.broadcast(tensor_y, 0)
                paddle.device.cuda.synchronize()
                assert np.array_equal(broadcast_result, tensor_y)

            print("test broadcast api ok")

            # test barrier
            # rank 0
            if pg.rank() == 0:
                dist.barrier()
            # rank 1
            else:
                task = pg.barrier()
                task.wait()

            print("test barrier api ok\n")

            # test allgather
            # rank 0
            x = np.random.random(self.shape).astype(self.dtype)
            y = np.random.random(self.shape).astype(self.dtype)
            tensor_x = paddle.to_tensor(x)
            tensor_y = paddle.to_tensor(y)
            out_shape = list(self.shape)
            out_shape[0] *= 2
            out = np.random.random(out_shape).astype(self.dtype)
            tensor_out = paddle.to_tensor(out)
            if pg.rank() == 0:
                task = pg.all_gather(tensor_x, tensor_out)
                task.wait()
                paddle.device.cuda.synchronize()
            # rank 1
            else:
                tensor_out_list = [
                    paddle.empty_like(tensor_x),
                    paddle.empty_like(tensor_x)
                ]
                task = dist.all_gather(tensor_out_list,
                                       tensor_y,
                                       use_calc_stream=False)
                paddle.device.cuda.synchronize()
                tensor_out = paddle.concat(tensor_out_list)
            out_1 = paddle.slice(tensor_out, [0], [0], [out_shape[0] // 2])
            out_2 = paddle.slice(tensor_out, [0], [out_shape[0] // 2],
                                 [out_shape[0]])
            assert np.array_equal(tensor_x, out_1)
            assert np.array_equal(tensor_y, out_2)
            print("test allgather api ok\n")

            if pg.rank() == 0:
                task = pg.all_gather(tensor_x, tensor_out)
                task.wait()
                paddle.device.cuda.synchronize()
            # rank 1
            else:
                tensor_out_list = []
                task = dist.all_gather(tensor_out_list,
                                       tensor_y,
                                       use_calc_stream=False)
                paddle.device.cuda.synchronize()
                tensor_out = paddle.concat(tensor_out_list)
            out_1 = paddle.slice(tensor_out, [0], [0], [out_shape[0] // 2])
            out_2 = paddle.slice(tensor_out, [0], [out_shape[0] // 2],
                                 [out_shape[0]])
            assert np.array_equal(tensor_x, out_1)
            assert np.array_equal(tensor_y, out_2)
            print("test allgather api2 ok\n")

            # test alltoall
            # rank 0
            x = np.random.random(self.shape).astype(self.dtype)
            y = np.random.random(self.shape).astype(self.dtype)
            out1 = np.random.random(self.shape).astype(self.dtype)
            out2 = np.random.random(self.shape).astype(self.dtype)
            tensor_x = paddle.to_tensor(x)
            tensor_y = paddle.to_tensor(y)
            tensor_out1 = paddle.to_tensor(out1)
            tensor_out2 = paddle.to_tensor(out2)
            raw_tensor_x_2 = paddle.slice(tensor_x, [0], [self.shape[0] // 2],
                                          [self.shape[0]])
            raw_tensor_y_1 = paddle.slice(tensor_y, [0], [0],
                                          [self.shape[0] // 2])
            if pg.rank() == 0:
                task = pg.alltoall(tensor_x, tensor_out1)
                task.wait()
            # rank 1
            else:
                in_1, in_2 = paddle.split(tensor_y, 2)
                out_1, out_2 = paddle.split(tensor_out2, 2)
                out_tensor_list = [out_1, out_2]
                task = dist.alltoall([in_1, in_2], out_tensor_list)
                paddle.device.cuda.synchronize()
                tensor_out2 = paddle.concat(out_tensor_list)
            out1_2 = paddle.slice(tensor_out1, [0], [self.shape[0] // 2],
                                  [self.shape[0]])
            out2_1 = paddle.slice(tensor_out2, [0], [0], [self.shape[0] // 2])
            if pg.rank() == 0:
                assert np.array_equal(out1_2.numpy(), raw_tensor_y_1.numpy())
            else:
                assert np.array_equal(out2_1, raw_tensor_x_2)
            print("test alltoall api ok\n")

            x = np.random.random(self.shape).astype(self.dtype)
            y = np.random.random(self.shape).astype(self.dtype)
            out1 = np.random.random(self.shape).astype(self.dtype)
            out2 = np.random.random(self.shape).astype(self.dtype)
            tensor_x = paddle.to_tensor(x)
            tensor_y = paddle.to_tensor(y)
            tensor_out1 = paddle.to_tensor(out1)
            tensor_out2 = paddle.to_tensor(out2)
            raw_tensor_x_2 = paddle.slice(tensor_x, [0], [self.shape[0] // 2],
                                          [self.shape[0]])
            raw_tensor_y_1 = paddle.slice(tensor_y, [0], [0],
                                          [self.shape[0] // 2])
            if pg.rank() == 0:
                task = pg.alltoall(tensor_x, tensor_out1)
                task.wait()
            # rank 1
            else:
                in_1, in_2 = paddle.split(tensor_y, 2)
                out_1, out_2 = paddle.split(tensor_out2, 2)
                out_tensor_list = []
                task = dist.alltoall([in_1, in_2], out_tensor_list)
                paddle.device.cuda.synchronize()
                tensor_out2 = paddle.concat(out_tensor_list)
            out1_2 = paddle.slice(tensor_out1, [0], [self.shape[0] // 2],
                                  [self.shape[0]])
            out2_1 = paddle.slice(tensor_out2, [0], [0], [self.shape[0] // 2])
            if pg.rank() == 0:
                assert np.array_equal(out1_2.numpy(), raw_tensor_y_1.numpy())
            else:
                assert np.array_equal(out2_1, raw_tensor_x_2)
            print("test alltoall api2 ok\n")

            # test Reduce
            # rank 0
            x = np.random.random(self.shape).astype(self.dtype)
            y = np.random.random(self.shape).astype(self.dtype)
            tensor_x = paddle.to_tensor(x)
            tensor_y = paddle.to_tensor(y)
            sum_result = tensor_x + tensor_y
            if pg.rank() == 0:
                task = dist.reduce(tensor_x, 0, use_calc_stream=True)
                paddle.device.cuda.synchronize()
            # rank 1
            else:
                task = dist.reduce(tensor_y, 0, use_calc_stream=False)
                task.wait()
                paddle.device.cuda.synchronize()
            if pg.rank() == 0:
                assert np.array_equal(tensor_x, sum_result)
            print("test reduce sum api ok\n")

            # test reduce max
            # rank 0
            x = np.random.random(self.shape).astype(self.dtype)
            tensor_x = paddle.to_tensor(x)
            # rank 1
            y = np.random.random(self.shape).astype(self.dtype)
            tensor_y = paddle.to_tensor(y)

            max_result = paddle.maximum(tensor_x, tensor_y)

            if pg.rank() == 0:
                task = dist.reduce(tensor_x,
                                   0,
                                   dist.ReduceOp.MAX,
                                   use_calc_stream=False)
                task.wait()
                assert np.array_equal(tensor_x, max_result)
            else:
                task = dist.reduce(tensor_y,
                                   0,
                                   dist.ReduceOp.MAX,
                                   use_calc_stream=False)
                task.wait()

            print("test reduce max api ok")

            # test reduce min
            # rank 0
            x = np.random.random(self.shape).astype(self.dtype)
            tensor_x = paddle.to_tensor(x)
            # rank 1
            y = np.random.random(self.shape).astype(self.dtype)
            tensor_y = paddle.to_tensor(y)

            min_result = paddle.minimum(tensor_x, tensor_y)

            if pg.rank() == 0:
                task = dist.reduce(tensor_x,
                                   0,
                                   dist.ReduceOp.MIN,
                                   use_calc_stream=False)
                task.wait()
                assert np.array_equal(tensor_x, min_result)
            else:
                task = dist.reduce(tensor_y,
                                   0,
                                   dist.ReduceOp.MIN,
                                   use_calc_stream=False)
                task.wait()

            print("test reduce min api ok")

            # test reduce product
            # rank 0
            x = np.random.random(self.shape).astype(self.dtype)
            tensor_x = paddle.to_tensor(x)
            # rank 1
            y = np.random.random(self.shape).astype(self.dtype)
            tensor_y = paddle.to_tensor(y)

            prod_result = np.multiply(x, y)

            if pg.rank() == 0:
                task = dist.reduce(tensor_x,
                                   0,
                                   dist.ReduceOp.PROD,
                                   use_calc_stream=False)
                task.wait()
                assert np.array_equal(tensor_x, prod_result)
            else:
                task = dist.reduce(tensor_y,
                                   0,
                                   dist.ReduceOp.PROD,
                                   use_calc_stream=False)
                task.wait()

            print("test reduce prod api ok")
            # test Scatter
            # rank 0
            in_shape = list(self.shape)
            in_shape[0] *= 2
            x = np.random.random(in_shape).astype(self.dtype)
            y = np.random.random(self.shape).astype(self.dtype)
            tensor_x = paddle.to_tensor(x)
            tensor_y = paddle.to_tensor(y)
            if pg.rank() == 0:
                in_1, in_2 = paddle.split(tensor_x, 2)
                task = dist.scatter(tensor_y, [in_1, in_2],
                                    0,
                                    use_calc_stream=True)
                #task.wait()
                paddle.device.cuda.synchronize()
            # rank 1
            else:
                task = dist.scatter(tensor_y, [], 0, use_calc_stream=False)
                task.wait()
                paddle.device.cuda.synchronize()
            out1 = paddle.slice(tensor_x, [0], [0], [self.shape[0]])
            out2 = paddle.slice(tensor_x, [0], [self.shape[0]],
                                [self.shape[0] * 2])
            if pg.rank() == 0:
                assert np.array_equal(tensor_y, out1)
            else:
                assert np.array_equal(tensor_y, out2)
            print("test scatter api ok\n")

            # test send min
            # rank 0
            x = np.random.random(self.shape).astype(self.dtype)
            tensor_x = paddle.to_tensor(x)
            # rank 1
            y = np.random.random(self.shape).astype(self.dtype)
            tensor_y = paddle.to_tensor(y)

            if pg.rank() == 0:
                task = dist.send(tensor_x, 1, use_calc_stream=False)
                task.wait()
            else:
                task = dist.recv(tensor_y, 0, use_calc_stream=False)
                task.wait()
                assert np.array_equal(tensor_y, tensor_x)

            print("test send api ok")

            # test send min
            # rank 0
            x = np.random.random(self.shape).astype(self.dtype)
            tensor_x = paddle.to_tensor(x)
            # rank 1
            y = np.random.random(self.shape).astype(self.dtype)
            tensor_y = paddle.to_tensor(y)

            if pg.rank() == 0:
                task = dist.send(tensor_x, 1, use_calc_stream=True)
            else:
                task = dist.recv(tensor_y, 0, use_calc_stream=True)
                assert np.array_equal(tensor_y, tensor_x)

            print("test send api ok")