def _sync_buffers(self): for buffer in self._layer.buffers(include_sublayers=True): dist.broadcast(buffer, self._global_root_rank, self._group, use_calc_stream=True) # Multi stream operation will be supported later dist.wait(tensor=buffer, group=self._group, use_calc_stream=True)
def _sync_params_and_buffers(self): """ Sync all model states for all ranks """ for p in self._layer.parameters(): dist.broadcast(p, src=self._global_root_rank, group=self._group, use_calc_stream=True) # Multi stream operation will be supported later dist.wait(tensor=p, group=self._group, use_calc_stream=True)
def __sync_buffers(self): """ Sync all the param buffers from all ranks (exp: batch norm statistics). """ for buffer in self._layer.buffers(include_sublayers=True): dist.broadcast( buffer, self._global_root_rank, self._group, use_calc_stream=True) # Multi stream operation will be supported later dist.wait(tensor=buffer, group=self._group, use_calc_stream=True)
def _get_size(self): # random_interval = 10 as default, every 10 iters to change self._input_size image_ratio = self.input_size[1] * 1.0 / self.input_size[0] if self._step % self.random_interval == 0: size_factor = random.randint(*self.size_range) size = [ self.size_stride * size_factor, self.size_stride * int(size_factor * image_ratio) ] size = paddle.to_tensor(size) if dist.get_world_size() > 1 and paddle_distributed_is_initialized( ): dist.barrier() dist.broadcast(size, 0) self._input_size = size self._step += 1
def _broadcast_params(self): """Broadcast the parameters of the current rank to each rank""" assert self._default_device == "gpu", "Only supported gpu" # Exchange all the shards with the other ranks for dtype_per_rank in self.param_storages.values(): for dst_rank, internal_storage in dtype_per_rank.items(): dist.broadcast( tensor=internal_storage.buffer, src=dst_rank, group=self.group, use_calc_stream=True) # Multi stream operation will be supported later dist.wait( tensor=internal_storage.buffer, group=self.group, use_calc_stream=True)
def _sync_params_buffers(self): for buffer in self._layers.buffers(): dist.broadcast(buffer, src=0) for param in self._layers.parameters(): dist.broadcast(param, src=0)
def test_create_process_group_nccl(self): with _test_eager_guard(): paddle.set_device('gpu:%d' % paddle.distributed.ParallelEnv().dev_id) pg = init_process_group() print("rank:", pg.rank(), "size:", pg.size(), "name:", pg.name()) print("test new group api ok") # test allreduce sum # rank 0 x = np.random.random(self.shape).astype(self.dtype) tensor_x = paddle.to_tensor(x) # rank 1 y = np.random.random(self.shape).astype(self.dtype) tensor_y = paddle.to_tensor(y) sum_result = tensor_x + tensor_y if pg.rank() == 0: task = dist.all_reduce(tensor_x) assert np.array_equal(tensor_x, sum_result) else: task = dist.all_reduce(tensor_y) assert np.array_equal(tensor_y, sum_result) print("test allreduce sum api ok") # test allreduce max # rank 0 x = np.random.random(self.shape).astype(self.dtype) tensor_x = paddle.to_tensor(x) # rank 1 y = np.random.random(self.shape).astype(self.dtype) tensor_y = paddle.to_tensor(y) max_result = paddle.maximum(tensor_x, tensor_y) if pg.rank() == 0: task = dist.all_reduce(tensor_x, dist.ReduceOp.MAX, use_calc_stream=False) task.wait() assert np.array_equal(tensor_x, max_result) else: task = dist.all_reduce(tensor_y, dist.ReduceOp.MAX, use_calc_stream=False) task.wait() assert np.array_equal(tensor_y, max_result) print("test allreduce max api ok") # test allreduce min # rank 0 x = np.random.random(self.shape).astype(self.dtype) tensor_x = paddle.to_tensor(x) # rank 1 y = np.random.random(self.shape).astype(self.dtype) tensor_y = paddle.to_tensor(y) min_result = paddle.minimum(tensor_x, tensor_y) if pg.rank() == 0: task = dist.all_reduce(tensor_x, dist.ReduceOp.MIN, use_calc_stream=False) task.wait() assert np.array_equal(tensor_x, min_result) else: task = dist.all_reduce(tensor_y, dist.ReduceOp.MIN, use_calc_stream=False) task.wait() assert np.array_equal(tensor_y, min_result) print("test allreduce min api ok") # test allreduce prod # rank 0 x = np.random.random(self.shape).astype(self.dtype) tensor_x = paddle.to_tensor(x) # rank 1 y = np.random.random(self.shape).astype(self.dtype) tensor_y = paddle.to_tensor(y) prod_result = np.multiply(x, y) if pg.rank() == 0: task = dist.all_reduce(tensor_x, dist.ReduceOp.PROD, use_calc_stream=False) task.wait() assert np.array_equal(tensor_x, prod_result) else: task = dist.all_reduce(tensor_y, dist.ReduceOp.PROD, use_calc_stream=False) task.wait() assert np.array_equal(tensor_y, prod_result) print("test allreduce prod api ok") # test broadcast # rank 0 x = np.random.random(self.shape).astype(self.dtype) tensor_x = paddle.to_tensor(x) # rank 1 y = np.random.random(self.shape).astype(self.dtype) tensor_y = paddle.to_tensor(y) broadcast_result = paddle.assign(tensor_x) if pg.rank() == 0: task = dist.broadcast(tensor_x, 0, use_calc_stream=False) task.synchronize() paddle.device.cuda.synchronize() assert task.is_completed() assert np.array_equal(broadcast_result, tensor_x) else: task = dist.broadcast(tensor_y, 0) paddle.device.cuda.synchronize() assert np.array_equal(broadcast_result, tensor_y) print("test broadcast api ok") # test barrier # rank 0 if pg.rank() == 0: dist.barrier() # rank 1 else: task = pg.barrier() task.wait() print("test barrier api ok\n") # test allgather # rank 0 x = np.random.random(self.shape).astype(self.dtype) y = np.random.random(self.shape).astype(self.dtype) tensor_x = paddle.to_tensor(x) tensor_y = paddle.to_tensor(y) out_shape = list(self.shape) out_shape[0] *= 2 out = np.random.random(out_shape).astype(self.dtype) tensor_out = paddle.to_tensor(out) if pg.rank() == 0: task = pg.all_gather(tensor_x, tensor_out) task.wait() paddle.device.cuda.synchronize() # rank 1 else: tensor_out_list = [ paddle.empty_like(tensor_x), paddle.empty_like(tensor_x) ] task = dist.all_gather(tensor_out_list, tensor_y, use_calc_stream=False) paddle.device.cuda.synchronize() tensor_out = paddle.concat(tensor_out_list) out_1 = paddle.slice(tensor_out, [0], [0], [out_shape[0] // 2]) out_2 = paddle.slice(tensor_out, [0], [out_shape[0] // 2], [out_shape[0]]) assert np.array_equal(tensor_x, out_1) assert np.array_equal(tensor_y, out_2) print("test allgather api ok\n") if pg.rank() == 0: task = pg.all_gather(tensor_x, tensor_out) task.wait() paddle.device.cuda.synchronize() # rank 1 else: tensor_out_list = [] task = dist.all_gather(tensor_out_list, tensor_y, use_calc_stream=False) paddle.device.cuda.synchronize() tensor_out = paddle.concat(tensor_out_list) out_1 = paddle.slice(tensor_out, [0], [0], [out_shape[0] // 2]) out_2 = paddle.slice(tensor_out, [0], [out_shape[0] // 2], [out_shape[0]]) assert np.array_equal(tensor_x, out_1) assert np.array_equal(tensor_y, out_2) print("test allgather api2 ok\n") # test alltoall # rank 0 x = np.random.random(self.shape).astype(self.dtype) y = np.random.random(self.shape).astype(self.dtype) out1 = np.random.random(self.shape).astype(self.dtype) out2 = np.random.random(self.shape).astype(self.dtype) tensor_x = paddle.to_tensor(x) tensor_y = paddle.to_tensor(y) tensor_out1 = paddle.to_tensor(out1) tensor_out2 = paddle.to_tensor(out2) raw_tensor_x_2 = paddle.slice(tensor_x, [0], [self.shape[0] // 2], [self.shape[0]]) raw_tensor_y_1 = paddle.slice(tensor_y, [0], [0], [self.shape[0] // 2]) if pg.rank() == 0: task = pg.alltoall(tensor_x, tensor_out1) task.wait() # rank 1 else: in_1, in_2 = paddle.split(tensor_y, 2) out_1, out_2 = paddle.split(tensor_out2, 2) out_tensor_list = [out_1, out_2] task = dist.alltoall([in_1, in_2], out_tensor_list) paddle.device.cuda.synchronize() tensor_out2 = paddle.concat(out_tensor_list) out1_2 = paddle.slice(tensor_out1, [0], [self.shape[0] // 2], [self.shape[0]]) out2_1 = paddle.slice(tensor_out2, [0], [0], [self.shape[0] // 2]) if pg.rank() == 0: assert np.array_equal(out1_2.numpy(), raw_tensor_y_1.numpy()) else: assert np.array_equal(out2_1, raw_tensor_x_2) print("test alltoall api ok\n") x = np.random.random(self.shape).astype(self.dtype) y = np.random.random(self.shape).astype(self.dtype) out1 = np.random.random(self.shape).astype(self.dtype) out2 = np.random.random(self.shape).astype(self.dtype) tensor_x = paddle.to_tensor(x) tensor_y = paddle.to_tensor(y) tensor_out1 = paddle.to_tensor(out1) tensor_out2 = paddle.to_tensor(out2) raw_tensor_x_2 = paddle.slice(tensor_x, [0], [self.shape[0] // 2], [self.shape[0]]) raw_tensor_y_1 = paddle.slice(tensor_y, [0], [0], [self.shape[0] // 2]) if pg.rank() == 0: task = pg.alltoall(tensor_x, tensor_out1) task.wait() # rank 1 else: in_1, in_2 = paddle.split(tensor_y, 2) out_1, out_2 = paddle.split(tensor_out2, 2) out_tensor_list = [] task = dist.alltoall([in_1, in_2], out_tensor_list) paddle.device.cuda.synchronize() tensor_out2 = paddle.concat(out_tensor_list) out1_2 = paddle.slice(tensor_out1, [0], [self.shape[0] // 2], [self.shape[0]]) out2_1 = paddle.slice(tensor_out2, [0], [0], [self.shape[0] // 2]) if pg.rank() == 0: assert np.array_equal(out1_2.numpy(), raw_tensor_y_1.numpy()) else: assert np.array_equal(out2_1, raw_tensor_x_2) print("test alltoall api2 ok\n") # test Reduce # rank 0 x = np.random.random(self.shape).astype(self.dtype) y = np.random.random(self.shape).astype(self.dtype) tensor_x = paddle.to_tensor(x) tensor_y = paddle.to_tensor(y) sum_result = tensor_x + tensor_y if pg.rank() == 0: task = dist.reduce(tensor_x, 0, use_calc_stream=True) paddle.device.cuda.synchronize() # rank 1 else: task = dist.reduce(tensor_y, 0, use_calc_stream=False) task.wait() paddle.device.cuda.synchronize() if pg.rank() == 0: assert np.array_equal(tensor_x, sum_result) print("test reduce sum api ok\n") # test reduce max # rank 0 x = np.random.random(self.shape).astype(self.dtype) tensor_x = paddle.to_tensor(x) # rank 1 y = np.random.random(self.shape).astype(self.dtype) tensor_y = paddle.to_tensor(y) max_result = paddle.maximum(tensor_x, tensor_y) if pg.rank() == 0: task = dist.reduce(tensor_x, 0, dist.ReduceOp.MAX, use_calc_stream=False) task.wait() assert np.array_equal(tensor_x, max_result) else: task = dist.reduce(tensor_y, 0, dist.ReduceOp.MAX, use_calc_stream=False) task.wait() print("test reduce max api ok") # test reduce min # rank 0 x = np.random.random(self.shape).astype(self.dtype) tensor_x = paddle.to_tensor(x) # rank 1 y = np.random.random(self.shape).astype(self.dtype) tensor_y = paddle.to_tensor(y) min_result = paddle.minimum(tensor_x, tensor_y) if pg.rank() == 0: task = dist.reduce(tensor_x, 0, dist.ReduceOp.MIN, use_calc_stream=False) task.wait() assert np.array_equal(tensor_x, min_result) else: task = dist.reduce(tensor_y, 0, dist.ReduceOp.MIN, use_calc_stream=False) task.wait() print("test reduce min api ok") # test reduce product # rank 0 x = np.random.random(self.shape).astype(self.dtype) tensor_x = paddle.to_tensor(x) # rank 1 y = np.random.random(self.shape).astype(self.dtype) tensor_y = paddle.to_tensor(y) prod_result = np.multiply(x, y) if pg.rank() == 0: task = dist.reduce(tensor_x, 0, dist.ReduceOp.PROD, use_calc_stream=False) task.wait() assert np.array_equal(tensor_x, prod_result) else: task = dist.reduce(tensor_y, 0, dist.ReduceOp.PROD, use_calc_stream=False) task.wait() print("test reduce prod api ok") # test Scatter # rank 0 in_shape = list(self.shape) in_shape[0] *= 2 x = np.random.random(in_shape).astype(self.dtype) y = np.random.random(self.shape).astype(self.dtype) tensor_x = paddle.to_tensor(x) tensor_y = paddle.to_tensor(y) if pg.rank() == 0: in_1, in_2 = paddle.split(tensor_x, 2) task = dist.scatter(tensor_y, [in_1, in_2], 0, use_calc_stream=True) #task.wait() paddle.device.cuda.synchronize() # rank 1 else: task = dist.scatter(tensor_y, [], 0, use_calc_stream=False) task.wait() paddle.device.cuda.synchronize() out1 = paddle.slice(tensor_x, [0], [0], [self.shape[0]]) out2 = paddle.slice(tensor_x, [0], [self.shape[0]], [self.shape[0] * 2]) if pg.rank() == 0: assert np.array_equal(tensor_y, out1) else: assert np.array_equal(tensor_y, out2) print("test scatter api ok\n") # test send min # rank 0 x = np.random.random(self.shape).astype(self.dtype) tensor_x = paddle.to_tensor(x) # rank 1 y = np.random.random(self.shape).astype(self.dtype) tensor_y = paddle.to_tensor(y) if pg.rank() == 0: task = dist.send(tensor_x, 1, use_calc_stream=False) task.wait() else: task = dist.recv(tensor_y, 0, use_calc_stream=False) task.wait() assert np.array_equal(tensor_y, tensor_x) print("test send api ok") # test send min # rank 0 x = np.random.random(self.shape).astype(self.dtype) tensor_x = paddle.to_tensor(x) # rank 1 y = np.random.random(self.shape).astype(self.dtype) tensor_y = paddle.to_tensor(y) if pg.rank() == 0: task = dist.send(tensor_x, 1, use_calc_stream=True) else: task = dist.recv(tensor_y, 0, use_calc_stream=True) assert np.array_equal(tensor_y, tensor_x) print("test send api ok")