def check_finite_and_unscale(x, scale, name=None, float_status=None): """ Check if input X contains all finite data, if yes, scale it by input Scale. $$Out = X / scale$$ If any tensor in X contains Inf or Nan, the Out will generate a indicator. FoundInfinite will be 1 (True), and Out will not be scaled. In this case, the data of Out should not be used, and its data may not be deterministic. Otherwise, FoundInfinite will be 0 (False). Args: x(list|tuple): The input tensors of check_finite_and_unscale operator. scale: The scale of check_finite_and_unscale operator. float_status(Tensor): (Only used on NPU) The float status to check overflow. """ check_type(x, 'x', (tuple, list), 'check_finite_and_unscale') for e in x: check_variable_and_dtype(e, "x", ['float16', 'float32', 'float64'], 'check_finite_and_unscale') helper = LayerHelper("check_finite_and_unscale", **locals()) found_inf = helper.create_variable_for_type_inference(dtype='bool') inputs = {'X': x, 'Scale': scale} if core.is_compiled_with_npu(): check_variable_and_dtype(float_status, "float_status", ['float16', 'float32'], 'check_finite_and_unscale') inputs['FloatStatus'] = float_status outputs = {'Out': x, 'FoundInfinite': found_inf} helper.append_op( type='check_finite_and_unscale', inputs=inputs, outputs=outputs) return x, found_inf
def __init__(self): self._rank = int(os.getenv("PADDLE_TRAINER_ID", "0")) self._world_size = int(os.getenv("PADDLE_TRAINERS_NUM", "1")) # imperative only support one gpu or xpu if core.is_compiled_with_cuda(): selected_gpus = os.getenv("FLAGS_selected_gpus", "0").split(",") self._device_id = int(selected_gpus[0]) elif core.is_compiled_with_xpu(): selected_xpus = os.getenv("FLAGS_selected_xpus", "0").split(",") self._device_id = int(selected_xpus[0]) elif core.is_compiled_with_npu(): selected_npus = os.getenv("FLAGS_selected_npus", "0").split(",") self._device_id = int(selected_npus[0]) elif core.is_compiled_with_mlu(): selected_mlus = os.getenv("FLAGS_selected_mlus", "0").split(",") self._device_id = int(selected_mlus[0]) self._trainer_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS", "").split(",") self._current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT", "") self._nrings = int(os.getenv("FLAGS_nccl_nrings", "1")) assert self._nrings > 0, \ "nccl_nrings must be an integer greater than 0." assert self._nrings < 9, \ "nccl_nrings should be less than 9, which is enough in most scenarios."
def _is_cpuonly(backend): check_backend(backend) if backend in [ 'auto', 'nccl', 'bkcl', 'hccl', 'heter', 'cncl' ] and (core.is_compiled_with_cuda() or core.is_compiled_with_xpu() or core.is_compiled_with_npu() or core.is_compiled_with_mlu()): # passes 'auto' and can use cuda or xpu, use the default logics. so return False return False else: return True
def is_compiled_with_npu(): """ Whether paddle was built with WITH_ASCEND_CL=ON to support Ascend NPU. Returns (bool): `True` if NPU is supported, otherwise `False`. Examples: .. code-block:: python import paddle support_npu = paddle.device.is_compiled_with_npu() """ return core.is_compiled_with_npu()
def test_case(self): import paddle if core.is_compiled_with_npu(): place = core.NPUPlace(0) else: place = core.CPUPlace() with fluid.dygraph.guard(place): input_data = np.random.random((2, 3, 6, 6)).astype("float32") scale_np = np.array([2, 2]).astype("int64") input_x = paddle.to_tensor(input_data) scale = paddle.to_tensor(scale_np) expect_res = nearest_neighbor_interp_np(input_data, out_h=12, out_w=12, align_corners=False) out = interpolate(x=input_x, scale_factor=scale, mode="nearest", align_corners=False) self.assertTrue(np.allclose(out.numpy(), expect_res))
def get_places(self): places = [core.CPUPlace()] if core.is_compiled_with_npu(): places.append(core.NPUPlace(0)) return places
def __init__(self, startup_program, main_program, num_mp=1, num_pp=1, micro_batch_size=1, beam_size=1, init_comm=True, role_maker=None): assert isinstance(startup_program, Program) assert isinstance(main_program, Program) self._device = None if core.is_compiled_with_npu(): self._device = "npu" elif core.is_compiled_with_cuda(): self._device = "gpu" assert self._device, "Only gpu and npu are supported." assert not _non_static_mode(), "Only static mode is supported." op_maker = core.op_proto_and_checker_maker self._op_role = op_maker.OpRole self._op_role_key = op_maker.kOpRoleAttrName() self._op_device_key = op_maker.kOpDeviceAttrName() self._param_device_map = dict() self._pipeline_pair = [] self._pipeline_pair_in_while = [] self._pp_ring_map = dict() self.ring_id = 20 # Just a magic number self.micro_batch_size = micro_batch_size self.beam_size = beam_size self.init_comm = init_comm self._output_var_to_op = None self._input_var_to_op = None self._main_program = main_program self._startup_program = startup_program if role_maker is None: self.role_maker = fleet.base.role_maker.PaddleCloudRoleMaker( is_collective=True) else: if isinstance(role_maker, fleet.base.role_maker.RoleMakerBase): assert role_maker._is_collective == True self.role_maker = role_maker # communication_group info self.mp_ring_id = 0 self.global_ring_id = 1 self.endpoints = self.role_maker._get_trainer_endpoints() self.current_endpoint = self.endpoints[self.role_maker._worker_index()] self.rank = self.role_maker._worker_index() self.nranks = self.role_maker._worker_num() assert num_mp * num_pp == self.nranks self.num_pp = num_pp self.num_mp = num_mp # global ring info self.global_endpoints = self.endpoints self.global_rank = self.rank self.global_nranks = self.nranks arr = np.arange(0, self.num_pp * self.num_mp).reshape( [self.num_pp, self.num_mp]) ipp, imp = np.where(arr == self.rank) ipp = ipp[0] imp = imp[0] self.mp_group = arr[ipp, :] self.pp_group = arr[:, imp] self._stage = ipp
def _convert_to_place(device): lower_device = device.lower() if lower_device == 'cpu': place = core.CPUPlace() elif lower_device == 'gpu': if not core.is_compiled_with_cuda(): raise ValueError("The device should not be 'gpu', " "since PaddlePaddle is not compiled with CUDA") place = core.CUDAPlace(ParallelEnv().dev_id) elif lower_device == 'xpu': if not core.is_compiled_with_xpu(): raise ValueError("The device should not be 'xpu', " "since PaddlePaddle is not compiled with XPU") selected_xpus = os.getenv("FLAGS_selected_xpus", "0").split(",") device_id = int(selected_xpus[0]) place = core.XPUPlace(device_id) elif lower_device == 'npu': if not core.is_compiled_with_npu(): raise ValueError("The device should not be 'npu', " "since PaddlePaddle is not compiled with NPU") selected_npus = os.getenv("FLAGS_selected_npus", "0").split(",") device_id = int(selected_npus[0]) place = core.NPUPlace(device_id) elif lower_device == 'ipu': if not core.is_compiled_with_ipu(): raise ValueError( "The device should not be 'ipu', " \ "since PaddlePaddle is not compiled with IPU") place = core.IPUPlace() elif lower_device == 'mlu': if not core.is_compiled_with_mlu(): raise ValueError("The device should not be 'mlu', " "since PaddlePaddle is not compiled with MLU") selected_mlus = os.getenv("FLAGS_selected_mlus", "0").split(",") device_id = int(selected_mlus[0]) place = core.MLUPlace(device_id) elif device in core.get_all_custom_device_type(): place = core.CustomPlace(device, 0) else: avaliable_gpu_device = re.match(r'gpu:\d+', lower_device) avaliable_xpu_device = re.match(r'xpu:\d+', lower_device) avaliable_npu_device = re.match(r'npu:\d+', lower_device) avaliable_mlu_device = re.match(r'mlu:\d+', lower_device) if not avaliable_gpu_device and not avaliable_xpu_device and not avaliable_npu_device and not avaliable_mlu_device: device_info_list = device.split(':', 1) device_type = device_info_list[0] if device_type in core.get_all_custom_device_type(): device_id = device_info_list[1] device_id = int(device_id) place = core.CustomPlace(device_type, device_id) else: raise ValueError( "The device must be a string which is like 'cpu', {}". format(', '.join("'{}', '{}:x'".format(x, x) for x in ['gpu', 'xpu', 'npu', 'mlu'] + core.get_all_custom_device_type()))) if avaliable_gpu_device: if not core.is_compiled_with_cuda(): raise ValueError( "The device should not be {}, since PaddlePaddle is " "not compiled with CUDA".format(avaliable_gpu_device)) device_info_list = device.split(':', 1) device_id = device_info_list[1] device_id = int(device_id) place = core.CUDAPlace(device_id) if avaliable_xpu_device: if not core.is_compiled_with_xpu(): raise ValueError( "The device should not be {}, since PaddlePaddle is " "not compiled with XPU".format(avaliable_xpu_device)) device_info_list = device.split(':', 1) device_id = device_info_list[1] device_id = int(device_id) place = core.XPUPlace(device_id) if avaliable_npu_device: if not core.is_compiled_with_npu(): raise ValueError( "The device should not be {}, since PaddlePaddle is " "not compiled with NPU".format(avaliable_npu_device)) device_info_list = device.split(':', 1) device_id = device_info_list[1] device_id = int(device_id) place = core.NPUPlace(device_id) if avaliable_mlu_device: if not core.is_compiled_with_mlu(): raise ValueError( "The device should not be {}, since PaddlePaddle is " "not compiled with mlu".format(avaliable_mlu_device)) device_info_list = device.split(':', 1) device_id = device_info_list[1] device_id = int(device_id) place = core.MLUPlace(device_id) return place
def init_parallel_env(): """ Initialize parallel training environment in dynamic graph mode. .. note:: Now initialize both `NCCL` and `GLOO` contexts for communication. Args: backend (string): A string represents the backend used by DataParallel, should be one of 'gloo'(for cpu), 'nccl'(for cuda), 'bkcl'(for xpu), 'auto'(auto detect). The auto detection prefer 'nccl', 'bkcl' than 'gloo'. Returns: None Examples: .. code-block:: python # required: gpu import paddle import paddle.nn as nn import paddle.optimizer as opt import paddle.distributed as dist class LinearNet(nn.Layer): def __init__(self): super(LinearNet, self).__init__() self._linear1 = nn.Linear(10, 10) self._linear2 = nn.Linear(10, 1) def forward(self, x): return self._linear2(self._linear1(x)) def train(): # 1. initialize parallel environment dist.init_parallel_env() # 2. create data parallel layer & optimizer layer = LinearNet() dp_layer = paddle.DataParallel(layer) loss_fn = nn.MSELoss() adam = opt.Adam( learning_rate=0.001, parameters=dp_layer.parameters()) # 3. run layer inputs = paddle.randn([10, 10], 'float32') outputs = dp_layer(inputs) labels = paddle.randn([10, 1], 'float32') loss = loss_fn(outputs, labels) loss.backward() adam.step() adam.clear_grad() if __name__ == '__main__': dist.spawn(train) """ # 0. get env & check world size global _global_parallel_env # when call init_parallel_env, need update `_global_parallel_env` _global_parallel_env = ParallelEnv() parallel_env = _global_parallel_env # if not parallel, `init_parallel_env` do nothing if parallel_env.world_size < 2: warnings.warn( "Currently not a parallel execution environment, `paddle.distributed.init_parallel_env` will not do anything." ) return # NOTE(xiongkun): support cpu gloo only, add this environment variable to # enable cpu only gloo prarllel training) backend = os.environ.get('PADDLE_DISTRI_BACKEND', 'auto') is_cpu_only = _is_cpuonly(backend) # 1. gpu xpu check, must be gpu or xpu, if not (is_cpu_only or core.is_compiled_with_cuda() or core.is_compiled_with_xpu() or core.is_compiled_with_npu() or core.is_compiled_with_mlu()): raise NotImplementedError( "If you want to use CPU-only version, please use 'gloo' as backend" ) if not is_cpu_only and core.is_compiled_with_cuda(): _check_var_exists("FLAGS_selected_gpus") backend = "nccl" if backend == "auto" else backend elif not is_cpu_only and core.is_compiled_with_xpu(): _check_var_exists('FLAGS_selected_xpus') backend = "bkcl" if backend == "auto" else backend elif not is_cpu_only and core.is_compiled_with_npu(): _check_var_exists('FLAGS_selected_npus') backend = "hccl" if backend == "auto" else backend elif not is_cpu_only and core.is_compiled_with_mlu(): _check_var_exists('FLAGS_selected_mlus') backend = "cncl" if backend == "auto" else backend _check_var_exists("PADDLE_TRAINER_ID") _check_var_exists("PADDLE_CURRENT_ENDPOINT") _check_var_exists("PADDLE_TRAINERS_NUM") _check_var_exists("PADDLE_TRAINER_ENDPOINTS") # NOTE(chenweihang): [ why config global place here? ] # the dygraph mode will be set to default mode, # users will not call `dygraph.guard` or `enable_dygraph` # directly, if they want to switch default place, # they need to call a function to change default place, # here just set correctly place to users if is_cpu_only: place = core.CPUPlace() elif core.is_compiled_with_cuda(): place = core.CUDAPlace(parallel_env.device_id) elif core.is_compiled_with_xpu(): place = core.XPUPlace(parallel_env.device_id) elif core.is_compiled_with_npu(): place = core.NPUPlace(parallel_env.device_id) elif core.is_compiled_with_mlu(): place = core.MLUPlace(parallel_env.device_id) _set_expected_place(place) group = None if backend in _valid_backend_list and in_dygraph_mode(): if _default_group_name in _get_group_map_by_name(): return _get_group_map_by_name()[_default_group_name] _set_default_backend(backend) rank = int(os.getenv("PADDLE_TRAINER_ID")) world_size = int(os.getenv("PADDLE_TRAINERS_NUM")) assert rank >= 0 and world_size > rank and world_size > 1, ( "rank must be non-negative and world_size must be the " "maximum rank plus one. Moreover, at least two processes are " "required to create a process group.") master_addr = os.getenv("MASTER_ADDR", None) master_port = os.getenv("MASTER_PORT", None) endpoints = ":".join([master_addr, master_port ]) if master_addr and master_port else None if endpoints is None: endpoints = os.getenv("PADDLE_MASTER", None) if endpoints is None: endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS").split(',')[0] assert endpoints, ( "The environment variable 'MASTER_ADDR' and 'MASTER_PORT' " "must be specified, for example 'export MASTER_ADDR=127.0.0.1' " "and 'export MASTER_ADDR=54612'. Or you can start your training" "with paddle.distributed.run module.") master_addr, master_port = endpoints.split(":") master_port = int(master_port) is_master = rank == 0 stop_check_timeout = int(os.getenv("FLAGS_stop_check_timeout", "900")) default_store = core.TCPStore(master_addr, master_port, is_master, world_size, stop_check_timeout=stop_check_timeout) _set_default_store(default_store) pg = _new_process_group_impl(backend, default_store, rank, world_size, _default_group_name, pg_options=None) ranks = list(range(world_size)) group = Group(rank, world_size, id=0, ranks=ranks, pg=pg, name=_default_group_name) _set_group_map_by_name(_default_group_name, group) _set_group_map(0, group) parallel_helper._set_parallel_ctx(True) paddle.distributed.barrier(group=group) return group node_num = set([i.split(":")[0] for i in parallel_env.trainer_endpoints]) # 3: init gloo context (step 1: httpsever start) init_gloo = int(os.getenv("PADDLE_WITH_GLOO", "0")) if is_cpu_only or init_gloo or backend == "heter": ep_rank_0 = parallel_env.trainer_endpoints[0].split(":") manager = Manager() # glboal dict to store status http_server_d = manager.dict() http_server_d["running"] = False if parallel_env.rank == 0: # The scope for worker used by http server is '_worker' size = {'_worker': parallel_env.world_size} if backend == "heter": size = {'_worker': len(node_num)} http_server = Process(target=_start_kv_server, args=(int(ep_rank_0[1]), http_server_d, size)) http_server.daemon = True http_server_d["running"] = True http_server.start() # 4. init NCCL ParallelStrategy strategy = ParallelStrategy() if parallel_helper._is_parallel_ctx_initialized(): warnings.warn("The parallel environment has been initialized.") strategy.nranks = parallel_env.world_size strategy.local_rank = parallel_env.rank strategy.trainer_endpoints = parallel_env.trainer_endpoints strategy.current_endpoint = parallel_env.current_endpoint strategy.nrings = parallel_env.nrings # init nccl or hccl or bkcl or heter context if is_cpu_only: parallel_helper._set_parallel_ctx( core.GLOOParallelContext(strategy, place)) elif (backend == "heter"): parallel_helper._set_parallel_ctx( core.HeterParallelContext(strategy, parallel_env.device_id)) elif core.is_compiled_with_cuda(): parallel_helper._set_parallel_ctx( core.NCCLParallelContext(strategy, place)) elif core.is_compiled_with_xpu(): parallel_helper._set_parallel_ctx( core.BKCLParallelContext(strategy, place)) elif core.is_compiled_with_npu(): parallel_helper._set_parallel_ctx( core.HCCLParallelContext(strategy, place)) elif core.is_compiled_with_mlu(): parallel_helper._set_parallel_ctx( core.CNCLParallelContext(strategy, place)) if backend != "heter": other_endpoints = strategy.trainer_endpoints[:] other_endpoints.remove(strategy.current_endpoint) if not is_cpu_only and strategy.local_rank == 0: wait_server_ready(other_endpoints) parallel_helper._init_parallel_ctx() # 5: init gloo context (step 2: gloo init) # dividing init_gloo into two part beacause nccl and gloo # are separately looking for free ports which sometimes # leads to port-conflict. if (is_cpu_only or backend == "heter") and parallel_env.rank == 0: # compare to init_gloo, we don't need to # init gloo, because we do this in _init_parallel_ctx; http_server_d["running"] = False http_server.join() elif init_gloo: wait_server_ready([parallel_env.trainer_endpoints[0]]) gloo_strategy = core.GlooParallelStrategy() gloo_strategy.rank = parallel_env.rank gloo_strategy.rank_num = parallel_env.world_size gloo_strategy.ip_address = ep_rank_0[0] gloo_strategy.ip_port = int(ep_rank_0[1]) default_init_timeout_seconds = 3600 default_run_timeout_seconds = 9999999 gloo_strategy.init_seconds = default_init_timeout_seconds gloo_strategy.run_seconds = default_run_timeout_seconds gloo = core.GlooParallelContext(gloo_strategy) gloo.init() if parallel_env.rank == 0: http_server_d["running"] = False http_server.join() return group
def default_pinned(): if core.is_compiled_with_cuda(): return PlaceType.CUDA_PINNED elif core.is_compiled_with_npu(): return PlaceType.NPU_PINNED return PlaceType.CPU
def default_device(): if core.is_compiled_with_cuda(): return PlaceType.CUDA elif core.is_compiled_with_npu(): return PlaceType.NPU return PlaceType.CPU
def set_place(self): return fluid.CPUPlace() if not core.is_compiled_with_npu( ) else paddle.NPUPlace(0)
def test_adam_api(self): # NOTE(zhiqiu): cpu and gpu has different seed, so should compare separatly. self._test_with_place(paddle.CPUPlace()) if core.is_compiled_with_npu(): self._test_with_place(paddle.NPUPlace(0))
def _unscale(self, optimizer): """ Unscale the gradients of parameters, multiplies the gradients of parameters by 1/(loss scaling ratio). If this instance of :class:`GradScaler` is not enabled, output are returned unmodified. Args: optimizer(Optimizer): The optimizer used to update parameters. Returns: The unscaled parameters or original parameters. """ if not self._enable: return optimizer_state = self._optimizer_states[id(optimizer)] if optimizer_state["state"] is OptimizerState.UNSCALED: raise RuntimeError( "unscale_() has already been called on this optimizer since the last update()." ) elif optimizer_state["state"] is OptimizerState.STEPPED: raise RuntimeError("unscale_() is being called after step().") if getattr(optimizer, '_param_groups', None) and isinstance( optimizer._param_groups[0], dict): param_grads = [] param_grads_fp16 = [] param_grads_fp32 = [] for group in optimizer._param_groups: for param in group['params']: if param._grad_ivar() is not None: param_grads.append(param._grad_ivar()) if param._grad_ivar( ).dtype == core.VarDesc.VarType.FP16: param_grads_fp16.append(param._grad_ivar()) else: param_grads_fp32.append(param._grad_ivar()) else: param_grads = [ param._grad_ivar() for param in optimizer._parameter_list if param._grad_ivar() is not None ] param_grads_fp16 = [ param._grad_ivar() for param in optimizer._parameter_list if (param._grad_ivar() is not None) and ( param._grad_ivar().dtype == core.VarDesc.VarType.FP16) ] param_grads_fp32 = [ param._grad_ivar() for param in optimizer._parameter_list if (param._grad_ivar() is not None) and ( param._grad_ivar().dtype == core.VarDesc.VarType.FP32) ] if core.is_compiled_with_npu(): float_status = _C_ops.alloc_float_status() _C_ops.clear_float_status(float_status, float_status) if len(param_grads_fp16): _C_ops.check_finite_and_unscale(param_grads_fp16, self._scale, float_status, param_grads_fp16, self._temp_found_inf_fp16) if len(param_grads_fp32): _C_ops.check_finite_and_unscale(param_grads_fp32, self._scale, float_status, param_grads_fp32, self._temp_found_inf_fp32) else: if len(param_grads_fp16): _C_ops.check_finite_and_unscale(param_grads_fp16, self._scale, param_grads_fp16, self._temp_found_inf_fp16) if len(param_grads_fp32): _C_ops.check_finite_and_unscale(param_grads_fp32, self._scale, param_grads_fp32, self._temp_found_inf_fp32) if len(param_grads_fp16) and len(param_grads_fp32): self._found_inf = self._temp_found_inf_fp16 or self._temp_found_inf_fp32 elif len(param_grads_fp16): self._found_inf = self._temp_found_inf_fp16 else: self._found_inf = self._temp_found_inf_fp32 optimizer_state["state"] = OptimizerState.UNSCALED
class BuildExt(build_ext): def build_extensions(self): if '-Wstrict-prototypes' in self.compiler.compiler_so: self.compiler.compiler_so.remove('-Wstrict-prototypes') super(BuildExt, self).build_extensions() # cc flags paddle_extra_compile_args = [ '-std=c++14', '-shared', '-fPIC', '-Wno-parentheses', '-DPADDLE_WITH_CUSTOM_KERNEL', ] if core.is_compiled_with_npu(): paddle_extra_compile_args += ['-D_GLIBCXX_USE_CXX11_ABI=0'] # include path site_packages_path = site.getsitepackages() paddle_custom_kernel_include = list( map(lambda path: os.path.join(path, 'paddle', 'include'), site_packages_path)) # include path third_party compile_third_party_path = os.path.join(os.environ['PADDLE_ROOT'], 'build/third_party') paddle_custom_kernel_include += [ os.path.join(compile_third_party_path, 'boost/src/extern_boost'), # boost os.path.join(compile_third_party_path, 'install/gflags/include'), # gflags os.path.join(compile_third_party_path, 'install/glog/include'), # glog
def _init_communicator(self, program, current_endpoint, endpoints, rank, ring_id, wait_port, global_ring_id=None, sync=True): # if current_endpoint is None, it means just for sync, # no group is created. if current_endpoint: nranks = len(endpoints) other_endpoints = endpoints[:] other_endpoints.remove(current_endpoint) if rank == 0 and wait_port: wait_server_ready(other_endpoints) def _add_sync_by_allreduce(block): sync_var = block.create_var( name=unique_name.generate('sync_var'), dtype=core.VarDesc.VarType.INT32, persistable=False, stop_gradient=True) block.append_op( type='fill_constant', inputs={}, outputs={'Out': [sync_var]}, attrs={ 'shape': [1], 'dtype': sync_var.dtype, 'value': 1, 'force_cpu': False, OP_ROLE_KEY: OpRole.Forward }) block.append_op( type='c_allreduce_sum', inputs={'X': [sync_var]}, outputs={'Out': [sync_var]}, attrs={ 'ring_id': global_ring_id, 'use_calc_stream': True, OP_ROLE_KEY: OpRole.Forward }) block.append_op( type='c_sync_calc_stream', inputs={'X': sync_var}, outputs={'Out': sync_var}, attrs={OP_ROLE_KEY: OpRole.Forward}) block = program.global_block() if current_endpoint is None: assert endpoints is None assert sync _add_sync_by_allreduce(block) return comm_id_var = block.create_var( name=unique_name.generate('comm_id'), persistable=True, type=core.VarDesc.VarType.RAW) if core.is_compiled_with_cuda(): block.append_op( type='c_gen_nccl_id', inputs={}, outputs={'Out': comm_id_var}, attrs={ 'rank': rank, 'endpoint': current_endpoint, 'other_endpoints': other_endpoints, 'ring_id': ring_id, OP_ROLE_KEY: OpRole.Forward }) block.append_op( type='c_comm_init', inputs={'X': comm_id_var}, outputs={}, attrs={ 'nranks': nranks, 'rank': rank, 'ring_id': ring_id, OP_ROLE_KEY: OpRole.Forward }) elif core.is_compiled_with_xpu(): block.append_op( type='c_gen_bkcl_id', inputs={}, outputs={'Out': comm_id_var}, attrs={ 'rank': rank, 'endpoint': current_endpoint, 'other_endpoints': other_endpoints, 'ring_id': ring_id, OP_ROLE_KEY: OpRole.Forward }) block.append_op( type='c_comm_init', inputs={'X': comm_id_var}, outputs={}, attrs={ 'nranks': nranks, 'rank': rank, 'ring_id': ring_id, OP_ROLE_KEY: OpRole.Forward }) elif core.is_compiled_with_npu(): block.append_op( type='c_gen_hccl_id', inputs={}, outputs={'Out': comm_id_var}, attrs={ 'rank': rank, 'endpoint': current_endpoint, 'other_endpoints': other_endpoints, 'ring_id': ring_id, OP_ROLE_KEY: OpRole.Forward }) block.append_op( type='c_comm_init_hccl', inputs={'X': comm_id_var}, outputs={}, attrs={ 'rank': rank, 'ring_id': ring_id, 'device_id': int(os.getenv("FLAGS_selected_npus")), 'rank_ids': nranks, OP_ROLE_KEY: OpRole.Forward }) else: raise ValueError( "comm_id must be generated in paddlepaddle-xpu or paddlepaddle-xpu." ) if sync: _add_sync_by_allreduce(block)