def test_cases(self): places = [core.CPUPlace()] if core.is_compiled_with_mlu(): places.append(core.MLUPlace(0)) for place in places: self.run_dygraph(place) self.run_static(place)
def test_check_output(self): places = [] if core.is_compiled_with_mlu(): places.append(core.MLUPlace(0)) for place in places: for data_format in ["NCHW", "NHWC"]: self.check_with_place(place, data_format, self.dtype, [2, 3, 4, 5]) self.check_with_place(place, data_format, self.dtype, [2, 3])
def MLUPlace(dev_id): """ Return a Cambricon MLU Place Parameters: dev_id(int): MLU device id Examples: .. code-block:: python # required: mlu import paddle place = paddle.device.MLUPlace(0) """ return core.MLUPlace(dev_id)
def _convert_to_place(device): lower_device = device.lower() if lower_device == 'cpu': place = core.CPUPlace() elif lower_device == 'gpu': if not core.is_compiled_with_cuda(): raise ValueError("The device should not be 'gpu', " "since PaddlePaddle is not compiled with CUDA") place = core.CUDAPlace(ParallelEnv().dev_id) elif lower_device == 'xpu': if not core.is_compiled_with_xpu(): raise ValueError("The device should not be 'xpu', " "since PaddlePaddle is not compiled with XPU") selected_xpus = os.getenv("FLAGS_selected_xpus", "0").split(",") device_id = int(selected_xpus[0]) place = core.XPUPlace(device_id) elif lower_device == 'npu': if not core.is_compiled_with_npu(): raise ValueError("The device should not be 'npu', " "since PaddlePaddle is not compiled with NPU") selected_npus = os.getenv("FLAGS_selected_npus", "0").split(",") device_id = int(selected_npus[0]) place = core.NPUPlace(device_id) elif lower_device == 'ipu': if not core.is_compiled_with_ipu(): raise ValueError( "The device should not be 'ipu', " \ "since PaddlePaddle is not compiled with IPU") place = core.IPUPlace() elif lower_device == 'mlu': if not core.is_compiled_with_mlu(): raise ValueError("The device should not be 'mlu', " "since PaddlePaddle is not compiled with MLU") selected_mlus = os.getenv("FLAGS_selected_mlus", "0").split(",") device_id = int(selected_mlus[0]) place = core.MLUPlace(device_id) elif device in core.get_all_custom_device_type(): place = core.CustomPlace(device, 0) else: avaliable_gpu_device = re.match(r'gpu:\d+', lower_device) avaliable_xpu_device = re.match(r'xpu:\d+', lower_device) avaliable_npu_device = re.match(r'npu:\d+', lower_device) avaliable_mlu_device = re.match(r'mlu:\d+', lower_device) if not avaliable_gpu_device and not avaliable_xpu_device and not avaliable_npu_device and not avaliable_mlu_device: device_info_list = device.split(':', 1) device_type = device_info_list[0] if device_type in core.get_all_custom_device_type(): device_id = device_info_list[1] device_id = int(device_id) place = core.CustomPlace(device_type, device_id) else: raise ValueError( "The device must be a string which is like 'cpu', {}". format(', '.join("'{}', '{}:x'".format(x, x) for x in ['gpu', 'xpu', 'npu', 'mlu'] + core.get_all_custom_device_type()))) if avaliable_gpu_device: if not core.is_compiled_with_cuda(): raise ValueError( "The device should not be {}, since PaddlePaddle is " "not compiled with CUDA".format(avaliable_gpu_device)) device_info_list = device.split(':', 1) device_id = device_info_list[1] device_id = int(device_id) place = core.CUDAPlace(device_id) if avaliable_xpu_device: if not core.is_compiled_with_xpu(): raise ValueError( "The device should not be {}, since PaddlePaddle is " "not compiled with XPU".format(avaliable_xpu_device)) device_info_list = device.split(':', 1) device_id = device_info_list[1] device_id = int(device_id) place = core.XPUPlace(device_id) if avaliable_npu_device: if not core.is_compiled_with_npu(): raise ValueError( "The device should not be {}, since PaddlePaddle is " "not compiled with NPU".format(avaliable_npu_device)) device_info_list = device.split(':', 1) device_id = device_info_list[1] device_id = int(device_id) place = core.NPUPlace(device_id) if avaliable_mlu_device: if not core.is_compiled_with_mlu(): raise ValueError( "The device should not be {}, since PaddlePaddle is " "not compiled with mlu".format(avaliable_mlu_device)) device_info_list = device.split(':', 1) device_id = device_info_list[1] device_id = int(device_id) place = core.MLUPlace(device_id) return place
def test_scale_selected_rows_inplace(self): place = core.MLUPlace(0) self.check_with_place(place, 'in', 'in')
def test_scale_selected_rows_inplace(self): places = [core.CPUPlace()] if core.is_compiled_with_mlu(): places.append(core.MLUPlace(0)) for place in places: self.check_with_place(place, 'in', 'in')
def test_check_output(self): place = core.MLUPlace(0) self.check_output_with_place(place, atol=1e-3)
def init_parallel_env(): """ Initialize parallel training environment in dynamic graph mode. .. note:: Now initialize both `NCCL` and `GLOO` contexts for communication. Args: backend (string): A string represents the backend used by DataParallel, should be one of 'gloo'(for cpu), 'nccl'(for cuda), 'bkcl'(for xpu), 'auto'(auto detect). The auto detection prefer 'nccl', 'bkcl' than 'gloo'. Returns: None Examples: .. code-block:: python # required: gpu import paddle import paddle.nn as nn import paddle.optimizer as opt import paddle.distributed as dist class LinearNet(nn.Layer): def __init__(self): super(LinearNet, self).__init__() self._linear1 = nn.Linear(10, 10) self._linear2 = nn.Linear(10, 1) def forward(self, x): return self._linear2(self._linear1(x)) def train(): # 1. initialize parallel environment dist.init_parallel_env() # 2. create data parallel layer & optimizer layer = LinearNet() dp_layer = paddle.DataParallel(layer) loss_fn = nn.MSELoss() adam = opt.Adam( learning_rate=0.001, parameters=dp_layer.parameters()) # 3. run layer inputs = paddle.randn([10, 10], 'float32') outputs = dp_layer(inputs) labels = paddle.randn([10, 1], 'float32') loss = loss_fn(outputs, labels) loss.backward() adam.step() adam.clear_grad() if __name__ == '__main__': dist.spawn(train) """ # 0. get env & check world size global _global_parallel_env # when call init_parallel_env, need update `_global_parallel_env` _global_parallel_env = ParallelEnv() parallel_env = _global_parallel_env # if not parallel, `init_parallel_env` do nothing if parallel_env.world_size < 2: warnings.warn( "Currently not a parallel execution environment, `paddle.distributed.init_parallel_env` will not do anything." ) return # NOTE(xiongkun): support cpu gloo only, add this environment variable to # enable cpu only gloo prarllel training) backend = os.environ.get('PADDLE_DISTRI_BACKEND', 'auto') is_cpu_only = _is_cpuonly(backend) # 1. gpu xpu check, must be gpu or xpu, if not (is_cpu_only or core.is_compiled_with_cuda() or core.is_compiled_with_xpu() or core.is_compiled_with_npu() or core.is_compiled_with_mlu()): raise NotImplementedError( "If you want to use CPU-only version, please use 'gloo' as backend" ) if not is_cpu_only and core.is_compiled_with_cuda(): _check_var_exists("FLAGS_selected_gpus") backend = "nccl" if backend == "auto" else backend elif not is_cpu_only and core.is_compiled_with_xpu(): _check_var_exists('FLAGS_selected_xpus') backend = "bkcl" if backend == "auto" else backend elif not is_cpu_only and core.is_compiled_with_npu(): _check_var_exists('FLAGS_selected_npus') backend = "hccl" if backend == "auto" else backend elif not is_cpu_only and core.is_compiled_with_mlu(): _check_var_exists('FLAGS_selected_mlus') backend = "cncl" if backend == "auto" else backend _check_var_exists("PADDLE_TRAINER_ID") _check_var_exists("PADDLE_CURRENT_ENDPOINT") _check_var_exists("PADDLE_TRAINERS_NUM") _check_var_exists("PADDLE_TRAINER_ENDPOINTS") # NOTE(chenweihang): [ why config global place here? ] # the dygraph mode will be set to default mode, # users will not call `dygraph.guard` or `enable_dygraph` # directly, if they want to switch default place, # they need to call a function to change default place, # here just set correctly place to users if is_cpu_only: place = core.CPUPlace() elif core.is_compiled_with_cuda(): place = core.CUDAPlace(parallel_env.device_id) elif core.is_compiled_with_xpu(): place = core.XPUPlace(parallel_env.device_id) elif core.is_compiled_with_npu(): place = core.NPUPlace(parallel_env.device_id) elif core.is_compiled_with_mlu(): place = core.MLUPlace(parallel_env.device_id) _set_expected_place(place) group = None if backend in _valid_backend_list and in_dygraph_mode(): if _default_group_name in _get_group_map_by_name(): return _get_group_map_by_name()[_default_group_name] _set_default_backend(backend) rank = int(os.getenv("PADDLE_TRAINER_ID")) world_size = int(os.getenv("PADDLE_TRAINERS_NUM")) assert rank >= 0 and world_size > rank and world_size > 1, ( "rank must be non-negative and world_size must be the " "maximum rank plus one. Moreover, at least two processes are " "required to create a process group.") master_addr = os.getenv("MASTER_ADDR", None) master_port = os.getenv("MASTER_PORT", None) endpoints = ":".join([master_addr, master_port ]) if master_addr and master_port else None if endpoints is None: endpoints = os.getenv("PADDLE_MASTER", None) if endpoints is None: endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS").split(',')[0] assert endpoints, ( "The environment variable 'MASTER_ADDR' and 'MASTER_PORT' " "must be specified, for example 'export MASTER_ADDR=127.0.0.1' " "and 'export MASTER_ADDR=54612'. Or you can start your training" "with paddle.distributed.run module.") master_addr, master_port = endpoints.split(":") master_port = int(master_port) is_master = rank == 0 stop_check_timeout = int(os.getenv("FLAGS_stop_check_timeout", "900")) default_store = core.TCPStore(master_addr, master_port, is_master, world_size, stop_check_timeout=stop_check_timeout) _set_default_store(default_store) pg = _new_process_group_impl(backend, default_store, rank, world_size, _default_group_name, pg_options=None) ranks = list(range(world_size)) group = Group(rank, world_size, id=0, ranks=ranks, pg=pg, name=_default_group_name) _set_group_map_by_name(_default_group_name, group) _set_group_map(0, group) parallel_helper._set_parallel_ctx(True) paddle.distributed.barrier(group=group) return group node_num = set([i.split(":")[0] for i in parallel_env.trainer_endpoints]) # 3: init gloo context (step 1: httpsever start) init_gloo = int(os.getenv("PADDLE_WITH_GLOO", "0")) if is_cpu_only or init_gloo or backend == "heter": ep_rank_0 = parallel_env.trainer_endpoints[0].split(":") manager = Manager() # glboal dict to store status http_server_d = manager.dict() http_server_d["running"] = False if parallel_env.rank == 0: # The scope for worker used by http server is '_worker' size = {'_worker': parallel_env.world_size} if backend == "heter": size = {'_worker': len(node_num)} http_server = Process(target=_start_kv_server, args=(int(ep_rank_0[1]), http_server_d, size)) http_server.daemon = True http_server_d["running"] = True http_server.start() # 4. init NCCL ParallelStrategy strategy = ParallelStrategy() if parallel_helper._is_parallel_ctx_initialized(): warnings.warn("The parallel environment has been initialized.") strategy.nranks = parallel_env.world_size strategy.local_rank = parallel_env.rank strategy.trainer_endpoints = parallel_env.trainer_endpoints strategy.current_endpoint = parallel_env.current_endpoint strategy.nrings = parallel_env.nrings # init nccl or hccl or bkcl or heter context if is_cpu_only: parallel_helper._set_parallel_ctx( core.GLOOParallelContext(strategy, place)) elif (backend == "heter"): parallel_helper._set_parallel_ctx( core.HeterParallelContext(strategy, parallel_env.device_id)) elif core.is_compiled_with_cuda(): parallel_helper._set_parallel_ctx( core.NCCLParallelContext(strategy, place)) elif core.is_compiled_with_xpu(): parallel_helper._set_parallel_ctx( core.BKCLParallelContext(strategy, place)) elif core.is_compiled_with_npu(): parallel_helper._set_parallel_ctx( core.HCCLParallelContext(strategy, place)) elif core.is_compiled_with_mlu(): parallel_helper._set_parallel_ctx( core.CNCLParallelContext(strategy, place)) if backend != "heter": other_endpoints = strategy.trainer_endpoints[:] other_endpoints.remove(strategy.current_endpoint) if not is_cpu_only and strategy.local_rank == 0: wait_server_ready(other_endpoints) parallel_helper._init_parallel_ctx() # 5: init gloo context (step 2: gloo init) # dividing init_gloo into two part beacause nccl and gloo # are separately looking for free ports which sometimes # leads to port-conflict. if (is_cpu_only or backend == "heter") and parallel_env.rank == 0: # compare to init_gloo, we don't need to # init gloo, because we do this in _init_parallel_ctx; http_server_d["running"] = False http_server.join() elif init_gloo: wait_server_ready([parallel_env.trainer_endpoints[0]]) gloo_strategy = core.GlooParallelStrategy() gloo_strategy.rank = parallel_env.rank gloo_strategy.rank_num = parallel_env.world_size gloo_strategy.ip_address = ep_rank_0[0] gloo_strategy.ip_port = int(ep_rank_0[1]) default_init_timeout_seconds = 3600 default_run_timeout_seconds = 9999999 gloo_strategy.init_seconds = default_init_timeout_seconds gloo_strategy.run_seconds = default_run_timeout_seconds gloo = core.GlooParallelContext(gloo_strategy) gloo.init() if parallel_env.rank == 0: http_server_d["running"] = False http_server.join() return group
def test_forward_backward(self): def test_with_place(place, data_layout, shape): # attr epsilon = self.epsilon momentum = self.momentum if data_layout == "NCHW": n, c, h, w = shape[0], shape[1], shape[2], shape[3] else: n, h, w, c = shape[0], shape[1], shape[2], shape[3] scale_shape = [c] np.random.seed(123) x = np.random.random_sample(shape).astype(np.float32) scale = np.random.random_sample(scale_shape).astype(np.float32) bias = np.random.random_sample(scale_shape).astype(np.float32) mean, variance = self.set_mean_variance(scale_shape, x, data_layout) y_grad = np.random.random_sample(shape).astype(np.float32) momentum_var = np.array([momentum]).astype(np.float32) y, mean_out, variance_out, saved_mean, saved_variance, x_grad, scale_grad, bias_grad = self.ref_forward_backward( x, y_grad, scale, bias, mean, variance, epsilon, momentum, shape, data_layout) var_dict = locals() var_dict['y@GRAD'] = y_grad var_dict['x@GRAD'] = x_grad var_dict['scale@GRAD'] = scale_grad var_dict['bias@GRAD'] = bias_grad var_names = [ 'x', 'scale', 'bias', 'mean', 'variance', 'y', 'saved_mean', 'saved_variance', 'momentum_var' ] ground_truth = {name: var_dict[name] for name in var_names} program = fluid.Program() with fluid.program_guard(program): block = program.global_block() for name in ground_truth: block.create_var(name=name, dtype='float32', shape=ground_truth[name].shape) inputs = { "X": block.var('x'), "Scale": block.var('scale'), "Bias": block.var('bias'), "Mean": block.var('mean'), "Variance": block.var('variance') } attrs = { "epsilon": epsilon, "is_test": False, "data_layout": data_layout, "use_mkldnn": False, "fuse_with_relu": self.fuse_with_relu, "use_global_stats": self.use_global_stats } if self.use_momentum_variable: inputs['MomentumTensor'] = block.var('momentum_var') else: attrs['momentum'] = momentum outputs = { "Y": block.var('y'), "MeanOut": block.var('mean'), # share memory "VarianceOut": block.var('variance'), # share memory "SavedMean": block.var('saved_mean'), "SavedVariance": block.var('saved_variance') } block.create_var(name="reserve_space", dtype='float32') outputs["ReserveSpace"] = block.var('reserve_space') bn_op = block.append_op(type="batch_norm", inputs=inputs, outputs=outputs, attrs=attrs) block.create_var(name='y@GRAD', dtype='float32', shape=y.shape) # generate backward op_desc grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc( bn_op.desc, self.no_grad_set, []) grad_op_desc = grad_op_desc_list[0] new_op_desc = block.desc.append_op() new_op_desc.copy_from(grad_op_desc) for var_name in grad_op_desc.output_arg_names(): block.desc.var(var_name.encode("ascii")) grad_op_desc.infer_var_type(block.desc) grad_op_desc.infer_shape(block.desc) for arg in grad_op_desc.output_arg_names(): grad_var = block.desc.find_var(arg.encode("ascii")) grad_var.set_dtype(core.VarDesc.VarType.FP32) program._sync_with_cpp() exe = fluid.Executor(place) out = exe.run(program, feed={ name: var_dict[name] for name in [ 'x', 'scale', 'bias', 'mean', 'variance', 'y@GRAD', 'momentum_var' ] }, fetch_list=self.fetch_list) for id, name in enumerate(self.fetch_list): if name == 'variance': self.__assert_close(var_dict[name], out[id], name, atol=1e-3) continue self.__assert_close(var_dict[name], out[id], name) print("op test forward passed: ", str(place), data_layout) places = [core.CPUPlace()] if core.is_compiled_with_mlu(): places.append(core.MLUPlace(0)) for place in places: for data_format in self.data_formats: test_with_place(place, data_format, [2, 3, 4, 5])
def get_places(self): places = [core.CPUPlace()] if core.is_compiled_with_mlu(): places.append(core.MLUPlace(0)) return places