def _get_size_helper(group, backend): """ The Helper to do get_rank_size. Args: group (str): The communication group. backend (str): The backend, like "hccl". Raises: ValueError: If backend is invalid. Returns: Integer. The rank size of specified group. """ size = None if _is_role_pserver() or _is_role_sched(): size = 1 return size if backend == Backend.HCCL: if group == HCCL_WORLD_COMM_GROUP: size = hccl.get_rank_size() else: size = hccl.get_rank_size(group) elif backend == Backend.NCCL: size = mpi.get_rank_size(group) else: raise ValueError("Invalid backend: '{}'".format(backend)) return size
def step_end(self, run_context): """ Save the checkpoint at the end of step. Args: run_context (RunContext): Context of the train running. """ if _is_role_pserver(): self._prefix = "PServer_" + str( _get_ps_mode_rank()) + "_" + self._prefix cb_params = run_context.original_args() _make_directory(self._directory) # save graph (only once) if not self._graph_saved: graph_file_name = os.path.join(self._directory, self._prefix + '-graph.meta') if os.path.isfile(graph_file_name) and context.get_context( "mode") == context.GRAPH_MODE: os.remove(graph_file_name) _save_graph(cb_params.train_network, graph_file_name) self._graph_saved = True thread_list = threading.enumerate() for thread in thread_list: if thread.getName() == "asyn_save_ckpt": thread.join() self._save_ckpt(cb_params)
def wrapper(*args, **kargs): if _is_role_pserver() or _is_role_sched(): return func(*args, **kargs) if not GlobalComm.INITED: raise RuntimeError("Distributed Communication has not been inited") group = None if "group" in kargs.keys(): group = kargs.get("group") if group is not None and not isinstance(group, str): raise TypeError("Group should be str or None, " "but got group {}".format(type(group))) if "backend" in kargs.keys(): backend = kargs.get("backend") if backend is Backend.HCCL and not is_hccl_available(): raise RuntimeError( "Distributed Communication doesn't have HCCL built in") if backend is Backend.NCCL and not is_nccl_available(): raise RuntimeError( "Distributed Communication doesn't have NCCL built in") if group is None: if backend is Backend.HCCL: group = HCCL_WORLD_COMM_GROUP elif backend is Backend.NCCL: group = NCCL_WORLD_COMM_GROUP return func(*args, **kargs)
def _get_rank_helper(group, backend): """ The Helper to do get_rank_id. Args: group (str): The communication group. backend (str): The backend, like "hccl". Raises: ValueError: If backend is invalid. Returns: Integer. The local rank id of the calling process. """ rank_id = None if _is_role_pserver() or _is_role_sched(): rank_id = 0 return rank_id if backend == Backend.HCCL: if group == HCCL_WORLD_COMM_GROUP: rank_id = hccl.get_rank_id() else: rank_id = hccl.get_rank_id(group) elif backend == Backend.NCCL: rank_id = mpi.get_rank_id(group) else: raise ValueError("Invalid backend: '{}'".format(backend)) return rank_id
def init(backend_name=None): """ Initialize distributed backend, e.g. HCCL/NCCL, it is required before using the communication service. Note: The full name of HCCL is Huawei Collective Communication Library. The full name of NCCL is NVIDIA Collective Communication Library. This method should be used after set_context. Args: backend_name (str): Backend, using HCCL/NCCL. if not been set, infer it by device_target. Default: None. Raises: TypeError: If `backend_name` is not a string. RuntimeError: If device target is invalid, or backend is invalid, or distributed initialization fails, or the environment variables RANK_ID/MINDSPORE_HCCL_CONFIG_PATH have not been exported when backend is HCCL. ValueError: If the environment variable RANK_ID has not been exported as a number. Examples: >>> from mindspore.context import set_context >>> set_context(device_target="Ascend") >>> init() """ if _is_role_pserver() or _is_role_sched(): return device_target = context.get_context("device_target") if backend_name is None: if device_target == "Ascend": backend_name = "hccl" elif device_target == "GPU": backend_name = "nccl" else: raise RuntimeError( "Device target {} is not supported in parallel initialization, " "please use Ascend or GPU.".format(device_target)) if not isinstance(backend_name, str): raise TypeError("Backend name must be a string, but got {}".format( type(backend_name))) if backend_name == "hccl": if device_target != "Ascend": raise RuntimeError( "Device target should be 'Ascend' to init hccl, but got {}". format(device_target)) _check_parallel_envs() init_hccl() GlobalComm.BACKEND = Backend("hccl") GlobalComm.WORLD_COMM_GROUP = HCCL_WORLD_COMM_GROUP GlobalComm.INITED = True elif backend_name == "nccl": init_gpu_collective() GlobalComm.BACKEND = Backend("nccl") GlobalComm.WORLD_COMM_GROUP = NCCL_WORLD_COMM_GROUP GlobalComm.INITED = True else: raise RuntimeError( "Backend name {} is not supported.".format(backend_name))
def init(backend_name=None): """ Initialize distributed backend, e.g. HCCL/NCCL, it is required before using the communication service. Note: The full name of HCCL is Huawei Collective Communication Library. The full name of NCCL is NVIDIA Collective Communication Library. Args: backend_name (str): Backend. Raises: TypeError: If `backend_name` is not a string. RuntimeError: If device target is invalid, or backend is invalid, or distributed initialization fails. """ if _is_role_pserver() or _is_role_sched(): return device_target = context.get_context("device_target") if backend_name is None: if device_target == "Ascend": backend_name = "hccl" elif device_target == "GPU": backend_name = "nccl" else: raise RuntimeError( "Device target {} is not supported.".format(device_target)) if not isinstance(backend_name, str): raise TypeError("Backend name must be a string, but got {}".format( type(backend_name))) if backend_name == "hccl": if device_target != "Ascend": raise RuntimeError( "Device target should be 'Ascend' to init hccl, but got {}". format(device_target)) _check_parallel_envs() init_hccl() GlobalComm.BACKEND = Backend("hccl") GlobalComm.WORLD_COMM_GROUP = HCCL_WORLD_COMM_GROUP GlobalComm.INITED = True elif backend_name == "nccl": init_gpu_collective() GlobalComm.BACKEND = Backend("nccl") GlobalComm.WORLD_COMM_GROUP = NCCL_WORLD_COMM_GROUP GlobalComm.INITED = True else: raise RuntimeError( "Backend name {} is not supported.".format(backend_name))
def _save_ckpt(self, cb_params, force_to_save=False): """Save checkpoint files.""" if cb_params.cur_step_num == self._last_triggered_step: return save_ckpt = self._check_save_ckpt(cb_params, force_to_save) step_num_in_epoch = (cb_params.cur_step_num - 1) % cb_params.batch_num + 1 if save_ckpt: cur_ckpoint_file = self._prefix + "-" + str(cb_params.cur_epoch_num) + "_" \ + str(step_num_in_epoch) + ".ckpt" if _is_role_pserver(): cur_ckpoint_file = "PServer_" + str( _get_ps_mode_rank()) + "_" + cur_ckpoint_file # update checkpoint file list. self._manager.update_ckpoint_filelist(self._directory, self._prefix) # keep checkpoint files number equal max number. if self._config.keep_checkpoint_max and 0 < self._config.keep_checkpoint_max <= self._manager.ckpoint_num: self._manager.remove_oldest_ckpoint_file() elif self._config.keep_checkpoint_per_n_minutes and self._config.keep_checkpoint_per_n_minutes > 0: self._cur_time_for_keep = time.time() if (self._cur_time_for_keep - self._last_time_for_keep) \ < self._config.keep_checkpoint_per_n_minutes * 60: self._manager.keep_one_ckpoint_per_minutes( self._config.keep_checkpoint_per_n_minutes, self._cur_time_for_keep) # generate the new checkpoint file and rename it. global _save_dir _save_dir = self._directory cur_file = os.path.join(self._directory, cur_ckpoint_file) self._last_time_for_keep = time.time() self._last_triggered_step = cb_params.cur_step_num if context.get_context("enable_ge"): set_cur_net(cb_params.train_network) cb_params.train_network.exec_checkpoint_graph() save_checkpoint(cb_params.train_network, cur_file, self._config.integrated_save, self._config.async_save) self._latest_ckpt_file_name = cur_file
def step_end(self, run_context): """ Save the checkpoint at the end of step. Args: run_context (RunContext): Context of the train running. """ if _is_role_pserver(): self._prefix = "PServer_" + str( _get_ps_mode_rank()) + "_" + self._prefix cb_params = run_context.original_args() # save graph (only once) if not self._graph_saved: graph_file_name = os.path.join(self._directory, self._prefix + '-graph.meta') _save_graph(cb_params.train_network, graph_file_name) self._graph_saved = True self._save_ckpt(cb_params)
def init(backend_name=None): """ Init distributed backend, e.g., hccl/nccl, it is required before communication service can be used. Note: The full name of hccl is Huawei Collective Communication Library. The full name of nccl is NVIDIA Collective Communication Library. Args: backend_name (str): Backend. Raises: TypeError: If backen_name is not a string. RuntimeError: If device target is invalid. RuntimeError: If backend is invalid or distributed init fails. """ if _is_role_pserver() or _is_role_sched(): return if backend_name is None: device_target = context.get_context("device_target") if device_target == "Ascend": backend_name = "hccl" elif device_target == "GPU": backend_name = "nccl" else: raise RuntimeError( "Device target {} is not supported.".format(device_target)) if not isinstance(backend_name, str): raise TypeError("Backend name must be a string, but got {}".format( type(backend_name))) if backend_name == "hccl": init_hccl() GlobalComm.BACKEND = Backend("hccl") GlobalComm.WORLD_COMM_GROUP = HCCL_WORLD_COMM_GROUP elif backend_name == "nccl": init_gpu_collective() GlobalComm.BACKEND = Backend("nccl") GlobalComm.WORLD_COMM_GROUP = NCCL_WORLD_COMM_GROUP else: raise RuntimeError( "Backend name {} is not supported.".format(backend_name))
def do_sparse_embedding(ps=False): epoch = 10 net = LeNet5(10) if ps: net.embedding.embedding_table.set_param_ps() optimizer = Adam(filter(lambda x: x.requires_grad, net.get_parameters())) optimizer.target = 'CPU' criterion = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") net_with_criterion = WithLossCell(net, criterion) train_network = TrainOneStepCell(net_with_criterion, optimizer) train_network.set_train() losses = [] for _ in range(epoch): data = Tensor(np.random.randint(0, 15, (32, 3), np.int32)) label = Tensor(np.random.randint(0, 9, (32), np.int32)) if _is_role_pserver(): train_network(data, label) sys.exit() else: loss = train_network(data, label).asnumpy() losses.append(loss) print(losses) return losses
def get_full_batch(self): """Get whether load full batch on each device.""" self.check_context_handle() if _is_role_pserver(): return False return self._context_handle.get_full_batch()
def get_parallel_mode(self): """Get parallel mode.""" self.check_context_handle() if _is_role_pserver(): return context.ParallelMode.STAND_ALONE return self._context_handle.get_parallel_mode()
x = self.fc2(x) x = self.relu(x) x = self.fc3(x) return x if __name__ == "__main__": epoch = 5 np.random.seed(0) network = LeNet5(10) network.set_param_ps() criterion = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") net_opt = nn.Momentum(network.trainable_params(), 0.01, 0.9) if device_target == "GPU": context.set_auto_parallel_context(parallel_mode="data_parallel", gradients_mean=True, device_num=get_group_size()) net_with_criterion = WithLossCell(network, criterion) train_network = TrainOneStepCell(net_with_criterion, net_opt) train_network.set_train() losses = [] for _ in range(epoch): data = Tensor(np.random.rand(32, 3, 32, 32).astype(np.float32)) label = Tensor(np.random.randint(0, 9, (32)).astype(np.int32)) if _is_role_pserver(): train_network(data, label) sys.exit() else: loss = train_network(data, label).asnumpy() losses.append(loss) print(losses)