Esempio n. 1
0
def _get_size_helper(group, backend):
    """
    The Helper to do get_rank_size.

    Args:
        group (str): The communication group.
        backend (str): The backend, like "hccl".

    Raises:
        ValueError: If backend is invalid.

    Returns:
        Integer. The rank size of specified group.
    """
    size = None
    if _is_role_pserver() or _is_role_sched():
        size = 1
        return size
    if backend == Backend.HCCL:
        if group == HCCL_WORLD_COMM_GROUP:
            size = hccl.get_rank_size()
        else:
            size = hccl.get_rank_size(group)
    elif backend == Backend.NCCL:
        size = mpi.get_rank_size(group)
    else:
        raise ValueError("Invalid backend: '{}'".format(backend))
    return size
Esempio n. 2
0
    def step_end(self, run_context):
        """
        Save the checkpoint at the end of step.

        Args:
            run_context (RunContext): Context of the train running.
        """
        if _is_role_pserver():
            self._prefix = "PServer_" + str(
                _get_ps_mode_rank()) + "_" + self._prefix
        cb_params = run_context.original_args()
        _make_directory(self._directory)
        # save graph (only once)
        if not self._graph_saved:
            graph_file_name = os.path.join(self._directory,
                                           self._prefix + '-graph.meta')
            if os.path.isfile(graph_file_name) and context.get_context(
                    "mode") == context.GRAPH_MODE:
                os.remove(graph_file_name)
            _save_graph(cb_params.train_network, graph_file_name)
            self._graph_saved = True
        thread_list = threading.enumerate()
        for thread in thread_list:
            if thread.getName() == "asyn_save_ckpt":
                thread.join()
        self._save_ckpt(cb_params)
Esempio n. 3
0
    def wrapper(*args, **kargs):
        if _is_role_pserver() or _is_role_sched():
            return func(*args, **kargs)
        if not GlobalComm.INITED:
            raise RuntimeError("Distributed Communication has not been inited")
        group = None
        if "group" in kargs.keys():
            group = kargs.get("group")
            if group is not None and not isinstance(group, str):
                raise TypeError("Group should be str or None, "
                                "but got group {}".format(type(group)))

        if "backend" in kargs.keys():
            backend = kargs.get("backend")
            if backend is Backend.HCCL and not is_hccl_available():
                raise RuntimeError(
                    "Distributed Communication doesn't have HCCL built in")
            if backend is Backend.NCCL and not is_nccl_available():
                raise RuntimeError(
                    "Distributed Communication doesn't have NCCL built in")

        if group is None:
            if backend is Backend.HCCL:
                group = HCCL_WORLD_COMM_GROUP
            elif backend is Backend.NCCL:
                group = NCCL_WORLD_COMM_GROUP
        return func(*args, **kargs)
Esempio n. 4
0
def _get_rank_helper(group, backend):
    """
    The Helper to do get_rank_id.

    Args:
        group (str): The communication group.
        backend (str): The backend, like "hccl".

    Raises:
        ValueError: If backend is invalid.

    Returns:
        Integer. The local rank id of the calling process.
    """
    rank_id = None
    if _is_role_pserver() or _is_role_sched():
        rank_id = 0
        return rank_id
    if backend == Backend.HCCL:
        if group == HCCL_WORLD_COMM_GROUP:
            rank_id = hccl.get_rank_id()
        else:
            rank_id = hccl.get_rank_id(group)
    elif backend == Backend.NCCL:
        rank_id = mpi.get_rank_id(group)
    else:
        raise ValueError("Invalid backend: '{}'".format(backend))
    return rank_id
Esempio n. 5
0
def init(backend_name=None):
    """
    Initialize distributed backend, e.g. HCCL/NCCL, it is required before using the communication service.

    Note:
        The full name of HCCL is Huawei Collective Communication Library.
        The full name of NCCL is NVIDIA Collective Communication Library.
        This method should be used after set_context.

    Args:
        backend_name (str): Backend, using HCCL/NCCL. if not been set, infer it by device_target. Default: None.

    Raises:
        TypeError: If `backend_name` is not a string.
        RuntimeError: If device target is invalid, or backend is invalid, or distributed initialization fails,
                      or the environment variables RANK_ID/MINDSPORE_HCCL_CONFIG_PATH
                      have not been exported when backend is HCCL.
        ValueError: If the environment variable RANK_ID has not been exported as a number.

    Examples:
        >>> from mindspore.context import set_context
        >>> set_context(device_target="Ascend")
        >>> init()
    """
    if _is_role_pserver() or _is_role_sched():
        return
    device_target = context.get_context("device_target")
    if backend_name is None:
        if device_target == "Ascend":
            backend_name = "hccl"
        elif device_target == "GPU":
            backend_name = "nccl"
        else:
            raise RuntimeError(
                "Device target {} is not supported in parallel initialization, "
                "please use Ascend or GPU.".format(device_target))
    if not isinstance(backend_name, str):
        raise TypeError("Backend name must be a string, but got {}".format(
            type(backend_name)))

    if backend_name == "hccl":
        if device_target != "Ascend":
            raise RuntimeError(
                "Device target should be 'Ascend' to init hccl, but got {}".
                format(device_target))
        _check_parallel_envs()
        init_hccl()
        GlobalComm.BACKEND = Backend("hccl")
        GlobalComm.WORLD_COMM_GROUP = HCCL_WORLD_COMM_GROUP
        GlobalComm.INITED = True
    elif backend_name == "nccl":
        init_gpu_collective()
        GlobalComm.BACKEND = Backend("nccl")
        GlobalComm.WORLD_COMM_GROUP = NCCL_WORLD_COMM_GROUP
        GlobalComm.INITED = True
    else:
        raise RuntimeError(
            "Backend name {} is not supported.".format(backend_name))
Esempio n. 6
0
def init(backend_name=None):
    """
    Initialize distributed backend, e.g. HCCL/NCCL, it is required before using the communication service.

    Note:
        The full name of HCCL is Huawei Collective Communication Library.
        The full name of NCCL is NVIDIA Collective Communication Library.

    Args:
        backend_name (str): Backend.

    Raises:
        TypeError: If `backend_name` is not a string.
        RuntimeError: If device target is invalid, or backend is invalid, or distributed initialization fails.
    """
    if _is_role_pserver() or _is_role_sched():
        return
    device_target = context.get_context("device_target")
    if backend_name is None:
        if device_target == "Ascend":
            backend_name = "hccl"
        elif device_target == "GPU":
            backend_name = "nccl"
        else:
            raise RuntimeError(
                "Device target {} is not supported.".format(device_target))
    if not isinstance(backend_name, str):
        raise TypeError("Backend name must be a string, but got {}".format(
            type(backend_name)))

    if backend_name == "hccl":
        if device_target != "Ascend":
            raise RuntimeError(
                "Device target should be 'Ascend' to init hccl, but got {}".
                format(device_target))
        _check_parallel_envs()
        init_hccl()
        GlobalComm.BACKEND = Backend("hccl")
        GlobalComm.WORLD_COMM_GROUP = HCCL_WORLD_COMM_GROUP
        GlobalComm.INITED = True
    elif backend_name == "nccl":
        init_gpu_collective()
        GlobalComm.BACKEND = Backend("nccl")
        GlobalComm.WORLD_COMM_GROUP = NCCL_WORLD_COMM_GROUP
        GlobalComm.INITED = True
    else:
        raise RuntimeError(
            "Backend name {} is not supported.".format(backend_name))
Esempio n. 7
0
    def _save_ckpt(self, cb_params, force_to_save=False):
        """Save checkpoint files."""
        if cb_params.cur_step_num == self._last_triggered_step:
            return

        save_ckpt = self._check_save_ckpt(cb_params, force_to_save)
        step_num_in_epoch = (cb_params.cur_step_num -
                             1) % cb_params.batch_num + 1

        if save_ckpt:
            cur_ckpoint_file = self._prefix + "-" + str(cb_params.cur_epoch_num) + "_" \
                               + str(step_num_in_epoch) + ".ckpt"
            if _is_role_pserver():
                cur_ckpoint_file = "PServer_" + str(
                    _get_ps_mode_rank()) + "_" + cur_ckpoint_file
            # update checkpoint file list.
            self._manager.update_ckpoint_filelist(self._directory,
                                                  self._prefix)
            # keep checkpoint files number equal max number.
            if self._config.keep_checkpoint_max and 0 < self._config.keep_checkpoint_max <= self._manager.ckpoint_num:
                self._manager.remove_oldest_ckpoint_file()
            elif self._config.keep_checkpoint_per_n_minutes and self._config.keep_checkpoint_per_n_minutes > 0:
                self._cur_time_for_keep = time.time()
                if (self._cur_time_for_keep - self._last_time_for_keep) \
                        < self._config.keep_checkpoint_per_n_minutes * 60:
                    self._manager.keep_one_ckpoint_per_minutes(
                        self._config.keep_checkpoint_per_n_minutes,
                        self._cur_time_for_keep)

            # generate the new checkpoint file and rename it.
            global _save_dir
            _save_dir = self._directory
            cur_file = os.path.join(self._directory, cur_ckpoint_file)
            self._last_time_for_keep = time.time()
            self._last_triggered_step = cb_params.cur_step_num

            if context.get_context("enable_ge"):
                set_cur_net(cb_params.train_network)
                cb_params.train_network.exec_checkpoint_graph()

            save_checkpoint(cb_params.train_network, cur_file,
                            self._config.integrated_save,
                            self._config.async_save)

            self._latest_ckpt_file_name = cur_file
Esempio n. 8
0
    def step_end(self, run_context):
        """
        Save the checkpoint at the end of step.

        Args:
            run_context (RunContext): Context of the train running.
        """
        if _is_role_pserver():
            self._prefix = "PServer_" + str(
                _get_ps_mode_rank()) + "_" + self._prefix
        cb_params = run_context.original_args()
        # save graph (only once)
        if not self._graph_saved:
            graph_file_name = os.path.join(self._directory,
                                           self._prefix + '-graph.meta')
            _save_graph(cb_params.train_network, graph_file_name)
            self._graph_saved = True
        self._save_ckpt(cb_params)
Esempio n. 9
0
def init(backend_name=None):
    """
    Init distributed backend, e.g., hccl/nccl, it is required before communication service can be used.

    Note:
        The full name of hccl is Huawei Collective Communication Library.
        The full name of nccl is NVIDIA Collective Communication Library.

    Args:
        backend_name (str): Backend.

    Raises:
        TypeError: If backen_name is not a string.
        RuntimeError: If device target is invalid.
        RuntimeError: If backend is invalid or distributed init fails.
    """
    if _is_role_pserver() or _is_role_sched():
        return
    if backend_name is None:
        device_target = context.get_context("device_target")
        if device_target == "Ascend":
            backend_name = "hccl"
        elif device_target == "GPU":
            backend_name = "nccl"
        else:
            raise RuntimeError(
                "Device target {} is not supported.".format(device_target))
    if not isinstance(backend_name, str):
        raise TypeError("Backend name must be a string, but got {}".format(
            type(backend_name)))

    if backend_name == "hccl":
        init_hccl()
        GlobalComm.BACKEND = Backend("hccl")
        GlobalComm.WORLD_COMM_GROUP = HCCL_WORLD_COMM_GROUP
    elif backend_name == "nccl":
        init_gpu_collective()
        GlobalComm.BACKEND = Backend("nccl")
        GlobalComm.WORLD_COMM_GROUP = NCCL_WORLD_COMM_GROUP
    else:
        raise RuntimeError(
            "Backend name {} is not supported.".format(backend_name))
Esempio n. 10
0
def do_sparse_embedding(ps=False):
    epoch = 10
    net = LeNet5(10)
    if ps:
        net.embedding.embedding_table.set_param_ps()

    optimizer = Adam(filter(lambda x: x.requires_grad, net.get_parameters()))
    optimizer.target = 'CPU'
    criterion = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
    net_with_criterion = WithLossCell(net, criterion)
    train_network = TrainOneStepCell(net_with_criterion, optimizer)
    train_network.set_train()
    losses = []
    for _ in range(epoch):
        data = Tensor(np.random.randint(0, 15, (32, 3), np.int32))
        label = Tensor(np.random.randint(0, 9, (32), np.int32))
        if _is_role_pserver():
            train_network(data, label)
            sys.exit()
        else:
            loss = train_network(data, label).asnumpy()
            losses.append(loss)
    print(losses)
    return losses
 def get_full_batch(self):
     """Get whether load full batch on each device."""
     self.check_context_handle()
     if _is_role_pserver():
         return False
     return self._context_handle.get_full_batch()
 def get_parallel_mode(self):
     """Get parallel mode."""
     self.check_context_handle()
     if _is_role_pserver():
         return context.ParallelMode.STAND_ALONE
     return self._context_handle.get_parallel_mode()
Esempio n. 13
0
        x = self.fc2(x)
        x = self.relu(x)
        x = self.fc3(x)
        return x


if __name__ == "__main__":
    epoch = 5
    np.random.seed(0)
    network = LeNet5(10)
    network.set_param_ps()
    criterion = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
    net_opt = nn.Momentum(network.trainable_params(), 0.01, 0.9)
    if device_target == "GPU":
        context.set_auto_parallel_context(parallel_mode="data_parallel", gradients_mean=True,
                                          device_num=get_group_size())
    net_with_criterion = WithLossCell(network, criterion)
    train_network = TrainOneStepCell(net_with_criterion, net_opt)
    train_network.set_train()
    losses = []
    for _ in range(epoch):
        data = Tensor(np.random.rand(32, 3, 32, 32).astype(np.float32))
        label = Tensor(np.random.randint(0, 9, (32)).astype(np.int32))
        if _is_role_pserver():
            train_network(data, label)
            sys.exit()
        else:
            loss = train_network(data, label).asnumpy()
            losses.append(loss)
    print(losses)