Esempio n. 1
0
def check_finite_and_unscale(x, scale, name=None, float_status=None):
    """
    Check if input X contains all finite data, if yes, scale it by input Scale.

    $$Out = X / scale$$

    If any tensor in X contains Inf or Nan, the Out will generate a indicator.
    FoundInfinite will be 1 (True), and Out will not be scaled. In this case, the data of 
    Out should not be used, and its data may not be deterministic. 
    Otherwise, FoundInfinite will be 0 (False).

    Args:
        x(list|tuple): The input tensors of check_finite_and_unscale operator.
        scale: The scale of check_finite_and_unscale operator.
        float_status(Tensor): (Only used on NPU) The float status to check overflow.
    """
    check_type(x, 'x', (tuple, list), 'check_finite_and_unscale')
    for e in x:
        check_variable_and_dtype(e, "x", ['float16', 'float32', 'float64'],
                                 'check_finite_and_unscale')

    helper = LayerHelper("check_finite_and_unscale", **locals())
    found_inf = helper.create_variable_for_type_inference(dtype='bool')

    inputs = {'X': x, 'Scale': scale}
    if core.is_compiled_with_npu():
        check_variable_and_dtype(float_status, "float_status",
                                 ['float16', 'float32'],
                                 'check_finite_and_unscale')
        inputs['FloatStatus'] = float_status
    outputs = {'Out': x, 'FoundInfinite': found_inf}
    helper.append_op(
        type='check_finite_and_unscale', inputs=inputs, outputs=outputs)

    return x, found_inf
Esempio n. 2
0
    def __init__(self):
        self._rank = int(os.getenv("PADDLE_TRAINER_ID", "0"))
        self._world_size = int(os.getenv("PADDLE_TRAINERS_NUM", "1"))

        # imperative only support one gpu or xpu
        if core.is_compiled_with_cuda():
            selected_gpus = os.getenv("FLAGS_selected_gpus", "0").split(",")
            self._device_id = int(selected_gpus[0])
        elif core.is_compiled_with_xpu():
            selected_xpus = os.getenv("FLAGS_selected_xpus", "0").split(",")
            self._device_id = int(selected_xpus[0])
        elif core.is_compiled_with_npu():
            selected_npus = os.getenv("FLAGS_selected_npus", "0").split(",")
            self._device_id = int(selected_npus[0])
        elif core.is_compiled_with_mlu():
            selected_mlus = os.getenv("FLAGS_selected_mlus", "0").split(",")
            self._device_id = int(selected_mlus[0])

        self._trainer_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS",
                                            "").split(",")
        self._current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT", "")
        self._nrings = int(os.getenv("FLAGS_nccl_nrings", "1"))
        assert self._nrings > 0, \
            "nccl_nrings must be an integer greater than 0."
        assert self._nrings < 9, \
            "nccl_nrings should be less than 9, which is enough in most scenarios."
Esempio n. 3
0
def _is_cpuonly(backend):
    check_backend(backend)
    if backend in [
            'auto', 'nccl', 'bkcl', 'hccl', 'heter', 'cncl'
    ] and (core.is_compiled_with_cuda() or core.is_compiled_with_xpu()
           or core.is_compiled_with_npu() or core.is_compiled_with_mlu()):

        # passes 'auto' and can use cuda or xpu, use the default logics. so return False
        return False
    else:
        return True
Esempio n. 4
0
def is_compiled_with_npu():
    """
    Whether paddle was built with WITH_ASCEND_CL=ON to support Ascend NPU.

    Returns (bool): `True` if NPU is supported, otherwise `False`.

    Examples:
        .. code-block:: python

            import paddle
            support_npu = paddle.device.is_compiled_with_npu()
    """
    return core.is_compiled_with_npu()
 def test_case(self):
     import paddle
     if core.is_compiled_with_npu():
         place = core.NPUPlace(0)
     else:
         place = core.CPUPlace()
     with fluid.dygraph.guard(place):
         input_data = np.random.random((2, 3, 6, 6)).astype("float32")
         scale_np = np.array([2, 2]).astype("int64")
         input_x = paddle.to_tensor(input_data)
         scale = paddle.to_tensor(scale_np)
         expect_res = nearest_neighbor_interp_np(input_data,
                                                 out_h=12,
                                                 out_w=12,
                                                 align_corners=False)
         out = interpolate(x=input_x,
                           scale_factor=scale,
                           mode="nearest",
                           align_corners=False)
         self.assertTrue(np.allclose(out.numpy(), expect_res))
 def get_places(self):
     places = [core.CPUPlace()]
     if core.is_compiled_with_npu():
         places.append(core.NPUPlace(0))
     return places
Esempio n. 7
0
    def __init__(self,
                 startup_program,
                 main_program,
                 num_mp=1,
                 num_pp=1,
                 micro_batch_size=1,
                 beam_size=1,
                 init_comm=True,
                 role_maker=None):

        assert isinstance(startup_program, Program)
        assert isinstance(main_program, Program)

        self._device = None
        if core.is_compiled_with_npu():
            self._device = "npu"
        elif core.is_compiled_with_cuda():
            self._device = "gpu"
        assert self._device, "Only gpu and npu are supported."
        assert not _non_static_mode(), "Only static mode is supported."

        op_maker = core.op_proto_and_checker_maker
        self._op_role = op_maker.OpRole
        self._op_role_key = op_maker.kOpRoleAttrName()
        self._op_device_key = op_maker.kOpDeviceAttrName()

        self._param_device_map = dict()

        self._pipeline_pair = []
        self._pipeline_pair_in_while = []
        self._pp_ring_map = dict()
        self.ring_id = 20  # Just a magic number

        self.micro_batch_size = micro_batch_size
        self.beam_size = beam_size
        self.init_comm = init_comm

        self._output_var_to_op = None
        self._input_var_to_op = None
        self._main_program = main_program
        self._startup_program = startup_program

        if role_maker is None:
            self.role_maker = fleet.base.role_maker.PaddleCloudRoleMaker(
                is_collective=True)
        else:
            if isinstance(role_maker, fleet.base.role_maker.RoleMakerBase):
                assert role_maker._is_collective == True
                self.role_maker = role_maker

        # communication_group info
        self.mp_ring_id = 0
        self.global_ring_id = 1

        self.endpoints = self.role_maker._get_trainer_endpoints()
        self.current_endpoint = self.endpoints[self.role_maker._worker_index()]
        self.rank = self.role_maker._worker_index()
        self.nranks = self.role_maker._worker_num()
        assert num_mp * num_pp == self.nranks
        self.num_pp = num_pp
        self.num_mp = num_mp

        # global ring info
        self.global_endpoints = self.endpoints
        self.global_rank = self.rank
        self.global_nranks = self.nranks

        arr = np.arange(0, self.num_pp * self.num_mp).reshape(
            [self.num_pp, self.num_mp])
        ipp, imp = np.where(arr == self.rank)
        ipp = ipp[0]
        imp = imp[0]
        self.mp_group = arr[ipp, :]
        self.pp_group = arr[:, imp]

        self._stage = ipp
Esempio n. 8
0
def _convert_to_place(device):
    lower_device = device.lower()
    if lower_device == 'cpu':
        place = core.CPUPlace()
    elif lower_device == 'gpu':
        if not core.is_compiled_with_cuda():
            raise ValueError("The device should not be 'gpu', "
                             "since PaddlePaddle is not compiled with CUDA")
        place = core.CUDAPlace(ParallelEnv().dev_id)
    elif lower_device == 'xpu':
        if not core.is_compiled_with_xpu():
            raise ValueError("The device should not be 'xpu', "
                             "since PaddlePaddle is not compiled with XPU")
        selected_xpus = os.getenv("FLAGS_selected_xpus", "0").split(",")
        device_id = int(selected_xpus[0])
        place = core.XPUPlace(device_id)
    elif lower_device == 'npu':
        if not core.is_compiled_with_npu():
            raise ValueError("The device should not be 'npu', "
                             "since PaddlePaddle is not compiled with NPU")
        selected_npus = os.getenv("FLAGS_selected_npus", "0").split(",")
        device_id = int(selected_npus[0])
        place = core.NPUPlace(device_id)
    elif lower_device == 'ipu':
        if not core.is_compiled_with_ipu():
            raise ValueError(
                "The device should not be 'ipu', " \
                "since PaddlePaddle is not compiled with IPU")
        place = core.IPUPlace()
    elif lower_device == 'mlu':
        if not core.is_compiled_with_mlu():
            raise ValueError("The device should not be 'mlu', "
                             "since PaddlePaddle is not compiled with MLU")
        selected_mlus = os.getenv("FLAGS_selected_mlus", "0").split(",")
        device_id = int(selected_mlus[0])
        place = core.MLUPlace(device_id)
    elif device in core.get_all_custom_device_type():
        place = core.CustomPlace(device, 0)
    else:
        avaliable_gpu_device = re.match(r'gpu:\d+', lower_device)
        avaliable_xpu_device = re.match(r'xpu:\d+', lower_device)
        avaliable_npu_device = re.match(r'npu:\d+', lower_device)
        avaliable_mlu_device = re.match(r'mlu:\d+', lower_device)
        if not avaliable_gpu_device and not avaliable_xpu_device and not avaliable_npu_device and not avaliable_mlu_device:
            device_info_list = device.split(':', 1)
            device_type = device_info_list[0]
            if device_type in core.get_all_custom_device_type():
                device_id = device_info_list[1]
                device_id = int(device_id)
                place = core.CustomPlace(device_type, device_id)
            else:
                raise ValueError(
                    "The device must be a string which is like 'cpu', {}".
                    format(', '.join("'{}', '{}:x'".format(x, x)
                                     for x in ['gpu', 'xpu', 'npu', 'mlu'] +
                                     core.get_all_custom_device_type())))
        if avaliable_gpu_device:
            if not core.is_compiled_with_cuda():
                raise ValueError(
                    "The device should not be {}, since PaddlePaddle is "
                    "not compiled with CUDA".format(avaliable_gpu_device))
            device_info_list = device.split(':', 1)
            device_id = device_info_list[1]
            device_id = int(device_id)
            place = core.CUDAPlace(device_id)
        if avaliable_xpu_device:
            if not core.is_compiled_with_xpu():
                raise ValueError(
                    "The device should not be {}, since PaddlePaddle is "
                    "not compiled with XPU".format(avaliable_xpu_device))
            device_info_list = device.split(':', 1)
            device_id = device_info_list[1]
            device_id = int(device_id)
            place = core.XPUPlace(device_id)
        if avaliable_npu_device:
            if not core.is_compiled_with_npu():
                raise ValueError(
                    "The device should not be {}, since PaddlePaddle is "
                    "not compiled with NPU".format(avaliable_npu_device))
            device_info_list = device.split(':', 1)
            device_id = device_info_list[1]
            device_id = int(device_id)
            place = core.NPUPlace(device_id)
        if avaliable_mlu_device:
            if not core.is_compiled_with_mlu():
                raise ValueError(
                    "The device should not be {}, since PaddlePaddle is "
                    "not compiled with mlu".format(avaliable_mlu_device))
            device_info_list = device.split(':', 1)
            device_id = device_info_list[1]
            device_id = int(device_id)
            place = core.MLUPlace(device_id)
    return place
Esempio n. 9
0
def init_parallel_env():
    """
    Initialize parallel training environment in dynamic graph mode.

    .. note::
        Now initialize both `NCCL` and `GLOO` contexts for communication.

    Args:
        backend (string): A string represents the backend used by DataParallel,
            should be one of 'gloo'(for cpu), 'nccl'(for cuda), 'bkcl'(for xpu), 'auto'(auto detect).
            The auto detection prefer 'nccl', 'bkcl' than 'gloo'.

    Returns:
        None
        
    Examples:
        .. code-block:: python
            # required: gpu
            import paddle
            import paddle.nn as nn
            import paddle.optimizer as opt
            import paddle.distributed as dist

            class LinearNet(nn.Layer):
                def __init__(self):
                    super(LinearNet, self).__init__()
                    self._linear1 = nn.Linear(10, 10)
                    self._linear2 = nn.Linear(10, 1)
                    
                def forward(self, x):
                    return self._linear2(self._linear1(x))

            def train():
                # 1. initialize parallel environment
                dist.init_parallel_env()

                # 2. create data parallel layer & optimizer
                layer = LinearNet()
                dp_layer = paddle.DataParallel(layer)

                loss_fn = nn.MSELoss()
                adam = opt.Adam(
                    learning_rate=0.001, parameters=dp_layer.parameters())

                # 3. run layer
                inputs = paddle.randn([10, 10], 'float32')
                outputs = dp_layer(inputs)
                labels = paddle.randn([10, 1], 'float32')
                loss = loss_fn(outputs, labels)
                
                loss.backward()

                adam.step()
                adam.clear_grad()

            if __name__ == '__main__':
                dist.spawn(train)
    """

    # 0. get env & check world size
    global _global_parallel_env
    # when call init_parallel_env, need update `_global_parallel_env`
    _global_parallel_env = ParallelEnv()
    parallel_env = _global_parallel_env
    # if not parallel, `init_parallel_env` do nothing
    if parallel_env.world_size < 2:
        warnings.warn(
            "Currently not a parallel execution environment, `paddle.distributed.init_parallel_env` will not do anything."
        )
        return
    # NOTE(xiongkun): support cpu gloo only, add this environment variable to
    #                 enable cpu only gloo prarllel training)
    backend = os.environ.get('PADDLE_DISTRI_BACKEND', 'auto')
    is_cpu_only = _is_cpuonly(backend)
    # 1. gpu xpu check, must be gpu or xpu,
    if not (is_cpu_only or core.is_compiled_with_cuda()
            or core.is_compiled_with_xpu() or core.is_compiled_with_npu()
            or core.is_compiled_with_mlu()):
        raise NotImplementedError(
            "If you want to use CPU-only version, please use 'gloo' as backend"
        )

    if not is_cpu_only and core.is_compiled_with_cuda():
        _check_var_exists("FLAGS_selected_gpus")
        backend = "nccl" if backend == "auto" else backend
    elif not is_cpu_only and core.is_compiled_with_xpu():
        _check_var_exists('FLAGS_selected_xpus')
        backend = "bkcl" if backend == "auto" else backend
    elif not is_cpu_only and core.is_compiled_with_npu():
        _check_var_exists('FLAGS_selected_npus')
        backend = "hccl" if backend == "auto" else backend
    elif not is_cpu_only and core.is_compiled_with_mlu():
        _check_var_exists('FLAGS_selected_mlus')
        backend = "cncl" if backend == "auto" else backend

    _check_var_exists("PADDLE_TRAINER_ID")
    _check_var_exists("PADDLE_CURRENT_ENDPOINT")
    _check_var_exists("PADDLE_TRAINERS_NUM")
    _check_var_exists("PADDLE_TRAINER_ENDPOINTS")

    # NOTE(chenweihang): [ why config global place here? ]
    # the dygraph mode will be set to default mode,
    # users will not call `dygraph.guard` or `enable_dygraph`
    # directly, if they want to switch default place,
    # they need to call a function to change default place,
    # here just set correctly place to users
    if is_cpu_only:
        place = core.CPUPlace()
    elif core.is_compiled_with_cuda():
        place = core.CUDAPlace(parallel_env.device_id)
    elif core.is_compiled_with_xpu():
        place = core.XPUPlace(parallel_env.device_id)
    elif core.is_compiled_with_npu():
        place = core.NPUPlace(parallel_env.device_id)
    elif core.is_compiled_with_mlu():
        place = core.MLUPlace(parallel_env.device_id)

    _set_expected_place(place)

    group = None
    if backend in _valid_backend_list and in_dygraph_mode():
        if _default_group_name in _get_group_map_by_name():
            return _get_group_map_by_name()[_default_group_name]
        _set_default_backend(backend)
        rank = int(os.getenv("PADDLE_TRAINER_ID"))
        world_size = int(os.getenv("PADDLE_TRAINERS_NUM"))
        assert rank >= 0 and world_size > rank and world_size > 1, (
            "rank must be non-negative and world_size must be the "
            "maximum rank plus one. Moreover, at least two processes are "
            "required to create a process group.")
        master_addr = os.getenv("MASTER_ADDR", None)
        master_port = os.getenv("MASTER_PORT", None)
        endpoints = ":".join([master_addr, master_port
                              ]) if master_addr and master_port else None
        if endpoints is None:
            endpoints = os.getenv("PADDLE_MASTER", None)
        if endpoints is None:
            endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS").split(',')[0]
        assert endpoints, (
            "The environment variable 'MASTER_ADDR' and 'MASTER_PORT' "
            "must be specified, for example 'export MASTER_ADDR=127.0.0.1' "
            "and 'export MASTER_ADDR=54612'. Or you can start your training"
            "with paddle.distributed.run module.")
        master_addr, master_port = endpoints.split(":")
        master_port = int(master_port)
        is_master = rank == 0
        stop_check_timeout = int(os.getenv("FLAGS_stop_check_timeout", "900"))
        default_store = core.TCPStore(master_addr,
                                      master_port,
                                      is_master,
                                      world_size,
                                      stop_check_timeout=stop_check_timeout)
        _set_default_store(default_store)
        pg = _new_process_group_impl(backend,
                                     default_store,
                                     rank,
                                     world_size,
                                     _default_group_name,
                                     pg_options=None)
        ranks = list(range(world_size))
        group = Group(rank,
                      world_size,
                      id=0,
                      ranks=ranks,
                      pg=pg,
                      name=_default_group_name)
        _set_group_map_by_name(_default_group_name, group)
        _set_group_map(0, group)
        parallel_helper._set_parallel_ctx(True)

        paddle.distributed.barrier(group=group)
        return group

    node_num = set([i.split(":")[0] for i in parallel_env.trainer_endpoints])
    # 3: init gloo context (step 1: httpsever start)
    init_gloo = int(os.getenv("PADDLE_WITH_GLOO", "0"))
    if is_cpu_only or init_gloo or backend == "heter":
        ep_rank_0 = parallel_env.trainer_endpoints[0].split(":")
        manager = Manager()
        # glboal dict to store status
        http_server_d = manager.dict()
        http_server_d["running"] = False
        if parallel_env.rank == 0:
            # The scope for worker used by http server is '_worker'
            size = {'_worker': parallel_env.world_size}
            if backend == "heter":
                size = {'_worker': len(node_num)}
            http_server = Process(target=_start_kv_server,
                                  args=(int(ep_rank_0[1]), http_server_d,
                                        size))
            http_server.daemon = True
            http_server_d["running"] = True
            http_server.start()

    # 4. init NCCL ParallelStrategy
    strategy = ParallelStrategy()
    if parallel_helper._is_parallel_ctx_initialized():
        warnings.warn("The parallel environment has been initialized.")
    strategy.nranks = parallel_env.world_size
    strategy.local_rank = parallel_env.rank
    strategy.trainer_endpoints = parallel_env.trainer_endpoints
    strategy.current_endpoint = parallel_env.current_endpoint
    strategy.nrings = parallel_env.nrings

    # init nccl or hccl or bkcl or heter context
    if is_cpu_only:
        parallel_helper._set_parallel_ctx(
            core.GLOOParallelContext(strategy, place))
    elif (backend == "heter"):
        parallel_helper._set_parallel_ctx(
            core.HeterParallelContext(strategy, parallel_env.device_id))
    elif core.is_compiled_with_cuda():
        parallel_helper._set_parallel_ctx(
            core.NCCLParallelContext(strategy, place))
    elif core.is_compiled_with_xpu():
        parallel_helper._set_parallel_ctx(
            core.BKCLParallelContext(strategy, place))
    elif core.is_compiled_with_npu():
        parallel_helper._set_parallel_ctx(
            core.HCCLParallelContext(strategy, place))
    elif core.is_compiled_with_mlu():
        parallel_helper._set_parallel_ctx(
            core.CNCLParallelContext(strategy, place))

    if backend != "heter":
        other_endpoints = strategy.trainer_endpoints[:]
        other_endpoints.remove(strategy.current_endpoint)
        if not is_cpu_only and strategy.local_rank == 0:
            wait_server_ready(other_endpoints)

    parallel_helper._init_parallel_ctx()

    # 5: init gloo context (step 2: gloo init)
    # dividing init_gloo into two part beacause nccl and gloo
    # are separately looking for free ports which sometimes
    # leads to port-conflict.
    if (is_cpu_only or backend == "heter") and parallel_env.rank == 0:
        # compare to init_gloo, we don't need to
        # init gloo, because we do this in _init_parallel_ctx;
        http_server_d["running"] = False
        http_server.join()

    elif init_gloo:
        wait_server_ready([parallel_env.trainer_endpoints[0]])
        gloo_strategy = core.GlooParallelStrategy()
        gloo_strategy.rank = parallel_env.rank
        gloo_strategy.rank_num = parallel_env.world_size
        gloo_strategy.ip_address = ep_rank_0[0]
        gloo_strategy.ip_port = int(ep_rank_0[1])
        default_init_timeout_seconds = 3600
        default_run_timeout_seconds = 9999999
        gloo_strategy.init_seconds = default_init_timeout_seconds
        gloo_strategy.run_seconds = default_run_timeout_seconds
        gloo = core.GlooParallelContext(gloo_strategy)
        gloo.init()
        if parallel_env.rank == 0:
            http_server_d["running"] = False
            http_server.join()
    return group
Esempio n. 10
0
 def default_pinned():
     if core.is_compiled_with_cuda():
         return PlaceType.CUDA_PINNED
     elif core.is_compiled_with_npu():
         return PlaceType.NPU_PINNED
     return PlaceType.CPU
Esempio n. 11
0
 def default_device():
     if core.is_compiled_with_cuda():
         return PlaceType.CUDA
     elif core.is_compiled_with_npu():
         return PlaceType.NPU
     return PlaceType.CPU
Esempio n. 12
0
 def set_place(self):
     return fluid.CPUPlace() if not core.is_compiled_with_npu(
     ) else paddle.NPUPlace(0)
Esempio n. 13
0
 def test_adam_api(self):
     # NOTE(zhiqiu): cpu and gpu has different seed, so should compare separatly.
     self._test_with_place(paddle.CPUPlace())
     if core.is_compiled_with_npu():
         self._test_with_place(paddle.NPUPlace(0))
Esempio n. 14
0
    def _unscale(self, optimizer):
        """
        Unscale the gradients of parameters, multiplies the gradients of parameters by 1/(loss scaling ratio).  
        If this instance of :class:`GradScaler` is not enabled, output are returned unmodified.
        Args:
            optimizer(Optimizer):  The optimizer used to update parameters.
        Returns:
            The unscaled parameters or original parameters.
        """
        if not self._enable:
            return

        optimizer_state = self._optimizer_states[id(optimizer)]

        if optimizer_state["state"] is OptimizerState.UNSCALED:
            raise RuntimeError(
                "unscale_() has already been called on this optimizer since the last update()."
            )
        elif optimizer_state["state"] is OptimizerState.STEPPED:
            raise RuntimeError("unscale_() is being called after step().")

        if getattr(optimizer, '_param_groups', None) and isinstance(
                optimizer._param_groups[0], dict):
            param_grads = []
            param_grads_fp16 = []
            param_grads_fp32 = []
            for group in optimizer._param_groups:
                for param in group['params']:
                    if param._grad_ivar() is not None:
                        param_grads.append(param._grad_ivar())
                        if param._grad_ivar(
                        ).dtype == core.VarDesc.VarType.FP16:
                            param_grads_fp16.append(param._grad_ivar())
                        else:
                            param_grads_fp32.append(param._grad_ivar())
        else:
            param_grads = [
                param._grad_ivar() for param in optimizer._parameter_list
                if param._grad_ivar() is not None
            ]
            param_grads_fp16 = [
                param._grad_ivar() for param in optimizer._parameter_list
                if (param._grad_ivar() is not None) and (
                    param._grad_ivar().dtype == core.VarDesc.VarType.FP16)
            ]
            param_grads_fp32 = [
                param._grad_ivar() for param in optimizer._parameter_list
                if (param._grad_ivar() is not None) and (
                    param._grad_ivar().dtype == core.VarDesc.VarType.FP32)
            ]
        if core.is_compiled_with_npu():
            float_status = _C_ops.alloc_float_status()
            _C_ops.clear_float_status(float_status, float_status)

            if len(param_grads_fp16):
                _C_ops.check_finite_and_unscale(param_grads_fp16, self._scale,
                                                float_status, param_grads_fp16,
                                                self._temp_found_inf_fp16)
            if len(param_grads_fp32):
                _C_ops.check_finite_and_unscale(param_grads_fp32, self._scale,
                                                float_status, param_grads_fp32,
                                                self._temp_found_inf_fp32)
        else:
            if len(param_grads_fp16):
                _C_ops.check_finite_and_unscale(param_grads_fp16, self._scale,
                                                param_grads_fp16,
                                                self._temp_found_inf_fp16)
            if len(param_grads_fp32):
                _C_ops.check_finite_and_unscale(param_grads_fp32, self._scale,
                                                param_grads_fp32,
                                                self._temp_found_inf_fp32)

        if len(param_grads_fp16) and len(param_grads_fp32):
            self._found_inf = self._temp_found_inf_fp16 or self._temp_found_inf_fp32
        elif len(param_grads_fp16):
            self._found_inf = self._temp_found_inf_fp16
        else:
            self._found_inf = self._temp_found_inf_fp32

        optimizer_state["state"] = OptimizerState.UNSCALED
Esempio n. 15
0
class BuildExt(build_ext):
    def build_extensions(self):
        if '-Wstrict-prototypes' in self.compiler.compiler_so:
            self.compiler.compiler_so.remove('-Wstrict-prototypes')
        super(BuildExt, self).build_extensions()


# cc flags
paddle_extra_compile_args = [
    '-std=c++14',
    '-shared',
    '-fPIC',
    '-Wno-parentheses',
    '-DPADDLE_WITH_CUSTOM_KERNEL',
]
if core.is_compiled_with_npu():
    paddle_extra_compile_args += ['-D_GLIBCXX_USE_CXX11_ABI=0']

# include path
site_packages_path = site.getsitepackages()
paddle_custom_kernel_include = list(
    map(lambda path: os.path.join(path, 'paddle', 'include'),
        site_packages_path))

# include path third_party
compile_third_party_path = os.path.join(os.environ['PADDLE_ROOT'],
                                        'build/third_party')
paddle_custom_kernel_include += [
    os.path.join(compile_third_party_path, 'boost/src/extern_boost'),  # boost
    os.path.join(compile_third_party_path, 'install/gflags/include'),  # gflags
    os.path.join(compile_third_party_path, 'install/glog/include'),  # glog
Esempio n. 16
0
    def _init_communicator(self,
                           program,
                           current_endpoint,
                           endpoints,
                           rank,
                           ring_id,
                           wait_port,
                           global_ring_id=None,
                           sync=True):
        # if current_endpoint is None, it means just for sync,
        # no group is created.
        if current_endpoint:
            nranks = len(endpoints)
            other_endpoints = endpoints[:]
            other_endpoints.remove(current_endpoint)

        if rank == 0 and wait_port:
            wait_server_ready(other_endpoints)

        def _add_sync_by_allreduce(block):
            sync_var = block.create_var(
                name=unique_name.generate('sync_var'),
                dtype=core.VarDesc.VarType.INT32,
                persistable=False,
                stop_gradient=True)
            block.append_op(
                type='fill_constant',
                inputs={},
                outputs={'Out': [sync_var]},
                attrs={
                    'shape': [1],
                    'dtype': sync_var.dtype,
                    'value': 1,
                    'force_cpu': False,
                    OP_ROLE_KEY: OpRole.Forward
                })
            block.append_op(
                type='c_allreduce_sum',
                inputs={'X': [sync_var]},
                outputs={'Out': [sync_var]},
                attrs={
                    'ring_id': global_ring_id,
                    'use_calc_stream': True,
                    OP_ROLE_KEY: OpRole.Forward
                })
            block.append_op(
                type='c_sync_calc_stream',
                inputs={'X': sync_var},
                outputs={'Out': sync_var},
                attrs={OP_ROLE_KEY: OpRole.Forward})

        block = program.global_block()
        if current_endpoint is None:
            assert endpoints is None
            assert sync
            _add_sync_by_allreduce(block)
            return

        comm_id_var = block.create_var(
            name=unique_name.generate('comm_id'),
            persistable=True,
            type=core.VarDesc.VarType.RAW)
        if core.is_compiled_with_cuda():
            block.append_op(
                type='c_gen_nccl_id',
                inputs={},
                outputs={'Out': comm_id_var},
                attrs={
                    'rank': rank,
                    'endpoint': current_endpoint,
                    'other_endpoints': other_endpoints,
                    'ring_id': ring_id,
                    OP_ROLE_KEY: OpRole.Forward
                })
            block.append_op(
                type='c_comm_init',
                inputs={'X': comm_id_var},
                outputs={},
                attrs={
                    'nranks': nranks,
                    'rank': rank,
                    'ring_id': ring_id,
                    OP_ROLE_KEY: OpRole.Forward
                })
        elif core.is_compiled_with_xpu():
            block.append_op(
                type='c_gen_bkcl_id',
                inputs={},
                outputs={'Out': comm_id_var},
                attrs={
                    'rank': rank,
                    'endpoint': current_endpoint,
                    'other_endpoints': other_endpoints,
                    'ring_id': ring_id,
                    OP_ROLE_KEY: OpRole.Forward
                })
            block.append_op(
                type='c_comm_init',
                inputs={'X': comm_id_var},
                outputs={},
                attrs={
                    'nranks': nranks,
                    'rank': rank,
                    'ring_id': ring_id,
                    OP_ROLE_KEY: OpRole.Forward
                })
        elif core.is_compiled_with_npu():
            block.append_op(
                type='c_gen_hccl_id',
                inputs={},
                outputs={'Out': comm_id_var},
                attrs={
                    'rank': rank,
                    'endpoint': current_endpoint,
                    'other_endpoints': other_endpoints,
                    'ring_id': ring_id,
                    OP_ROLE_KEY: OpRole.Forward
                })
            block.append_op(
                type='c_comm_init_hccl',
                inputs={'X': comm_id_var},
                outputs={},
                attrs={
                    'rank': rank,
                    'ring_id': ring_id,
                    'device_id': int(os.getenv("FLAGS_selected_npus")),
                    'rank_ids': nranks,
                    OP_ROLE_KEY: OpRole.Forward
                })
        else:
            raise ValueError(
                "comm_id must be generated in paddlepaddle-xpu or paddlepaddle-xpu."
            )
        if sync: _add_sync_by_allreduce(block)