def __init__(self,
                 params,
                 optim,
                 group=None,
                 broadcast_fp16=False,
                 offload=False,
                 device="gpu",
                 **kw):

        super().__init__(optim._learning_rate, params, kw)

        # Segmentation information
        self._dtype_rank_params = OrderedDict(
        )  # {dtype:[param1,param2]} device, rank, params
        self._param2rank = {}
        self.__segment_params = []
        self._rank_buffer_size = {}  # {dtype: {rank: numel+alignment}}
        self._param2align = {}  # {param.name: align}

        # Default information
        self._optim_defaults = kw
        self._optim = optim
        self._ori_parameter_list = self._optim._parameter_list
        self._ori_param_groups = self._optim._param_groups

        assert hasattr(self._optim, "_master_weights"
                       ), "Must use optimizer with _master_weights attribute"
        self._local_params = params
        self._default_device = device
        self._pfp16 = len(
            list(
                filter(lambda x: x.trainable and x.dtype == Type.fp16.value,
                       self._local_params))) > 0

        self.group = dist.new_group(_get_global_group()
                                    .ranks) if group is None else group

        self.world_size = self.group.nranks
        self.rank = self.group.rank

        self.broadcast_fp16 = broadcast_fp16
        self.param_storages = {}  # {dtype: {rank: InternalStorage}}

        if isinstance(self._optim._grad_clip, ClipGradByGlobalNorm):
            logging.warning(
                "While using ClipGradByGlobalNorm in ShardingOptimizer, the grad clip of original optimizer will be changed."
            )
            self._optim._grad_clip = ShardingClipGrad(self._optim._grad_clip,
                                                      paddle.get_device(),
                                                      self.group)

        if offload:
            assert self._pfp16, "Only support offload strategy while using \'Adam\', \'AdamW\' and \'Momentum\' optimizer with AMP/Pure FP16"

        self.offload = offload  # Using for offload
        self.offload_device = "cpu"
        self.offload_buffer_size = 0
        self.offload_param2align = {}
        self.offload_params = None
        self.offload_grads = None

        self._master_params = {}

        # Update optimizer parameters and adjust parameter storage and use according to rank.
        self._update_opt_status()
    def __init__(self,
                 layer,
                 optimizer,
                 group=None,
                 sync_buffers=False,
                 device="gpu",
                 pertrain_sync_models=True,
                 accumulate_grads=False,
                 offload=False,
                 sync_comm=False):
        super().__init__()

        # Default configs
        assert core.is_compiled_with_cuda(), "Only support CUDA."
        self._layer = layer
        self._default_device = device
        self.__sync_buffers = sync_buffers
        self._accumulate_grads = accumulate_grads
        self._offload = offload
        self._sync_comm = sync_comm

        # Communication group establishment
        self._group = dist.new_group(
            _get_global_group().ranks) if group is None else group
        self._world_size_scaling = 1.0 / self._group.nranks
        assert self._group.nranks > 1, "Training must be distributed, ranks must be greater than 1."
        self._rank = self._group.rank
        self._global_root_rank = 0  # picking rank 0 as the reference
        self._global_ranks = self._group.ranks
        self._param2buffer_size = dict()  # {param.name: size}
        self._param2buffer = dict(
        )  # {param.name: [(start0, end0),(start1, end1), ...]}
        self._trainable_params = dict()  # {layer.name: [trainable_params]}

        assert not isinstance(
            optimizer, list), "Multiple optimizers are not supported now."
        self._optim = _OptimizerWrapper(optimizer, self._offload, self._group,
                                        self._update_params_slice)
        self._ori_parameter_list = self._optim._parameter_list
        self._ori_param_groups = self._optim._param_groups

        # Replace optimizer's _grad_clip
        if isinstance(self._optim._grad_clip, ClipGradByGlobalNorm):
            logging.warning(
                "While using ClipGradByGlobalNorm in ShardingStage3, the grad clip of original optimizer will be changed."
            )
            self._optim._grad_clip = ShardingClipGrad(self._optim._grad_clip,
                                                      paddle.get_device(),
                                                      self._group)

        # Synchronous all ranks models
        if pertrain_sync_models:
            self._sync_params_and_buffers()

        self._segment_rank_params(self._layer)

        # In the first step, record the execution order of the layer
        self._order_tracer = OrderedDict()
        self._order_tracer["order"] = 0
        self._order_tracer["layer"] = []
        # Register task flow
        self._task_flow = TaskFlow()
        # Register forward hooks
        self._register_forward_hooks(self._layer)
        # Register backward parameter hooks
        self._register_backward_hooks()
        # Redefine optimizer step and clear function
        self._redefine_opt_step()
        self._redefine_opt_clear()
Beispiel #3
0
    def __init__(
            self,
            layer,
            sharding_optimizer,
            group=None,
            sync_buffers=False,
            buffer_max_size=2**23,  #8MB
            auto_refresh_trainable=True,
            device="gpu"):
        super().__init__()

        # training options
        self._layer = layer
        self._sharding_optimizers = [
            sharding_optimizer
        ] if not isinstance(sharding_optimizer, list) else sharding_optimizer
        assert all(
            list(
                map(lambda opt: isinstance(opt, GroupShardedOptimizerStage2),
                    self._sharding_optimizers))
        ), "Please use GroupShardedOptimizerStage2 optimizer"
        self._sync_buffers = sync_buffers
        self._auto_refresh_trainable = auto_refresh_trainable

        # Communication related attributes
        self._group = collective.new_group(
            collective._get_global_group().ranks) if group is None else group
        self._world_size_scaling = 1.0 / self._group.nranks
        assert self._group.nranks > 1, "Training must be distributed, ranks must be greater than 1"
        self._rank = self._group.rank
        self._global_root_rank = self._group.ranks[
            0]  # picking ranks index 0 as the reference
        self._default_device = device

        # Global statistical parameters
        self._all_params = []
        for optim in self._sharding_optimizers:
            self._all_params.extend(list(optim.local_params))

        self._trainable_params = []
        self._grad_reduced = []
        self._trainable_param2rank = {}
        self._trainable_param2align = {}
        self._trainable_mask = list(map(_trainable, self._all_params))
        self._param_grads = []

        # Set grad storage size & Display param sizes and model sizes
        model_size = sum([p._numel() for p in self._layer.parameters()])
        assert buffer_max_size >= 0, "buffer_max_size must be GE than 0."
        self._buffer_max_size = self._rank_buffer_size(buffer_max_size,
                                                       model_size)
        self._use_grad_storage = buffer_max_size > 0
        self._grad_storages = {}  # {dtype: {rank: GradStorage}}
        self._has_grad_storage = []
        self._grad_storage_list = []

        # Offload
        # TODO(haohongxiang): Now it's not be supported for multi-optimizers using Offload strategy
        self._offload_optims = list(
            filter(lambda optim: optim.offload, self._sharding_optimizers))
        if len(self._offload_optims) > 0:
            assert len(
                self._sharding_optimizers
            ) == 1, "Only support offload strategy for single optimizer"

        self._offload = len(self._offload_optims) > 0
        self._offload_device = "cpu"

        # Set backward pass hooks
        self._bw_hooks = []

        # TODO (Baibaifan) Set tasks flow support asynchronous communicate
        # self._tasks_flow = deque()

        # Define optimizer step and clear_grad
        self._redefine_opt_step()
        self._redefine_opt_clear()
    def __init__(
            self,
            layer,
            sharding_optimizer,
            group=None,
            sync_buffers=False,
            pertrain_sync_models=True,
            buffer_max_size=2**23,  #8MB
            auto_refresh_trainable=True,
            device="gpu",
            use_grad_storage=True,
            accumulate_grads=False):
        super().__init__()

        # training options
        self._layer = layer
        self._sharding_optimizers = [sharding_optimizer] if not isinstance(
            sharding_optimizer, list) else sharding_optimizer
        assert all(
            list(
                map(lambda opt: isinstance(opt, ShardingOptimizerStage2),
                    self._sharding_optimizers))
        ), "Please use ShardingOptimizerStage2 optimizer"
        self._sync_buffers = sync_buffers
        self._auto_refresh_trainable = auto_refresh_trainable

        # Gradient accumulation, Gradient flip
        self._accumulate_grads = accumulate_grads

        # Communication related attributes
        self._group = dist.new_group(_get_global_group()
                                     .ranks) if group is None else group
        self._world_size_scaling = 1.0 / self._group.nranks
        assert self._group.nranks > 1, "Training must be distributed, ranks must be greater than 1"
        self._rank = self._group.rank
        self._global_root_rank = 0  # picking rank 0 as the reference
        self._default_device = device

        # Global statistical parameters
        self._all_params = list(
            chain(*[optim.local_params for optim in self._sharding_optimizers]))
        self._trainable_params = []
        self._grad_reduced = []
        self._trainable_param2rank = {}
        self._trainable_param2align = {}
        self._trainable_mask = list(map(_trainable, self._all_params))
        self._param_grads = []

        # Set grad storage size & Display param sizes and model sizes
        model_size = sum(
            [np.prod(p.shape) for p in self._layer.parameters()]).item()
        self._buffer_max_size = self._rank_buffer_size(buffer_max_size,
                                                       model_size)
        self._use_grad_storage = use_grad_storage
        self._grad_storages = {}  # {dtype: {rank: GradStorage}}
        self._has_grad_storage = []
        self._grad_storage_list = []

        # Offload
        # TODO(haohongxiang): Now it's not be supported for multi-optimizers using Offload strategy
        self._offload_optims = list(
            filter(lambda optim: optim.offload, self._sharding_optimizers))
        if len(self._offload_optims) > 0:
            assert len(
                self._sharding_optimizers
            ) == 1, "Only support offload strategy for single optimizer"

        self._offload = self._sharding_optimizers[0].offload
        self._offload_device = "cpu"

        # Set backward pass hooks
        self._bw_hooks = []

        # Synchronous all ranks models
        if pertrain_sync_models:
            self._sync_params_and_buffers()

        # Set tasks flow
        self._tasks_flow = deque()

        # Define optimizer step and clear_grad
        if self._accumulate_grads:
            self._redefine_opt_step()
        self._redefine_opt_clear()
Beispiel #5
0
    def __init__(self,
                 params,
                 optim,
                 group=None,
                 offload=False,
                 device="gpu",
                 pertrain_sync_models=True,
                 **kw):

        super().__init__(learning_rate=optim._learning_rate, parameters=params)
        assert core.is_compiled_with_cuda(), "Only GPU is supported now"

        # Segmentation information
        self._dtype_rank_params = OrderedDict(
        )  # {dtype:[param1,param2]} device, rank, params
        self._param2rank = {}
        self.__segment_params = []
        self._rank_buffer_size = {}  # {dtype: {rank: numel+alignment}}
        self._param2align = {}  # {param.name: align}

        # Default information
        self._optim = optim

        assert hasattr(self._optim, "_master_weights"
                       ), "Must use optimizer with _master_weights attribute"

        # Support parameter group and parameter list
        self._local_params = []
        if isinstance(params[0], dict):
            for param_group in params:
                self._local_params.extend(list(param_group["params"]))
        else:
            self._local_params.extend(list(params))

        self._default_device = device
        self._pfp16 = len(
            list(
                filter(lambda x: x.trainable and x.dtype == Type.fp16.value,
                       self._local_params))) > 0

        self._group = new_group(
            _get_global_group().ranks) if group is None else group

        self.world_size = self._group.nranks
        self._rank = self._group.rank
        self._global_root_rank = self._group.ranks[0]

        # Synchronous all ranks models
        if pertrain_sync_models:
            self._sync_params_and_buffers()

        self.param_storages = {}  # {dtype: {rank: InternalStorage}}

        if isinstance(self._optim._grad_clip, ClipGradByGlobalNorm):
            logging.warning(
                "While using ClipGradByGlobalNorm in GroupShardedOptimizerStage2, the grad clip of original optimizer will be changed."
            )

            self._optim._grad_clip = GroupShardedClipGrad(
                self._optim._grad_clip, paddle.get_device(), self._group)
            if self._optim._parameter_list and isinstance(
                    self._optim._parameter_list[0], dict):
                for item in self._optim._param_groups:
                    if "grad_clip" in item.keys():
                        item["grad_clip"] = self._optim._grad_clip

        if offload:
            assert self._pfp16, "Only support offload strategy while using \'Adam\', \'AdamW\' and \'Momentum\' optimizer with AMP/Pure FP16"

        self.offload = offload  # Using for offload
        self.offload_device = "cpu"
        self.offload_buffer_size = 0
        self.offload_param2align = {}
        self.offload_params = None
        self.offload_grads = None
        self.dev_id = int(paddle.get_device().split(":")[1])

        self._master_params = {}

        # Update optimizer parameters and adjust parameter storage and use according to rank.
        self._update_opt_status()
    def __init__(self,
                 layer,
                 optimizer,
                 group=None,
                 sync_buffers=False,
                 device="gpu",
                 segment_size=2**20,
                 pertrain_sync_models=True,
                 offload=False,
                 sync_comm=False):
        super().__init__()

        # Default configs
        assert core.is_compiled_with_cuda(), "Only support CUDA."
        self._layer = layer
        self._default_device = device
        self.__sync_buffers = sync_buffers
        self._offload = offload
        self._sync_comm = sync_comm
        # segmentation size
        assert segment_size >= 0, "segment_size must be GE than 0."
        self._segment_size = segment_size

        global DEV
        DEV = "cpu" if paddle.get_device() == "cpu" else paddle.get_device(
        ).split(":")[0]
        global DEV_ID
        DEV_ID = 0 if paddle.get_device() == "cpu" else int(
            paddle.get_device().split(":")[1])
        global param2dtype
        param2dtype = dict()

        # Communication group establishment
        self._group = collective.new_group(
            collective._get_global_group().ranks) if group is None else group
        self._world_size_scaling = 1.0 / self._group.nranks
        assert self._group.nranks > 1, "Training must be distributed, ranks must be greater than 1."
        self._rank = self._group.rank
        self._global_root_rank = self._group.ranks[
            0]  # picking ranks index 0 as the reference

        # Parameter segmentation for global ranks
        # After flatten -> self._param2buffer_size, self._param2buffer, self._trainable_params
        self._param2buffer_size = dict()  # {param.name: size}
        self._param2buffer = dict(
        )  # {param.name: [(start0, end0),(start1, end1), ...]}
        self._trainable_params = dict()  # {id(layer): [trainable_params]}
        self._unslice_params = set()  # param's numel <= segment_size
        self._unslice_params2align = dict()  # {param.name: param's align}
        self._grad_storages = dict()  # {param.dtype: GradStorage}

        assert not isinstance(
            optimizer, list), "Multiple optimizers are not supported now."
        self._optim = _OptimizerWrapper(optimizer, self._offload, self._group,
                                        self._update_params_slice)
        self._ori_parameter_list = self._optim._parameter_list
        self._ori_param_groups = self._optim._param_groups

        # Replace optimizer's _grad_clip
        if isinstance(self._optim._grad_clip, ClipGradByGlobalNorm):
            logging.warning(
                "While using ClipGradByGlobalNorm in GroupShardedStage3, the grad clip of original optimizer will be changed."
            )
            self._optim._grad_clip = GroupShardedClipGrad(
                self._optim._grad_clip, paddle.get_device(), self._group)
            if self._optim._parameter_list and isinstance(
                    self._optim._parameter_list[0], dict):
                for item in self._optim._param_groups:
                    if "grad_clip" in item.keys():
                        item["grad_clip"] = self._optim._grad_clip

        # Synchronous all ranks models
        if pertrain_sync_models:
            self._sync_params_and_buffers()

        self._segment_rank_params(self._layer)

        # Add unslice params to master_weight in fp16
        self._handle_unslice_params()

        # In the first step, record the execution order of the layer
        self._order_tracer = OrderedDict()
        self._order_tracer["order"] = 0
        self._order_tracer["layer"] = list()

        # Register task flow
        self._task_flow = TaskFlow()

        # Register forward hooks
        self._register_forward_hooks(self._layer)

        # Register backward parameter hooks
        self._register_backward_hooks()

        # Redefine optimizer step and clear function
        self._redefine_opt_step()
        self._redefine_opt_clear()