Exemple #1
0
    def _sanity_check(self, zero_config_dict):
        deprecated_dict = dict(
            ZERO_OPTIMIZATION_CPU_OFFLOAD=ZERO_OPTIMIZATION_OFFLOAD_OPTIMIZER,
            ZERO_OPTIMIZATION_CPU_OFFLOAD_PARAMS=
            ZERO_OPTIMIZATION_OFFLOAD_PARAM,
            ZERO_OPTIMIZATION_CPU_OFFLOAD_USE_PIN_MEMORY=
            f'{ZERO_OPTIMIZATION_OFFLOAD_PARAM} or {ZERO_OPTIMIZATION_OFFLOAD_OPTIMIZER}'
        )

        for old_key, new_key in deprecated_dict.items():
            if old_key in zero_config_dict:
                logger.warning(
                    f'DeepSpeedConfig: {old_key} is deprecated. Please use {new_key}.'
                )
Exemple #2
0
    def read_zero_config_deprecated(self, param_dict):
        zero_config_dict = {}
        zero_config_dict[
            ZERO_OPTIMIZATION_STAGE] = 1 if param_dict[ZERO_OPTIMIZATION] else 0
        if zero_config_dict[ZERO_OPTIMIZATION_STAGE] > 0:
            zero_config_dict[ZERO_OPTIMIZATION_ALLGATHER_BUCKET_SIZE] = get_scalar_param(
                param_dict,
                ZERO_OPTIMIZATION_ALLGATHER_BUCKET_SIZE_DEPRECATED,
                ZERO_OPTIMIZATION_ALLGATHER_BUCKET_SIZE_DEFAULT)

        logger.warning(
            'DeepSpeedConfig: this format of ZeRO optimization setup is deprecated. Please use the following format: {}'
            .format(ZERO_FORMAT))
        return zero_config_dict
Exemple #3
0
    def __init__(self,
                 optimizer: Optimizer,
                 total_num_steps: int,
                 warmup_min_lr: float = 0.0,
                 warmup_max_lr: float = 0.001,
                 warmup_num_steps: int = 1000,
                 last_batch_iteration: int = -1):

        self.total_num_steps = total_num_steps
        super(WarmupDecayLR,
              self).__init__(optimizer, warmup_min_lr, warmup_max_lr,
                             warmup_num_steps, last_batch_iteration)
        if self.total_num_steps < self.warmup_num_steps:
            logger.warning(
                'total_num_steps {} is less than warmup_num_steps {}'.format(
                    total_num_steps, warmup_num_steps))
Exemple #4
0
 def enable_weight_quantization(self, start_bits, target_bits,
                                quantization_period,
                                weight_quantization_enabled_in_forward,
                                quantization_type, num_groups):
     self.weight.start_bits = start_bits
     self.weight.target_bits = target_bits
     self.weight.q_period = quantization_period
     self.weight_quantization_enabled_in_forward = weight_quantization_enabled_in_forward
     if self.weight_quantization_enabled_in_forward:
         assert self.weight.target_bits >= 4, 'Only >=4 bits weight quantization are supported during forward pass for now'
         logger.warning(
             "************ A lot of MoQ features are not supported in quantize_weight_in_forward mode, please consider to use DS-FP16 optimizer************"
         )
         if quantization_type == 'symmetric':
             self.weight_quantizer = SymQuantizer.apply
         else:
             self.weight_quantizer = AsymQuantizer.apply
         self.weight_quantize_num_groups = num_groups
Exemple #5
0
 def _process_deprecated_field(self, pydantic_config, field):
     fields_set = pydantic_config.__fields_set__
     dep_param = field.name
     if dep_param in fields_set:
         kwargs = field.field_info.extra
         new_param = kwargs.get("new_param", "")
         logger.warning(f"Config parameter {dep_param} is deprecated" +
                        (f" use {new_param} instead" if new_param else ""))
         if new_param and kwargs.get("set_new_param", True):
             assert (
                 new_param not in fields_set
             ), f"Cannot provide deprecated parameter '{dep_param}' and replacing parameter '{new_param}' together"
             new_param_fn = kwargs.get("new_param_fn", lambda x: x)
             param_value = new_param_fn(getattr(pydantic_config, dep_param))
             try:
                 setattr(pydantic_config, new_param, param_value)
             except Exception as e:
                 logger.error(
                     f"Tried setting value for '{new_param}' with value from deprecated '{dep_param}'"
                 )
                 raise e
Exemple #6
0
    def _do_warning_check(self):
        fp16_enabled = self.fp16_enabled or self.zero_enabled

        vocabulary_size = self._param_dict.get(VOCABULARY_SIZE, VOCABULARY_SIZE_DEFAULT)
        if vocabulary_size and vocabulary_size % TENSOR_CORE_ALIGN_SIZE != 0:
            logger.warning(
                "DeepSpeedConfig: vocabulary size {} is not aligned to {}, may import tensor core utilization."
                .format(vocabulary_size,
                        TENSOR_CORE_ALIGN_SIZE))

        if self.optimizer_params is not None and \
            MAX_GRAD_NORM in self.optimizer_params.keys() and \
                self.optimizer_params[MAX_GRAD_NORM] > 0:
            if fp16_enabled:
                if self.global_rank == 0:
                    logger.warning(
                        'DeepSpeedConfig: In FP16 mode, DeepSpeed will pass {}:{} to FP16 wrapper'
                        .format(MAX_GRAD_NORM,
                                self.optimizer_params[MAX_GRAD_NORM]))
            else:
                if self.global_rank == 0:
                    logger.warning(
                        'DeepSpeedConfig: In FP32 mode, DeepSpeed does not permit MAX_GRAD_NORM ({}) > 0, setting to zero'
                        .format(self.optimizer_params[MAX_GRAD_NORM]))
                self.optimizer_params[MAX_GRAD_NORM] = 0.0
Exemple #7
0
 def enable_weight_quantization(self, start_bits, target_bits,
                                quantization_period,
                                weight_quantization_enabled_in_forward,
                                quantization_type, num_groups):
     self.weight.start_bits = start_bits
     self.weight.target_bits = target_bits
     self.weight.q_period = quantization_period
     self.weight_quantization_enabled_in_forward = weight_quantization_enabled_in_forward
     if self.weight_quantization_enabled_in_forward:
         logger.warning(
             "************ A lot of MoQ features are not supported in quantize_weight_in_forward mode, please consider to use DS-FP16 optimizer************"
         )
         if self.weight.target_bits >= 3:
             if quantization_type == 'symmetric':
                 self.weight_quantizer = SymQuantizer.apply
             else:
                 self.weight_quantizer = AsymQuantizer.apply
         elif self.weight.target_bits == 2:
             assert quantization_type == 'symmetric', 'Only symmetric quantization is supported for ternary weight quantization'
             self.weight_quantizer = TernaryQuantizer.apply
         elif self.weight.target_bits == 1:
             assert quantization_type == 'symmetric', 'Only symmetric quantization is supported for binary weight quantization'
             self.weight_quantizer = BinaryQuantizer.apply
         self.weight_quantize_num_groups = num_groups
    def load(self, path: str, map_location=None):
        tag = _get_tag_from_path(path)
        first_load_flag = self.tag_flag is None or self.tag_flag == tag
        if not self.enable_nebula_load and first_load_flag:
            self.tag_flag = tag
            logger.info(
                f"[Nebula] Disable nebula load. Loading checkpoint from {path}..."
            )
            partition = torch.load(path, map_location=map_location)
            logger.info(
                f"[Nebula] Disable nebula load. Loaded checkpoint from {path}..."
            )
            return partition

        partititon_name = os.path.basename(path)
        logger.info(
            f"[Nebula] Loading {path} under tag{tag} from {self.nebula_load_path}..."
        )

        checkpoint = None
        if tag is None:
            checkpoint = torch_nebula.get_latest_checkpoint(
                persist_path=self.nebula_load_path)
            if checkpoint is None or (checkpoint is not None
                                      and checkpoint.tag == ''):
                logger.warning(
                    f"Unable to find latest valid checkpoint from Nebula!")
                return None
        else:
            checkpoint = torch_nebula.get_checkpoint(
                tag=tag, persist_path=self.nebula_load_path)
        partition = checkpoint.load(partititon_name, map_location=map_location)
        logger.info(
            f"[Nebula] Loaded {path} under tag{tag} from {self.nebula_load_path}."
        )
        return partition
Exemple #9
0
 def __init__(self, config):
     super().__init__()
     self.state = {}
     assert "curriculum_type" in config, "Curriculum learning requires the config 'curriculum_type'"
     assert "min_difficulty" in config, "Curriculum learning requires the config 'min_difficulty'"
     assert "max_difficulty" in config, "Curriculum learning requires the config 'max_difficulty'"
     assert "schedule_type" in config, "Curriculum learning requires the config 'schedule_type'"
     self.state['min_difficulty'] = config['min_difficulty']
     self.state['max_difficulty'] = config['max_difficulty']
     self.state['current_difficulty'] = config['min_difficulty']
     self.state['schedule_type'] = config['schedule_type']
     if config['schedule_type'] == 'fixed_discrete':
         """
         The schedule_config is a list of difficulty and a list of max
         step belonging to each difficulty. Example json config:
         "schedule_config": {
           "difficulty": [1,2,3],
           "max_step": [5,10]
         }
         The "max_step" has one less element than "difficulty", because
         the last difficulty will be used for all following steps.
         The self.state['schedule'] is a dictionary of
         difficulty : [max step for this difficulty, next difficulty].
         """
         assert "difficulty" in config[
             'schedule_config'], "Curriculum learning with fixed_discrete schedule requires the schedule_config 'difficulty'"
         assert "max_step" in config[
             'schedule_config'], "Curriculum learning with fixed_discrete schedule requires the schedule_config 'max_step'"
         assert len(config['schedule_config']['max_step']) > 0
         assert len(config['schedule_config']['difficulty']) > 0
         assert len(config['schedule_config']['difficulty']) == len(
             config['schedule_config']['max_step']) + 1
         self.state['schedule'] = {}
         for i in range(len(config['schedule_config']['max_step'])):
             self.state['schedule'][config['schedule_config']['difficulty'][i]] = \
                 [config['schedule_config']['max_step'][i],
                  config['schedule_config']['difficulty'][i+1]]
     elif config['schedule_type'] == 'fixed_root':
         """
         The schedule_config includes:
         total_curriculum_step: how many steps the curriculum learning takes to go
         from min difficulty to max difficulty.
         difficulty_step: the difficulty level determined every time must
         be a multiple of this difficulty_step. This is used to determine
         the step of difficulty increase, and to ensure the use of NVIDIA
         Tensor Core acceleration (requires multiple of 8 (FP16) or
         16 (INT8)).
         root_degree: the degree of the root function. Degree of 2 means
         square root and degree of 3 means cube root. Degree of 1 is
         equivalent to linear.
         "schedule_config": {
           "total_curriculum_step": 30000,
           "difficulty_step": 8,
           "root_degree": 2
         }
         """
         assert "total_curriculum_step" in config[
             'schedule_config'], "Curriculum learning with fixed_root schedule requires the schedule_config 'total_curriculum_step'"
         assert "difficulty_step" in config[
             'schedule_config'], "Curriculum learning with fixed_root schedule requires the schedule_config 'difficulty_step'"
         assert "root_degree" in config[
             'schedule_config'], "Curriculum learning with fixed_root schedule requires the schedule_config 'root_degree'"
         if config['schedule_config']['difficulty_step'] % 8 != 0:
             logger.warning(
                 f'The difficulty_step for curriculum learning has to be multiple of 8 (for FP16 data) or 16 (for INT8 data) to enable NVIDIA Tensor Core acceleration. Disregard this warning if this is unrelated to your hardware.'
             )
         self.state['schedule'] = config['schedule_config']
     elif config['schedule_type'] == 'fixed_linear':
         """
         The schedule_config is the same as 'fixed_root' but without the
         root_degree.
         "schedule_config": {
           "total_curriculum_step": 30000,
           "difficulty_step": 8
         }
         """
         assert "total_curriculum_step" in config[
             'schedule_config'], "Curriculum learning with fixed_linear schedule requires the schedule_config 'total_curriculum_step'"
         assert "difficulty_step" in config[
             'schedule_config'], "Curriculum learning with fixed_linear schedule requires the schedule_config 'difficulty_step'"
         if config['schedule_config']['difficulty_step'] % 8 != 0:
             logger.warning(
                 f'The difficulty_step for curriculum learning has to be multiple of 8 (for FP16 data) or 16 (for INT8 data) to enable NVIDIA Tensor Core acceleration. Disregard this warning if this is unrelated to your hardware.'
             )
         self.state['schedule'] = config['schedule_config']
     else:
         raise RuntimeError('Unsupported curriculum schedule type')
Exemple #10
0
    def __init__(self,
                 model_params,
                 lr=1e-3,
                 bias_correction=True,
                 betas=(0.9, 0.999),
                 eps=1e-8,
                 weight_decay=0,
                 amsgrad=False,
                 adamw_mode=True,
                 fp32_optimizer_states=True):
        """Fast vectorized implementation of two variations of Adam optimizer on CPU:

        * Adam: A Method for Stochastic Optimization: (https://arxiv.org/abs/1412.6980);
        * AdamW: Fixing Weight Decay Regularization in Adam (https://arxiv.org/abs/1711.05101)

        DeepSpeed CPU Adam(W) provides between 5x to 7x speedup over torch.optim.adam(W).
        In order to apply this optimizer, the model requires to have its master parameter (in FP32)
        reside on the CPU memory.

        To train on a heterogeneous system, such as coordinating CPU and GPU, DeepSpeed offers
        the ZeRO-Offload technology which efficiently offloads the optimizer states into CPU memory,
        with minimal impact on training throughput. DeepSpeedCPUAdam plays an important role to minimize
        the overhead of the optimizer's latency on CPU. Please refer to ZeRO-Offload tutorial
        (https://www.deepspeed.ai/tutorials/zero-offload/) for more information on how to enable this technology.

        For calling step function, there are two options available: (1) update optimizer's states and (2) update
        optimizer's states and copy the parameters back to GPU at the same time. We have seen that the second
        option can bring 30% higher throughput than the doing the copy separately using option one.


        .. note::
                We recommend using our `config
                <https://www.deepspeed.ai/docs/config-json/#optimizer-parameters>`_
                to allow :meth:`deepspeed.initialize` to build this optimizer
                for you.


        Arguments:
            model_params (iterable): iterable of parameters to optimize or dicts defining
                parameter groups.
            lr (float, optional): learning rate. (default: 1e-3)
            betas (Tuple[float, float], optional): coefficients used for computing
                running averages of gradient and its square. (default: (0.9, 0.999))
            eps (float, optional): term added to the denominator to improve
                numerical stability. (default: 1e-8)
            weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
            amsgrad (boolean, optional): whether to use the AMSGrad variant of this
                algorithm from the paper `On the Convergence of Adam and Beyond`_
                (default: False) NOT SUPPORTED in DeepSpeed CPUAdam!
            adamw_mode: select between Adam and AdamW implementations (default: AdamW)
            full_precision_optimizer_states: creates momementum and variance in full precision regardless of
                        the precision of the parameters (default: True)
        """

        default_args = dict(lr=lr,
                            betas=betas,
                            eps=eps,
                            weight_decay=weight_decay,
                            bias_correction=bias_correction,
                            amsgrad=amsgrad)
        super(DeepSpeedCPUAdam, self).__init__(model_params, default_args)

        self.cpu_vendor = get_cpu_info()["vendor_id_raw"].lower()
        if "amd" in self.cpu_vendor:
            for group_id, group in enumerate(self.param_groups):
                for param_id, p in enumerate(group['params']):
                    if p.dtype == torch.half:
                        logger.warning(
                            "FP16 params for CPUAdam may not work on AMD CPUs")
                        break
                else:
                    continue
                break

        self.opt_id = DeepSpeedCPUAdam.optimizer_id
        DeepSpeedCPUAdam.optimizer_id = DeepSpeedCPUAdam.optimizer_id + 1
        self.adam_w_mode = adamw_mode
        self.fp32_optimizer_states = fp32_optimizer_states
        self.ds_opt_adam = CPUAdamBuilder().load()

        self.ds_opt_adam.create_adam(self.opt_id, lr, betas[0], betas[1], eps,
                                     weight_decay, adamw_mode,
                                     should_log_le("info"))