def _sanity_check(self, zero_config_dict): deprecated_dict = dict( ZERO_OPTIMIZATION_CPU_OFFLOAD=ZERO_OPTIMIZATION_OFFLOAD_OPTIMIZER, ZERO_OPTIMIZATION_CPU_OFFLOAD_PARAMS= ZERO_OPTIMIZATION_OFFLOAD_PARAM, ZERO_OPTIMIZATION_CPU_OFFLOAD_USE_PIN_MEMORY= f'{ZERO_OPTIMIZATION_OFFLOAD_PARAM} or {ZERO_OPTIMIZATION_OFFLOAD_OPTIMIZER}' ) for old_key, new_key in deprecated_dict.items(): if old_key in zero_config_dict: logger.warning( f'DeepSpeedConfig: {old_key} is deprecated. Please use {new_key}.' )
def read_zero_config_deprecated(self, param_dict): zero_config_dict = {} zero_config_dict[ ZERO_OPTIMIZATION_STAGE] = 1 if param_dict[ZERO_OPTIMIZATION] else 0 if zero_config_dict[ZERO_OPTIMIZATION_STAGE] > 0: zero_config_dict[ZERO_OPTIMIZATION_ALLGATHER_BUCKET_SIZE] = get_scalar_param( param_dict, ZERO_OPTIMIZATION_ALLGATHER_BUCKET_SIZE_DEPRECATED, ZERO_OPTIMIZATION_ALLGATHER_BUCKET_SIZE_DEFAULT) logger.warning( 'DeepSpeedConfig: this format of ZeRO optimization setup is deprecated. Please use the following format: {}' .format(ZERO_FORMAT)) return zero_config_dict
def __init__(self, optimizer: Optimizer, total_num_steps: int, warmup_min_lr: float = 0.0, warmup_max_lr: float = 0.001, warmup_num_steps: int = 1000, last_batch_iteration: int = -1): self.total_num_steps = total_num_steps super(WarmupDecayLR, self).__init__(optimizer, warmup_min_lr, warmup_max_lr, warmup_num_steps, last_batch_iteration) if self.total_num_steps < self.warmup_num_steps: logger.warning( 'total_num_steps {} is less than warmup_num_steps {}'.format( total_num_steps, warmup_num_steps))
def enable_weight_quantization(self, start_bits, target_bits, quantization_period, weight_quantization_enabled_in_forward, quantization_type, num_groups): self.weight.start_bits = start_bits self.weight.target_bits = target_bits self.weight.q_period = quantization_period self.weight_quantization_enabled_in_forward = weight_quantization_enabled_in_forward if self.weight_quantization_enabled_in_forward: assert self.weight.target_bits >= 4, 'Only >=4 bits weight quantization are supported during forward pass for now' logger.warning( "************ A lot of MoQ features are not supported in quantize_weight_in_forward mode, please consider to use DS-FP16 optimizer************" ) if quantization_type == 'symmetric': self.weight_quantizer = SymQuantizer.apply else: self.weight_quantizer = AsymQuantizer.apply self.weight_quantize_num_groups = num_groups
def _process_deprecated_field(self, pydantic_config, field): fields_set = pydantic_config.__fields_set__ dep_param = field.name if dep_param in fields_set: kwargs = field.field_info.extra new_param = kwargs.get("new_param", "") logger.warning(f"Config parameter {dep_param} is deprecated" + (f" use {new_param} instead" if new_param else "")) if new_param and kwargs.get("set_new_param", True): assert ( new_param not in fields_set ), f"Cannot provide deprecated parameter '{dep_param}' and replacing parameter '{new_param}' together" new_param_fn = kwargs.get("new_param_fn", lambda x: x) param_value = new_param_fn(getattr(pydantic_config, dep_param)) try: setattr(pydantic_config, new_param, param_value) except Exception as e: logger.error( f"Tried setting value for '{new_param}' with value from deprecated '{dep_param}'" ) raise e
def _do_warning_check(self): fp16_enabled = self.fp16_enabled or self.zero_enabled vocabulary_size = self._param_dict.get(VOCABULARY_SIZE, VOCABULARY_SIZE_DEFAULT) if vocabulary_size and vocabulary_size % TENSOR_CORE_ALIGN_SIZE != 0: logger.warning( "DeepSpeedConfig: vocabulary size {} is not aligned to {}, may import tensor core utilization." .format(vocabulary_size, TENSOR_CORE_ALIGN_SIZE)) if self.optimizer_params is not None and \ MAX_GRAD_NORM in self.optimizer_params.keys() and \ self.optimizer_params[MAX_GRAD_NORM] > 0: if fp16_enabled: if self.global_rank == 0: logger.warning( 'DeepSpeedConfig: In FP16 mode, DeepSpeed will pass {}:{} to FP16 wrapper' .format(MAX_GRAD_NORM, self.optimizer_params[MAX_GRAD_NORM])) else: if self.global_rank == 0: logger.warning( 'DeepSpeedConfig: In FP32 mode, DeepSpeed does not permit MAX_GRAD_NORM ({}) > 0, setting to zero' .format(self.optimizer_params[MAX_GRAD_NORM])) self.optimizer_params[MAX_GRAD_NORM] = 0.0
def enable_weight_quantization(self, start_bits, target_bits, quantization_period, weight_quantization_enabled_in_forward, quantization_type, num_groups): self.weight.start_bits = start_bits self.weight.target_bits = target_bits self.weight.q_period = quantization_period self.weight_quantization_enabled_in_forward = weight_quantization_enabled_in_forward if self.weight_quantization_enabled_in_forward: logger.warning( "************ A lot of MoQ features are not supported in quantize_weight_in_forward mode, please consider to use DS-FP16 optimizer************" ) if self.weight.target_bits >= 3: if quantization_type == 'symmetric': self.weight_quantizer = SymQuantizer.apply else: self.weight_quantizer = AsymQuantizer.apply elif self.weight.target_bits == 2: assert quantization_type == 'symmetric', 'Only symmetric quantization is supported for ternary weight quantization' self.weight_quantizer = TernaryQuantizer.apply elif self.weight.target_bits == 1: assert quantization_type == 'symmetric', 'Only symmetric quantization is supported for binary weight quantization' self.weight_quantizer = BinaryQuantizer.apply self.weight_quantize_num_groups = num_groups
def load(self, path: str, map_location=None): tag = _get_tag_from_path(path) first_load_flag = self.tag_flag is None or self.tag_flag == tag if not self.enable_nebula_load and first_load_flag: self.tag_flag = tag logger.info( f"[Nebula] Disable nebula load. Loading checkpoint from {path}..." ) partition = torch.load(path, map_location=map_location) logger.info( f"[Nebula] Disable nebula load. Loaded checkpoint from {path}..." ) return partition partititon_name = os.path.basename(path) logger.info( f"[Nebula] Loading {path} under tag{tag} from {self.nebula_load_path}..." ) checkpoint = None if tag is None: checkpoint = torch_nebula.get_latest_checkpoint( persist_path=self.nebula_load_path) if checkpoint is None or (checkpoint is not None and checkpoint.tag == ''): logger.warning( f"Unable to find latest valid checkpoint from Nebula!") return None else: checkpoint = torch_nebula.get_checkpoint( tag=tag, persist_path=self.nebula_load_path) partition = checkpoint.load(partititon_name, map_location=map_location) logger.info( f"[Nebula] Loaded {path} under tag{tag} from {self.nebula_load_path}." ) return partition
def __init__(self, config): super().__init__() self.state = {} assert "curriculum_type" in config, "Curriculum learning requires the config 'curriculum_type'" assert "min_difficulty" in config, "Curriculum learning requires the config 'min_difficulty'" assert "max_difficulty" in config, "Curriculum learning requires the config 'max_difficulty'" assert "schedule_type" in config, "Curriculum learning requires the config 'schedule_type'" self.state['min_difficulty'] = config['min_difficulty'] self.state['max_difficulty'] = config['max_difficulty'] self.state['current_difficulty'] = config['min_difficulty'] self.state['schedule_type'] = config['schedule_type'] if config['schedule_type'] == 'fixed_discrete': """ The schedule_config is a list of difficulty and a list of max step belonging to each difficulty. Example json config: "schedule_config": { "difficulty": [1,2,3], "max_step": [5,10] } The "max_step" has one less element than "difficulty", because the last difficulty will be used for all following steps. The self.state['schedule'] is a dictionary of difficulty : [max step for this difficulty, next difficulty]. """ assert "difficulty" in config[ 'schedule_config'], "Curriculum learning with fixed_discrete schedule requires the schedule_config 'difficulty'" assert "max_step" in config[ 'schedule_config'], "Curriculum learning with fixed_discrete schedule requires the schedule_config 'max_step'" assert len(config['schedule_config']['max_step']) > 0 assert len(config['schedule_config']['difficulty']) > 0 assert len(config['schedule_config']['difficulty']) == len( config['schedule_config']['max_step']) + 1 self.state['schedule'] = {} for i in range(len(config['schedule_config']['max_step'])): self.state['schedule'][config['schedule_config']['difficulty'][i]] = \ [config['schedule_config']['max_step'][i], config['schedule_config']['difficulty'][i+1]] elif config['schedule_type'] == 'fixed_root': """ The schedule_config includes: total_curriculum_step: how many steps the curriculum learning takes to go from min difficulty to max difficulty. difficulty_step: the difficulty level determined every time must be a multiple of this difficulty_step. This is used to determine the step of difficulty increase, and to ensure the use of NVIDIA Tensor Core acceleration (requires multiple of 8 (FP16) or 16 (INT8)). root_degree: the degree of the root function. Degree of 2 means square root and degree of 3 means cube root. Degree of 1 is equivalent to linear. "schedule_config": { "total_curriculum_step": 30000, "difficulty_step": 8, "root_degree": 2 } """ assert "total_curriculum_step" in config[ 'schedule_config'], "Curriculum learning with fixed_root schedule requires the schedule_config 'total_curriculum_step'" assert "difficulty_step" in config[ 'schedule_config'], "Curriculum learning with fixed_root schedule requires the schedule_config 'difficulty_step'" assert "root_degree" in config[ 'schedule_config'], "Curriculum learning with fixed_root schedule requires the schedule_config 'root_degree'" if config['schedule_config']['difficulty_step'] % 8 != 0: logger.warning( f'The difficulty_step for curriculum learning has to be multiple of 8 (for FP16 data) or 16 (for INT8 data) to enable NVIDIA Tensor Core acceleration. Disregard this warning if this is unrelated to your hardware.' ) self.state['schedule'] = config['schedule_config'] elif config['schedule_type'] == 'fixed_linear': """ The schedule_config is the same as 'fixed_root' but without the root_degree. "schedule_config": { "total_curriculum_step": 30000, "difficulty_step": 8 } """ assert "total_curriculum_step" in config[ 'schedule_config'], "Curriculum learning with fixed_linear schedule requires the schedule_config 'total_curriculum_step'" assert "difficulty_step" in config[ 'schedule_config'], "Curriculum learning with fixed_linear schedule requires the schedule_config 'difficulty_step'" if config['schedule_config']['difficulty_step'] % 8 != 0: logger.warning( f'The difficulty_step for curriculum learning has to be multiple of 8 (for FP16 data) or 16 (for INT8 data) to enable NVIDIA Tensor Core acceleration. Disregard this warning if this is unrelated to your hardware.' ) self.state['schedule'] = config['schedule_config'] else: raise RuntimeError('Unsupported curriculum schedule type')
def __init__(self, model_params, lr=1e-3, bias_correction=True, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, amsgrad=False, adamw_mode=True, fp32_optimizer_states=True): """Fast vectorized implementation of two variations of Adam optimizer on CPU: * Adam: A Method for Stochastic Optimization: (https://arxiv.org/abs/1412.6980); * AdamW: Fixing Weight Decay Regularization in Adam (https://arxiv.org/abs/1711.05101) DeepSpeed CPU Adam(W) provides between 5x to 7x speedup over torch.optim.adam(W). In order to apply this optimizer, the model requires to have its master parameter (in FP32) reside on the CPU memory. To train on a heterogeneous system, such as coordinating CPU and GPU, DeepSpeed offers the ZeRO-Offload technology which efficiently offloads the optimizer states into CPU memory, with minimal impact on training throughput. DeepSpeedCPUAdam plays an important role to minimize the overhead of the optimizer's latency on CPU. Please refer to ZeRO-Offload tutorial (https://www.deepspeed.ai/tutorials/zero-offload/) for more information on how to enable this technology. For calling step function, there are two options available: (1) update optimizer's states and (2) update optimizer's states and copy the parameters back to GPU at the same time. We have seen that the second option can bring 30% higher throughput than the doing the copy separately using option one. .. note:: We recommend using our `config <https://www.deepspeed.ai/docs/config-json/#optimizer-parameters>`_ to allow :meth:`deepspeed.initialize` to build this optimizer for you. Arguments: model_params (iterable): iterable of parameters to optimize or dicts defining parameter groups. lr (float, optional): learning rate. (default: 1e-3) betas (Tuple[float, float], optional): coefficients used for computing running averages of gradient and its square. (default: (0.9, 0.999)) eps (float, optional): term added to the denominator to improve numerical stability. (default: 1e-8) weight_decay (float, optional): weight decay (L2 penalty) (default: 0) amsgrad (boolean, optional): whether to use the AMSGrad variant of this algorithm from the paper `On the Convergence of Adam and Beyond`_ (default: False) NOT SUPPORTED in DeepSpeed CPUAdam! adamw_mode: select between Adam and AdamW implementations (default: AdamW) full_precision_optimizer_states: creates momementum and variance in full precision regardless of the precision of the parameters (default: True) """ default_args = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, bias_correction=bias_correction, amsgrad=amsgrad) super(DeepSpeedCPUAdam, self).__init__(model_params, default_args) self.cpu_vendor = get_cpu_info()["vendor_id_raw"].lower() if "amd" in self.cpu_vendor: for group_id, group in enumerate(self.param_groups): for param_id, p in enumerate(group['params']): if p.dtype == torch.half: logger.warning( "FP16 params for CPUAdam may not work on AMD CPUs") break else: continue break self.opt_id = DeepSpeedCPUAdam.optimizer_id DeepSpeedCPUAdam.optimizer_id = DeepSpeedCPUAdam.optimizer_id + 1 self.adam_w_mode = adamw_mode self.fp32_optimizer_states = fp32_optimizer_states self.ds_opt_adam = CPUAdamBuilder().load() self.ds_opt_adam.create_adam(self.opt_id, lr, betas[0], betas[1], eps, weight_decay, adamw_mode, should_log_le("info"))