def deepspeed_init(trainer, num_training_steps, resume_from_checkpoint=None): """ Init DeepSpeed, after updating the DeepSpeed configuration with any relevant Trainer's args. If ``resume_from_checkpoint`` was passed then an attempt to resume from a previously saved checkpoint will be made. Args: trainer: Trainer object num_training_steps: per single gpu resume_from_checkpoint: path to a checkpoint if to resume from after normal DeepSpeedEngine load Returns: model, optimizer, lr_scheduler """ import deepspeed from deepspeed.utils import logger as ds_logger model = trainer.model args = trainer.args hf_deepspeed_config = args.hf_deepspeed_config hf_deepspeed_config.trainer_config_finalize(args, model, num_training_steps) # resume config update - some bits like `model` and `num_training_steps` only become available during train config = hf_deepspeed_config.config # Optimizer + Scheduler # Currently supported combos: # 1. DS scheduler + DS optimizer: Yes # 2. HF scheduler + HF optimizer: Yes # 3. DS scheduler + HF optimizer: Yes # 4. HF scheduler + DS optimizer: No # # Unless Offload is enabled in which case it's: # 1. DS scheduler + DS optimizer: Yes # 2. HF scheduler + HF optimizer: Mostly* # 3. DS scheduler + HF optimizer: Mostly* # 4. HF scheduler + DS optimizer: No # # Mostly*: All non-native DeepSpeed optimizers that have both CPU and GPU implementation should work (except LAMB) optimizer = None if "optimizer" in config: if args.adafactor: raise ValueError( "--adafactor was passed, but also found `optimizer` configured in the DeepSpeed config. " "Only one optimizer can be configured." ) else: if hf_deepspeed_config.is_offload(): logger.info( "Detected ZeRO Offload and non-DeepSpeed optimizers: This combination should work as long as the custom optimizer has both CPU and GPU implementation (except LAMB)" ) # ds supports Adam, OneBitAdam, and Lamb optimizers and can import other optimizers from torch. # But trainer uses AdamW by default. trainer.create_optimizer() optimizer = trainer.optimizer # To use other optimizers requires voiding warranty with: `zero_allow_untested_optimizer` config["zero_allow_untested_optimizer"] = True # DS schedulers (deepspeed/runtime/lr_schedules.py): # # DS name | --lr_scheduler_type | HF func | Notes # -------------| ---------------------|-----------------------------------|-------------------- # LRRangeTest | na | na | LRRT # OneCycle | na | na | 1CLR # WarmupLR | constant_with_warmup | get_constant_schedule_with_warmup | w/ warmup_min_lr=0 # WarmupDecayLR| linear | get_linear_schedule_with_warmup | lr_scheduler = None if "scheduler" not in config: if "optimizer" in config: # to make this option work, we need to init DS optimizer first, then init HS scheduler, # then pass the HS scheduler to DS init, which is not possible at the moment raise ValueError("At the moment HF scheduler + DeepSpeed optimizer combination is not possible") else: trainer.create_scheduler(num_training_steps=num_training_steps) lr_scheduler = trainer.lr_scheduler # keep for quick debug: # from pprint import pprint; pprint(config) # set the Deepspeed log level consistent with the trainer ds_logger.setLevel(args.get_process_log_level()) model_parameters = filter(lambda p: p.requires_grad, model.parameters()) model, optimizer, _, lr_scheduler = deepspeed.initialize( model=model, model_parameters=model_parameters, config_params=config, optimizer=optimizer, lr_scheduler=lr_scheduler, ) if resume_from_checkpoint is not None: # it's possible that the user is trying to resume from model_path, which doesn't necessarily # contain a deepspeed checkpoint. e.g. examples just check if the dir exists and assume it's # a resume from a checkpoint and not just a local pretrained weight. So we check here if the # path contains what looks like a deepspeed checkpoint import glob deepspeed_checkpoint_dirs = sorted(glob.glob(f"{resume_from_checkpoint}/global_step*")) if len(deepspeed_checkpoint_dirs) > 0: logger.info(f"Attempting to resume from {resume_from_checkpoint}") # this magically updates self.optimizer and self.lr_scheduler load_path, _ = model.load_checkpoint( resume_from_checkpoint, load_optimizer_states=True, load_lr_scheduler_states=True ) if load_path is None: raise ValueError(f"[deepspeed] failed to resume from checkpoint {resume_from_checkpoint}") else: logger.info(f"{resume_from_checkpoint} doesn't have deepspeed checkpoints, doing nothing") return model, optimizer, lr_scheduler
def deepspeed_init(trainer, num_training_steps, resume_from_checkpoint=None, inference=False): """ Init DeepSpeed, after updating the DeepSpeed configuration with any relevant Trainer's args. If `resume_from_checkpoint` was passed then an attempt to resume from a previously saved checkpoint will be made. Args: trainer: Trainer object num_training_steps: per single gpu resume_from_checkpoint: path to a checkpoint if to resume from after normal DeepSpeedEngine load inference: launch in inference mode (no optimizer and no lr scheduler) Returns: model, optimizer, lr_scheduler """ import deepspeed from deepspeed.utils import logger as ds_logger model = trainer.model args = trainer.args # resume config update - some bits like `model` and `num_training_steps` only become available during train hf_deepspeed_config = args.hf_deepspeed_config hf_deepspeed_config.trainer_config_finalize(args, model, num_training_steps) config = hf_deepspeed_config.config # set the Deepspeed log level consistent with the Trainer ds_logger.setLevel(args.get_process_log_level()) if inference: # only Z3 makes sense for the inference if not hf_deepspeed_config.is_zero3(): raise ValueError( "ZeRO inference only makes sense with ZeRO Stage 3 - please adjust your config" ) # in case the training config is re-used for inference hf_deepspeed_config.del_config_sub_tree("optimizer") hf_deepspeed_config.del_config_sub_tree("lr_scheduler") optimizer, lr_scheduler = None, None model_parameters = None else: optimizer, lr_scheduler = deepspeed_optim_sched( trainer, hf_deepspeed_config, args, num_training_steps) model_parameters = list( filter(lambda p: p.requires_grad, model.parameters())) # keep for quick debug: # from pprint import pprint; pprint(config) kwargs = dict( model=model, model_parameters=model_parameters, config_params=config, optimizer=optimizer, lr_scheduler=lr_scheduler, ) deepspeed_engine, optimizer, _, lr_scheduler = deepspeed.initialize( **kwargs) # stash kwargs to enabled a later deepspeed_reinit trainer.deepspeed_initialize_kwargs = kwargs if resume_from_checkpoint is not None: # it's possible that the user is trying to resume from model_path, which doesn't necessarily # contain a deepspeed checkpoint. e.g. examples just check if the dir exists and assume it's # a resume from a checkpoint and not just a local pretrained weight. So we check here if the # path contains what looks like a deepspeed checkpoint import glob deepspeed_checkpoint_dirs = sorted( glob.glob(f"{resume_from_checkpoint}/global_step*")) if len(deepspeed_checkpoint_dirs) > 0: logger.info(f"Attempting to resume from {resume_from_checkpoint}") # this magically updates self.optimizer and self.lr_scheduler load_path, _ = deepspeed_engine.load_checkpoint( resume_from_checkpoint, load_optimizer_states=True, load_lr_scheduler_states=True) if load_path is None: raise ValueError( f"[deepspeed] failed to resume from checkpoint {resume_from_checkpoint}" ) else: logger.info( f"{resume_from_checkpoint} doesn't have deepspeed checkpoints, doing nothing" ) return deepspeed_engine, optimizer, lr_scheduler
from typing import Union, Dict, Any, NamedTuple from copy import deepcopy import os import json import tempfile import logging import torch from deepspeed.utils import logger as ds_logger ds_logger.setLevel(logging.WARNING) ds_logger.propagate = False import deepspeed from allennlp.common import Params, FromParams JsonDict = Dict[str, Any] class DeepspeedConfig(FromParams): def __init__(self, optimizer: JsonDict, fp16: JsonDict = {'enabled': False}, amp: JsonDict = {'enabled': False}, zero_optimization: Union[bool, Dict] = False, zero_allow_untested_optimizer: bool = True): self.optimizer = optimizer self.fp16 = fp16 self.amp = amp self.zero_optimization = zero_optimization self.zero_allow_untested_optimizer = zero_allow_untested_optimizer