def get_optimizer(model): """Set up the optimizer.""" args = get_args() # Build parameter groups (weight decay and non-decay). while isinstance(model, (torchDDP, LocalDDP, FP16_Module)): model = model.module param_groups = get_params_for_weight_decay_optimization(model) # Add model parallel attribute if it is not set. for param_group in param_groups: for param in param_group['params']: if not hasattr(param, 'model_parallel'): param.model_parallel = False # Use Adam. optimizer = Adam(param_groups, lr=args.lr, weight_decay=args.weight_decay, betas=(args.adam_beta1, args.adam_beta2), eps=args.adam_eps) # Wrap into fp16 optimizer. if args.fp16: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale, dynamic_loss_scale=args.dynamic_loss_scale, dynamic_loss_args={ 'scale_window': args.loss_scale_window, 'min_scale': args.min_scale, 'delayed_shift': args.hysteresis}) return optimizer
def get_optimizer(model, neox_args): """Set up the optimizer.""" if neox_args.no_load_optim: return None, None # Build parameter groups (weight decay and non-decay). param_groups = get_params_for_weight_decay_optimization(model, neox_args) print_rank_0( f'Configuring Optimizer type: {neox_args.optimizer_type} with params: {neox_args.optimizer["params"]}' ) # Add model parallel attribute if it is not set. for param_group in param_groups: for param in param_group['params']: if not hasattr(param, 'model_parallel'): param.model_parallel = False if neox_args.optimizer_type.lower() in ["cpu_adam", "cpu_torch_adam"]: if neox_args.optimizer == "cpu_torch_adam": cpu_adam_optimizer = torch.optim.Adam else: from deepspeed.ops.adam import DeepSpeedCPUAdam cpu_adam_optimizer = DeepSpeedCPUAdam optimizer = cpu_adam_optimizer(param_groups, weight_decay=neox_args.weight_decay, **neox_args.optimizer["params"]) elif neox_args.optimizer_type.lower() == "onebitadam": assert neox_args.deepspeed optimizer = None # onebitadam needs to be instantiated within the deepspeed engine to work :| elif neox_args.optimizer_type.lower() == "sm3": from .optimizers import SM3 optimizer = SM3(param_groups, **neox_args.optimizer["params"]) elif neox_args.optimizer_type.lower() == "madgrad_wd": from .optimizers import madgrad_wd optimizer = madgrad_wd(param_groups, weight_decay=neox_args.weight_decay, **neox_args.optimizer["params"]) elif neox_args.optimizer_type.lower() == "adam": # Use Adam try: # default to apex as it's slightly faster from apex.optimizers import FusedAdam as Adam except ImportError: # if apex isn't installed, use deepspeed's FusedAdam print( "WARNING: APEX not installed - defaulting to deepspeed's fused adam" ) from deepspeed.ops.adam import FusedAdam as Adam optimizer = Adam(param_groups, weight_decay=neox_args.weight_decay, **neox_args.optimizer["params"]) else: raise ValueError( f"Optimizer type {neox_args.optimizer_type} not recognized") if neox_args.deepspeed: # fp16 wrapper is not required for DeepSpeed. return optimizer, param_groups else: raise ValueError("Must be using deepspeed to run neox")
def get_optimizer(model): """Set up the optimizer.""" args = get_args() # Build parameter groups (weight decay and non-decay). while isinstance(model, (torchDDP, LocalDDP, FP16_Module)): model = model.module param_groups = get_params_for_weight_decay_optimization(model) # Add model parallel attribute if it is not set. for param_group in param_groups: for param in param_group['params']: if not hasattr(param, 'model_parallel'): param.model_parallel = False if args.cpu_optimizer: if args.cpu_torch_adam: cpu_adam_optimizer = torch.optim.Adam else: from deepspeed.ops.adam import DeepSpeedCPUAdam cpu_adam_optimizer = DeepSpeedCPUAdam optimizer = cpu_adam_optimizer(param_groups, lr=args.lr, weight_decay=args.weight_decay) elif args.onebitadam: assert args.deepspeed optimizer = None # onebitadam needs to be instantiated within the deepspeed engine to work :| else: # Use Adam optimizer = Adam(param_groups, lr=args.lr, weight_decay=args.weight_decay, betas=(args.adam_beta1, args.adam_beta2), eps=args.adam_eps) if args.deepspeed: # fp16 wrapper is not required for DeepSpeed. return optimizer, param_groups # Wrap into fp16 optimizer. if args.fp16: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale, dynamic_loss_scale=args.dynamic_loss_scale, dynamic_loss_args={ 'scale_window': args.loss_scale_window, 'min_scale': args.min_scale, 'delayed_shift': args.hysteresis }) return optimizer, param_groups
def get_optimizer(model): """Set up the optimizer.""" args = get_args() # Build parameter groups (weight decay and non-decay). while isinstance(model, (torchDDP, FP16_Module)): model = model.module param_groups = get_params_for_weight_decay_optimization(model, args) # Add model parallel attribute if it is not set. for param_group in param_groups: for param in param_group['params']: if not hasattr(param, 'model_parallel'): param.model_parallel = False if args.cpu_optimizer: if args.cpu_torch_adam: cpu_adam_optimizer = torch.optim.Adam else: from deepspeed.ops.adam import DeepSpeedCPUAdam cpu_adam_optimizer = DeepSpeedCPUAdam optimizer = cpu_adam_optimizer(param_groups, lr=args.lr, weight_decay=args.weight_decay) elif args.onebitadam: assert args.deepspeed optimizer = None # onebitadam needs to be instantiated within the deepspeed engine to work :| elif args.sm3: from .optimizers import SM3 optimizer = SM3( param_groups, lr=args.lr, momentum=args.momentum, beta=args.adam_beta1, eps=args.adam_eps, ) else: # Use Adam optimizer = Adam(param_groups, lr=args.lr, weight_decay=args.weight_decay, betas=(args.adam_beta1, args.adam_beta2), eps=args.adam_eps, adam_w_mode=not args.no_adamw) if args.deepspeed: # fp16 wrapper is not required for DeepSpeed. return optimizer, param_groups else: raise ValueError("Must be using deepspeed to run neox")
def get_optimizer(model, neox_args): """Set up the optimizer.""" if neox_args.no_load_optim: return None, None # Build parameter groups (weight decay and non-decay). param_groups = get_params_for_weight_decay_optimization(model, neox_args) print_rank_0( f'Configuring Optimizer type: {neox_args.optimizer_type} with params: {neox_args.optimizer["params"]}' ) # Add model parallel attribute if it is not set. for param_group in param_groups: for param in param_group["params"]: if not hasattr(param, "model_parallel"): param.model_parallel = False # Filter out params that don't require a grad (for soft prompt tuning, etc.) _param_groups = [] for param_group in param_groups: trainable_params = [ p for p in param_group["params"] if p.requires_grad ] param_group["params"] = trainable_params _param_groups.append(param_group) param_groups = _param_groups if neox_args.optimizer_type.lower() in ["cpu_adam", "cpu_torch_adam"]: if neox_args.optimizer == "cpu_torch_adam": cpu_adam_optimizer = torch.optim.Adam else: from deepspeed.ops.adam import DeepSpeedCPUAdam cpu_adam_optimizer = DeepSpeedCPUAdam optimizer = cpu_adam_optimizer( param_groups, weight_decay=neox_args.weight_decay, **neox_args.optimizer["params"], ) elif neox_args.optimizer_type.lower() == "onebitadam": assert neox_args.deepspeed optimizer = None # onebitadam needs to be instantiated within the deepspeed engine to work :| elif neox_args.optimizer_type.lower() == "sm3": from .optimizers import SM3 optimizer = SM3(param_groups, **neox_args.optimizer["params"]) elif neox_args.optimizer_type.lower() == "madgrad_wd": from .optimizers import madgrad_wd optimizer = madgrad_wd( param_groups, weight_decay=neox_args.weight_decay, **neox_args.optimizer["params"], ) elif neox_args.optimizer_type.lower() == "adam": # Use Adam if neox_args.use_bnb_optimizer: try: import bitsandbytes as bnb adam_optimizer = bnb.optim.Adam8bit except ModuleNotFoundError: print( "Please install bitsandbytes following https://github.com/facebookresearch/bitsandbytes." ) raise Exception else: try: # default to apex as it's slightly faster from apex.optimizers import FusedAdam as Adam except ImportError: # if apex isn't installed, use deepspeed's FusedAdam print( "WARNING: APEX not installed - defaulting to deepspeed's fused adam" ) from deepspeed.ops.adam import FusedAdam as Adam adam_optimizer = Adam optimizer = adam_optimizer( param_groups, weight_decay=neox_args.weight_decay, **neox_args.optimizer["params"], ) else: raise ValueError( f"Optimizer type {neox_args.optimizer_type} not recognized") if neox_args.deepspeed: # fp16 wrapper is not required for DeepSpeed. return optimizer, param_groups else: raise ValueError("Must be using deepspeed to run neox")