def get_optimizer(model, args): """Set up the optimizer.""" # Build parameter groups (weight decay and non-decay). while isinstance(model, (DDP, FP16_Module)): model = model.module param_groups = gpt2_get_params_for_weight_decay_optimization(model) # Add model parallel attribute if it is not set. for param_group in param_groups: for param in param_group['params']: if not hasattr(param, 'model_parallel'): param.model_parallel = False # Use FusedAdam. optimizer = Adam(param_groups, lr=args.lr, weight_decay=args.weight_decay) print(f'Optimizer = {optimizer.__class__.__name__}') if args.deepspeed: return optimizer, param_groups # Wrap into fp16 optimizer. if args.fp16: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale, dynamic_loss_scale=args.dynamic_loss_scale, dynamic_loss_args={ 'scale_window': args.loss_scale_window, 'min_scale': args.min_scale, 'delayed_shift': args.hysteresis}) return optimizer, param_groups
def get_optimizer_param_groups(model): # Build parameter groups (weight decay and non-decay). while isinstance(model, (DDP, FP16_Module)): model = model.module param_groups = gpt2_get_params_for_weight_decay_optimization(model) # Add model parallel attribute if it is not set. for param_group in param_groups: for param in param_group['params']: if not hasattr(param, 'model_parallel'): param.model_parallel = False return param_groups
def get_optimizer(model, args): """Set up the optimizer.""" # Build parameter groups (weight decay and non-decay). while isinstance(model, (DDP, FP16_Module)): model = model.module param_groups = gpt2_get_params_for_weight_decay_optimization(model) # Add model parallel attribute if it is not set. for param_group in param_groups: for param in param_group['params']: if not hasattr(param, 'model_parallel'): param.model_parallel = False if args.cpu_optimizer: #Apex FusedAdam uses decoupled weight decay so use the same here if args.cpu_torch_adam: cpu_adam_optimizer = torch.optim.AdamW else: #TODO add option for decoupled weight decay in DeepCPUAdam from deepspeed.ops.adam import DeepSpeedCPUAdam cpu_adam_optimizer = DeepSpeedCPUAdam optimizer = cpu_adam_optimizer(param_groups, lr=args.lr, weight_decay=args.weight_decay) else: # Use FusedAdam. optimizer = Adam(param_groups, lr=args.lr, weight_decay=args.weight_decay) print(f'Optimizer = {optimizer.__class__.__name__}') if args.deepspeed: # fp16 wrapper is not required for DeepSpeed. return optimizer # Wrap into fp16 optimizer. if args.fp16: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale, dynamic_loss_scale=args.dynamic_loss_scale, dynamic_loss_args={ 'scale_window': args.loss_scale_window, 'min_scale': args.min_scale, 'delayed_shift': args.hysteresis }) return optimizer