def configure_optimizers(self): optimizer = { "sgd": FusedSGD(self.parameters(), lr=self.learning_rate, momentum=self.args.momentum), "adam": FusedAdam(self.parameters(), lr=self.learning_rate, weight_decay=self.args.weight_decay), }[self.args.optimizer.lower()] if self.args.scheduler: scheduler = { "scheduler": WarmupCosineSchedule( optimizer=optimizer, warmup_steps=250, t_total=self.args.epochs * len(self.trainer.datamodule.train_dataloader()), ), "interval": "step", "frequency": 1, } return { "optimizer": optimizer, "monitor": "val_loss", "lr_scheduler": scheduler } return {"optimizer": optimizer, "monitor": "val_loss"}
def SGD(params, lr=required, momentum=0, dampening=0, weight_decay=0, nesterov=False, wd_after_momentum=False, materialize_master_grads=True): try: from apex.optimizers import FusedSGD return FusedSGD(params, lr, momentum, dampening, weight_decay, nesterov, wd_after_momentum, materialize_master_grads) except ImportError: return optim.SGD(params, lr, momentum, dampening, weight_decay, nesterov)
def configure_optimizers(self): optimizer = { "sgd": FusedSGD(self.parameters(), lr=self.lr, momentum=self.args.momentum), "adam": FusedAdam(self.parameters(), lr=self.lr, weight_decay=self.args.weight_decay), "adamw": torch.optim.AdamW(self.parameters(), lr=self.lr, weight_decay=self.args.weight_decay), "radam": RAdam(self.parameters(), lr=self.lr, weight_decay=self.args.weight_decay), "adabelief": AdaBelief(self.parameters(), lr=self.lr, weight_decay=self.args.weight_decay), "adabound": AdaBound(self.parameters(), lr=self.lr, weight_decay=self.args.weight_decay), "adamp": AdamP(self.parameters(), lr=self.lr, weight_decay=self.args.weight_decay), "novograd": FusedNovoGrad(self.parameters(), lr=self.lr, weight_decay=self.args.weight_decay), }[self.args.optimizer.lower()] if not self.args.use_scheduler: return optimizer scheduler = { "scheduler": NoamLR( optimizer=optimizer, warmup_epochs=self.args.warmup, total_epochs=self.args.epochs, steps_per_epoch=len(self.train_dataloader()) // self.args.gpus, init_lr=self.args.init_lr, max_lr=self.args.lr, final_lr=self.args.final_lr, ), "interval": "step", "frequency": 1, } return {"optimizer": optimizer, "lr_scheduler": scheduler}
def get_optimizer_for_params(cfg_opt, params): r"""Return the scheduler object. Args: cfg_opt (obj): Config for the specific optimization module (gen/dis). params (obj): Parameters to be trained by the parameters. Returns: (obj): Optimizer """ # We will use fuse optimizers by default. fused_opt = cfg_opt.fused_opt if cfg_opt.type == 'adam': if fused_opt: opt = FusedAdam(params, lr=cfg_opt.lr, eps=cfg_opt.eps, betas=(cfg_opt.adam_beta1, cfg_opt.adam_beta2)) else: opt = Adam(params, lr=cfg_opt.lr, eps=cfg_opt.eps, betas=(cfg_opt.adam_beta1, cfg_opt.adam_beta2)) elif cfg_opt.type == 'madam': g_bound = getattr(cfg_opt, 'g_bound', None) opt = Madam(params, lr=cfg_opt.lr, scale=cfg_opt.scale, g_bound=g_bound) elif cfg_opt.type == 'fromage': opt = Fromage(params, lr=cfg_opt.lr) elif cfg_opt.type == 'rmsprop': opt = RMSprop(params, lr=cfg_opt.lr, eps=cfg_opt.eps, weight_decay=cfg_opt.weight_decay) elif cfg_opt.type == 'sgd': if fused_opt: opt = FusedSGD(params, lr=cfg_opt.lr, momentum=cfg_opt.momentum, weight_decay=cfg_opt.weight_decay) else: opt = SGD(params, lr=cfg_opt.lr, momentum=cfg_opt.momentum, weight_decay=cfg_opt.weight_decay) else: raise NotImplementedError( 'Optimizer {} is not yet implemented.'.format(cfg_opt.type)) return opt
def get_optimizer(optimizer_name: str, parameters, learning_rate: float, weight_decay=0.0, **kwargs): if optimizer_name.lower() == "sgd": return SGD(parameters, learning_rate, momentum=0.9, weight_decay=weight_decay, **kwargs) if optimizer_name.lower() == "adam": return Adam(parameters, learning_rate, weight_decay=weight_decay, eps=1e-5, **kwargs) # As Jeremy suggests if optimizer_name.lower() == "rms": return RMSprop(parameters, learning_rate, weight_decay=weight_decay, **kwargs) if optimizer_name.lower() == "adamw": return AdamW(parameters, learning_rate, weight_decay=weight_decay, eps=1e-5, **kwargs) if optimizer_name.lower() == "radam": return RAdam(parameters, learning_rate, weight_decay=weight_decay, eps=1e-5, **kwargs) # As Jeremy suggests if optimizer_name.lower() == "ranger": return Ranger(parameters, learning_rate, weight_decay=weight_decay, **kwargs) # if optimizer_name.lower() == "qhadamw": # return QHAdamW(parameters, learning_rate, weight_decay=weight_decay, # **kwargs) # if optimizer_name.lower() == "lamb": return Lamb(parameters, learning_rate, weight_decay=weight_decay, **kwargs) if optimizer_name.lower() == "fused_lamb": from apex.optimizers import FusedLAMB return FusedLAMB(parameters, learning_rate, weight_decay=weight_decay, **kwargs) if optimizer_name.lower() == "fused_adam": from apex.optimizers import FusedAdam return FusedAdam(parameters, learning_rate, eps=1e-5, weight_decay=weight_decay, adam_w_mode=True, **kwargs) if optimizer_name.lower() == "fused_sgd": from apex.optimizers import FusedSGD return FusedSGD(parameters, learning_rate, weight_decay=weight_decay, momentum=0.9, **kwargs) if optimizer_name.lower() == "diffgrad": return DiffGrad(parameters, learning_rate, eps=1e-5, weight_decay=weight_decay, **kwargs) if optimizer_name.lower() == "novograd": return Novograd(parameters, learning_rate, eps=1e-5, weight_decay=weight_decay, **kwargs) raise ValueError("Unsupported optimizer name " + optimizer_name)
def configure_optimizers(self): optimizer = { "sgd": FusedSGD(self.parameters(), lr=self.learning_rate, momentum=self.args.momentum), "adam": FusedAdam(self.parameters(), lr=self.learning_rate, weight_decay=self.args.weight_decay), "radam": RAdam(self.parameters(), lr=self.learning_rate, weight_decay=self.args.weight_decay), }[self.args.optimizer.lower()] scheduler = { "none": None, "multistep": torch.optim.lr_scheduler.MultiStepLR(optimizer, self.args.steps, gamma=self.args.factor), "cosine": torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, self.args.max_epochs), "plateau": torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer, factor=self.args.factor, patience=self.args.lr_patience), }[self.args.scheduler.lower()] opt_dict = {"optimizer": optimizer, "monitor": "val_loss"} if scheduler is not None: opt_dict.update({"lr_scheduler": scheduler}) return opt_dict
def create_optimizer(optimizer_config, model, master_params=None): """Creates optimizer and schedule from configuration Parameters ---------- optimizer_config : dict Dictionary containing the configuration options for the optimizer. model : Model The network model. Returns ------- optimizer : Optimizer The optimizer. scheduler : LRScheduler The learning rate scheduler. """ if optimizer_config["classifier_lr"] != -1: # Separate classifier parameters from all others net_params = [] classifier_params = [] for k, v in model.named_parameters(): if not v.requires_grad: continue if k.find("encoder") != -1: net_params.append(v) else: classifier_params.append(v) params = [ { "params": net_params }, { "params": classifier_params, "lr": optimizer_config["classifier_lr"] }, ] else: if master_params: params = master_params else: params = model.parameters() if optimizer_config["type"] == "SGD": optimizer = optim.SGD(params, lr=optimizer_config["learning_rate"], momentum=optimizer_config["momentum"], weight_decay=optimizer_config["weight_decay"], nesterov=optimizer_config["nesterov"]) elif optimizer_config["type"] == "FusedSGD": optimizer = FusedSGD(params, lr=optimizer_config["learning_rate"], momentum=optimizer_config["momentum"], weight_decay=optimizer_config["weight_decay"], nesterov=optimizer_config["nesterov"]) elif optimizer_config["type"] == "Adam": optimizer = optim.Adam(params, lr=optimizer_config["learning_rate"], weight_decay=optimizer_config["weight_decay"]) elif optimizer_config["type"] == "FusedAdam": optimizer = FusedAdam(params, lr=optimizer_config["learning_rate"], weight_decay=optimizer_config["weight_decay"]) elif optimizer_config["type"] == "AdamW": optimizer = optim.Adam(params, lr=optimizer_config["learning_rate"], weight_decay=optimizer_config["weight_decay"]) elif optimizer_config["type"] == "RmsProp": optimizer = optim.Adam(params, lr=optimizer_config["learning_rate"], weight_decay=optimizer_config["weight_decay"]) else: raise KeyError("unrecognized optimizer {}".format( optimizer_config["type"])) if optimizer_config["schedule"]["type"] == "step": scheduler = LRStepScheduler(optimizer, **optimizer_config["schedule"]["params"]) elif optimizer_config["schedule"]["type"] == "multistep": scheduler = MultiStepLR(optimizer, **optimizer_config["schedule"]["params"]) elif optimizer_config["schedule"]["type"] == "exponential": scheduler = ExponentialLRScheduler( optimizer, **optimizer_config["schedule"]["params"]) elif optimizer_config["schedule"]["type"] == "poly": scheduler = PolyLR(optimizer, **optimizer_config["schedule"]["params"]) elif optimizer_config["schedule"]["type"] == "constant": scheduler = lr_scheduler.LambdaLR(optimizer, lambda epoch: 1.0) elif optimizer_config["schedule"]["type"] == "linear": def linear_lr(it): return it * optimizer_config["schedule"]["params"][ "alpha"] + optimizer_config["schedule"]["params"]["beta"] scheduler = lr_scheduler.LambdaLR(optimizer, linear_lr) return optimizer, scheduler
def get_optimizer(optimizer_name: str, parameters, learning_rate: float, weight_decay=1e-5, eps=1e-5, **kwargs) -> Optimizer: from torch.optim import SGD, Adam, RMSprop, AdamW from torch_optimizer import RAdam, Lamb, DiffGrad, NovoGrad, Ranger if optimizer_name.lower() == "sgd": return SGD(parameters, learning_rate, momentum=0.9, nesterov=True, weight_decay=weight_decay, **kwargs) if optimizer_name.lower() == "adam": return Adam(parameters, learning_rate, weight_decay=weight_decay, eps=eps, **kwargs) # As Jeremy suggests if optimizer_name.lower() == "rms": return RMSprop(parameters, learning_rate, weight_decay=weight_decay, **kwargs) if optimizer_name.lower() == "adamw": return AdamW(parameters, learning_rate, weight_decay=weight_decay, eps=eps, **kwargs) if optimizer_name.lower() == "radam": return RAdam(parameters, learning_rate, weight_decay=weight_decay, eps=eps, **kwargs) # As Jeremy suggests # Optimizers from torch-optimizer if optimizer_name.lower() == "ranger": return Ranger(parameters, learning_rate, eps=eps, weight_decay=weight_decay, **kwargs) if optimizer_name.lower() == "lamb": return Lamb(parameters, learning_rate, eps=eps, weight_decay=weight_decay, **kwargs) if optimizer_name.lower() == "diffgrad": return DiffGrad(parameters, learning_rate, eps=eps, weight_decay=weight_decay, **kwargs) if optimizer_name.lower() == "novograd": return NovoGrad(parameters, learning_rate, eps=eps, weight_decay=weight_decay, **kwargs) # Optimizers from Apex (Fused version is faster on GPU with tensor cores) if optimizer_name.lower() == "fused_lamb": from apex.optimizers import FusedLAMB return FusedLAMB(parameters, learning_rate, eps=eps, weight_decay=weight_decay, **kwargs) if optimizer_name.lower() == "fused_sgd": from apex.optimizers import FusedSGD return FusedSGD(parameters, learning_rate, momentum=0.9, nesterov=True, weight_decay=weight_decay, **kwargs) if optimizer_name.lower() == "fused_adam": from apex.optimizers import FusedAdam return FusedAdam(parameters, learning_rate, eps=eps, weight_decay=weight_decay, adam_w_mode=True, **kwargs) raise ValueError("Unsupported optimizer name " + optimizer_name)
def test_3models2losses2optimizers(self): model0 = MyModel(1) model1 = MyModel(2) model2 = MyModel(3) optimizer0 = torch.optim.SGD([{'params' : model0.parameters(), 'lr' : 0.25}, {'params' : model1.parameters(), 'lr' : 1.0}], momentum=0.5) optimizer1 = torch.optim.SGD([{'params' : model2.parameters(), 'lr' : 0.5}], momentum=0.25) # Again, can't do this: reference_grads = [[]]*9 reference_grads = [[], [], [], [], [], [], [], [], []] final_params = [None, None, None, None, None, None, None, None, None] for i in range(2): optimizer0.zero_grad() optimizer1.zero_grad() loss0 = model0(self.x) + model1(self.x) loss1 = model2(self.x) + model1(self.x) loss0.backward() loss1.backward() reference_grads[0].append([param.grad.data.clone() for param in model0.parameters()] + [param.grad.data.clone() for param in model1.parameters()]) optimizer0.step() optimizer1.step() final_params[0] = \ [param.data.clone() for param in model0.parameters()] + \ [param.data.clone() for param in model1.parameters()] + \ [param.data.clone() for param in model2.parameters()] def what_got_skipped(which_iter, which_backward, which_model): if which_iter == 0: if which_backward == 0: if which_model == 0: return 1 if which_model == 1: return 2 if which_backward == 1: if which_model == 2: return 3 if which_model == 1: return 4 if which_iter == 1: if which_backward == 0: if which_model == 0: return 5 if which_model == 1: return 6 if which_backward == 1: if which_model == 2: return 7 if which_model == 1: return 8 return 0 for which_iter in (0,1): for which_backward in (0,1): if which_backward == 0: which_models = (0,1) if which_backward == 1: which_models = (2,1) for which_model in which_models: model0 = MyModel(1) model1 = MyModel(2) model2 = MyModel(3) optimizer0 = torch.optim.SGD([{'params' : model0.parameters(), 'lr' : 0.25}, {'params' : model1.parameters(), 'lr' : 1.0}], momentum=0.5) optimizer1 = torch.optim.SGD([{'params' : model2.parameters(), 'lr' : 0.5}], momentum=0.25) for i in range(3): optimizer0.zero_grad() optimizer1.zero_grad() loss0 = model0(self.x) + model1(self.x) loss1 = model2(self.x) + model1(self.x) loss0.backward() loss1.backward() if i != which_iter: reference_grads[what_got_skipped(which_iter, which_backward, which_model)].append( [param.grad.data.clone() for param in model0.parameters()] + [param.grad.data.clone() for param in model1.parameters()]) if i == which_iter: if which_backward == 0: # if which_model == 0: optimizer1.step() # if which_model == 1: # optimizer1.step() if which_backward == 1: # if which_model == 2: # optimizer0.step() # if which_model == 1: continue else: optimizer0.step() optimizer1.step() final_params[what_got_skipped(which_iter, which_backward, which_model)] = \ [param.data.clone() for param in model0.parameters()] + \ [param.data.clone() for param in model1.parameters()] + \ [param.data.clone() for param in model2.parameters()] for materialize_master_grads in (False, True): for opt_level in ("O0", "O1", "O2", "O3"): for how_to_zero in ("none", "model", "optimizer"): for use_multiple_loss_scalers in (False, True): if opt_level == "O1" or opt_level == "O2": inject_inf_iters = (-1, 0, 1) else: inject_inf_iters = (-1,) for inject_inf in inject_inf_iters: if inject_inf >= 0: inject_inf_locs = ("fp16", "fp32") which_backwards = (0, 1) else: inject_inf_locs = ("fdsa",) which_backwards = (None,) for inject_inf_loc in inject_inf_locs: for which_backward in which_backwards: if use_multiple_loss_scalers: num_losses = 2 loss_ids = [0, 1] else: num_losses = 1 loss_ids = [0, 0] if inject_inf >= 0: iters = 3 if which_backward == 0: which_models = (0, 1) elif which_backward == 1: which_models = (2, 1) else: iters = 2 which_models = (None,) for which_model in which_models: model0 = MyModel(1) model1 = MyModel(2) model2 = MyModel(3) models = [model0, model1, model2] optimizer0 = FusedSGD([{'params' : model0.parameters(), 'lr' : 0.25}, {'params' : model1.parameters(), 'lr' : 1.0}], momentum=0.5, materialize_master_grads=materialize_master_grads) optimizer1 = FusedSGD([{'params' : model2.parameters(), 'lr' : 0.5}], momentum=0.25, materialize_master_grads=materialize_master_grads) _amp_state.allow_incoming_model_not_fp32 = True [model0, model1, model2], [optimizer0, optimizer1] = amp.initialize( [model0, model1, model2], [optimizer0, optimizer1], opt_level=opt_level, verbosity=0, cast_model_type=False, num_losses=num_losses) _amp_state.allow_incoming_model_not_fp32 = False _amp_state.loss_scalers[0]._loss_scale = 4.0 if use_multiple_loss_scalers: _amp_state.loss_scalers[1]._loss_scale = 16.0 unskipped = 0 for i in range(iters): if how_to_zero == "none": for model in models: for param in model.parameters(): param.grad = None elif how_to_zero == "model": for model in models: model.zero_grad() else: optimizer0.zero_grad() optimizer1.zero_grad() loss0 = model0(self.x) + model1(self.x) loss1 = model2(self.x) + model1(self.x) with amp.scale_loss(loss0, optimizer0, loss_id=loss_ids[0]) as scaled_loss: scaled_loss.backward() if i == inject_inf and which_backward == 0: if which_model == 0: inj_model = model0 elif which_model == 1: inj_model = model1 else: raise RuntimeError(which_model + " invalid for loss 0") if inject_inf_loc == "fp32": inj_model.weight0.grad[0] = float('inf') elif inject_inf_loc == "fp16": inj_model.weight1.grad[0] = float('inf') with amp.scale_loss(loss1, [optimizer0, optimizer1], loss_id=loss_ids[1]) as scaled_loss: scaled_loss.backward() if i == inject_inf and which_backward == 1: if which_model == 2: inj_model = model2 elif which_model == 1: inj_model = model1 else: raise RuntimeError(which_model + " invalid for loss 1 ") if inject_inf_loc == "fp32": inj_model.weight0.grad[0] = float('inf') elif inject_inf_loc == "fp16": inj_model.weight1.grad[0] = float('inf') if i != inject_inf: master_params = list(amp.master_params(optimizer0)) + \ list(amp.master_params(optimizer1)) for param, reference_grad in zip(master_params, reference_grads[what_got_skipped(inject_inf, which_backward, which_model)][unskipped]): if opt_level == "O2" and not materialize_master_grads: continue else: self.assertTrue(torch.allclose(param.grad.float(), reference_grad.float())) unskipped += 1 optimizer0.step() optimizer1.step() model_params = [p for p in model0.parameters()] + \ [p for p in model1.parameters()] + \ [p for p in model2.parameters()] master_params = [p for p in amp.master_params(optimizer0)] + \ [p for p in amp.master_params(optimizer1)] # print("opt_level {} i {} inject_inf {} which_backward {} inject_inf_loc {} use_multiple_loss_scalers {} which_model {}".format(opt_level, i, inject_inf, which_backward, inject_inf_loc, use_multiple_loss_scalers, which_model)) for model, master, reference in zip( model_params, master_params, final_params[what_got_skipped(inject_inf, which_backward, which_model)]): self.assertTrue(torch.allclose(model, reference)) self.assertTrue(torch.allclose(model, master.to(model.dtype))) if opt_level == "O1": _amp_state.handle._deactivate()
def create_optimizer(args, model, filter_bias_and_bn=True): opt_lower = args.opt.lower() weight_decay = args.weight_decay if 'adamw' in opt_lower or 'radam' in opt_lower: # Compensate for the way current AdamW and RAdam optimizers apply LR to the weight-decay # I don't believe they follow the paper or original Torch7 impl which schedules weight # decay based on the ratio of current_lr/initial_lr weight_decay /= args.lr if weight_decay and filter_bias_and_bn: print("has weight decay and filter bias") parameters = add_weight_decay(model, weight_decay) weight_decay = 0. else: print("Comes here to unfrozen params inside optim") parameters = unfrozen_params(model) if 'fused' in opt_lower: assert has_apex and torch.cuda.is_available( ), 'APEX and CUDA required for fused optimizers' opt_split = opt_lower.split('_') opt_lower = opt_split[-1] if opt_lower == 'sgd' or opt_lower == 'nesterov': optimizer = optim.SGD(parameters, lr=args.lr, momentum=args.momentum, weight_decay=weight_decay, nesterov=True) elif opt_lower == 'momentum': optimizer = optim.SGD(parameters, lr=args.lr, momentum=args.momentum, weight_decay=weight_decay, nesterov=False) elif opt_lower == 'adam': optimizer = optim.Adam(parameters, lr=args.lr, weight_decay=weight_decay, eps=args.opt_eps) elif opt_lower == 'adamw': optimizer = AdamW(parameters, lr=args.lr, weight_decay=weight_decay, eps=args.opt_eps) elif opt_lower == 'nadam': optimizer = Nadam(parameters, lr=args.lr, weight_decay=weight_decay, eps=args.opt_eps) elif opt_lower == 'radam': optimizer = RAdam(parameters, lr=args.lr, weight_decay=weight_decay, eps=args.opt_eps) elif opt_lower == 'adadelta': optimizer = optim.Adadelta(parameters, lr=args.lr, weight_decay=weight_decay, eps=args.opt_eps) elif opt_lower == 'rmsprop': optimizer = optim.RMSprop(parameters, lr=args.lr, alpha=0.9, eps=args.opt_eps, momentum=args.momentum, weight_decay=weight_decay) elif opt_lower == 'rmsproptf': optimizer = RMSpropTF(parameters, lr=args.lr, alpha=0.9, eps=args.opt_eps, momentum=args.momentum, weight_decay=weight_decay) elif opt_lower == 'novograd': optimizer = NovoGrad(parameters, lr=args.lr, weight_decay=weight_decay, eps=args.opt_eps) elif opt_lower == 'nvnovograd': optimizer = NvNovoGrad(parameters, lr=args.lr, weight_decay=weight_decay, eps=args.opt_eps) elif opt_lower == 'fusedsgd': optimizer = FusedSGD(parameters, lr=args.lr, momentum=args.momentum, weight_decay=weight_decay, nesterov=True) elif opt_lower == 'fusedmomentum': print("my optimizer") optimizer = FusedSGD(parameters, lr=args.lr, momentum=args.momentum, weight_decay=weight_decay, nesterov=False) elif opt_lower == 'fusedadam': optimizer = FusedAdam(parameters, lr=args.lr, adam_w_mode=False, weight_decay=weight_decay, eps=args.opt_eps) elif opt_lower == 'fusedadamw': optimizer = FusedAdam(parameters, lr=args.lr, adam_w_mode=True, weight_decay=weight_decay, eps=args.opt_eps) elif opt_lower == 'fusedlamb': optimizer = FusedLAMB(parameters, lr=args.lr, weight_decay=weight_decay, eps=args.opt_eps) elif opt_lower == 'fusednovograd': optimizer = FusedNovoGrad(parameters, lr=args.lr, betas=(0.95, 0.98), weight_decay=weight_decay, eps=args.opt_eps) else: assert False and "Invalid optimizer" raise ValueError if len(opt_split) > 1: if opt_split[0] == 'lookahead': optimizer = Lookahead(optimizer) return optimizer
def create_optimizer(args, model, filter_bias_and_bn=True): opt_lower = args.opt.lower() weight_decay = args.weight_decay if weight_decay and filter_bias_and_bn: skip = {} if hasattr(model, 'no_weight_decay'): skip = model.no_weight_decay() parameters = add_weight_decay(model, weight_decay, skip) weight_decay = 0. else: parameters = model.parameters() if 'fused' in opt_lower: assert has_apex and torch.cuda.is_available( ), 'APEX and CUDA required for fused optimizers' opt_args = dict(lr=args.lr, weight_decay=weight_decay) if hasattr(args, 'opt_eps') and args.opt_eps is not None: opt_args['eps'] = args.opt_eps if hasattr(args, 'opt_betas') and args.opt_betas is not None: opt_args['betas'] = args.opt_betas if hasattr(args, 'opt_args') and args.opt_args is not None: opt_args.update(args.opt_args) opt_split = opt_lower.split('_') opt_lower = opt_split[-1] if opt_lower == 'sgd' or opt_lower == 'nesterov': opt_args.pop('eps', None) optimizer = optim.SGD(parameters, momentum=args.momentum, nesterov=True, **opt_args) elif opt_lower == 'momentum': opt_args.pop('eps', None) optimizer = optim.SGD(parameters, momentum=args.momentum, nesterov=False, **opt_args) elif opt_lower == 'adam': optimizer = optim.Adam(parameters, **opt_args) elif opt_lower == 'adamw': optimizer = optim.AdamW(parameters, **opt_args) elif opt_lower == 'nadam': optimizer = Nadam(parameters, **opt_args) elif opt_lower == 'radam': optimizer = RAdam(parameters, **opt_args) elif opt_lower == 'adamp': # ================================ # optimizer = AdamP(parameters, wd_ratio=0.01, nesterov=True, **opt_args) print(' ') print('Gradient centralization is enabled for AdamP optimizer.') print(' ') optimizer = AdamP(parameters, wd_ratio=0.01, nesterov=True, use_gc=True, gc_conv_only=True, gc_loc=False, **opt_args) # ================================ elif opt_lower == 'sgdp': optimizer = SGDP(parameters, momentum=args.momentum, nesterov=True, **opt_args) elif opt_lower == 'adadelta': optimizer = optim.Adadelta(parameters, **opt_args) elif opt_lower == 'adafactor': if not args.lr: opt_args['lr'] = None optimizer = Adafactor(parameters, **opt_args) elif opt_lower == 'adahessian': optimizer = Adahessian(parameters, **opt_args) elif opt_lower == 'rmsprop': optimizer = optim.RMSprop(parameters, alpha=0.9, momentum=args.momentum, **opt_args) elif opt_lower == 'rmsproptf': optimizer = RMSpropTF(parameters, alpha=0.9, momentum=args.momentum, **opt_args) elif opt_lower == 'novograd': optimizer = NovoGrad(parameters, **opt_args) elif opt_lower == 'nvnovograd': optimizer = NvNovoGrad(parameters, **opt_args) elif opt_lower == 'fusedsgd': opt_args.pop('eps', None) optimizer = FusedSGD(parameters, momentum=args.momentum, nesterov=True, **opt_args) elif opt_lower == 'fusedmomentum': opt_args.pop('eps', None) optimizer = FusedSGD(parameters, momentum=args.momentum, nesterov=False, **opt_args) elif opt_lower == 'fusedadam': optimizer = FusedAdam(parameters, adam_w_mode=False, **opt_args) elif opt_lower == 'fusedadamw': optimizer = FusedAdam(parameters, adam_w_mode=True, **opt_args) elif opt_lower == 'fusedlamb': optimizer = FusedLAMB(parameters, **opt_args) elif opt_lower == 'fusednovograd': opt_args.setdefault('betas', (0.95, 0.98)) optimizer = FusedNovoGrad(parameters, **opt_args) else: assert False and "Invalid optimizer" raise ValueError if len(opt_split) > 1: if opt_split[0] == 'lookahead': optimizer = Lookahead(optimizer) return optimizer
def create_optimizer_v2( model: nn.Module, optimizer_name: str = 'sgd', learning_rate: Optional[float] = None, weight_decay: float = 0., momentum: float = 0.9, filter_bias_and_bn: bool = True, **kwargs): """ Create an optimizer. TODO currently the model is passed in and all parameters are selected for optimization. For more general use an interface that allows selection of parameters to optimize and lr groups, one of: * a filter fn interface that further breaks params into groups in a weight_decay compatible fashion * expose the parameters interface and leave it up to caller Args: model (nn.Module): model containing parameters to optimize optimizer_name: name of optimizer to create learning_rate: initial learning rate weight_decay: weight decay to apply in optimizer momentum: momentum for momentum based optimizers (others may use betas via kwargs) filter_bias_and_bn: filter out bias, bn and other 1d params from weight decay **kwargs: extra optimizer specific kwargs to pass through Returns: Optimizer """ opt_lower = optimizer_name.lower() if weight_decay and filter_bias_and_bn: skip = {} if hasattr(model, 'no_weight_decay'): skip = model.no_weight_decay() parameters = add_weight_decay(model, weight_decay, skip) weight_decay = 0. else: parameters = model.parameters() if 'fused' in opt_lower: assert has_apex and torch.cuda.is_available(), 'APEX and CUDA required for fused optimizers' opt_args = dict(lr=learning_rate, weight_decay=weight_decay, **kwargs) opt_split = opt_lower.split('_') opt_lower = opt_split[-1] if opt_lower == 'sgd' or opt_lower == 'nesterov': opt_args.pop('eps', None) optimizer = optim.SGD(parameters, momentum=momentum, nesterov=True, **opt_args) elif opt_lower == 'momentum': opt_args.pop('eps', None) optimizer = optim.SGD(parameters, momentum=momentum, nesterov=False, **opt_args) elif opt_lower == 'adam': optimizer = optim.Adam(parameters, **opt_args) elif opt_lower == 'adabelief': optimizer = AdaBelief(parameters, rectify = False, print_change_log = False,**opt_args) elif opt_lower == 'adamw': optimizer = optim.AdamW(parameters, **opt_args) elif opt_lower == 'nadam': optimizer = Nadam(parameters, **opt_args) elif opt_lower == 'radam': optimizer = RAdam(parameters, **opt_args) elif opt_lower == 'adamp': optimizer = AdamP(parameters, wd_ratio=0.01, nesterov=True, **opt_args) elif opt_lower == 'sgdp': optimizer = SGDP(parameters, momentum=momentum, nesterov=True, **opt_args) elif opt_lower == 'adadelta': optimizer = optim.Adadelta(parameters, **opt_args) elif opt_lower == 'adafactor': if not learning_rate: opt_args['lr'] = None optimizer = Adafactor(parameters, **opt_args) elif opt_lower == 'adahessian': optimizer = Adahessian(parameters, **opt_args) elif opt_lower == 'rmsprop': optimizer = optim.RMSprop(parameters, alpha=0.9, momentum=momentum, **opt_args) elif opt_lower == 'rmsproptf': optimizer = RMSpropTF(parameters, alpha=0.9, momentum=momentum, **opt_args) elif opt_lower == 'novograd': optimizer = NovoGrad(parameters, **opt_args) elif opt_lower == 'nvnovograd': optimizer = NvNovoGrad(parameters, **opt_args) elif opt_lower == 'fusedsgd': opt_args.pop('eps', None) optimizer = FusedSGD(parameters, momentum=momentum, nesterov=True, **opt_args) elif opt_lower == 'fusedmomentum': opt_args.pop('eps', None) optimizer = FusedSGD(parameters, momentum=momentum, nesterov=False, **opt_args) elif opt_lower == 'fusedadam': optimizer = FusedAdam(parameters, adam_w_mode=False, **opt_args) elif opt_lower == 'fusedadamw': optimizer = FusedAdam(parameters, adam_w_mode=True, **opt_args) elif opt_lower == 'fusedlamb': optimizer = FusedLAMB(parameters, **opt_args) elif opt_lower == 'fusednovograd': opt_args.setdefault('betas', (0.95, 0.98)) optimizer = FusedNovoGrad(parameters, **opt_args) else: assert False and "Invalid optimizer" raise ValueError if len(opt_split) > 1: if opt_split[0] == 'lookahead': optimizer = Lookahead(optimizer) return optimizer
def create_optimizer_v2(model_or_params, opt: str = 'sgd', lr: Optional[float] = None, weight_decay: float = 0., momentum: float = 0.9, filter_bias_and_bn: bool = True, layer_decay: Optional[float] = None, param_group_fn: Optional[Callable] = None, **kwargs): """ Create an optimizer. TODO currently the model is passed in and all parameters are selected for optimization. For more general use an interface that allows selection of parameters to optimize and lr groups, one of: * a filter fn interface that further breaks params into groups in a weight_decay compatible fashion * expose the parameters interface and leave it up to caller Args: model_or_params (nn.Module): model containing parameters to optimize opt: name of optimizer to create lr: initial learning rate weight_decay: weight decay to apply in optimizer momentum: momentum for momentum based optimizers (others may use betas via kwargs) filter_bias_and_bn: filter out bias, bn and other 1d params from weight decay **kwargs: extra optimizer specific kwargs to pass through Returns: Optimizer """ if isinstance(model_or_params, nn.Module): # a model was passed in, extract parameters and add weight decays to appropriate layers no_weight_decay = {} if hasattr(model_or_params, 'no_weight_decay'): no_weight_decay = model_or_params.no_weight_decay() if param_group_fn: parameters = param_group_fn(model_or_params) elif layer_decay is not None: parameters = param_groups_layer_decay( model_or_params, weight_decay=weight_decay, layer_decay=layer_decay, no_weight_decay_list=no_weight_decay) weight_decay = 0. elif weight_decay and filter_bias_and_bn: parameters = param_groups_weight_decay(model_or_params, weight_decay, no_weight_decay) weight_decay = 0. else: parameters = model_or_params.parameters() else: # iterable of parameters or param groups passed in parameters = model_or_params opt_lower = opt.lower() opt_split = opt_lower.split('_') opt_lower = opt_split[-1] if 'fused' in opt_lower: assert has_apex and torch.cuda.is_available( ), 'APEX and CUDA required for fused optimizers' opt_args = dict(weight_decay=weight_decay, **kwargs) if lr is not None: opt_args.setdefault('lr', lr) # basic SGD & related if opt_lower == 'sgd' or opt_lower == 'nesterov': # NOTE 'sgd' refers to SGD + nesterov momentum for legacy / backwards compat reasons opt_args.pop('eps', None) optimizer = optim.SGD(parameters, momentum=momentum, nesterov=True, **opt_args) elif opt_lower == 'momentum': opt_args.pop('eps', None) optimizer = optim.SGD(parameters, momentum=momentum, nesterov=False, **opt_args) elif opt_lower == 'sgdp': optimizer = SGDP(parameters, momentum=momentum, nesterov=True, **opt_args) # adaptive elif opt_lower == 'adam': optimizer = optim.Adam(parameters, **opt_args) elif opt_lower == 'adamw': optimizer = optim.AdamW(parameters, **opt_args) elif opt_lower == 'adamp': optimizer = AdamP(parameters, wd_ratio=0.01, nesterov=True, **opt_args) elif opt_lower == 'nadam': try: # NOTE PyTorch >= 1.10 should have native NAdam optimizer = optim.Nadam(parameters, **opt_args) except AttributeError: optimizer = Nadam(parameters, **opt_args) elif opt_lower == 'radam': optimizer = RAdam(parameters, **opt_args) elif opt_lower == 'adamax': optimizer = optim.Adamax(parameters, **opt_args) elif opt_lower == 'adabelief': optimizer = AdaBelief(parameters, rectify=False, **opt_args) elif opt_lower == 'radabelief': optimizer = AdaBelief(parameters, rectify=True, **opt_args) elif opt_lower == 'adadelta': optimizer = optim.Adadelta(parameters, **opt_args) elif opt_lower == 'adagrad': opt_args.setdefault('eps', 1e-8) optimizer = optim.Adagrad(parameters, **opt_args) elif opt_lower == 'adafactor': optimizer = Adafactor(parameters, **opt_args) elif opt_lower == 'lamb': optimizer = Lamb(parameters, **opt_args) elif opt_lower == 'lambc': optimizer = Lamb(parameters, trust_clip=True, **opt_args) elif opt_lower == 'larc': optimizer = Lars(parameters, momentum=momentum, trust_clip=True, **opt_args) elif opt_lower == 'lars': optimizer = Lars(parameters, momentum=momentum, **opt_args) elif opt_lower == 'nlarc': optimizer = Lars(parameters, momentum=momentum, trust_clip=True, nesterov=True, **opt_args) elif opt_lower == 'nlars': optimizer = Lars(parameters, momentum=momentum, nesterov=True, **opt_args) elif opt_lower == 'madgrad': optimizer = MADGRAD(parameters, momentum=momentum, **opt_args) elif opt_lower == 'madgradw': optimizer = MADGRAD(parameters, momentum=momentum, decoupled_decay=True, **opt_args) elif opt_lower == 'novograd' or opt_lower == 'nvnovograd': optimizer = NvNovoGrad(parameters, **opt_args) elif opt_lower == 'rmsprop': optimizer = optim.RMSprop(parameters, alpha=0.9, momentum=momentum, **opt_args) elif opt_lower == 'rmsproptf': optimizer = RMSpropTF(parameters, alpha=0.9, momentum=momentum, **opt_args) # second order elif opt_lower == 'adahessian': optimizer = Adahessian(parameters, **opt_args) # NVIDIA fused optimizers, require APEX to be installed elif opt_lower == 'fusedsgd': opt_args.pop('eps', None) optimizer = FusedSGD(parameters, momentum=momentum, nesterov=True, **opt_args) elif opt_lower == 'fusedmomentum': opt_args.pop('eps', None) optimizer = FusedSGD(parameters, momentum=momentum, nesterov=False, **opt_args) elif opt_lower == 'fusedadam': optimizer = FusedAdam(parameters, adam_w_mode=False, **opt_args) elif opt_lower == 'fusedadamw': optimizer = FusedAdam(parameters, adam_w_mode=True, **opt_args) elif opt_lower == 'fusedlamb': optimizer = FusedLAMB(parameters, **opt_args) elif opt_lower == 'fusednovograd': opt_args.setdefault('betas', (0.95, 0.98)) optimizer = FusedNovoGrad(parameters, **opt_args) else: assert False and "Invalid optimizer" raise ValueError if len(opt_split) > 1: if opt_split[0] == 'lookahead': optimizer = Lookahead(optimizer) return optimizer
def test_2models2losses1optimizer(self): model0 = MyModel(1) model1 = MyModel(2) optimizer = torch.optim.SGD([{'params' : model0.parameters(), 'lr' : 0.25}, {'params' : model1.parameters(), 'lr' : 0.5}], momentum=0.125) reference_grads = [] for i in range(2): optimizer.zero_grad() loss0 = model0(self.x) loss1 = model1(self.x) loss0.backward() loss1.backward() reference_grads.append([param.grad.data.clone() for param in model0.parameters()] + [param.grad.data.clone() for param in model1.parameters()]) optimizer.step() final_params = [param.data.clone() for param in model0.parameters()] + \ [param.data.clone() for param in model1.parameters()] for materialize_master_grads in (False, True): for opt_level in ("O0", "O1", "O2", "O3"): for how_to_zero in ("none", "model", "optimizer"): for use_multiple_loss_scalers in (False, True): if opt_level == "O1" or opt_level == "O2": inject_inf_iters = (-1, 0, 1) else: inject_inf_iters = (-1,) for inject_inf in inject_inf_iters: if inject_inf >= 0: inject_inf_locs = ("fp16", "fp32") which_backwards = (0, 1) else: inject_inf_locs = ("fdsa",) which_backwards = (None,) for inject_inf_loc in inject_inf_locs: for which_backward in which_backwards: if use_multiple_loss_scalers: num_losses = 2 loss_ids = [0, 1] else: num_losses = 1 loss_ids = [0, 0] if inject_inf >= 0: iters = 3 else: iters = 2 model0 = MyModel(1) model1 = MyModel(2) models = [model0, model1] optimizer = FusedSGD([{'params' : model0.parameters(), 'lr' : 0.25}, {'params' : model1.parameters(), 'lr' : 0.5}], momentum=0.125, materialize_master_grads=materialize_master_grads) _amp_state.allow_incoming_model_not_fp32 = True [model0, model1], optimizer = amp.initialize( [model0, model1], optimizer, opt_level=opt_level, verbosity=0, cast_model_type=False, num_losses=num_losses) _amp_state.allow_incoming_model_not_fp32 = False _amp_state.loss_scalers[0]._loss_scale = 4.0 if use_multiple_loss_scalers: _amp_state.loss_scalers[1]._loss_scale = 16.0 unskipped = 0 for i in range(iters): if how_to_zero == "none": for model in models: for param in model.parameters(): param.grad = None elif how_to_zero == "model": for model in models: model.zero_grad() else: optimizer.zero_grad() loss0 = model0(self.x) loss1 = model1(self.x) with amp.scale_loss(loss0, optimizer, loss_id=loss_ids[0]) as scaled_loss: scaled_loss.backward() if i == inject_inf and which_backward == 0: if inject_inf_loc == "fp32": model0.weight0.grad[0] = float('inf') elif inject_inf_loc == "fp16": model0.weight1.grad[0] = float('inf') with amp.scale_loss(loss1, optimizer, loss_id=loss_ids[1]) as scaled_loss: scaled_loss.backward() if i == inject_inf and which_backward == 1: if inject_inf_loc == "fp32": model1.weight0.grad[0] = float('inf') elif inject_inf_loc == "fp16": model1.weight1.grad[0] = float('inf') if i != inject_inf: master_params = amp.master_params(optimizer) for param, reference_grad in zip(master_params, reference_grads[unskipped]): if opt_level == "O2" and not materialize_master_grads: continue else: self.assertTrue(torch.allclose(param.grad.float(), reference_grad.float()), "opt_level {} i {} inject_inf {} which_backward {} inject_inf_loc {} use_multiple_loss_scalers {}".format(opt_level, i, inject_inf, which_backward, inject_inf_loc, use_multiple_loss_scalers)) unskipped += 1 optimizer.step() model_params = [p for p in model0.parameters()] + [p for p in model1.parameters()] for model, master, reference in zip( model_params, amp.master_params(optimizer), final_params): self.assertTrue(torch.allclose(model, reference)) self.assertTrue(torch.allclose(model, master.to(model.dtype))) if opt_level == "O1": _amp_state.handle._deactivate()
def create_optimizer_param(args, parameters): opt_lower = args.opt.lower() if 'fused' in opt_lower: assert has_apex and torch.cuda.is_available( ), 'APEX and CUDA required for fused optimizers' opt_args = dict(lr=args.lr, weight_decay=args.weight_decay) if hasattr(args, 'opt_eps') and args.opt_eps is not None: opt_args['eps'] = args.opt_eps if hasattr(args, 'opt_betas') and args.opt_betas is not None: opt_args['betas'] = args.opt_betas if hasattr(args, 'opt_args') and args.opt_args is not None: opt_args.update(args.opt_args) opt_split = opt_lower.split('_') opt_lower = opt_split[-1] if opt_lower == 'sgd' or opt_lower == 'nesterov': opt_args.pop('eps', None) optimizer = optim.SGD(parameters, momentum=args.momentum, nesterov=True, **opt_args) elif opt_lower == 'momentum': opt_args.pop('eps', None) optimizer = optim.SGD(parameters, momentum=args.momentum, nesterov=False, **opt_args) elif opt_lower == 'adam': optimizer = optim.Adam(parameters, **opt_args) elif opt_lower == 'adamw': optimizer = optim.AdamW(parameters, **opt_args) elif opt_lower == 'nadam': optimizer = Nadam(parameters, **opt_args) elif opt_lower == 'radam': optimizer = RAdam(parameters, **opt_args) elif opt_lower == 'adamp': optimizer = AdamP(parameters, wd_ratio=0.01, nesterov=True, **opt_args) elif opt_lower == 'sgdp': optimizer = SGDP(parameters, momentum=args.momentum, nesterov=True, **opt_args) elif opt_lower == 'adadelta': optimizer = optim.Adadelta(parameters, **opt_args) elif opt_lower == 'adafactor': if not args.lr: opt_args['lr'] = None optimizer = Adafactor(parameters, **opt_args) elif opt_lower == 'adahessian': optimizer = Adahessian(parameters, **opt_args) elif opt_lower == 'rmsprop': optimizer = optim.RMSprop(parameters, alpha=0.9, momentum=args.momentum, **opt_args) elif opt_lower == 'rmsproptf': optimizer = RMSpropTF(parameters, alpha=0.9, momentum=args.momentum, **opt_args) elif opt_lower == 'novograd': optimizer = NovoGrad(parameters, **opt_args) elif opt_lower == 'nvnovograd': optimizer = NvNovoGrad(parameters, **opt_args) elif opt_lower == 'fusedsgd': opt_args.pop('eps', None) optimizer = FusedSGD(parameters, momentum=args.momentum, nesterov=True, **opt_args) elif opt_lower == 'fusedmomentum': opt_args.pop('eps', None) optimizer = FusedSGD(parameters, momentum=args.momentum, nesterov=False, **opt_args) elif opt_lower == 'fusedadam': optimizer = FusedAdam(parameters, adam_w_mode=False, **opt_args) elif opt_lower == 'fusedadamw': optimizer = FusedAdam(parameters, adam_w_mode=True, **opt_args) elif opt_lower == 'fusedlamb': optimizer = FusedLAMB(parameters, **opt_args) elif opt_lower == 'fusednovograd': opt_args.setdefault('betas', (0.95, 0.98)) optimizer = FusedNovoGrad(parameters, **opt_args) else: assert False and "Invalid optimizer" raise ValueError if len(opt_split) > 1: if opt_split[0] == 'lookahead': optimizer = Lookahead(optimizer) return optimizer
def create_optimizer(args, model, filter_bias_and_bn=True, freeze_stage=""): opt_lower = args.opt.lower() weight_decay = args.weight_decay if 'adamw' in opt_lower or 'radam' in opt_lower: # Compensate for the way current AdamW and RAdam optimizers apply LR to the weight-decay # I don't believe they follow the paper or original Torch7 impl which schedules weight # decay based on the ratio of current_lr/initial_lr weight_decay /= args.lr if weight_decay and filter_bias_and_bn: if freeze_stage == "stage1": stage1_train_attn(model, layer_names=['fc']) print('stage1, Freeze layer successfully') if freeze_stage == "stage2": stage1_train_attn(model, layer_names=['layer3', 'layer4', 'se', 'fc']) stage2_train_layer4(model) print('stage2, Freeze layer successfully') # 对未冻结的层进行权重衰减 parameters = add_weight_decay(model, weight_decay) weight_decay = 0. else: parameters = model.parameters() for name, param in model.named_parameters(): print(name, param.requires_grad) if 'fused' in opt_lower: assert has_apex and torch.cuda.is_available( ), 'APEX and CUDA required for fused optimizers' opt_split = opt_lower.split('_') opt_lower = opt_split[-1] if opt_lower == 'sgd' or opt_lower == 'nesterov': optimizer = optim.SGD(parameters, lr=args.lr, momentum=args.momentum, weight_decay=weight_decay, nesterov=True) elif opt_lower == 'momentum': optimizer = optim.SGD(parameters, lr=args.lr, momentum=args.momentum, weight_decay=weight_decay, nesterov=False) elif opt_lower == 'adam': optimizer = optim.Adam(parameters, lr=args.lr, weight_decay=weight_decay, eps=args.opt_eps) elif opt_lower == 'adamw': optimizer = AdamW(parameters, lr=args.lr, weight_decay=weight_decay, eps=args.opt_eps) elif opt_lower == 'nadam': optimizer = Nadam(parameters, lr=args.lr, weight_decay=weight_decay, eps=args.opt_eps) elif opt_lower == 'radam': optimizer = RAdam(parameters, lr=args.lr, weight_decay=weight_decay, eps=args.opt_eps) elif opt_lower == 'adamp': optimizer = AdamP(parameters, lr=args.lr, weight_decay=weight_decay, eps=args.opt_eps, delta=0.1, wd_ratio=0.01, nesterov=True) elif opt_lower == 'sgdp': optimizer = SGDP(parameters, lr=args.lr, momentum=args.momentum, weight_decay=weight_decay, eps=args.opt_eps, nesterov=True) elif opt_lower == 'adadelta': optimizer = optim.Adadelta(parameters, lr=args.lr, weight_decay=weight_decay, eps=args.opt_eps) elif opt_lower == 'rmsprop': optimizer = optim.RMSprop(parameters, lr=args.lr, alpha=0.9, eps=args.opt_eps, momentum=args.momentum, weight_decay=weight_decay) elif opt_lower == 'rmsproptf': optimizer = RMSpropTF(parameters, lr=args.lr, alpha=0.9, eps=args.opt_eps, momentum=args.momentum, weight_decay=weight_decay) elif opt_lower == 'novograd': optimizer = NovoGrad(parameters, lr=args.lr, weight_decay=weight_decay, eps=args.opt_eps) elif opt_lower == 'nvnovograd': optimizer = NvNovoGrad(parameters, lr=args.lr, weight_decay=weight_decay, eps=args.opt_eps) elif opt_lower == 'fusedsgd': optimizer = FusedSGD(parameters, lr=args.lr, momentum=args.momentum, weight_decay=weight_decay, nesterov=True) elif opt_lower == 'fusedmomentum': optimizer = FusedSGD(parameters, lr=args.lr, momentum=args.momentum, weight_decay=weight_decay, nesterov=False) elif opt_lower == 'fusedadam': optimizer = FusedAdam(parameters, lr=args.lr, adam_w_mode=False, weight_decay=weight_decay, eps=args.opt_eps) elif opt_lower == 'fusedadamw': optimizer = FusedAdam(parameters, lr=args.lr, adam_w_mode=True, weight_decay=weight_decay, eps=args.opt_eps) elif opt_lower == 'fusedlamb': optimizer = FusedLAMB(parameters, lr=args.lr, weight_decay=weight_decay, eps=args.opt_eps) elif opt_lower == 'fusednovograd': optimizer = FusedNovoGrad(parameters, lr=args.lr, betas=(0.95, 0.98), weight_decay=weight_decay, eps=args.opt_eps) else: assert False and "Invalid optimizer" raise ValueError if len(opt_split) > 1: if opt_split[0] == 'lookahead': optimizer = Lookahead(optimizer) return optimizer
def create_optimizer(args, model, filter_bias_and_bn=True): opt_lower = args.opt.lower() weight_decay = args.weight_decay if weight_decay and filter_bias_and_bn: skip = {} if hasattr(model, 'no_weight_decay'): skip = model.no_weight_decay parameters = add_weight_decay(model, weight_decay, skip) weight_decay = 0. else: parameters = model.parameters() if 'fused' in opt_lower: assert has_apex and torch.cuda.is_available( ), 'APEX and CUDA required for fused optimizers' opt_split = opt_lower.split('_') opt_lower = opt_split[-1] opt_args = dict(lr=args.lr, weight_decay=weight_decay) opt_args = dict(lr=args.lr, weight_decay=weight_decay) if hasattr(args, 'opt_eps') and args.opt_eps is not None and opt_lower not in [ 'sgd', 'momentum', 'fusedmomentum', 'fusedsgd' ]: opt_args['eps'] = args.opt_eps if hasattr(args, 'opt_betas') and args.opt_betas is not None: opt_args['betas'] = args.opt_betas if opt_lower == 'sgd' or opt_lower == 'nesterov': optimizer = optim.SGD(parameters, momentum=args.momentum, nesterov=True, **opt_args) elif opt_lower == 'momentum': optimizer = optim.SGD(parameters, momentum=args.momentum, nesterov=False, **opt_args) elif opt_lower == 'adam': optimizer = optim.Adam(parameters, **opt_args) elif opt_lower == 'adamw': optimizer = optim.AdamW(parameters, **opt_args) elif opt_lower == 'fusedsgd': optimizer = FusedSGD(parameters, momentum=args.momentum, nesterov=True, **opt_args) elif opt_lower == 'fusedmomentum': optimizer = FusedSGD(parameters, momentum=args.momentum, nesterov=False, **opt_args) elif opt_lower == 'fusedadam': optimizer = FusedAdam(parameters, adam_w_mode=False, **opt_args) elif opt_lower == 'fusedadamw': optimizer = FusedAdam(parameters, adam_w_mode=True, **opt_args) elif opt_lower == 'fusedlamb': optimizer = FusedLAMB(parameters, **opt_args) elif opt_lower == 'fusednovograd': opt_args.setdefault('betas', (0.95, 0.98)) optimizer = FusedNovoGrad(parameters, **opt_args) else: assert False and "Invalid optimizer" raise ValueError if len(opt_split) > 1: if opt_split[0] == 'lookahead': optimizer = Lookahead(optimizer) return optimizer
def get_optimizer( model: nn.Module, optimizer_name: str, learning_rate: float, weight_decay: float = 1e-5, no_weight_decay_on_bias: bool = False, eps: float = 1e-5, **kwargs, ) -> Optimizer: """ Construct an Optimizer for given model Args: model: Model to optimize. Only parameters that require_grad will be used optimizer_name: Name of the optimizer. Case-insensitive learning_rate: Target learning rate (regardless of the scheduler) weight_decay: Target weight decay no_weight_decay_on_bias: Whether to disable weight decay on bias parameters eps: Default epsilon for Adam-like optimizers. **kwargs: Additional parameters for optimizer Returns: """ from torch.optim import ASGD, SGD, Adam, RMSprop, AdamW from torch_optimizer import RAdam, Lamb, DiffGrad, NovoGrad, Ranger # Optimizer parameter groups default_pg, biases_pg = [], [] for k, v in model.named_parameters(): if v.requires_grad: if str.endswith(k, ".bias"): biases_pg.append(v) # biases else: default_pg.append(v) # all else if no_weight_decay_on_bias: parameters = default_pg else: parameters = default_pg + biases_pg optimizer: Optimizer = None if optimizer_name.lower() == "sgd": optimizer = SGD( parameters, lr=learning_rate, momentum=0.9, nesterov=True, weight_decay=weight_decay, **kwargs, ) elif optimizer_name.lower() == "asgd": optimizer = ASGD( parameters, lr=learning_rate, weight_decay=weight_decay, **kwargs, ) elif optimizer_name.lower() == "adam": optimizer = Adam( parameters, lr=learning_rate, weight_decay=weight_decay, eps=eps, **kwargs, ) elif optimizer_name.lower() == "rms": optimizer = RMSprop(parameters, learning_rate, weight_decay=weight_decay, **kwargs) elif optimizer_name.lower() == "adamw": optimizer = AdamW( parameters, lr=learning_rate, weight_decay=weight_decay, eps=eps, **kwargs, ) elif optimizer_name.lower() == "radam": optimizer = RAdam( parameters, lr=learning_rate, weight_decay=weight_decay, eps=eps, **kwargs, ) elif optimizer_name.lower() == "ranger": optimizer = Ranger( parameters, lr=learning_rate, eps=eps, weight_decay=weight_decay, **kwargs, ) elif optimizer_name.lower() == "lamb": optimizer = Lamb( parameters, lr=learning_rate, eps=eps, weight_decay=weight_decay, **kwargs, ) elif optimizer_name.lower() == "diffgrad": optimizer = DiffGrad( parameters, lr=learning_rate, eps=eps, weight_decay=weight_decay, **kwargs, ) elif optimizer_name.lower() == "novograd": optimizer = NovoGrad( parameters, lr=learning_rate, eps=eps, weight_decay=weight_decay, **kwargs, ) elif optimizer_name.lower() == "fused_lamb": from apex.optimizers import FusedLAMB optimizer = FusedLAMB(parameters, learning_rate, eps=eps, weight_decay=weight_decay, **kwargs) elif optimizer_name.lower() == "fused_sgd": from apex.optimizers import FusedSGD optimizer = FusedSGD(parameters, learning_rate, momentum=0.9, nesterov=True, weight_decay=weight_decay, **kwargs) elif optimizer_name.lower() == "fused_adam": from apex.optimizers import FusedAdam optimizer = FusedAdam(parameters, learning_rate, eps=eps, weight_decay=weight_decay, adam_w_mode=True, **kwargs) else: raise KeyError(f"Cannot get optimizer by name {optimizer_name}") # Currently either no_wd or per-group lr if no_weight_decay_on_bias: optimizer.add_param_group({"params": biases_pg, "weight_decay": 0}) return optimizer
def create_optimizer(args, model, filter_bias_and_bn=True, classification_layer_name=None): opt_lower = args.opt.lower() weight_decay = args.weight_decay if 'adamw' in opt_lower or 'radam' in opt_lower: # Compensate for the way current AdamW and RAdam optimizers apply LR to the weight-decay # I don't believe they follow the paper or original Torch7 impl which schedules weight # decay based on the ratio of current_lr/initial_lr weight_decay /= args.lr if weight_decay and filter_bias_and_bn: # batch norm and bias params if classification_layer_name is not None: parameters = set_lr_per_params(args, model, classification_layer_name, weight_decay) else: parameters = add_weight_decay(model, weight_decay) weight_decay = 0. # reset to 0 else: if classification_layer_name is not None: parameters = set_lr_per_params(args, model, classification_layer_name, weight_decay=0) else: parameters = model.parameters() if 'fused' in opt_lower: assert has_apex and torch.cuda.is_available( ), 'APEX and CUDA required for fused optimizers' opt_split = opt_lower.split('_') opt_lower = opt_split[-1] if opt_lower == 'sgd' or opt_lower == 'nesterov': optimizer = optim.SGD(parameters, lr=args.lr, momentum=args.momentum, weight_decay=weight_decay, nesterov=True) elif opt_lower == 'momentum': optimizer = optim.SGD(parameters, lr=args.lr, momentum=args.momentum, weight_decay=weight_decay, nesterov=False) elif opt_lower == 'adam': optimizer = optim.Adam(parameters, lr=args.lr, weight_decay=weight_decay, eps=args.opt_eps) elif opt_lower == 'adamw': optimizer = AdamW(parameters, lr=args.lr, weight_decay=weight_decay, eps=args.opt_eps) elif opt_lower == 'nadam': optimizer = Nadam(parameters, lr=args.lr, weight_decay=weight_decay, eps=args.opt_eps) elif opt_lower == 'radam': optimizer = RAdam(parameters, lr=args.lr, weight_decay=weight_decay, eps=args.opt_eps) elif opt_lower == 'adamp': optimizer = AdamP(parameters, lr=args.lr, weight_decay=weight_decay, eps=args.opt_eps, delta=0.1, wd_ratio=0.01, nesterov=True) elif opt_lower == 'sgdp': optimizer = SGDP(parameters, lr=args.lr, momentum=args.momentum, weight_decay=weight_decay, eps=args.opt_eps, nesterov=True) elif opt_lower == 'adadelta': optimizer = optim.Adadelta(parameters, lr=args.lr, weight_decay=weight_decay, eps=args.opt_eps) elif opt_lower == 'rmsprop': optimizer = optim.RMSprop(parameters, lr=args.lr, alpha=0.9, eps=args.opt_eps, momentum=args.momentum, weight_decay=weight_decay) elif opt_lower == 'rmsproptf': optimizer = RMSpropTF(parameters, lr=args.lr, alpha=0.9, eps=args.opt_eps, momentum=args.momentum, weight_decay=weight_decay) elif opt_lower == 'novograd': optimizer = NovoGrad(parameters, lr=args.lr, weight_decay=weight_decay, eps=args.opt_eps) elif opt_lower == 'nvnovograd': optimizer = NvNovoGrad(parameters, lr=args.lr, weight_decay=weight_decay, eps=args.opt_eps) elif opt_lower == 'fusedsgd': optimizer = FusedSGD(parameters, lr=args.lr, momentum=args.momentum, weight_decay=weight_decay, nesterov=True) elif opt_lower == 'fusedmomentum': optimizer = FusedSGD(parameters, lr=args.lr, momentum=args.momentum, weight_decay=weight_decay, nesterov=False) elif opt_lower == 'fusedadam': optimizer = FusedAdam(parameters, lr=args.lr, adam_w_mode=False, weight_decay=weight_decay, eps=args.opt_eps) elif opt_lower == 'fusedadamw': optimizer = FusedAdam(parameters, lr=args.lr, adam_w_mode=True, weight_decay=weight_decay, eps=args.opt_eps) elif opt_lower == 'fusedlamb': optimizer = FusedLAMB(parameters, lr=args.lr, weight_decay=weight_decay, eps=args.opt_eps) elif opt_lower == 'fusednovograd': optimizer = FusedNovoGrad(parameters, lr=args.lr, betas=(0.95, 0.98), weight_decay=weight_decay, eps=args.opt_eps) else: assert False and "Invalid optimizer" raise ValueError if len(opt_split) > 1: if opt_split[0] == 'lookahead': optimizer = Lookahead(optimizer) return optimizer
def test_2models2losses2optimizers(self): model0 = MyModel(1) model1 = MyModel(2) optimizer0 = torch.optim.SGD([{'params' : model0.parameters(), 'lr' : 0.25}], momentum=0.125) optimizer1 = torch.optim.SGD([{'params' : model1.parameters(), 'lr' : 0.5}], momentum=0.25) # Don't do it like this: reference_grads = [[]]*5 # because then it creates a list of 5 references to the same "[]" and appending # to any of them effectively makes you append to all of them, which multiplies # the resulting size of reference_grads by 5x and needless to say makes the test fail. reference_grads = [[], [], [], [], []] final_params = [None, None, None, None, None] for i in range(2): optimizer0.zero_grad() optimizer1.zero_grad() loss0 = model0(self.x) loss1 = model1(self.x) loss0.backward() loss1.backward() reference_grads[0].append([param.grad.data.clone() for param in model0.parameters()] + [param.grad.data.clone() for param in model1.parameters()]) optimizer0.step() optimizer1.step() final_params[0] = [param.data.clone() for param in model0.parameters()] + \ [param.data.clone() for param in model1.parameters()] def what_got_skipped(which_iter, which_backward): if which_iter == 0 and which_backward == 0: return 1 if which_iter == 0 and which_backward == 1: return 2 if which_iter == 1 and which_backward == 0: return 3 if which_iter == 1 and which_backward == 1: return 4 return 0 for which_iter in (0,1): for which_backward in (0,1): model0 = MyModel(1) model1 = MyModel(2) optimizer0 = torch.optim.SGD([{'params' : model0.parameters(), 'lr' : 0.25}], momentum=0.125) optimizer1 = torch.optim.SGD([{'params' : model1.parameters(), 'lr' : 0.5}], momentum=0.25) for i in range(3): optimizer0.zero_grad() optimizer1.zero_grad() loss0 = model0(self.x) loss1 = model1(self.x) loss0.backward() loss1.backward() if i != which_iter: reference_grads[what_got_skipped(which_iter, which_backward)].append( [param.grad.data.clone() for param in model0.parameters()] + [param.grad.data.clone() for param in model1.parameters()]) if i == which_iter: if which_backward == 0: optimizer1.step() else: optimizer0.step() else: optimizer0.step() optimizer1.step() final_params[what_got_skipped(which_iter, which_backward)] = \ [param.data.clone() for param in model0.parameters()] + \ [param.data.clone() for param in model1.parameters()] for materialize_master_grads in (False, True): for opt_level in ("O0", "O1", "O2", "O3"): for how_to_zero in ("none", "model", "optimizer"): for use_multiple_loss_scalers in (False, True): if opt_level == "O1" or opt_level == "O2": inject_inf_iters = (-1, 0, 1) else: inject_inf_iters = (-1,) for inject_inf in inject_inf_iters: if inject_inf >= 0: inject_inf_locs = ("fp16", "fp32") which_backwards = (0, 1) else: inject_inf_locs = ("fdsa",) which_backwards = (None,) for inject_inf_loc in inject_inf_locs: for which_backward in which_backwards: if use_multiple_loss_scalers: num_losses = 2 loss_ids = [0, 1] else: num_losses = 1 loss_ids = [0, 0] if inject_inf >= 0: iters = 3 else: iters = 2 model0 = MyModel(1) model1 = MyModel(2) models = [model0, model1] optimizer0 = FusedSGD([{'params' : model0.parameters(), 'lr' : 0.25}], momentum=0.125, materialize_master_grads=materialize_master_grads) optimizer1 = FusedSGD([{'params' : model1.parameters(), 'lr' : 0.5}], momentum=0.25, materialize_master_grads=materialize_master_grads) _amp_state.allow_incoming_model_not_fp32 = True [model0, model1], [optimizer0, optimizer1] = amp.initialize( [model0, model1], [optimizer0, optimizer1], opt_level=opt_level, verbosity=0, cast_model_type=False, num_losses=num_losses) _amp_state.allow_incoming_model_not_fp32 = False _amp_state.loss_scalers[0]._loss_scale = 4.0 if use_multiple_loss_scalers: _amp_state.loss_scalers[1]._loss_scale = 16.0 unskipped = 0 for i in range(iters): if how_to_zero == "none": for model in models: for param in model.parameters(): param.grad = None elif how_to_zero == "model": for model in models: model.zero_grad() else: optimizer0.zero_grad() optimizer1.zero_grad() loss0 = model0(self.x) loss1 = model1(self.x) with amp.scale_loss(loss0, optimizer0, loss_id=loss_ids[0]) as scaled_loss: scaled_loss.backward() if i == inject_inf and which_backward == 0: if inject_inf_loc == "fp32": model0.weight0.grad[0] = float('inf') elif inject_inf_loc == "fp16": model0.weight1.grad[0] = float('inf') with amp.scale_loss(loss1, optimizer1, loss_id=loss_ids[1]) as scaled_loss: scaled_loss.backward() if i == inject_inf and which_backward == 1: if inject_inf_loc == "fp32": model1.weight0.grad[0] = float('inf') elif inject_inf_loc == "fp16": model1.weight1.grad[0] = float('inf') # print("opt_level {} i {} inject_inf {} which_backward {} inject_inf_loc {} use_multiple_loss_scalers {}".format(opt_level, i, inject_inf, which_backward, inject_inf_loc, use_multiple_loss_scalers)) if i != inject_inf: master_params = list(amp.master_params(optimizer0)) + \ list(amp.master_params(optimizer1)) for param, reference_grad in zip(master_params, reference_grads[what_got_skipped(inject_inf, which_backward)][unskipped]): if opt_level == "O2" and not materialize_master_grads: continue else: self.assertTrue(torch.allclose(param.grad.float(), reference_grad.float())) unskipped += 1 optimizer0.step() optimizer1.step() model_params = [p for p in model0.parameters()] + [p for p in model1.parameters()] master_params = [p for p in amp.master_params(optimizer0)] + \ [p for p in amp.master_params(optimizer1)] for model, master, reference in zip( model_params, master_params, final_params[what_got_skipped(inject_inf, which_backward)]): self.assertTrue(torch.allclose(model, reference)) self.assertTrue(torch.allclose(model, master.to(model.dtype))) if opt_level == "O1": _amp_state.handle._deactivate()