def get_optimizer(opt, learning_rate, parameters, lr_decay, decay_rate, milestone, warmup_updates, init_lr, rebound): if opt == 'sgd': optimizer = SGD(parameters, lr=learning_rate, momentum=0.9, weight_decay=0., nesterov=True) elif opt == 'radam': optimizer = RAdamW(parameters, lr=learning_rate, betas=(0.9, 0.999), weight_decay=0.) elif opt == 'adam': optimizer = Adam(parameters, lr=learning_rate, betas=(0.9, 0.999), weight_decay=0.) elif opt == 'adabelief': optimizer = AdaBelief(parameters, lr=learning_rate, betas=(0.9, 0.999), eps=1e-12, weight_decay=0.) elif opt == 'apollo': optimizer = Apollo(parameters, lr=learning_rate, beta=0.9, eps=1e-4, rebound=rebound, warmup=warmup_updates, init_lr=init_lr, weight_decay=0.) elif opt == 'adahessian': optimizer = AdaHessian(parameters, lr=learning_rate, betas=(0.9, 0.999), eps=1e-4, warmup=warmup_updates, init_lr=init_lr, weight_decay=0.) else: raise ValueError('unknown optimizer: {}'.format(opt)) opt_param = 'lr decay={} {}, decay rate={:.3f}'.format( lr_decay, milestone, decay_rate) scheduler = MultiStepLR(optimizer, milestones=milestone, gamma=decay_rate) if opt == 'apollo': opt_param += ', rebound={}'.format(rebound) if opt in ['apollo', 'adahessian']: opt_param += ', warmup={}, init_lr={:.1e}'.format( warmup_updates, init_lr) return optimizer, scheduler, opt_param
def get_optimizer(opt, learning_rate, parameters, hyper1, hyper2, eps, rebound, lr_decay, decay_rate, milestone, weight_decay, weight_decay_type, warmup_updates, init_lr, last_lr, num_epochs, world_size): if opt == 'sgd': optimizer = SGD(parameters, lr=learning_rate, momentum=hyper1, weight_decay=weight_decay, nesterov=True) opt = 'momentum=%.1f, ' % (hyper1) weight_decay_type = 'L2' elif opt == 'radamw': optimizer = RAdamW(parameters, lr=learning_rate, betas=(hyper1, hyper2), eps=eps, weight_decay=weight_decay) opt = 'betas=(%.1f, %.3f), eps=%.1e, ' % (hyper1, hyper2, eps) weight_decay_type = 'decoupled' elif opt == 'adamw': optimizer = AdamW(parameters, lr=learning_rate, betas=(hyper1, hyper2), eps=eps, weight_decay=weight_decay) opt = 'betas=(%.1f, %.3f), eps=%.1e, ' % (hyper1, hyper2, eps) weight_decay_type = 'decoupled' elif opt == 'adabelief': optimizer = AdaBelief(parameters, lr=learning_rate, betas=(hyper1, hyper2), eps=eps, weight_decay=weight_decay, weight_decay_type=weight_decay_type) opt = 'betas=(%.1f, %.3f), eps=%.1e, ' % (hyper1, hyper2, eps) elif opt == 'apollo': optimizer = Apollo(parameters, lr=learning_rate, beta=hyper1, eps=eps, rebound=rebound, warmup=warmup_updates, init_lr=init_lr, weight_decay=weight_decay, weight_decay_type=weight_decay_type) opt = 'beta=%.1f, eps=%.1e, rebound=%s, ' % (hyper1, eps, rebound) elif opt == 'adahessian': optimizer = AdaHessian(parameters, lr=learning_rate, betas=(hyper1, hyper2), eps=eps, warmup=warmup_updates, init_lr=init_lr, weight_decay=weight_decay, num_threads=world_size) opt = 'betas=(%.1f, %.3f), eps=%.1e, ' % (hyper1, hyper2, eps) weight_decay_type = 'decoupled' else: raise ValueError('unknown optimizer: {}'.format(opt)) if lr_decay == 'exp': opt = opt + 'lr decay={}, decay rate={:.3f}, '.format(lr_decay, decay_rate) scheduler = ExponentialLR(optimizer, decay_rate) elif lr_decay == 'milestone': opt = opt + 'lr decay={} {}, decay rate={:.3f}, '.format(lr_decay, milestone, decay_rate) scheduler = MultiStepLR(optimizer, milestones=milestone, gamma=decay_rate) elif lr_decay == 'cosine': opt = opt + 'lr decay={}, lr_min={}, '.format(lr_decay, last_lr) scheduler = CosineAnnealingLR(optimizer, T_max=num_epochs, eta_min=last_lr) else: raise ValueError('unknown lr decay: {}'.format(lr_decay)) opt += 'warmup={}, init_lr={:.1e}, wd={:.1e} ({})'.format(warmup_updates, init_lr, weight_decay, weight_decay_type) return optimizer, scheduler, opt