Example #1
0
def get_optimizer(opt, learning_rate, parameters, lr_decay, decay_rate,
                  milestone, warmup_updates, init_lr, rebound):
    if opt == 'sgd':
        optimizer = SGD(parameters,
                        lr=learning_rate,
                        momentum=0.9,
                        weight_decay=0.,
                        nesterov=True)
    elif opt == 'radam':
        optimizer = RAdamW(parameters,
                           lr=learning_rate,
                           betas=(0.9, 0.999),
                           weight_decay=0.)
    elif opt == 'adam':
        optimizer = Adam(parameters,
                         lr=learning_rate,
                         betas=(0.9, 0.999),
                         weight_decay=0.)
    elif opt == 'adabelief':
        optimizer = AdaBelief(parameters,
                              lr=learning_rate,
                              betas=(0.9, 0.999),
                              eps=1e-12,
                              weight_decay=0.)
    elif opt == 'apollo':
        optimizer = Apollo(parameters,
                           lr=learning_rate,
                           beta=0.9,
                           eps=1e-4,
                           rebound=rebound,
                           warmup=warmup_updates,
                           init_lr=init_lr,
                           weight_decay=0.)
    elif opt == 'adahessian':
        optimizer = AdaHessian(parameters,
                               lr=learning_rate,
                               betas=(0.9, 0.999),
                               eps=1e-4,
                               warmup=warmup_updates,
                               init_lr=init_lr,
                               weight_decay=0.)
    else:
        raise ValueError('unknown optimizer: {}'.format(opt))

    opt_param = 'lr decay={} {}, decay rate={:.3f}'.format(
        lr_decay, milestone, decay_rate)
    scheduler = MultiStepLR(optimizer, milestones=milestone, gamma=decay_rate)

    if opt == 'apollo':
        opt_param += ', rebound={}'.format(rebound)
    if opt in ['apollo', 'adahessian']:
        opt_param += ', warmup={}, init_lr={:.1e}'.format(
            warmup_updates, init_lr)
    return optimizer, scheduler, opt_param
Example #2
0
def get_optimizer(opt, learning_rate, parameters, hyper1, hyper2, eps, rebound,
                  lr_decay, decay_rate, milestone, weight_decay, weight_decay_type,
                  warmup_updates, init_lr, last_lr, num_epochs, world_size):
    if opt == 'sgd':
        optimizer = SGD(parameters, lr=learning_rate, momentum=hyper1, weight_decay=weight_decay, nesterov=True)
        opt = 'momentum=%.1f, ' % (hyper1)
        weight_decay_type = 'L2'
    elif opt == 'radamw':
        optimizer = RAdamW(parameters, lr=learning_rate, betas=(hyper1, hyper2), eps=eps, weight_decay=weight_decay)
        opt = 'betas=(%.1f, %.3f), eps=%.1e, ' % (hyper1, hyper2, eps)
        weight_decay_type = 'decoupled'
    elif opt == 'adamw':
        optimizer = AdamW(parameters, lr=learning_rate, betas=(hyper1, hyper2), eps=eps, weight_decay=weight_decay)
        opt = 'betas=(%.1f, %.3f), eps=%.1e, ' % (hyper1, hyper2, eps)
        weight_decay_type = 'decoupled'
    elif opt == 'adabelief':
        optimizer = AdaBelief(parameters, lr=learning_rate, betas=(hyper1, hyper2), eps=eps,
                              weight_decay=weight_decay, weight_decay_type=weight_decay_type)
        opt = 'betas=(%.1f, %.3f), eps=%.1e, ' % (hyper1, hyper2, eps)
    elif opt == 'apollo':
        optimizer = Apollo(parameters, lr=learning_rate, beta=hyper1, eps=eps, rebound=rebound,
                           warmup=warmup_updates, init_lr=init_lr, weight_decay=weight_decay,
                           weight_decay_type=weight_decay_type)
        opt = 'beta=%.1f, eps=%.1e, rebound=%s, ' % (hyper1, eps, rebound)
    elif opt == 'adahessian':
        optimizer = AdaHessian(parameters, lr=learning_rate, betas=(hyper1, hyper2), eps=eps,
                               warmup=warmup_updates, init_lr=init_lr, weight_decay=weight_decay, num_threads=world_size)
        opt = 'betas=(%.1f, %.3f), eps=%.1e, ' % (hyper1, hyper2, eps)
        weight_decay_type = 'decoupled'
    else:
        raise ValueError('unknown optimizer: {}'.format(opt))

    if lr_decay == 'exp':
        opt = opt + 'lr decay={}, decay rate={:.3f}, '.format(lr_decay, decay_rate)
        scheduler = ExponentialLR(optimizer, decay_rate)
    elif lr_decay == 'milestone':
        opt = opt + 'lr decay={} {}, decay rate={:.3f}, '.format(lr_decay, milestone, decay_rate)
        scheduler = MultiStepLR(optimizer, milestones=milestone, gamma=decay_rate)
    elif lr_decay == 'cosine':
        opt = opt + 'lr decay={}, lr_min={}, '.format(lr_decay, last_lr)
        scheduler = CosineAnnealingLR(optimizer, T_max=num_epochs, eta_min=last_lr)
    else:
        raise ValueError('unknown lr decay: {}'.format(lr_decay))

    opt += 'warmup={}, init_lr={:.1e}, wd={:.1e} ({})'.format(warmup_updates, init_lr, weight_decay, weight_decay_type)
    return optimizer, scheduler, opt