def test_zaremba(): from baseline.tf import optz tf.reset_default_graph() sess = tf.Session() lr_sched = create_lr_scheduler(**ZAREMBA_LR_CONFIG) bl_zaremba = ZarembaDecayScheduler(**ZAREMBA_LR_CONFIG) lr_var = tf.placeholder(tf.float32, shape=(), name='lr') step_var = tf.placeholder(tf.int32, shape=(), name='step') gph = lr_sched(lr_var, step_var) sess.run(tf.global_variables_initializer()) lrs = [] lrs_bl = [] expect_lrs = [] current_lr = INIT_LR for step in range(NUM_STEPS): lr = sess.run(gph, feed_dict={lr_var: INIT_LR, step_var: step}) lr_bl = bl_zaremba(step) lrs += [lr] lrs_bl += [lr_bl] if step in BOUNDS: b = BOUNDS.index(step) current_lr = ZAREMBA_DECAY_VALUES[b] expect_lrs += [current_lr] np.allclose(expect_lrs, lrs) np.allclose(expect_lrs, lrs_bl)
def test_exp(): from baseline.tf import optz tf.reset_default_graph() sess = tf.Session() lr_sched = create_lr_scheduler(**EXP_LR_CONFIG) bl_exp = ExponentialDecayScheduler(**EXP_LR_CONFIG) decay_rate = EXP_LR_CONFIG['decay_rate'] lr_var = tf.placeholder(tf.float32, shape=(), name='lr') step_var = tf.placeholder(tf.int32, shape=(), name='step') gph = lr_sched(lr_var, step_var) sess.run(tf.global_variables_initializer()) lrs = [] lrs_bl = [] for step in range(NUM_STEPS): lr = sess.run(gph, feed_dict={lr_var: INIT_LR, step_var: step}) lrs += [lr] lr_bl = bl_exp(step) lrs_bl += [lr_bl] inv_times = [(INIT_LR * decay_rate ** (t/100.)) for t in range(NUM_STEPS)] assert np.allclose(inv_times, lrs) assert np.allclose(inv_times, lrs_bl)
def __init__(self, model, global_step=0, **kwargs): self.global_step = global_step if 'lr_function' in kwargs: self.lr_function = kwargs['lr_function'] else: if 'lr_scheduler_type' not in kwargs: kwargs['lr_scheduler_type'] = 'default' self.lr_function = create_lr_scheduler(**kwargs) self._init_optimizer(model, **kwargs)
def optimizer(loss_fn, **kwargs): #global_step = tf.Variable(0, trainable=False) global_step = tf.train.get_or_create_global_step() clip = kwargs.get('clip', None) optim = kwargs.get('optim', 'sgd') eta = kwargs.get('lr', kwargs.get('eta', 0.01)) lr_scheduler = create_lr_scheduler(**kwargs) decay_fn = None colocate_gradients_with_ops = bool( kwargs.get('colocate_gradients_with_ops', False)) sgd_mom = float(kwargs.get('mom', 0.9)) if optim == 'adadelta': rho = float(kwargs.get('rho', 0.95)) eps = float(kwargs.get('epsilon', 1e-6)) logger.info('adadelta(eta=%f, rho=%f, epsilon=%f)', eta, rho, eps) optz = lambda lr: tf.train.AdadeltaOptimizer(lr, rho, eps) elif optim == 'adam': beta1 = float(kwargs.get('beta1', 0.9)) beta2 = float(kwargs.get('beta2', 0.999)) eps = float(kwargs.get('epsilon', 1e-8)) logger.info('adam(eta=%f beta1=%f, beta2=%f, eps=%f)', eta, beta1, beta2, eps) optz = lambda lr: tf.train.AdamOptimizer(lr, beta1, beta2, eps) elif optim == 'adamw': wd = float(kwargs.get('weight_decay', 0)) beta1 = float(kwargs.get('beta1', 0.9)) beta2 = float(kwargs.get('beta2', 0.999)) eps = float(kwargs.get('epsilon', 1e-8)) logger.info('adamw(eta=%f beta1=%f, beta2=%f, eps=%f)', eta, beta1, beta2, eps) optz = lambda lr: AdamWOptimizer(lr, wd, beta1, beta2, eps) elif optim == 'rmsprop': # Get mom again with difference default mom = float(kwargs.get('mom', 0.0)) logger.info('rmsprop(eta=%f, mom=%f)', eta, mom) optz = lambda lr: tf.train.RMSPropOptimizer(lr, momentum=mom) elif sgd_mom > 0: logger.info('sgd-mom(eta=%f, mom=%f)', eta, sgd_mom) optz = lambda lr: tf.train.MomentumOptimizer(lr, sgd_mom) else: logger.info('sgd(eta=%f)', eta) optz = lambda lr: tf.train.GradientDescentOptimizer(lr) logger.info('clip gradients at %s', clip) return global_step, tf.contrib.layers.optimize_loss( loss_fn, global_step, eta, optz, colocate_gradients_with_ops=colocate_gradients_with_ops, clip_gradients=clip, learning_rate_decay_fn=lr_scheduler, increment_global_step=True)
def optimizer(loss_fn, **kwargs): #global_step = tf.Variable(0, trainable=False) global_step = tf.train.get_or_create_global_step() clip = kwargs.get('clip', None) optim = kwargs.get('optim', 'sgd') eta = kwargs.get('lr', kwargs.get('eta', 0.01)) lr_scheduler = create_lr_scheduler(**kwargs) decay_fn = None colocate_gradients_with_ops = bool( kwargs.get('colocate_gradients_with_ops', False)) sgd_mom = float(kwargs.get('mom', 0.9)) if optim == 'adadelta': #print('adadelta', eta) optz = lambda lr: tf.train.AdadeltaOptimizer(lr, 0.95, 1e-6) elif optim == 'adam': #print('adam', eta) optz = lambda lr: tf.train.AdamOptimizer(lr, kwargs.get('beta1', 0.9), kwargs.get('beta2', 0.999), kwargs.get('epsilon', 1e-8)) elif optim == 'adamw': wd = float(kwargs.get('weight_decay', 0)) optz = lambda lr: AdamWOptimizer(lr, wd, kwargs.get('beta1', 0.9), kwargs.get('beta2', 0.999), kwargs.get('epsilon', 1e-8)) elif optim == 'rmsprop': #print('rmsprop', eta) optz = lambda lr: tf.train.RMSPropOptimizer( lr, momentum=float(kwargs.get('mom', 0.0))) elif sgd_mom > 0: #print('sgd-mom', eta, sgd_mom) optz = lambda lr: tf.train.MomentumOptimizer(lr, sgd_mom) else: #print('sgd') optz = lambda lr: tf.train.GradientDescentOptimizer(lr) #print('clip', clip) #print('decay', decay_fn) return global_step, tf.contrib.layers.optimize_loss( loss_fn, global_step, eta, optz, colocate_gradients_with_ops=colocate_gradients_with_ops, clip_gradients=clip, learning_rate_decay_fn=lr_scheduler, increment_global_step=True)
def optimizer(loss_fn, **kwargs): #global_step = tf.Variable(0, trainable=False) global_step = tf.train.get_or_create_global_step() clip = kwargs.get('clip', None) optim = kwargs.get('optim', 'sgd') eta = kwargs.get('lr', kwargs.get('eta', 0.01)) lr_scheduler = create_lr_scheduler(**kwargs) decay_fn = None colocate_gradients_with_ops = bool(kwargs.get('colocate_gradients_with_ops', False)) sgd_mom = float(kwargs.get('mom', 0.9)) if optim == 'adadelta': rho = float(kwargs.get('rho', 0.95)) eps = float(kwargs.get('epsilon', 1e-6)) logger.info('adadelta(eta=%f, rho=%f, epsilon=%f)', eta, rho, eps) optz = lambda lr: tf.train.AdadeltaOptimizer(lr, rho, eps) elif optim == 'adam': beta1 = float(kwargs.get('beta1', 0.9)) beta2 = float(kwargs.get('beta2', 0.999)) eps = float(kwargs.get('epsilon', 1e-8)) logger.info('adam(eta=%f beta1=%f, beta2=%f, eps=%f)', eta, beta1, beta2, eps) optz = lambda lr: tf.train.AdamOptimizer(lr, beta1, beta2, eps) elif optim == 'adamw': wd = float(kwargs.get('weight_decay', 0)) beta1 = float(kwargs.get('beta1', 0.9)) beta2 = float(kwargs.get('beta2', 0.999)) eps = float(kwargs.get('epsilon', 1e-8)) logger.info('adamw(eta=%f beta1=%f, beta2=%f, eps=%f)', eta, beta1, beta2, eps) optz = lambda lr: AdamWOptimizer(lr, wd, beta1, beta2, eps) elif optim == 'rmsprop': # Get mom again with difference default mom = float(kwargs.get('mom', 0.0)) logger.info('rmsprop(eta=%f, mom=%f)', eta, mom) optz = lambda lr: tf.train.RMSPropOptimizer(lr, momentum=mom) elif sgd_mom > 0: logger.info('sgd-mom(eta=%f, mom=%f)', eta, sgd_mom) optz = lambda lr: tf.train.MomentumOptimizer(lr, sgd_mom) else: logger.info('sgd(eta=%f)', eta) optz = lambda lr: tf.train.GradientDescentOptimizer(lr) logger.info('clip gradients at %s', clip) return global_step, tf.contrib.layers.optimize_loss(loss_fn, global_step, eta, optz, colocate_gradients_with_ops=colocate_gradients_with_ops, clip_gradients=clip, learning_rate_decay_fn=lr_scheduler, increment_global_step=True)
def test_cyclic(): from baseline.tf import optz tf.reset_default_graph() sess = tf.Session() lr_sched = create_lr_scheduler(**CYCLIC_LR_CONFIG) bl_const = CyclicLRScheduler(**CYCLIC_LR_CONFIG) lr_var = tf.placeholder(tf.float32, shape=(), name='lr') step_var = tf.placeholder(tf.int32, shape=(), name='step') gph = lr_sched(lr_var, step_var) sess.run(tf.global_variables_initializer()) for step in range(NUM_STEPS): lr = sess.run(gph, feed_dict={lr_var: INIT_LR, step_var: step}) lr_bl = bl_const(step) assert np.isclose(lr, lr_bl)
def test_constant(): from baseline.tf import optz tf.reset_default_graph() sess = tf.Session() lr_sched = create_lr_scheduler(lr=INIT_LR, lr_scheduler_type='default') bl_const = ConstantScheduler(lr=INIT_LR) lr_var = tf.placeholder(tf.float32, shape=(), name='lr') step_var = tf.placeholder(tf.int32, shape=(), name='step') gph = lr_sched(lr_var, step_var) sess.run(tf.global_variables_initializer()) for step in range(NUM_STEPS): lr = sess.run(gph, feed_dict={lr_var: INIT_LR, step_var: step}) assert np.isclose(INIT_LR, lr) assert np.isclose(INIT_LR, bl_const(step))
def test_linear_warmup(): from baseline.tf import optz tf.reset_default_graph() sess = tf.Session() lr_sched = create_lr_scheduler(**LINEAR_WARMUP_LR_CONFIG) warmup_steps = LINEAR_WARMUP_LR_CONFIG['warmup_steps'] lr_var = tf.placeholder(tf.float32, shape=(), name='lr') step_var = tf.placeholder(tf.int32, shape=(), name='step') gph = lr_sched(lr_var, step_var) sess.run(tf.global_variables_initializer()) lrs = [] for step in range(NUM_STEPS): lr = sess.run(gph, feed_dict={lr_var: INIT_LR, step_var: step}) lrs += [lr] expected_lrs = [INIT_LR*min(1.0, step / warmup_steps) for step in range(NUM_STEPS)] assert np.allclose(expected_lrs, lrs)
def test_composite_warmup(): from baseline.tf import optz tf.reset_default_graph() warmup_steps = COMPOSITE_LR_CONFIG['warmup_steps'] decay_rate = EXP_LR_CONFIG['decay_rate'] with tf.Session() as sess: lr_sched = create_lr_scheduler(**COMPOSITE_LR_CONFIG) lr_var = tf.placeholder(tf.float32, name='lr') step_var = tf.placeholder(tf.int32, name='step') out = lr_sched(lr_var, step_var) sess.run(tf.global_variables_initializer()) lrs = [sess.run(out, {lr_var: INIT_LR, step_var: step}) for step in range(NUM_STEPS)] warmup_expected = [INIT_LR * min(1.0, step / warmup_steps) for step in range(NUM_STEPS)] exp_expected = [(INIT_LR * decay_rate ** (t/100.)) for t in range(NUM_STEPS)] for step in range(NUM_STEPS): if step < warmup_steps: assert np.allclose(lrs[step], warmup_expected[step]) else: assert np.allclose(lrs[step], exp_expected[step - warmup_steps])
def test_composite_error(): pytest.importorskip('torch') from baseline.pytorch.optz import CompositeLRSchedulerPyTorch with pytest.raises(AssertionError): _ = create_lr_scheduler(**{"lr_scheduler_type": ["exponential", "zaremba"]})
def __init__(self, model, global_step=0, **kwargs): self.global_step = global_step if 'lr_scheduler_type' not in kwargs: kwargs['lr_scheduler_type'] = 'default' self.lr_function = create_lr_scheduler(**kwargs) self._init_optimizer(model, **kwargs)