Esempio n. 1
0
def test_exp():
    from eight_mile.tf import optz

    tf.compat.v1.reset_default_graph()
    sess = tf.compat.v1.Session()

    lr_sched = create_lr_scheduler(**EXP_LR_CONFIG)
    bl_exp = ExponentialDecayScheduler(**EXP_LR_CONFIG)
    decay_rate = EXP_LR_CONFIG["decay_rate"]

    lr_var = tf.compat.v1.placeholder(tf.float32, shape=(), name="lr")
    step_var = tf.compat.v1.placeholder(tf.int32, shape=(), name="step")

    gph = lr_sched(lr_var, step_var)
    sess.run(tf.compat.v1.global_variables_initializer())

    lrs = []
    lrs_bl = []
    for step in range(NUM_STEPS):
        lr = sess.run(gph, feed_dict={lr_var: INIT_LR, step_var: step})
        lrs += [lr]
        lr_bl = bl_exp(step)
        lrs_bl += [lr_bl]
    inv_times = [(INIT_LR * decay_rate**(t / 100.0)) for t in range(NUM_STEPS)]
    assert np.allclose(inv_times, lrs)
    assert np.allclose(inv_times, lrs_bl)
Esempio n. 2
0
def test_composite_error():
    pytest.importorskip("torch")
    from eight_mile.pytorch.optz import CompositeLRSchedulerPyTorch

    with pytest.raises(AssertionError):
        _ = create_lr_scheduler(
            **{"lr_scheduler_type": ["exponential", "zaremba"]})
def get_lr_decay(sched_type,
                 lr,
                 steps_per_epoch,
                 n_epochs,
                 logger,
                 decay_steps=None,
                 decay_rate=None,
                 alpha=None):
    if sched_type == 'cosine':
        decay_steps = decay_steps if decay_steps else steps_per_epoch * n_epochs
        alpha = alpha if alpha else 0.
        params = {'decay_steps': decay_steps, 'alpha': alpha}
    else:
        decay_steps = decay_steps if decay_steps else steps_per_epoch
        if not decay_rate:
            if sched_type == 'exponential':
                decay_rate = 0.5
            elif sched_type == 'invtime':
                decay_rate = 1.0
        params = {'decay_steps': decay_steps, 'decay_rate': decay_rate}
    lr_decay = create_lr_scheduler(lr_scheduler_type=sched_type,
                                   lr=lr,
                                   **params)
    logger.info(
        f"Using {sched_type} decay learning rate with params {params}.")
    return lr_decay
Esempio n. 4
0
def test_zaremba():
    from eight_mile.tf import optz

    tf.compat.v1.reset_default_graph()
    sess = tf.compat.v1.Session()

    lr_sched = create_lr_scheduler(**ZAREMBA_LR_CONFIG)
    bl_zaremba = ZarembaDecayScheduler(**ZAREMBA_LR_CONFIG)
    lr_var = tf.compat.v1.placeholder(tf.float32, shape=(), name="lr")
    step_var = tf.compat.v1.placeholder(tf.int32, shape=(), name="step")

    gph = lr_sched(lr_var, step_var)
    sess.run(tf.global_variables_initializer())

    lrs = []
    lrs_bl = []
    expect_lrs = []
    current_lr = INIT_LR
    for step in range(NUM_STEPS):
        lr = sess.run(gph, feed_dict={lr_var: INIT_LR, step_var: step})
        lr_bl = bl_zaremba(step)
        lrs += [lr]
        lrs_bl += [lr_bl]
        if step in BOUNDS:
            b = BOUNDS.index(step)
            current_lr = ZAREMBA_DECAY_VALUES[b]
        expect_lrs += [current_lr]
    np.allclose(expect_lrs, lrs)
    np.allclose(expect_lrs, lrs_bl)
Esempio n. 5
0
def test_composite_warmup():
    from eight_mile.tf import optz

    tf.compat.v1.reset_default_graph()
    warmup_steps = COMPOSITE_LR_CONFIG["warmup_steps"]
    decay_rate = EXP_LR_CONFIG["decay_rate"]
    with tf.compat.v1.Session() as sess:
        lr_sched = create_lr_scheduler(**COMPOSITE_LR_CONFIG)
        lr_var = tf.compat.v1.placeholder(tf.float32, name="lr")
        step_var = tf.compat.v1.placeholder(tf.int32, name="step")

        out = lr_sched(lr_var, step_var)
        sess.run(tf.compat.v1.global_variables_initializer())

        lrs = [
            sess.run(out, {
                lr_var: INIT_LR,
                step_var: step
            }) for step in range(NUM_STEPS)
        ]

        warmup_expected = [
            INIT_LR * min(1.0, step / warmup_steps)
            for step in range(NUM_STEPS)
        ]
        exp_expected = [(INIT_LR * decay_rate**(t / 100.0))
                        for t in range(NUM_STEPS)]

    for step in range(NUM_STEPS):
        if step < warmup_steps:
            assert np.allclose(lrs[step], warmup_expected[step])
        else:
            assert np.allclose(lrs[step], exp_expected[step - warmup_steps])
Esempio n. 6
0
def optimizer(loss_fn, **kwargs):

    global_step = tf.train.get_or_create_global_step()
    clip = kwargs.get("clip", None)
    optim = kwargs.get("optim", "sgd")
    eta = kwargs.get("lr", kwargs.get("eta", 0.01))
    lr_scheduler = create_lr_scheduler(**kwargs)
    decay_fn = None
    colocate_gradients_with_ops = bool(
        kwargs.get("colocate_gradients_with_ops", False))
    sgd_mom = float(kwargs.get("mom", 0.9))
    if optim == "adadelta":
        rho = float(kwargs.get("rho", 0.95))
        eps = float(kwargs.get("epsilon", 1e-6))
        logger.info("adadelta(eta=%f, rho=%f, epsilon=%f)", eta, rho, eps)
        optz = lambda lr: tf.train.AdadeltaOptimizer(lr, rho, eps)
    elif optim == "adam":
        beta1 = float(kwargs.get("beta1", 0.9))
        beta2 = float(kwargs.get("beta2", 0.999))
        eps = float(kwargs.get("epsilon", 1e-8))
        logger.info("adam(eta=%f beta1=%f, beta2=%f, eps=%f)", eta, beta1,
                    beta2, eps)
        optz = lambda lr: tf.train.AdamOptimizer(lr, beta1, beta2, eps)
    elif optim == "adamw":
        wd = float(kwargs.get("weight_decay", 0))
        beta1 = float(kwargs.get("beta1", 0.9))
        beta2 = float(kwargs.get("beta2", 0.999))
        eps = float(kwargs.get("epsilon", 1e-8))
        logger.info("adamw(eta=%f beta1=%f, beta2=%f, eps=%f)", eta, beta1,
                    beta2, eps)
        optz = lambda lr: AdamWOptimizer(lr, wd, beta1, beta2, eps)
    elif optim == "rmsprop":
        # Get mom again with difference default
        mom = float(kwargs.get("mom", 0.0))
        logger.info("rmsprop(eta=%f, mom=%f)", eta, mom)
        optz = lambda lr: tf.train.RMSPropOptimizer(lr, momentum=mom)
    elif sgd_mom > 0:
        logger.info("sgd-mom(eta=%f, mom=%f)", eta, sgd_mom)
        optz = lambda lr: tf.train.MomentumOptimizer(lr, sgd_mom)
    else:
        logger.info("sgd(eta=%f)", eta)
        optz = lambda lr: tf.train.GradientDescentOptimizer(lr)

    logger.info("clip gradients at %s", clip)
    return (
        global_step,
        tf.contrib.layers.optimize_loss(
            loss_fn,
            global_step,
            eta,
            optz,
            colocate_gradients_with_ops=colocate_gradients_with_ops,
            clip_gradients=clip,
            learning_rate_decay_fn=lr_scheduler,
            increment_global_step=True,
        ),
    )
Esempio n. 7
0
 def __init__(self, model, global_step=0, **kwargs):
     self.global_step = global_step
     if "lr_function" in kwargs:
         self.lr_function = kwargs["lr_function"]
     else:
         if "lr_scheduler_type" not in kwargs:
             kwargs["lr_scheduler_type"] = "default"
         self.lr_function = create_lr_scheduler(**kwargs)
     self._init_optimizer(model, **kwargs)
def test_constant():
    from eight_mile.tf import optz

    lr_sched = create_lr_scheduler(lr=INIT_LR, lr_scheduler_type="default")
    bl_const = ConstantScheduler(lr=INIT_LR)

    for step in range(NUM_STEPS):
        lr = lr_sched(step)
        assert np.isclose(INIT_LR, lr)
        assert np.isclose(INIT_LR, bl_const(step))
Esempio n. 9
0
    def __init__(self, loss, optimizer=None, **kwargs):
        self.loss = loss
        self.global_step = kwargs.get('global_step', 0)
        if "lr_function" in kwargs:
            lr_function = kwargs["lr_function"]
        else:
            if "lr_scheduler_type" not in kwargs:
                kwargs["lr_scheduler_type"] = "default"
            lr_function = create_lr_scheduler(**kwargs)
        # decay_fn = None
        # Right now this option is pointless since sparse updates dont work on GPU.  We just turn it off
        sgd_mom = float(kwargs.get("mom", 0.9))
        self.clip = kwargs.get("clip", 100)

        if optimizer:
            self.optimizer = optimizer
        else:
            optim = kwargs.get("optim", "sgd")
            lr = kwargs.get("lr", kwargs.get("eta", 0.01))

            if optim == "adadelta":
                rho = float(kwargs.get("rho", 0.95))
                eps = float(kwargs.get("epsilon", 1e-6))
                logger.info("adadelta(eta=%f, rho=%f, epsilon=%f)", lr, rho, eps)
                self.optimizer = tf.optimizers.Adadelta(lr, rho, eps)
            elif optim == "adam":
                beta1 = float(kwargs.get("beta1", 0.9))
                beta2 = float(kwargs.get("beta2", 0.999))
                eps = float(kwargs.get("epsilon", 1e-8))
                logger.info("adam(eta=%f beta1=%f, beta2=%f, eps=%f)", lr, beta1, beta2, eps)
                self.optimizer = tf.optimizers.Adam(lr_function, beta1, beta2, eps)
            elif optim == "adamw":
                import tensorflow_addons as tfa

                wd = float(kwargs.get("weight_decay", 0))
                beta1 = float(kwargs.get("beta1", 0.9))
                beta2 = float(kwargs.get("beta2", 0.999))
                eps = float(kwargs.get("epsilon", 1e-8))
                logger.info("adamw(eta=%f beta1=%f, beta2=%f, eps=%f)", lr, beta1, beta2, eps)
                self.optimizer = tfa.optimizers.AdamW(
                    weight_decay=wd, learning_rate=lr_function, beta_1=beta1, beta_2=beta2, epsilon=eps
                )
            elif optim == "rmsprop":
                # Get mom again with difference default
                mom = float(kwargs.get("mom", 0.0))
                logger.info("rmsprop(eta=%f, mom=%f)", lr, mom)
                self.optimizer = tf.optimizers.RMSprop(lr_function, momentum=mom)
            elif sgd_mom > 0:
                logger.info("sgd-mom(eta=%f, mom=%f)", lr, sgd_mom)
                self.optimizer = tf.optimizers.SGD(lr_function, sgd_mom)
            else:
                logger.info("sgd(eta=%f)", lr)
                self.optimizer = tf.optimizers.SGD(lr_function)

        logger.info("clip gradients at %s", self.clip)
Esempio n. 10
0
def test_cyclic():
    from eight_mile.tf import optz

    tf.compat.v1.reset_default_graph()
    sess = tf.compat.v1.Session()

    lr_sched = create_lr_scheduler(**CYCLIC_LR_CONFIG)
    bl_const = CyclicLRScheduler(**CYCLIC_LR_CONFIG)

    for step in range(NUM_STEPS):
        lr = lr_sched(step)
        lr_bl = bl_const(step)
        assert np.isclose(lr, lr_bl)
Esempio n. 11
0
def test_linear_warmup():
    from eight_mile.tf import optz

    lr_sched = create_lr_scheduler(**LINEAR_WARMUP_LR_CONFIG)
    warmup_steps = LINEAR_WARMUP_LR_CONFIG["warmup_steps"]

    lrs = []
    for step in range(NUM_STEPS):
        lr = lr_sched(step)
        lrs += [lr]

    expected_lrs = [INIT_LR * min(1.0, step / warmup_steps) for step in range(NUM_STEPS)]
    assert np.allclose(expected_lrs, lrs)
Esempio n. 12
0
 def __init__(self, model_or_params, global_step=0, **kwargs):
     if isinstance(model_or_params, torch.nn.Module):
         parameters = model_or_params.parameters()
     else:
         parameters = model_or_params
     self.global_step = global_step
     if "lr_function" in kwargs:
         self.lr_function = kwargs["lr_function"]
     else:
         if "lr_scheduler_type" not in kwargs:
             kwargs["lr_scheduler_type"] = "default"
         self.lr_function = create_lr_scheduler(**kwargs)
     self._init_optimizer(parameters, **kwargs)
     self.current_lr = 0
Esempio n. 13
0
def test_composite_warmup():
    from eight_mile.tf import optz

    warmup_steps = COMPOSITE_LR_CONFIG["warmup_steps"]
    decay_rate = EXP_LR_CONFIG["decay_rate"]
    lr_sched = create_lr_scheduler(**COMPOSITE_LR_CONFIG)
    lrs = [lr_sched(step) for step in range(NUM_STEPS)]

    warmup_expected = [INIT_LR * min(1.0, step / warmup_steps) for step in range(NUM_STEPS)]
    exp_expected = [(INIT_LR * decay_rate ** (t / 100.0)) for t in range(NUM_STEPS)]

    for step in range(NUM_STEPS):
        if step < warmup_steps:
            assert np.allclose(lrs[step], warmup_expected[step])
        else:
            assert np.allclose(lrs[step], exp_expected[step - warmup_steps])
Esempio n. 14
0
def test_invtime():
    from eight_mile.tf import optz

    lr_sched = create_lr_scheduler(**INVTIME_LR_CONFIG)
    bl_invtime = InverseTimeDecayScheduler(**INVTIME_LR_CONFIG)
    decay_rate = INVTIME_LR_CONFIG["decay_rate"]

    lrs = []
    lrs_bl = []
    for step in range(NUM_STEPS):
        lr = lr_sched(step)
        lrs += [lr]
        lr_bl = bl_invtime(step)
        lrs_bl += [lr_bl]
    inv_times = [INIT_LR / (1.0 + decay_rate * t) for t in range(NUM_STEPS)]
    assert np.allclose(inv_times, lrs)
    assert np.allclose(inv_times, lrs_bl)
Esempio n. 15
0
def test_linear():
    from eight_mile.tf import optz

    lr_sched = create_lr_scheduler(**LINEAR_LR_CONFIG)
    bl_sched = LinearDecayScheduler(**LINEAR_LR_CONFIG)

    linear = [INIT_LR * (1.0 - step / NUM_STEPS) for step in range(NUM_STEPS)]
    lrs = []
    lrs_bl = []
    for step in range(NUM_STEPS):

        lr = lr_sched(step)
        lrs += [lr]
        lr_bl = bl_sched(step)
        lrs_bl += [lr_bl]
    assert np.allclose(lrs_bl, lrs)
    assert np.allclose(linear, lrs_bl)
Esempio n. 16
0
def test_exp():
    from eight_mile.tf import optz

    lr_sched = create_lr_scheduler(**EXP_LR_CONFIG)
    bl_exp = ExponentialDecayScheduler(**EXP_LR_CONFIG)
    decay_rate = EXP_LR_CONFIG["decay_rate"]

    lrs = []
    lrs_bl = []
    for step in range(NUM_STEPS):
        lr = lr_sched(step)
        lrs += [lr]
        lr_bl = bl_exp(step)
        lrs_bl += [lr_bl]
    inv_times = [(INIT_LR * decay_rate**(t / 100.0)) for t in range(NUM_STEPS)]
    assert np.allclose(inv_times, lrs)
    assert np.allclose(inv_times, lrs_bl)
Esempio n. 17
0
def test_cyclic():
    from eight_mile.tf import optz

    tf.compat.v1.reset_default_graph()
    sess = tf.compat.v1.Session()

    lr_sched = create_lr_scheduler(**CYCLIC_LR_CONFIG)
    bl_const = CyclicLRScheduler(**CYCLIC_LR_CONFIG)

    lr_var = tf.compat.v1.placeholder(tf.float32, shape=(), name="lr")
    step_var = tf.compat.v1.placeholder(tf.int32, shape=(), name="step")

    gph = lr_sched(lr_var, step_var)
    sess.run(tf.compat.v1.global_variables_initializer())

    for step in range(NUM_STEPS):
        lr = sess.run(gph, feed_dict={lr_var: INIT_LR, step_var: step})
        lr_bl = bl_const(step)
        assert np.isclose(lr, lr_bl)
Esempio n. 18
0
def test_constant():
    from eight_mile.tf import optz

    tf.compat.v1.reset_default_graph()
    sess = tf.compat.v1.Session()

    lr_sched = create_lr_scheduler(lr=INIT_LR, lr_scheduler_type="default")
    bl_const = ConstantScheduler(lr=INIT_LR)

    lr_var = tf.compat.v1.placeholder(tf.float32, shape=(), name="lr")
    step_var = tf.compat.v1.placeholder(tf.int32, shape=(), name="step")

    gph = lr_sched(lr_var, step_var)
    sess.run(tf.compat.v1.global_variables_initializer())

    for step in range(NUM_STEPS):
        lr = sess.run(gph, feed_dict={lr_var: INIT_LR, step_var: step})
        assert np.isclose(INIT_LR, lr)
        assert np.isclose(INIT_LR, bl_const(step))
Esempio n. 19
0
def test_zaremba():
    from eight_mile.tf import optz

    lr_sched = create_lr_scheduler(**ZAREMBA_LR_CONFIG)
    bl_zaremba = ZarembaDecayScheduler(**ZAREMBA_LR_CONFIG)

    lrs = []
    lrs_bl = []
    expect_lrs = []
    current_lr = INIT_LR
    for step in range(NUM_STEPS):
        lr = lr_sched(step)
        lr_bl = bl_zaremba(step)
        lrs += [lr]
        lrs_bl += [lr_bl]
        if step in BOUNDS:
            b = BOUNDS.index(step)
            current_lr = ZAREMBA_DECAY_VALUES[b]
        expect_lrs += [current_lr]
    np.allclose(expect_lrs, lrs)
    np.allclose(expect_lrs, lrs_bl)
Esempio n. 20
0
def test_linear_warmup():
    from eight_mile.tf import optz

    tf.compat.v1.reset_default_graph()
    sess = tf.compat.v1.Session()

    lr_sched = create_lr_scheduler(**LINEAR_WARMUP_LR_CONFIG)
    warmup_steps = LINEAR_WARMUP_LR_CONFIG["warmup_steps"]

    lr_var = tf.compat.v1.placeholder(tf.float32, shape=(), name="lr")
    step_var = tf.compat.v1.placeholder(tf.int32, shape=(), name="step")

    gph = lr_sched(lr_var, step_var)
    sess.run(tf.compat.v1.global_variables_initializer())

    lrs = []
    for step in range(NUM_STEPS):
        lr = sess.run(gph, feed_dict={lr_var: INIT_LR, step_var: step})
        lrs += [lr]

    expected_lrs = [
        INIT_LR * min(1.0, step / warmup_steps) for step in range(NUM_STEPS)
    ]
    assert np.allclose(expected_lrs, lrs)
Esempio n. 21
0
 def __init__(self,
              model_or_params,
              global_step=0,
              weight_decay=0.0,
              **kwargs):
     DONT_DECAY = ['ln.weight', 'bias']
     if isinstance(model_or_params, torch.nn.Module):
         if weight_decay == 0.0:
             parameters = model_or_params.parameters()
         else:
             params_w_wd = [
                 p for n, p in model_or_params.named_parameters()
                 if not any(nd in n for nd in DONT_DECAY)
             ]
             params_wo_wd = [
                 p for n, p in model_or_params.named_parameters()
                 if any(nd in n for nd in DONT_DECAY)
             ]
             parameters = [{
                 'params': params_w_wd,
                 'weight_decay': weight_decay
             }, {
                 'params': params_wo_wd,
                 'weight_decay': 0.0
             }]
     else:
         parameters = model_or_params
     self.global_step = global_step
     if "lr_function" in kwargs:
         self.lr_function = kwargs["lr_function"]
     else:
         if "lr_scheduler_type" not in kwargs:
             kwargs["lr_scheduler_type"] = "default"
         self.lr_function = create_lr_scheduler(**kwargs)
     self._init_optimizer(parameters, **kwargs)
     self.current_lr = 0