Esempio n. 1
0
def test_resource():
    for i, dtype in enumerate(_dtypes_to_test(use_gpu=is_gpu_available())):
        # Initialize variables for numpy implementation.
        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)

        var0 = tf.Variable(var0_np, name="var0_%d" % i)
        var1 = tf.Variable(var1_np, name="var1_%d" % i)
        grads0 = tf.constant(grads0_np)
        grads1 = tf.constant(grads1_np)

        def learning_rate():
            return 0.001

        opt = AdaBeliefOptimizer(learning_rate=learning_rate)

        # Run 3 steps of AdaBelief
        for t in range(3):
            beta_1_power, beta_2_power = get_beta_accumulators(opt, dtype)
            assert_allclose_according_to_type(0.9**(t + 1), beta_1_power)
            assert_allclose_according_to_type(0.999**(t + 1), beta_2_power)

            opt.apply_gradients(zip([grads0, grads1], [var0, var1]))

            var0_np, m0, v0 = adabelief_update_numpy(var0_np, grads0_np, t, m0,
                                                     v0)
            var1_np, m1, v1 = adabelief_update_numpy(var1_np, grads1_np, t, m1,
                                                     v1)

            # Validate updated params
            assert_allclose_according_to_type(var0_np, var0.numpy())
            assert_allclose_according_to_type(var1_np, var1.numpy())
Esempio n. 2
0
def test_sharing():
    for dtype in _dtypes_to_test(use_gpu=is_gpu_available()):
        # Initialize variables for numpy implementation.
        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)

        var0 = tf.Variable(var0_np)
        var1 = tf.Variable(var1_np)
        grads0 = tf.constant(grads0_np)
        grads1 = tf.constant(grads1_np)
        opt = AdaBeliefOptimizer()

        # Fetch params to validate initial values
        np.testing.assert_allclose(np.asanyarray([1.0, 2.0]), var0.numpy())
        np.testing.assert_allclose(np.asanyarray([3.0, 4.0]), var1.numpy())

        # Run 3 steps of AdaBelief
        for t in range(3):
            beta_1_power, beta_2_power = get_beta_accumulators(opt, dtype)
            assert_allclose_according_to_type(0.9**(t + 1), beta_1_power)
            assert_allclose_according_to_type(0.999**(t + 1), beta_2_power)

            opt.apply_gradients(zip([grads0, grads1], [var0, var1]))

            var0_np, m0, v0 = adabelief_update_numpy(var0_np, grads0_np, t, m0,
                                                     v0)
            var1_np, m1, v1 = adabelief_update_numpy(var1_np, grads1_np, t, m1,
                                                     v1)

            # Validate updated params
            assert_allclose_according_to_type(var0_np, var0.numpy())
            assert_allclose_according_to_type(var1_np, var1.numpy())
Esempio n. 3
0
def test_serialization():
    optimizer = AdaBeliefOptimizer(lr=1e-3,
                                   total_steps=10000,
                                   warmup_proportion=0.1,
                                   min_lr=1e-5)
    config = tf.keras.optimizers.serialize(optimizer)
    new_optimizer = tf.keras.optimizers.deserialize(
        config, custom_objects={"AdaBeliefOptimizer": AdaBeliefOptimizer})
    assert new_optimizer.get_config() == optimizer.get_config()
Esempio n. 4
0
def test_sparse():
    for dtype in _dtypes_to_test(use_gpu=is_gpu_available()):
        # Initialize tf for numpy implementation.
        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
        var0_np = np.array([1.0, 1.0, 2.0], dtype=dtype.as_numpy_dtype)
        grads0_np = np.array([0.1, 0.0, 0.1], dtype=dtype.as_numpy_dtype)
        var1_np = np.array([3.0, 3.0, 4.0], dtype=dtype.as_numpy_dtype)
        grads1_np = np.array([0.01, 0.0, 0.01], dtype=dtype.as_numpy_dtype)

        var0 = tf.Variable(var0_np)
        var1 = tf.Variable(var1_np)
        grads0_np_indices = np.array([0, 2], dtype=np.int32)
        grads0 = tf.IndexedSlices(
            tf.constant(grads0_np[grads0_np_indices]),
            tf.constant(grads0_np_indices),
            tf.constant([3]),
        )
        grads1_np_indices = np.array([0, 2], dtype=np.int32)
        grads1 = tf.IndexedSlices(
            tf.constant(grads1_np[grads1_np_indices]),
            tf.constant(grads1_np_indices),
            tf.constant([3]),
        )

        epsilon = 1e-7
        optimizer = AdaBeliefOptimizer(epsilon=epsilon)

        # Fetch params to validate initial values
        np.testing.assert_allclose(np.asanyarray([1.0, 1.0, 2.0]),
                                   var0.numpy())
        np.testing.assert_allclose(np.asanyarray([3.0, 3.0, 4.0]),
                                   var1.numpy())

        # Run 3 steps of AdaBelief
        for t in range(3):
            beta_1_power, beta_2_power = get_beta_accumulators(
                optimizer, dtype)
            assert_allclose_according_to_type(0.9**(t + 1), beta_1_power)
            assert_allclose_according_to_type(0.999**(t + 1), beta_2_power)

            optimizer.apply_gradients(zip([grads0, grads1], [var0, var1]))
            var0_np, m0, v0 = adabelief_update_numpy(var0_np,
                                                     grads0_np,
                                                     t,
                                                     m0,
                                                     v0,
                                                     epsilon=epsilon)
            var1_np, m1, v1 = adabelief_update_numpy(var1_np,
                                                     grads1_np,
                                                     t,
                                                     m1,
                                                     v1,
                                                     epsilon=epsilon)
            # Validate updated params
            assert_allclose_according_to_type(var0_np, var0.numpy(), atol=2e-4)
            assert_allclose_according_to_type(var1_np, var1.numpy(), atol=2e-4)
Esempio n. 5
0
def test_basic_with_learning_rate_decay():
    for i, dtype in enumerate(_dtypes_to_test(use_gpu=is_gpu_available())):
        # Initialize variables for numpy implementation.
        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)

        var0 = tf.Variable(var0_np, name="var0_%d" % i)
        var1 = tf.Variable(var1_np, name="var1_%d" % i)
        grads0 = tf.constant(grads0_np)
        grads1 = tf.constant(grads1_np)

        learning_rate = 0.001
        beta_1 = 0.9
        beta_2 = 0.999
        epsilon = 1e-14
        decay = 0.5
        weight_decay = 0.01

        opt = AdaBeliefOptimizer(
            learning_rate=learning_rate,
            beta_1=beta_1,
            beta_2=beta_2,
            epsilon=epsilon,
            weight_decay=weight_decay,
            decay=decay,
        )

        # Run 3 steps of AdaBelief
        for t in range(3):
            opt.apply_gradients(zip([grads0, grads1], [var0, var1]))

            lr_np = learning_rate / (1 + decay * t)

            var0_np, m0, v0 = adabelief_update_numpy(var0_np,
                                                     grads0_np,
                                                     t,
                                                     m0,
                                                     v0,
                                                     lr=lr_np,
                                                     weight_decay=weight_decay)
            var1_np, m1, v1 = adabelief_update_numpy(var1_np,
                                                     grads1_np,
                                                     t,
                                                     m1,
                                                     v1,
                                                     lr=lr_np,
                                                     weight_decay=weight_decay)

            # Validate updated params
            assert_allclose_according_to_type(var0_np, var0.numpy(), atol=2e-4)
            assert_allclose_according_to_type(var1_np, var1.numpy(), atol=2e-4)
Esempio n. 6
0
def test_minimize_mean_square_loss_with_weight_decay():
    w = tf.Variable([0.1, -0.2, -0.1])
    x = tf.constant([0.4, 0.2, -0.5])

    def loss():
        return tf.reduce_mean(tf.square(x - w))

    opt = AdaBeliefOptimizer(0.02, weight_decay=0.01)

    # Run 200 steps
    for _ in range(200):
        opt.minimize(loss, [w])
    # Validate updated params
    np.testing.assert_allclose(w.numpy(),
                               np.asanyarray([0.4, 0.2, -0.5]),
                               rtol=1e-2,
                               atol=1e-2)
Esempio n. 7
0
def test_dense_sample_with_warmup():
    run_dense_sample(
        iterations=100,
        expected=[[0.9811546, 1.9810544], [2.981224, 3.981214]],
        optimizer=AdaBeliefOptimizer(lr=1e-3,
                                     total_steps=100,
                                     warmup_proportion=0.1,
                                     min_lr=1e-5),
    )
Esempio n. 8
0
def test_sparse_sample_with_warmup():
    run_sparse_sample(
        iterations=200,
        expected=[[0.9211433, 2.0], [3.0, 3.9211729]],
        optimizer=AdaBeliefOptimizer(lr=1e-3,
                                     total_steps=200,
                                     warmup_proportion=0.1,
                                     min_lr=1e-5),
    )
Esempio n. 9
0
def test_schedulers():
    lr_scheduler = tf.keras.optimizers.schedules.ExponentialDecay(
        1e-3, 50, 0.5)
    wd_scheduler = tf.keras.optimizers.schedules.InverseTimeDecay(
        2e-3, 25, 0.25)

    run_dense_sample(
        iterations=100,
        expected=[[0.9778532, 1.9773799], [2.977964, 3.977844]],
        optimizer=AdaBeliefOptimizer(learning_rate=lr_scheduler,
                                     weight_decay=wd_scheduler),
    )
Esempio n. 10
0
def test_scheduler_serialization():
    lr_scheduler = tf.keras.optimizers.schedules.ExponentialDecay(
        1e-3, 50, 0.5)
    wd_scheduler = tf.keras.optimizers.schedules.InverseTimeDecay(
        2e-3, 25, 0.25)

    optimizer = AdaBeliefOptimizer(learning_rate=lr_scheduler,
                                   weight_decay=wd_scheduler)
    config = tf.keras.optimizers.serialize(optimizer)
    new_optimizer = tf.keras.optimizers.deserialize(
        config, custom_objects={"AdaBeliefOptimizer": AdaBeliefOptimizer})
    assert new_optimizer.get_config() == optimizer.get_config()

    assert new_optimizer.get_config()["learning_rate"] == {
        "class_name": "ExponentialDecay",
        "config": lr_scheduler.get_config(),
    }

    assert new_optimizer.get_config()["weight_decay"] == {
        "class_name": "InverseTimeDecay",
        "config": wd_scheduler.get_config(),
    }
Esempio n. 11
0
def test_dense_sample():
    run_dense_sample(
        iterations=100,
        expected=[[0.9475605, 1.9471607], [2.94784, 3.9478]],
        optimizer=AdaBeliefOptimizer(lr=1e-3),
    )
Esempio n. 12
0
def test_dense_sample_with_weight_decay():
    run_dense_sample(
        iterations=100,
        expected=[[0.94657826, 1.9451787], [2.9448593, 3.9438188]],
        optimizer=AdaBeliefOptimizer(lr=1e-3, weight_decay=0.01),
    )
Esempio n. 13
0
def test_get_config():
    opt = AdaBeliefOptimizer(lr=1e-4)
    config = opt.get_config()
    assert config["learning_rate"] == 1e-4
    assert config["total_steps"] == 0
Esempio n. 14
0
def test_sparse_sample():
    run_sparse_sample(
        iterations=200,
        expected=[[0.78374314, 2.0], [3.0, 3.7839816]],
        optimizer=AdaBeliefOptimizer(lr=1e-3),
    )
Esempio n. 15
0
def test_sparse_sample_without_rectify():
    run_sparse_sample(
        iterations=200,
        expected=[[0.0538935, 2.0], [3.0, 3.0538921]],
        optimizer=AdaBeliefOptimizer(lr=1e-3, rectify=False),
    )
Esempio n. 16
0
def test_dense_sample_without_rectify():
    run_dense_sample(
        iterations=100,
        expected=[[0.6672395, 1.6672392], [2.667238, 3.6672378]],
        optimizer=AdaBeliefOptimizer(lr=1e-3, rectify=False),
    )
Esempio n. 17
0
def test_dense_sample_with_amsgrad():
    run_dense_sample(
        iterations=100,
        expected=[[0.9485513, 1.9481515], [2.9488308, 3.9487908]],
        optimizer=AdaBeliefOptimizer(lr=1e-3, amsgrad=True),
    )
Esempio n. 18
0
def test_sparse_sample_with_amsgrad():
    run_sparse_sample(
        iterations=200,
        expected=[[0.7947248, 2.0], [3.0, 3.7949643]],
        optimizer=AdaBeliefOptimizer(lr=1e-3, amsgrad=True),
    )
Esempio n. 19
0
def test_sparse_sample_with_weight_decay():
    run_sparse_sample(
        iterations=200,
        expected=[[0.7818859, 2.0], [3.0, 3.7761304]],
        optimizer=AdaBeliefOptimizer(lr=1e-3, weight_decay=0.01),
    )