def test_020_embedding_gradient_descent(caplog):
    """
    Objective:
        Verify the gradient descent, especially np.ufunc.at, is working as expected.

        W:(V, D=3) where all elements are initialized to 1.0.
        dL/dY:(1,E+SL) = I and E=1,SL=1

        X=(target,context)=[3,4,5,6,5] where target_index=3.

    Expected:
        The context index 5 occurs twice so that W[5] should be updated twice
        at the gradient descent as W[5] = W[5] - [lr * (1 + l2)] * 2.
        For other context indices at 4, 6:
        W[3] = W[3] - [lr * (1 + l2) * dWe].
        W[4] = W[4] - [lr * (1 + l2) * dWc].
        W[6] = W[6] - [lr * (1 + l2) * dWc].
    """
    caplog.set_level(logging.DEBUG)
    name = "test_020_embedding_gradient_multi_lines"
    dictionary: EventIndexing = _instantiate_event_indexing()

    def L(x):
        loss = Function.sum(
            x, axis=None, keepdims=False
        )
        return loss

    target_size = negative_sample_size = TYPE_INT(1)
    context_size = TYPE_INT(4)
    event_vector_size = TYPE_INT(3)
    W: TYPE_TENSOR = np.ones(
        shape=(dictionary.vocabulary_size, event_vector_size),
        dtype=TYPE_FLOAT
    )

    embedding, event_context = _must_succeed(
        name=name,
        num_nodes=(1 + negative_sample_size),
        target_size=target_size,
        context_size=context_size,
        negative_sample_size=negative_sample_size,
        event_vector_size=event_vector_size,
        dictionary=dictionary,
        W=W,
        log_level=logging.DEBUG,
        msg="must succeed"
    )
    del W   # embedding deepcopy W to avoid unexpected changes
    embedding.objective = L

    target_context_pairs = np.array([[3, 4, 5, 6, 5]], dtype=TYPE_INT)

    # --------------------------------------------------------------------------------
    # Forward path
    # --------------------------------------------------------------------------------
    Y = embedding.function(target_context_pairs)
    EDWe, EDWs, EDWc = embedding.gradient_numerical()
    print(f"Loss {L(Y)}\n")

    # --------------------------------------------------------------------------------
    # Backward path
    # --------------------------------------------------------------------------------
    dY = Function.ones(shape=Function.tensor_shape(Y))
    embedding.gradient(dY)

    # --------------------------------------------------------------------------------
    # Expected We, Wc (we do not know Ws as negative sample is stochastic)
    # This is for SGD as the optimizer.
    # --------------------------------------------------------------------------------
    lr = embedding.lr
    l2 = embedding.l2

    expected_dWe = lr * (1+l2) * embedding.dWe
    diff_We = embedding.optimizer.differential(dW=embedding.dWe)
    msg_We = "dWe: expected\n%s\n but actual diff=:\n%s\n" % \
             (expected_dWe, (expected_dWe-diff_We))
    embedding.all_close(
        expected_dWe, diff_We, msg=msg_We
    )
    EWe = embedding.W[3] - expected_dWe

    expected_dWc = lr * (1+l2) * embedding.dWc
    diff_Wc = embedding.optimizer.differential(dW=embedding.dWc)
    msg_Wc = "dWc: expected\n%s\n but actual diff=:\n%s\n" % \
             (expected_dWc, (expected_dWc-diff_Wc))
    embedding.all_close(
        expected_dWc, diff_Wc, msg=msg_Wc
    )
    EWc4 = np.subtract(embedding.W[4], expected_dWc)
    EWc5 = np.subtract(embedding.W[5], expected_dWc * 2)
    EWc6 = np.subtract(embedding.W[6], expected_dWc)

    # --------------------------------------------------------------------------------
    # Backward path: Gradient descent
    # --------------------------------------------------------------------------------
    assert np.array_equal(embedding.target_indices, np.array([3], dtype=TYPE_INT))
    assert np.array_equal(embedding.context_indices, np.array([4, 5, 6, 5], dtype=TYPE_INT))

    dWe, dWs, dWc = embedding.update()

    # ********************************************************************************
    # Constraint:
    # - dW is close to EDW
    # - dL/dWe = dL/dWs = Bc when dL/dY = I
    # ********************************************************************************
    assert Function.all_close(
        EDWe, dWe, msg="Expected (EDWe==dWe)\n%s\ndifference\n%s\n" % (EDWe, EDWe - dWe)
    )
    assert Function.all_close(
        EDWs, dWs, msg="Expected (EDWs==dWs)\n%s\ndifference\n%s\n" % (EDWs, EDWs - dWs)
    )
    assert Function.all_close(
        EDWc, dWc, msg="Expected (EWc5==W[5])\n%s\ndifference\n%s\n" % (EDWc, EDWc - dWc)
    )
    assert Function.all_close(
        dWe, dWs, msg="Expected (dWe==dWs) but dWe:\n%s\ndifference\n%s\n" % (dWe, dWe - dWs)
    )

    # ********************************************************************************
    # Constraint:
    # ********************************************************************************
    assert np.array_equal(expected_dWe, lr * (1+l2) * dWe)
    assert np.array_equal(expected_dWc, lr * (1+l2) * dWc)

    # - W[3] = W[3] - [lr * (1 + l2) * dWe].
    assert Function.all_close(
        # EWe, embedding.W[3], msg="Expected (EDWe==W[3])\n%s\ndifference\n%s\n" % (EWe, EWe - embedding.W[3])
        EWe, embedding.WO[3], msg="Expected (EDWe==W[3])\n%s\ndifference\n%s\n" % (EWe, EWe - embedding.W[3])
    )
    # W[4] = W[4] - [lr * (1 + l2) * dWc]
    assert Function.all_close(
        EWc4, embedding.W[4], msg="Expected (EWc4==W[4])\n%s\ndifference\n%s\n" % (EWc4, EWc4 - embedding.W[4])
    )
    # W[5] = W[5] - [lr * (1 + l2) * 2 * dWc]
    assert Function.all_close(
        EWc5, embedding.W[5], msg="Expected (EWc5==W[5])\n%s\ndifference\n%s\n" % (EWc5, EWc5 - embedding.W[5])
    )
    # W[6] = W[6] - [lr * (1 + l2) * dWc]
    assert Function.all_close(
        EWc6, embedding.W[6], msg="Expected (EWc6==W[6])\n%s\ndifference\n%s\n" % (EWc6, EWc6 - embedding.W[6])
    )
 def L(x):
     loss = Function.sum(
         x, axis=None, keepdims=False
     )
     return loss
def test_020_embedding_gradient_vs_autodiff(caplog):
    """
    Objective:
        Verify the Embedding analytical gradient with TF autodiff implemented
        in the gradient_numerical() method of the layer.
    Expected:
        Gradients [dWe, dWs, dWc] calculated in gradient() method matches with
        those calculated in the gradient_numerical().
    """
    caplog.set_level(logging.DEBUG)
    name = "test_020_embedding_gradient_multi_lines"
    dictionary: EventIndexing = _instantiate_event_indexing()

    from function import text
    from . test_020_embedding_sample_sentences import (
        bbc_world_us_canada_56988381 as sentences
    )
    max_sentence_length = TYPE_INT(text.Function.max_sentence_length(sentences))
    assert max_sentence_length >= 3

    profiler = cProfile.Profile()
    profiler.enable()

    def L(x):
        loss = Function.sum(
            x, axis=None, keepdims=False
        )
        return loss

    # --------------------------------------------------------------------------------
    # Ye = einsum("nd,ncd->n",  Bc:(N,D), We:(N,E,D))
    # dL/dWe:(N,E,D) = dL/dYe * dYe/dWe = dL/dYe * Bc
    #
    # Ys = einsum("nd,nsd->ns", Bc:(N,D), Ws:(N,SL,D))
    # dL/dWs:(N,SL,D) = dL/dYs * dYs/dWs = dL/dYs * Bc
    #
    # By setting
    # 1. dL/dY = np.c_[dL/dYe,dL/dYs] = I and
    # 2. context_size C == negative_sample_size SL
    # The constraint is E * dL/dWe == dL/dWs == Bc because dL/dYe, dL/dYs are I.
    # dL/dWe is normalized with E to be independent from the event (target) size.
    # --------------------------------------------------------------------------------
    for _ in range(NUM_MAX_TEST_TIMES):
        # C must be even number
        C = TYPE_INT(np.random.randint(1, max_sentence_length / 2) * 2)
        assert C < max_sentence_length

        # E=SL for (N,E,D) and (N,SL,D) has the same shape
        E = SL = TYPE_INT(
            np.random.randint(
                1,
                min(
                    Embedding.MAX_TARGET_SIZE,
                    Embedding.MAX_NEGATIVE_SAMPLE_SIZE,
                    (max_sentence_length - C)
                )+1
            )
        )

        target_size = negative_sample_size = TYPE_INT(E)
        context_size = TYPE_INT(C)
        event_vector_size: TYPE_INT = TYPE_INT(np.random.randint(1, 100))
        W: TYPE_TENSOR = np.random.randn(dictionary.vocabulary_size, event_vector_size)

        embedding, event_context = _must_succeed(
            name=name,
            num_nodes=(1+negative_sample_size),
            target_size=target_size,
            context_size=context_size,
            negative_sample_size=negative_sample_size,
            event_vector_size=event_vector_size,
            dictionary=dictionary,
            W=W,
            log_level=logging.DEBUG,
            msg="must succeed"
        )
        embedding.objective = L
        sequences = dictionary.function(sentences)
        target_context_pairs = event_context.function(sequences)

        # --------------------------------------------------------------------------------
        # Forward path
        # --------------------------------------------------------------------------------
        Y = embedding.function(target_context_pairs)
        EDWe, EDWs, EDWc = embedding.gradient_numerical()

        # --------------------------------------------------------------------------------
        # Backward path
        # --------------------------------------------------------------------------------
        dY = Function.ones(shape=Function.tensor_shape(Y))
        embedding.gradient(dY)

        # --------------------------------------------------------------------------------
        # Backward path
        # --------------------------------------------------------------------------------
        dWe, dWs, dWc = embedding.update()

        # ********************************************************************************
        # Constraint:
        # - dW is close to EDW
        # - dL/dWe = dL/dWs = Bc when dL/dY = I
        # ********************************************************************************
        assert Function.all_close(
            EDWe, dWe
        ), "Expected (EDWe==dWe)\n%s\ndifference\n%s\n" % (EDWe, EDWe-dWe)
        assert Function.all_close(
            EDWs, dWs
        ), "Expected (EDWs==dWs)\n%s\ndifference\n%s\n" % (EDWs, EDWs-dWs)
        assert Function.all_close(
            EDWc, dWc
        ), "Expected (EWc5==W[5])\n%s\ndifference\n%s\n" % (EDWc, EDWc-dWc)
        assert Function.all_close(
            dWe * E, dWs
        ), "Expected (dWe==dWs) but dWe:\n%s\ndifference\n%s\n" % (dWe, dWe-dWs)

    profiler.disable()
    profiler.print_stats(sort="cumtime")