Example #1
0
def test_040_objective_instance_properties():
    """
    Objective:
        Verify the layer class validates the parameters have been initialized before accessed.
    Expected:
        Initialization detects the access to the non-initialized parameters and fails.
    """
    msg = "Accessing uninitialized property of the layer must fail."
    name = random_string(np.random.randint(1, 10))
    for _ in range(NUM_MAX_TEST_TIMES):
        M: int = 1
        layer = CrossEntropyLogLoss(
            name=name,
            num_nodes=1,
            log_loss_function=sigmoid_cross_entropy_log_loss,
            log_level=logging.DEBUG)

        # --------------------------------------------------------------------------------
        # To pass
        # --------------------------------------------------------------------------------
        try:
            if not layer.name == name:
                raise RuntimeError("layer.name == name should be true")
        except AssertionError:
            raise RuntimeError(
                "Access to name should be allowed as already initialized.")

        try:
            if not layer.M == M:
                raise RuntimeError("layer.M == M should be true")
        except AssertionError:
            raise RuntimeError(
                "Access to M should be allowed as already initialized.")

        try:
            if not isinstance(layer.logger, logging.Logger):
                raise RuntimeError(
                    "isinstance(layer.logger, logging.Logger) should be true")
        except AssertionError:
            raise RuntimeError(
                "Access to logger should be allowed as already initialized.")

        # --------------------------------------------------------------------------------
        # To fail
        # --------------------------------------------------------------------------------
        try:
            print(layer.X)
            raise RuntimeError(msg)
        except AssertionError:
            pass

        try:
            layer.X = int(1)
            raise RuntimeError(msg)
        except AssertionError:
            pass

        try:
            print(layer.N)
            raise RuntimeError(msg)
        except AssertionError:
            pass

        try:
            print(layer.dX)
            raise RuntimeError(msg)
        except AssertionError:
            pass

        try:
            print(layer.Y)
            raise RuntimeError(msg)
        except AssertionError:
            pass
        try:
            print(layer.P)
            raise RuntimeError(msg)
        except AssertionError:
            pass
        try:
            layer._Y = int(1)
            print(layer.Y)
            raise RuntimeError(msg)
        except AssertionError:
            pass

        try:
            print(layer.dY)
            raise RuntimeError(msg)
        except AssertionError:
            pass
        try:
            layer._dY = int(1)
            print(layer.dY)
            raise RuntimeError(msg)
        except AssertionError:
            pass

        try:
            print(layer.T)
            raise RuntimeError(msg)
        except AssertionError:
            pass

        try:
            print(layer.L)
            raise RuntimeError(msg)
        except AssertionError:
            pass

        try:
            print(layer.J)
            raise RuntimeError(msg)
        except AssertionError:
            pass

        try:
            layer.T = float(1)
            raise RuntimeError(msg)
        except AssertionError:
            pass

        try:
            layer.function(int(1))
            raise RuntimeError("Invoke layer.function(int(1)) must fail.")
        except AssertionError:
            pass

        try:
            layer.function(TYPE_FLOAT(1.0))
            layer.gradient(int(1))
            raise RuntimeError("Invoke layer.gradient(int(1)) must fail.")
        except AssertionError:
            pass
Example #2
0
def disabled_test_040_objective_methods_2d_ohe(caplog):
    """
    TODO: Disabled as need to redesign numerical_jacobian for 32 bit floating.

    Objective:
        Verify the forward path constraints:
        1. Layer output L/loss is np.sum(sigmoid_cross_entropy_log_loss) / N.
        2. gradient_numerical() == numerical Jacobian numerical_jacobian(O, X).

        Verify the backward path constraints:
        1. Analytical gradient G: gradient() == (P-1)/N
        2. Analytical gradient G is close to GN: gradient_numerical().
    """
    caplog.set_level(logging.DEBUG)

    # --------------------------------------------------------------------------------
    # Instantiate a CrossEntropyLogLoss layer
    # --------------------------------------------------------------------------------
    name = "test_040_objective_methods_2d_ohe"

    profiler = cProfile.Profile()
    profiler.enable()

    for _ in range(NUM_MAX_TEST_TIMES):
        N: int = np.random.randint(1, NUM_MAX_BATCH_SIZE)
        M: int = 1  # node number is 1 for 0/1 binary classification.
        layer = CrossEntropyLogLoss(
            name=name,
            num_nodes=M,
            log_loss_function=sigmoid_cross_entropy_log_loss,
            log_level=logging.DEBUG)

        # ================================================================================
        # Layer forward path
        # ================================================================================
        X = np.random.randn(N, M).astype(TYPE_FLOAT)
        T = np.zeros_like(X, dtype=TYPE_LABEL)  # OHE labels.
        T[np.arange(N), np.random.randint(0, M, N)] = TYPE_LABEL(1)

        # log_loss function require (X, T) in X(N, M), and T(N, M) in OHE label format.
        X, T = transform_X_T(X, T)
        layer.T = T
        Logger.debug("%s: X is \n%s\nT is \n%s", name, X, T)

        # --------------------------------------------------------------------------------
        # Expected analytical gradient EG = (dX/dL) = (A-T)/N
        # --------------------------------------------------------------------------------
        A = sigmoid(X)
        EG = ((A - T).astype(TYPE_FLOAT) / TYPE_FLOAT(N))

        # --------------------------------------------------------------------------------
        # Total loss Z = np.sum(J)/N
        # Expected loss EL = sum((1-T)X + np.log(1 + np.exp(-X)))
        # (J, P) = sigmoid_cross_entropy_log_loss(X, T) and J:shape(N,) where J:shape(N,)
        # is loss for each input and P is activation by sigmoid(X).
        # --------------------------------------------------------------------------------
        L = layer.function(X)
        J, P = sigmoid_cross_entropy_log_loss(X, T)
        EL = np.array(np.sum((1 - T) * X + logarithm(1 + np.exp(-X))) / N,
                      dtype=TYPE_FLOAT)

        # Constraint: A == P as they are sigmoid(X)
        assert np.all(np.abs(A-P) < ACTIVATION_DIFF_ACCEPTANCE_VALUE), \
            f"Need A==P==sigmoid(X) but A=\n{A}\n P=\n{P}\n(A-P)=\n{(A-P)}\n"

        # Constraint: Log loss layer output L == sum(J) from the log loss function
        Z = np.array(np.sum(J) / N, dtype=TYPE_FLOAT)
        assert np.array_equal(L, Z), \
            f"Need log loss layer output L == sum(J) but L=\n{L}\nZ=\n{Z}."

        # Constraint: L/loss is close to expected loss EL.
        assert np.all(np.abs(EL-L) < LOSS_DIFF_ACCEPTANCE_VALUE), \
            "Need EL close to L but \nEL=\n{EL}\nL=\n{L}\n"

        # --------------------------------------------------------------------------------
        # constraint: gradient_numerical() == numerical_jacobian(objective, X)
        # TODO: compare the diff to accommodate numerical errors.
        # --------------------------------------------------------------------------------
        GN = layer.gradient_numerical()  # [dL/dX] from the layer

        def objective(x):
            """Function to calculate the scalar loss L for cross entropy log loss"""
            j, p = sigmoid_cross_entropy_log_loss(x, T)
            return np.array(np.sum(j) / N, dtype=TYPE_FLOAT)

        EGN = numerical_jacobian(objective, X)  # Expected numerical dL/dX
        assert np.array_equal(GN[0], EGN), \
            f"GN[0]==EGN expected but GN[0] is \n%s\n EGN is \n%s\n" % (GN[0], EGN)

        # ================================================================================
        # Layer backward path
        # ================================================================================
        # constraint: Analytical gradient G: gradient() == (P-1)/N.
        dY = TYPE_FLOAT(1)
        G = layer.gradient(dY)
        assert np.all(np.abs(G-EG) <= GRADIENT_DIFF_ACCEPTANCE_VALUE), \
            f"Layer gradient dL/dX \n{G} \nneeds to be \n{EG}."

        # constraint: Analytical gradient G is close to GN: gradient_numerical().
        assert \
            np.allclose(GN[0], G, atol=GRADIENT_DIFF_ACCEPTANCE_VALUE, rtol=GRADIENT_DIFF_ACCEPTANCE_RATIO), \
            f"dX is \n{G}\nGN[0] is \n{GN[0]}\nRDiff is \n{G-GN[0]}.\n"

        # constraint: Gradient g of the log loss layer needs -1 < g < 1
        # abs(P-T) = abs(sigmoid(X)-T) cannot be > 1.
        assert np.all(np.abs(G) < 1), \
            f"Log loss layer gradient cannot be < -1 nor > 1 but\n{G}"
        assert np.all(np.abs(GN[0]) < (1+GRADIENT_DIFF_ACCEPTANCE_RATIO)), \
            f"Log loss layer gradient cannot be < -1 nor > 1 but\n{GN[0]}"

    profiler.disable()
    profiler.print_stats(sort="cumtime")
Example #3
0
def test_040_objective_instantiation():
    """
    Objective:
        Verify the initialized layer instance provides its properties.
    Expected:
        * name, num_nodes, M, log_level are the same as initialized.
        * X, T, dY, objective returns what is set.
        * N, M property are provided after X is set.
        * Y, P, L properties are provided after function(X).
        * gradient(dL/dY) repeats dL/dY,
        * gradient_numerical() returns 1
    """
    name = "test_040_objective_instantiation"
    for _ in range(NUM_MAX_TEST_TIMES):
        N: int = np.random.randint(1, NUM_MAX_BATCH_SIZE)
        M: int = 1
        # For sigmoid log loss layer, the number of features N in X is the same with node number.
        D: int = M
        layer = CrossEntropyLogLoss(
            name=name,
            num_nodes=M,
            log_loss_function=sigmoid_cross_entropy_log_loss,
            log_level=logging.DEBUG)

        # --------------------------------------------------------------------------------
        # Properties
        # --------------------------------------------------------------------------------
        assert layer.name == name
        assert layer.num_nodes == layer.M == M

        layer._D = D
        assert layer.D == D

        X = np.random.randn(N, D).astype(TYPE_FLOAT)
        layer.X = X
        assert np.array_equal(layer.X, X)
        assert layer.N == N == X.shape[0]
        # For sigmoid log loss layer, the number of features N in X is the same with node number.
        assert layer.M == X.shape[1]

        layer._dX = X
        assert np.array_equal(layer.dX, X)

        T = np.random.randint(0, M, N).astype(TYPE_LABEL)
        layer.T = T
        assert np.array_equal(layer.T, T)

        # layer.function() gives the total loss L in shape ().
        # log_loss function require (X, T) in X(N, M), and T(N, M) in OHE label format.
        X, T = transform_X_T(X, T)
        L = layer.function(X)
        J, P = sigmoid_cross_entropy_log_loss(X, T)
        assert \
            L.shape == () and np.allclose(L, (np.sum(J) / N).astype(TYPE_FLOAT)) and L == layer.Y, \
            "After setting T, layer.function(X) generates the total loss L but %s" % L

        # layer.function(X) sets layer.P to sigmoid_cross_entropy_log_loss(X, T)
        # P is nearly equal with sigmoid(X)
        assert \
            np.array_equal(layer.P, P) and \
            np.all(np.abs(layer.P - sigmoid(X)) < LOSS_DIFF_ACCEPTANCE_VALUE), \
            "layer.function(X) needs to set P as sigmoid_cross_entropy_log_loss(X, T) " \
            "which is close to sigmoid(X) but layer.P=\n%s\nP=\n%s\nsigmoid(X)=%s" \
            % (layer.P, P, sigmoid(X))

        # gradient of sigmoid cross entropy log loss layer is (P-T)/N
        G = layer.gradient()
        assert \
            np.all(np.abs(G - ((P-T)/N)) < GRADIENT_DIFF_ACCEPTANCE_VALUE), \
            "Gradient G needs (P-T)/N but G=\n%s\n(P-T)/N=\n%s\n" % (G, (P-T)/N)

        layer.logger.debug("This is a pytest")

        # pylint: disable=not-callable
        assert \
            layer.objective(np.array(1.0, dtype=TYPE_FLOAT)) \
            == np.array(1.0, dtype=TYPE_FLOAT), \
            "Objective function of the output/last layer is an identity function."
Example #4
0
def disabled_test_040_objective_methods_1d_ohe():
    """
    TODO: Disabled as need to redesign numerical_jacobian for 32 bit floating.

    Objective:
        Verify the forward path constraints:
        1. Layer output L/loss is np.sum(cross_entropy_log_loss(sigmoid(X), T, f=logistic_log_loss))) / N.
        2. gradient_numerical() == numerical Jacobian numerical_jacobian(O, X).

        Verify the backward path constraints:
        1. Analytical gradient G: gradient() == (P-1)/N
        2. Analytical gradient G is close to GN: gradient_numerical().
    Expected:
        Initialization detects the access to the non-initialized parameters and fails.
        
        For X.ndim > 0, the layer transform X into 2D so as to use the numpy tuple-
        like indexing:
        P[
            (0,3),
            (2,4)
        ]
        Hence, the shape of GN, G are 2D.
    """
    # --------------------------------------------------------------------------------
    # Instantiate a CrossEntropyLogLoss layer
    # --------------------------------------------------------------------------------
    name = "test_040_objective_methods_1d_ohe"
    N = 1

    for _ in range(NUM_MAX_TEST_TIMES):
        layer = CrossEntropyLogLoss(
            name=name,
            num_nodes=1,
            log_loss_function=sigmoid_cross_entropy_log_loss,
            log_level=logging.DEBUG)

        # ================================================================================
        # Layer forward path
        # ================================================================================
        X = TYPE_FLOAT(
            np.random.uniform(low=-BOUNDARY_SIGMOID, high=BOUNDARY_SIGMOID))
        T = TYPE_LABEL(np.random.randint(0, 2))  # OHE labels.

        # log_loss function require (X, T) in X(N, M), and T(N, M) in OHE label format.
        X, T = transform_X_T(X, T)
        layer.T = T

        # Expected analytical gradient dL/dX = (P-T)/N of shape (N,M)
        A = sigmoid(X)
        EG = ((A - T) / N).reshape(1, -1).astype(TYPE_FLOAT)

        Logger.debug("%s: X is \n%s\nT is %s\nP is %s\nEG is %s\n", name, X, T,
                     A, EG)

        # --------------------------------------------------------------------------------
        # constraint: L/loss == np.sum(J) / N.
        # J, P = sigmoid_cross_entropy_log_loss(X, T)
        # --------------------------------------------------------------------------------
        L = layer.function(X)  # L is shape ()
        J, P = sigmoid_cross_entropy_log_loss(X, T)
        Z = np.array(np.sum(J), dtype=TYPE_FLOAT) / TYPE_FLOAT(N)
        assert np.array_equal(L, Z), f"LogLoss output should be {L} but {Z}."

        # --------------------------------------------------------------------------------
        # constraint: gradient_numerical() == numerical Jacobian numerical_jacobian(O, X)
        # Use a dummy layer for the objective function because using the "layer"
        # updates the X, Y which can interfere the independence of the layer.
        # --------------------------------------------------------------------------------
        GN = layer.gradient_numerical()  # [dL/dX] from the layer

        # --------------------------------------------------------------------------------
        # Cannot use CrossEntropyLogLoss.function() to simulate the objective function L.
        # because it causes applying transform_X_T multiple times.
        # Because internally transform_X_T(X, T) has transformed T into the index label
        # in 1D with with length 1 by "T = T.reshape(-1)".
        # Then providing X in 1D into "dummy.function(x)" re-run "transform_X_T(X, T)"
        # again. The (X.ndim == T.ndim ==1) as an input and T must be OHE label for such
        # combination and T.shape == P.shape must be true for OHE labels.
        # However, T has been converted into the index format already by transform_X_T
        # (applying transform_X_T multiple times) and (T.shape=(1,1), X.shape=(1, > 1)
        # that violates the (X.shape == T.shape) constraint.
        # --------------------------------------------------------------------------------
        # dummy = CrossEntropyLogLoss(
        #     name="dummy",
        #     num_nodes=M,
        #     log_level=logging.DEBUG
        # )
        # dummy.T = T
        # dummy.objective = objective
        # dummy.function(X)
        # --------------------------------------------------------------------------------
        def objective(x):
            j, p = sigmoid_cross_entropy_log_loss(x, T)
            return np.array(np.sum(j) / N, dtype=TYPE_FLOAT)

        EGN = numerical_jacobian(objective,
                                 X).reshape(1, -1)  # Expected numerical dL/dX
        assert np.array_equal(GN[0], EGN), \
            f"Layer gradient_numerical GN \n{GN} \nneeds to be \n{EGN}."

        # ================================================================================
        # Layer backward path
        # ================================================================================
        # --------------------------------------------------------------------------------
        # constraint: Analytical gradient G: gradient() == (P-1)/N.
        # --------------------------------------------------------------------------------
        dY = TYPE_FLOAT(1)
        G = layer.gradient(dY)
        assert np.all(np.abs(G-EG) <= GRADIENT_DIFF_ACCEPTANCE_VALUE), \
            f"Layer gradient dL/dX \n{G} \nneeds to be \n{EG}."

        # --------------------------------------------------------------------------------
        # constraint: Analytical gradient G is close to GN: gradient_numerical().
        # --------------------------------------------------------------------------------
        assert \
            np.all(np.abs(G-GN[0]) <= GRADIENT_DIFF_ACCEPTANCE_VALUE) or \
            np.all(np.abs(G-GN[0]) <= np.abs(GRADIENT_DIFF_ACCEPTANCE_RATIO * GN[0])), \
            "dX is \n%s\nGN is \n%s\nG-GN is \n%s\n Ratio * GN[0] is \n%s.\n" \
            % (G, GN[0], G-GN[0], GRADIENT_DIFF_ACCEPTANCE_RATIO * GN[0])
def train_binary_classifier(N: int,
                            D: int,
                            M: int,
                            X: np.ndarray,
                            T: np.ndarray,
                            W: np.ndarray,
                            log_loss_function: Callable,
                            optimizer: Optimizer,
                            num_epochs: int = 100,
                            test_numerical_gradient: bool = False,
                            log_level: int = logging.ERROR,
                            callback: Callable = None):
    """Test case for binary classification with matmul + log loss.
    Args:
        N: Batch size
        D: Number of features
        M: Number of nodes. 1 for sigmoid and 2 for softmax
        X: train data
        T: labels
        W: weight
        log_loss_function: cross entropy logg loss function
        optimizer: Optimizer
        num_epochs: Number of epochs to run
        test_numerical_gradient: Flag if test the analytical gradient with the numerical one.
        log_level: logging level
        callback: callback function to invoke at the each epoch end.
    """
    name = __name__
    assert isinstance(T, np.ndarray) and np.issubdtype(
        T.dtype, np.integer) and T.ndim == 1 and T.shape[0] == N
    assert isinstance(
        X, np.ndarray) and X.dtype == TYPE_FLOAT and X.ndim == 2 and X.shape[
            0] == N and X.shape[1] == D
    assert isinstance(
        W, np.ndarray) and W.dtype == TYPE_FLOAT and W.ndim == 2 and W.shape[
            0] == M and W.shape[1] == D + 1
    assert num_epochs > 0 and N > 0 and D > 0

    assert ((log_loss_function == sigmoid_cross_entropy_log_loss and M == 1) or
            (log_loss_function == softmax_cross_entropy_log_loss and M >= 2))

    # --------------------------------------------------------------------------------
    # Instantiate a CrossEntropyLogLoss layer
    # --------------------------------------------------------------------------------
    loss = CrossEntropyLogLoss(name="loss",
                               num_nodes=M,
                               log_loss_function=log_loss_function,
                               log_level=log_level)

    # --------------------------------------------------------------------------------
    # Instantiate a Matmul layer
    # --------------------------------------------------------------------------------
    matmul = Matmul(name="matmul",
                    num_nodes=M,
                    W=W,
                    optimizer=optimizer,
                    log_level=log_level)
    matmul.objective = loss.function

    num_no_progress: int = 0  # how many time when loss L not decreased.
    loss.T = T
    history: List[np.ndarray] = [loss.function(matmul.function(X))]

    for i in range(num_epochs):
        # --------------------------------------------------------------------------------
        # Layer forward path
        # Calculate the matmul output Y=f(X), and get the loss L = objective(Y)
        # Test the numerical gradient dL/dX=matmul.gradient_numerical().
        # --------------------------------------------------------------------------------
        Y = matmul.function(X)
        L = loss.function(Y)

        if not (i % 50): print(f"iteration {i} Loss {L}")
        Logger.info("%s: iteration[%s]. Loss is [%s]", name, i, L)

        # --------------------------------------------------------------------------------
        # Constraint: 1. Objective/Loss L(Yn+1) after gradient descent < L(Yn)
        # --------------------------------------------------------------------------------
        if L >= history[-1] and (i % 20) == 1:
            Logger.warning(
                "Iteration [%i]: Loss[%s] has not improved from the previous [%s].",
                i, L, history[-1])
            if (num_no_progress := num_no_progress + 1) > 20:
                Logger.error(
                    "The training has no progress more than %s times.",
                    num_no_progress)
                # break
        else:
            num_no_progress = 0

        history.append(L)

        # --------------------------------------------------------------------------------
        # Expected dL/dW.T = X.T @ dL/dY = X.T @ (P-T) / N, and dL/dX = dL/dY @ W
        # P = sigmoid(X) or softmax(X)
        # dL/dX = dL/dY * W is to use W BEFORE updating W.
        # --------------------------------------------------------------------------------
        P = None
        if log_loss_function == sigmoid_cross_entropy_log_loss:
            # P = sigmoid(np.matmul(X, W.T))
            P = sigmoid(np.matmul(matmul.X, matmul.W.T))
            P = P - T.reshape(-1, 1)  # T(N,) -> T(N,1) to align with P(N,1)
            assert P.shape == (
                N, 1), "P.shape is %s T.shape is %s" % (P.shape, T.shape)

        elif log_loss_function == softmax_cross_entropy_log_loss:
            # matmul.X.shape is (N, D+1), matmul.W.T.shape is (D+1, M)
            P = softmax(np.matmul(matmul.X, matmul.W.T))  # (N, M)
            P[np.arange(N), T] -= 1

        EDX = np.matmul(P / N, matmul.W)  # (N,M) @ (M, D+1) -> (N, D+1)
        EDX = EDX[::, 1:]  # Hide the bias    -> (N, D)
        EDW = np.matmul(matmul.X.T,
                        P / N).T  # ((D+1,N) @ (N, M)).T -> (M, D+1)

        # --------------------------------------------------------------------------------
        # Layer backward path
        # 1. Calculate the analytical gradient dL/dX=matmul.gradient(dL/dY) with a dL/dY.
        # 2. Gradient descent to update Wn+1 = Wn - lr * dL/dX.
        # --------------------------------------------------------------------------------
        before = copy.deepcopy(matmul.W)
        dY = loss.gradient(TYPE_FLOAT(1))
        dX = matmul.gradient(dY)

        # gradient descent and get the analytical gradients dS=[dL/dX, dL/dW]
        # dL/dX.shape = (N, D)
        # dL/dW.shape = (M, D+1)
        dS = matmul.update()
        dW = dS[0]
        # --------------------------------------------------------------------------------
        #  Constraint 1. W in the matmul has been updated by the gradient descent.
        # --------------------------------------------------------------------------------
        Logger.debug("W after is \n%s", matmul.W)
        assert not np.array_equal(before, matmul.W), "W has not been updated."

        if not validate_against_expected_gradient(EDX, dX):
            Logger.warning("Expected dL/dX \n%s\nDiff\n%s", EDX, EDX - dX)
        if not validate_against_expected_gradient(EDW, dW):
            Logger.warning("Expected dL/dW \n%s\nDiff\n%s", EDW, EDW - dW)

        if test_numerical_gradient:
            # --------------------------------------------------------------------------------
            # Numerical gradients gn=[dL/dX, dL/dW]
            # dL/dX.shape = (N, D)
            # dL/dW.shape = (M, D+1)
            # --------------------------------------------------------------------------------
            gn = matmul.gradient_numerical()
            validate_against_numerical_gradient([dX] + dS, gn, Logger)

        if callback:
            # if W.shape[1] == 1 else callback(W=np.average(matmul.W, axis=0))
            callback(W=matmul.W[0])