def test_010_sigmoid_cross_entropy_log_loss_2d(caplog): """ Objective: Test case for sigmoid_cross_entropy_log_loss(X, T) = -( T * log(sigmoid(X)) + (1 -T) * log(1-sigmoid(X)) ) For the input X of shape (N,1) and T in index format of shape (N,1), calculate the sigmoid log loss and verify the values are as expected. Expected: For Z = sigmoid(X) = 1 / (1 + exp(-X)) and T=[[1]] Then -log(Z) should be almost same with sigmoid_cross_entropy_log_loss(X, T). Almost because finite float precision always has rounding errors. """ # caplog.set_level(logging.DEBUG, logger=Logger.name) u = REFORMULA_DIFF_ACCEPTANCE_VALUE # -------------------------------------------------------------------------------- # [Test case 01] # X:(N,M)=(1, 1). X=(x0) where x0=0 by which sigmoid(X) generates 0.5. # Expected: # sigmoid_cross_entropy_log_loss(X, T) == -log(0.5) # -------------------------------------------------------------------------------- X = np.array([[TYPE_FLOAT(0.0)]]) T = np.array([TYPE_LABEL(1)]) X, T = transform_X_T(X, T) E = -logarithm(np.array([TYPE_FLOAT(0.5)])) J, P = sigmoid_cross_entropy_log_loss(X, T) assert E.shape == J.shape assert np.all(E == J), \ "Expected (E==J) but \n%s\nE=\n%s\nT=%s\nX=\n%s\nJ=\n%s\n" \ % (np.abs(E - J), E, T, X, J) assert P == 0.5 # -------------------------------------------------------------------------------- # [Test case 02] # For X:(N,1) # -------------------------------------------------------------------------------- for _ in range(NUM_MAX_TEST_TIMES): # X(N, M), and T(N,) in index label format N = np.random.randint(1, NUM_MAX_BATCH_SIZE) M = 1 # always 1 for binary classification 0 or 1. X = np.random.randn(N, M).astype(TYPE_FLOAT) T = np.random.randint(0, M, N).astype(TYPE_LABEL) X, T = transform_X_T(X, T) Logger.debug("T is %s\nX is \n%s\n", T, X) # ---------------------------------------------------------------------- # Expected value EJ for J and Z for P # Note: # To handle both index label format and OHE label format in the # Loss layer(s), X and T are transformed into (N,1) shapes in # transform_X_T(X, T) for logistic log loss. # DO NOT squeeze Z nor P. # ---------------------------------------------------------------------- Z = sigmoid(X) EJ = np.squeeze(-(T * logarithm(Z) + TYPE_FLOAT(1-T) * logarithm(TYPE_FLOAT(1-Z))), axis=-1) # ********************************************************************** # Constraint: Actual J should be close to EJ. # ********************************************************************** J, P = sigmoid_cross_entropy_log_loss(X, T) assert EJ.shape == J.shape assert np.all(np.abs(EJ-J) < u), \ "Expected abs(EJ-J) < %s but \n%s\nEJ=\n%s\nT=%s\nX=\n%s\nJ=\n%s\n" \ % (u, np.abs(EJ-J), EJ, T, X, J) # ********************************************************************** # Constraint: Actual P should be close to Z. # ********************************************************************** assert np.all(np.abs(Z-P) < u), \ "EP \n%s\nP\n%s\nEP-P \n%s\n" % (Z, P, Z-P) # ---------------------------------------------------------------------- # L = cross_entropy_log_loss(P, T) should be close to J # ---------------------------------------------------------------------- L = cross_entropy_log_loss(P=Z, T=T, f=logistic_log_loss) assert L.shape == J.shape assert np.all(np.abs(L-J) < u), \ "Expected abs(L-J) < %s but \n%s\nL=\n%s\nT=%s\nX=\n%s\nJ=\n%s\n" \ % (u, np.abs(L-J), L, T, X, J)
def test_030_objective_methods_1d_ohe(): """ Objective: Verify the forward path constraints: 1. Layer output L/loss is np.sum(cross_entropy_log_loss(softmax(X), T)) / N. 2. gradient_numerical() == numerical Jacobian numerical_jacobian(O, X). Verify the backward path constraints: 1. Analytical gradient G: gradient() == (P-1)/N 2. Analytical gradient G is close to GN: gradient_numerical(). Expected: Initialization detects the access to the non-initialized parameters and fails. For X.ndim > 0, the layer transform X into 2D so as to use the numpy tuple- like indexing: P[ (0,3), (2,4) ] Hence, the shape of GN, G are 2D. """ # -------------------------------------------------------------------------------- # Instantiate a CrossEntropyLogLoss layer # -------------------------------------------------------------------------------- name = "test_030_objective_methods_1d_ohe" N = 1 for _ in range(NUM_MAX_TEST_TIMES): M: int = np.random.randint(2, NUM_MAX_NODES) assert M >= 2, "Softmax is for multi label classification. "\ " Use Sigmoid for binary classification." _layer = layer.CrossEntropyLogLoss(name=name, num_nodes=M, log_level=logging.DEBUG) # ================================================================================ # Layer forward path # ================================================================================ X = np.random.randn(M).astype(TYPE_FLOAT) T = np.zeros_like(X, dtype=TYPE_LABEL) # OHE labels. T[np.random.randint(0, M)] = TYPE_LABEL(1) _layer.T = T P = softmax(X) EG = ((P - T) / N).reshape(1, -1).astype( TYPE_FLOAT) # Expected analytical gradient dL/dX = (P-T)/N Logger.debug("%s: X is \n%s\nT is %s\nP is %s\nEG is %s\n", name, X, T, P, EG) # -------------------------------------------------------------------------------- # constraint: L/loss == np.sum(cross_entropy_log_loss(softmax(X), T)) / N. # -------------------------------------------------------------------------------- L = _layer.function(X) Z = np.array(np.sum(cross_entropy_log_loss(softmax(X), T)), dtype=TYPE_FLOAT) / TYPE_FLOAT(N) assert np.array_equal( L, Z), f"SoftmaxLogLoss output should be {L} but {Z}." # -------------------------------------------------------------------------------- # constraint: gradient_numerical() == numerical Jacobian numerical_jacobian(O, X) # Use a dummy _layer for the objective function because using the "_layer" # updates the X, Y which can interfere the independence of the _layer. # -------------------------------------------------------------------------------- GN = _layer.gradient_numerical() # [dL/dX] from the _layer # -------------------------------------------------------------------------------- # Cannot use CrossEntropyLogLoss.function() to simulate the objective function L. # because it causes applying transform_X_T multiple times. # Because internally transform_X_T(X, T) has transformed T into the index label # in 1D with with length 1 by "T = T.reshape(-1)". # Then providing X in 1D into "dummy.function(x)" re-run "transform_X_T(X, T)" # again. The (X.ndim == T.ndim ==1) as an input and T must be OHE label for such # combination and T.shape == P.shape must be true for OHE labels. # However, T has been converted into the index format already by transform_X_T # (applying transform_X_T multiple times) and (T.shape=(1,1), X.shape=(1, > 1) # that violates the (X.shape == T.shape) constraint. # -------------------------------------------------------------------------------- # dummy = CrossEntropyLogLoss( # name="dummy", # num_nodes=M, # log_level=logging.DEBUG # ) # dummy.T = T # dummy.objective = objective # dummy.function(X) # -------------------------------------------------------------------------------- # O = lambda x: dummy.objective(dummy.function(x)) # Objective function O = lambda x: np.sum(cross_entropy_log_loss(softmax(x), T), dtype=TYPE_FLOAT) / TYPE_FLOAT(N) # -------------------------------------------------------------------------------- EGN = numerical_jacobian(O, X).reshape(1, -1) # Expected numerical dL/dX assert np.array_equal(GN[0], EGN), \ f"Layer gradient_numerical GN \n{GN} \nneeds to be \n{EGN}." # ================================================================================ # Layer backward path # ================================================================================ # -------------------------------------------------------------------------------- # constraint: Analytical gradient G: gradient() == (P-1)/N. # -------------------------------------------------------------------------------- dY = TYPE_FLOAT(1) G = _layer.gradient(dY) assert np.all(np.abs(G-EG) <= GRADIENT_DIFF_ACCEPTANCE_VALUE), \ f"Layer gradient dL/dX \n{G} \nneeds to be \n{EG} but G-EG \n{np.abs(G-EG)}\n" # -------------------------------------------------------------------------------- # constraint: Analytical gradient G is close to GN: gradient_numerical(). # -------------------------------------------------------------------------------- assert \ np.all(np.abs(G - GN[0]) <= GRADIENT_DIFF_ACCEPTANCE_VALUE) or \ np.all(np.abs(G-GN[0]) <= np.abs(GRADIENT_DIFF_ACCEPTANCE_RATIO * GN[0])), \ f"dX is \n{G}\nGN[0] is \n{GN[0]}\nRatio * GN[0] is \n{GRADIENT_DIFF_ACCEPTANCE_RATIO * GN[0]}.\n"
def disabled_test_030_objective_methods_2d_ohe(): """ TODO: Disabled as need to redesign numerical_jacobian for 32 bit floating. Objective: Verify the forward path constraints: 1. Layer output L/loss is np.sum(cross_entropy_log_loss(softmax(X), T)) / N. 2. gradient_numerical() == numerical Jacobian numerical_jacobian(O, X). Verify the backward path constraints: 1. Analytical gradient G: gradient() == (P-1)/N 2. Analytical gradient G is close to GN: gradient_numerical(). Expected: Initialization detects the access to the non-initialized parameters and fails. """ def objective(X: np.ndarray) -> Union[float, np.ndarray]: """Dummy objective function to calculate the loss L""" assert X.ndim == 0, "The output of the log loss should be of shape ()" return X # -------------------------------------------------------------------------------- # Instantiate a CrossEntropyLogLoss layer # -------------------------------------------------------------------------------- name = "test_030_objective_methods_2d_ohe" for _ in range(NUM_MAX_TEST_TIMES): N: int = np.random.randint(1, NUM_MAX_BATCH_SIZE) M: int = np.random.randint(2, NUM_MAX_NODES) assert M >= 2, "Softmax is for multi label classification. "\ " Use Sigmoid for binary classification." _layer = layer.CrossEntropyLogLoss(name=name, num_nodes=M, log_level=logging.DEBUG) _layer.objective = objective # ================================================================================ # Layer forward path # ================================================================================ X = np.random.randn(N, M).astype(TYPE_FLOAT) T = np.zeros_like(X, dtype=TYPE_LABEL) # OHE labels. T[np.arange(N), np.random.randint(0, M, N)] = TYPE_LABEL(1) _layer.T = T Logger.debug("%s: X is \n%s\nT is \n%s", name, X, T) P = softmax(X) EG = (P - T) / N # Expected analytical gradient dL/dX = (P-T)/N # -------------------------------------------------------------------------------- # constraint: L/loss == np.sum(cross_entropy_log_loss(softmax(X), T)) / N. # -------------------------------------------------------------------------------- L = _layer.function(X) Z = np.array(np.sum(cross_entropy_log_loss(softmax(X), T))) / N assert np.array_equal( L, Z), f"SoftmaxLogLoss output should be {L} but {Z}." # -------------------------------------------------------------------------------- # constraint: gradient_numerical() == numerical Jacobian numerical_jacobian(O, X) # -------------------------------------------------------------------------------- GN = _layer.gradient_numerical() # [dL/dX] from the _layer # -------------------------------------------------------------------------------- # DO not use CrossEntropyLogLoss.function() to simulate the objective function for # the expected GN. See the same part in test_030_objective_methods_1d_ohe(). # -------------------------------------------------------------------------------- # dummy= CrossEntropyLogLoss( # name=name, # num_nodes=M, # log_level=logging.DEBUG # ) # dummy.T = T # dummy.objective = objective # O = lambda x: dummy.objective(dummy.function(x)) # Objective function O = lambda x: np.sum(cross_entropy_log_loss(softmax(x), T)) / N # -------------------------------------------------------------------------------- EGN = numerical_jacobian(O, X) # Expected numerical dL/dX assert np.array_equal(GN[0], EGN), \ f"GN[0]==EGN expected but GN[0] is \n%s\n EGN is \n%s\n" % (GN[0], EGN) # ================================================================================ # Layer backward path # ================================================================================ # -------------------------------------------------------------------------------- # constraint: Analytical gradient G: gradient() == (P-1)/N. # -------------------------------------------------------------------------------- dY = float(1) G = _layer.gradient(dY) assert np.all(abs(G-EG) <= GRADIENT_DIFF_ACCEPTANCE_VALUE), \ f"Layer gradient dL/dX \n{G} \nneeds to be \n{EG}." # -------------------------------------------------------------------------------- # constraint: Analytical gradient G is close to GN: gradient_numerical(). # -------------------------------------------------------------------------------- assert \ np.all(np.abs(G - GN[0]) <= GRADIENT_DIFF_ACCEPTANCE_VALUE) or \ np.all(np.abs(G - GN[0]) <= np.abs(GRADIENT_DIFF_ACCEPTANCE_RATIO * GN[0])), \ f"dX is \n{G}\nGN[0] is \n{GN[0]}\nRatio * GN[0] is \n{GRADIENT_DIFF_ACCEPTANCE_RATIO * GN[0]}.\n"
# P:[0, 0, ..., 1, 0, ...] where Pi = 1 # T:[0, 0, ..., 1, 0, ...] is OHE label where Ti=1 # sum(-t * log(p+k)) -> log(1+k) # dlog(P+k)/dP -> -1 / (1+k) # -------------------------------------------------------------------------------- M = np.random.randint(2, NUM_MAX_NODES) # M > 1 index = np.random.randint(0, M) # location of the truth while not (x := TYPE_FLOAT( np.random.uniform(low=-BOUNDARY_SIGMOID, high=BOUNDARY_SIGMOID))): pass p = softmax(x) P3 = np.zeros(M, dtype=TYPE_FLOAT) P3[index] = p T3 = np.zeros(M).astype(TYPE_LABEL) # OHE index T3[index] = TYPE_LABEL(1) # -------------------------------------------------------------------------------- # The Jacobian G shape is the same with P.shape. # -------------------------------------------------------------------------------- N3 = np.zeros_like(P3, dtype=TYPE_FLOAT) N3[index] = TYPE_FLOAT(-1 * logarithm(p + h) + 1 * logarithm(p - h)) / TYPE_FLOAT(2 * h) N3 = numerical_jacobian(partial(f, T=T3), P3) assert N3.shape == N3.shape assert np.all(np.abs(N3-N3) < u), \ f"Delta expected to be < {u} but \n{np.abs(N3-N3)}" G3 = np.zeros_like(P3, dtype=TYPE_FLOAT) G3[index] = -1 / p check.equal(np.all(np.abs(G3 - N3) < u), True,
def test_020_cross_entropy_log_loss_1d(caplog): """ Objective: Test the categorical log loss values for P in 1 dimension. Constraints: 1. The numerical gradient gn = (-t * logarithm(p+h) + t * logarithm(p-h)) / 2h. 2. The numerical gradient gn is within +/- u within the analytical g = -T/P. P: Probabilities from softmax of shape (M,) M: Number of nodes in the cross_entropy_log_loss layer. T: Labels Note: log(P=1) -> 0 dlog(x)/dx = 1/x """ def f(P: np.ndarray, T: np.ndarray): return np.sum(cross_entropy_log_loss(P, T)) # caplog.set_level(logging.DEBUG, logger=Logger.name) h: TYPE_FLOAT = OFFSET_DELTA u: TYPE_FLOAT = GRADIENT_DIFF_ACCEPTANCE_VALUE # -------------------------------------------------------------------------------- # For (P, T): P[index] = True/1, OHE label T[index] = 1 where # P=[0,0,0,...,1,...0], T = [0,0,0,...1,...0]. T[i] == 1 # # Do not forget the Jacobian shape is (N,) and calculate each element. # 1. For T=1, loss L = -log(Pi) = 0 and dL/dP=(1/Pi)= -1 is expected. # 2. For T=0, Loss L = (-log(0+offset+h)-log(0+offset-h)) / 2h = 0 is expected. # -------------------------------------------------------------------------------- M: TYPE_INT = np.random.randint(2, NUM_MAX_NODES) index: TYPE_INT = TYPE_INT(np.random.randint( 0, M)) # Position of the true label in P P1 = np.zeros(M, dtype=TYPE_FLOAT) P1[index] = TYPE_FLOAT(1.0) T1 = np.zeros(M, dtype=TYPE_LABEL) T1[index] = TYPE_LABEL(1) # Analytica correct gradient for P=1, T=1 AG = np.zeros_like(P1, dtype=TYPE_FLOAT) AG[index] = TYPE_FLOAT(-1) # dL/dP = -1 EGN1 = np.zeros_like(P1, dtype=TYPE_FLOAT) # Expected numerical gradient EGN1[index] = (-1 * logarithm(TYPE_FLOAT(1.0 + h)) + TYPE_FLOAT(1) * logarithm(TYPE_FLOAT(1.0 - h))) / TYPE_FLOAT(2 * h) assert np.all(np.abs(EGN1-AG) < u), \ "Expected EGN-1<%s but %s\nEGN=\n%s" % (u, (EGN1-AG), EGN1) GN1 = numerical_jacobian(partial(f, T=T1), P1) assert np.all(np.abs(GN1-AG) < u), \ "Expected GN-1<%s but %s\nGN=\n%s" % (u, (GN1-AG), GN1) # The numerical gradient gn = (-t * logarithm(p+h) + t * logarithm(p-h)) / 2h assert GN1.shape == EGN1.shape assert np.all(np.abs(EGN1-GN1) < u), \ "Expected GN1==EGN1 but GN1-EGN1=\n%sP=\n%s\nT=%s\nEGN=\n%s\nGN=\n%s\n" \ % (np.abs(GN1-EGN1), P1, T1, EGN1, GN1) # The numerical gradient gn is within +/- u within the analytical g = -T/P G1 = np.zeros_like(P1, dtype=TYPE_FLOAT) G1[T1 == 1] = -1 * (T1[index] / P1[index]) # G1[T1 != 0] = 0 check.equal(np.all(np.abs(G1 - GN1) < u), True, "G1-GN1 %s\n" % np.abs(G1 - GN1)) # -------------------------------------------------------------------------------- # For (P, T): P[index] = np uniform(), index label T=index # -------------------------------------------------------------------------------- for _ in range(NUM_MAX_TEST_TIMES): M = np.random.randint(2, NUM_MAX_NODES) # M > 1 T2 = TYPE_LABEL(np.random.randint(0, M)) # location of the truth P2 = np.zeros(M, dtype=TYPE_FLOAT) while not (x := TYPE_FLOAT( np.random.uniform(low=-BOUNDARY_SIGMOID, high=BOUNDARY_SIGMOID))): pass p = softmax(x) P2[T2] = p # -------------------------------------------------------------------------------- # The Jacobian G shape is the same with P.shape. # G:[0, 0, ...,g, 0, ...] where Gi is numerical gradient close to -1/(1+k). # -------------------------------------------------------------------------------- N2 = np.zeros_like(P2, dtype=TYPE_FLOAT) N2[T2] = TYPE_FLOAT(-1) * (logarithm(p + h) - logarithm(p - h)) / TYPE_FLOAT(2 * h) N2 = numerical_jacobian(partial(f, T=T2), P2) # The numerical gradient gn = (-t * logarithm(p+h) + t * logarithm(p-h)) / 2h assert N2.shape == N2.shape assert np.all(np.abs(N2-N2) < u), \ f"Delta expected to be < {u} but \n{np.abs(N2-N2)}" G2 = np.zeros_like(P2, dtype=TYPE_FLOAT) G2[T2] = -1 / p # The numerical gradient gn is within +/- u within the analytical g = -T/P check.equal(np.all(np.abs(G2 - N2) < u), True, "G2-N2 %s\n" % np.abs(G2 - N2))
def test_010_base_instance_properties(): """ Objective: Verify the layer class validates the parameters have been initialized before accessed. Expected: Initialization detects the access to the non-initialized parameters and fails. """ msg = "Accessing uninitialized property of the _layer must fail." M: int = np.random.randint(1, NUM_MAX_NODES) name = "test_010_base" _layer = Layer(name=name, num_nodes=M, log_level=logging.DEBUG) # -------------------------------------------------------------------------------- # To pass # -------------------------------------------------------------------------------- try: if not _layer.name == name: raise RuntimeError("layer.name == name should be true") except AssertionError: raise RuntimeError( "Access to name should be allowed as already initialized.") try: if not _layer.M == M: raise RuntimeError("layer.M == M should be true") except AssertionError: raise RuntimeError( "Access to M should be allowed as already initialized.") try: if not isinstance(_layer.logger, logging.Logger): raise RuntimeError( "isinstance(layer.logger, logging.Logger) should be true") except AssertionError: raise RuntimeError( "Access to logger should be allowed as already initialized.") # -------------------------------------------------------------------------------- # To fail # -------------------------------------------------------------------------------- try: print(_layer.D) raise RuntimeError(msg) except AssertionError: pass try: print(_layer.X) raise RuntimeError(msg) except AssertionError: pass try: _layer.X = int(1) raise RuntimeError(msg) except AssertionError: pass try: print(_layer.dX) raise RuntimeError(msg) except AssertionError: pass try: print(_layer.Y) raise RuntimeError(msg) except AssertionError: pass try: _layer._Y = int(1) print(_layer.Y) raise RuntimeError(msg) except AssertionError: pass try: print(_layer.dY) raise RuntimeError(msg) except AssertionError: pass try: _layer._dY = int(1) print(_layer.dY) raise RuntimeError(msg) except AssertionError: pass try: print(_layer.T) raise RuntimeError(msg) except AssertionError: pass try: _layer.T = TYPE_LABEL(1) raise RuntimeError(msg) except AssertionError: pass try: # pylint: disable=not-callable _layer.objective(np.array(1.0, dtype=TYPE_FLOAT)) raise RuntimeError(msg) except AssertionError: pass try: print(_layer.N) raise RuntimeError(msg) except AssertionError: pass assert _layer.name == name assert _layer.num_nodes == M
def disabled_test_040_objective_methods_2d_ohe(caplog): """ TODO: Disabled as need to redesign numerical_jacobian for 32 bit floating. Objective: Verify the forward path constraints: 1. Layer output L/loss is np.sum(sigmoid_cross_entropy_log_loss) / N. 2. gradient_numerical() == numerical Jacobian numerical_jacobian(O, X). Verify the backward path constraints: 1. Analytical gradient G: gradient() == (P-1)/N 2. Analytical gradient G is close to GN: gradient_numerical(). """ caplog.set_level(logging.DEBUG) # -------------------------------------------------------------------------------- # Instantiate a CrossEntropyLogLoss layer # -------------------------------------------------------------------------------- name = "test_040_objective_methods_2d_ohe" profiler = cProfile.Profile() profiler.enable() for _ in range(NUM_MAX_TEST_TIMES): N: int = np.random.randint(1, NUM_MAX_BATCH_SIZE) M: int = 1 # node number is 1 for 0/1 binary classification. layer = CrossEntropyLogLoss( name=name, num_nodes=M, log_loss_function=sigmoid_cross_entropy_log_loss, log_level=logging.DEBUG) # ================================================================================ # Layer forward path # ================================================================================ X = np.random.randn(N, M).astype(TYPE_FLOAT) T = np.zeros_like(X, dtype=TYPE_LABEL) # OHE labels. T[np.arange(N), np.random.randint(0, M, N)] = TYPE_LABEL(1) # log_loss function require (X, T) in X(N, M), and T(N, M) in OHE label format. X, T = transform_X_T(X, T) layer.T = T Logger.debug("%s: X is \n%s\nT is \n%s", name, X, T) # -------------------------------------------------------------------------------- # Expected analytical gradient EG = (dX/dL) = (A-T)/N # -------------------------------------------------------------------------------- A = sigmoid(X) EG = ((A - T).astype(TYPE_FLOAT) / TYPE_FLOAT(N)) # -------------------------------------------------------------------------------- # Total loss Z = np.sum(J)/N # Expected loss EL = sum((1-T)X + np.log(1 + np.exp(-X))) # (J, P) = sigmoid_cross_entropy_log_loss(X, T) and J:shape(N,) where J:shape(N,) # is loss for each input and P is activation by sigmoid(X). # -------------------------------------------------------------------------------- L = layer.function(X) J, P = sigmoid_cross_entropy_log_loss(X, T) EL = np.array(np.sum((1 - T) * X + logarithm(1 + np.exp(-X))) / N, dtype=TYPE_FLOAT) # Constraint: A == P as they are sigmoid(X) assert np.all(np.abs(A-P) < ACTIVATION_DIFF_ACCEPTANCE_VALUE), \ f"Need A==P==sigmoid(X) but A=\n{A}\n P=\n{P}\n(A-P)=\n{(A-P)}\n" # Constraint: Log loss layer output L == sum(J) from the log loss function Z = np.array(np.sum(J) / N, dtype=TYPE_FLOAT) assert np.array_equal(L, Z), \ f"Need log loss layer output L == sum(J) but L=\n{L}\nZ=\n{Z}." # Constraint: L/loss is close to expected loss EL. assert np.all(np.abs(EL-L) < LOSS_DIFF_ACCEPTANCE_VALUE), \ "Need EL close to L but \nEL=\n{EL}\nL=\n{L}\n" # -------------------------------------------------------------------------------- # constraint: gradient_numerical() == numerical_jacobian(objective, X) # TODO: compare the diff to accommodate numerical errors. # -------------------------------------------------------------------------------- GN = layer.gradient_numerical() # [dL/dX] from the layer def objective(x): """Function to calculate the scalar loss L for cross entropy log loss""" j, p = sigmoid_cross_entropy_log_loss(x, T) return np.array(np.sum(j) / N, dtype=TYPE_FLOAT) EGN = numerical_jacobian(objective, X) # Expected numerical dL/dX assert np.array_equal(GN[0], EGN), \ f"GN[0]==EGN expected but GN[0] is \n%s\n EGN is \n%s\n" % (GN[0], EGN) # ================================================================================ # Layer backward path # ================================================================================ # constraint: Analytical gradient G: gradient() == (P-1)/N. dY = TYPE_FLOAT(1) G = layer.gradient(dY) assert np.all(np.abs(G-EG) <= GRADIENT_DIFF_ACCEPTANCE_VALUE), \ f"Layer gradient dL/dX \n{G} \nneeds to be \n{EG}." # constraint: Analytical gradient G is close to GN: gradient_numerical(). assert \ np.allclose(GN[0], G, atol=GRADIENT_DIFF_ACCEPTANCE_VALUE, rtol=GRADIENT_DIFF_ACCEPTANCE_RATIO), \ f"dX is \n{G}\nGN[0] is \n{GN[0]}\nRDiff is \n{G-GN[0]}.\n" # constraint: Gradient g of the log loss layer needs -1 < g < 1 # abs(P-T) = abs(sigmoid(X)-T) cannot be > 1. assert np.all(np.abs(G) < 1), \ f"Log loss layer gradient cannot be < -1 nor > 1 but\n{G}" assert np.all(np.abs(GN[0]) < (1+GRADIENT_DIFF_ACCEPTANCE_RATIO)), \ f"Log loss layer gradient cannot be < -1 nor > 1 but\n{GN[0]}" profiler.disable() profiler.print_stats(sort="cumtime")
def disabled_test_040_objective_methods_1d_ohe(): """ TODO: Disabled as need to redesign numerical_jacobian for 32 bit floating. Objective: Verify the forward path constraints: 1. Layer output L/loss is np.sum(cross_entropy_log_loss(sigmoid(X), T, f=logistic_log_loss))) / N. 2. gradient_numerical() == numerical Jacobian numerical_jacobian(O, X). Verify the backward path constraints: 1. Analytical gradient G: gradient() == (P-1)/N 2. Analytical gradient G is close to GN: gradient_numerical(). Expected: Initialization detects the access to the non-initialized parameters and fails. For X.ndim > 0, the layer transform X into 2D so as to use the numpy tuple- like indexing: P[ (0,3), (2,4) ] Hence, the shape of GN, G are 2D. """ # -------------------------------------------------------------------------------- # Instantiate a CrossEntropyLogLoss layer # -------------------------------------------------------------------------------- name = "test_040_objective_methods_1d_ohe" N = 1 for _ in range(NUM_MAX_TEST_TIMES): layer = CrossEntropyLogLoss( name=name, num_nodes=1, log_loss_function=sigmoid_cross_entropy_log_loss, log_level=logging.DEBUG) # ================================================================================ # Layer forward path # ================================================================================ X = TYPE_FLOAT( np.random.uniform(low=-BOUNDARY_SIGMOID, high=BOUNDARY_SIGMOID)) T = TYPE_LABEL(np.random.randint(0, 2)) # OHE labels. # log_loss function require (X, T) in X(N, M), and T(N, M) in OHE label format. X, T = transform_X_T(X, T) layer.T = T # Expected analytical gradient dL/dX = (P-T)/N of shape (N,M) A = sigmoid(X) EG = ((A - T) / N).reshape(1, -1).astype(TYPE_FLOAT) Logger.debug("%s: X is \n%s\nT is %s\nP is %s\nEG is %s\n", name, X, T, A, EG) # -------------------------------------------------------------------------------- # constraint: L/loss == np.sum(J) / N. # J, P = sigmoid_cross_entropy_log_loss(X, T) # -------------------------------------------------------------------------------- L = layer.function(X) # L is shape () J, P = sigmoid_cross_entropy_log_loss(X, T) Z = np.array(np.sum(J), dtype=TYPE_FLOAT) / TYPE_FLOAT(N) assert np.array_equal(L, Z), f"LogLoss output should be {L} but {Z}." # -------------------------------------------------------------------------------- # constraint: gradient_numerical() == numerical Jacobian numerical_jacobian(O, X) # Use a dummy layer for the objective function because using the "layer" # updates the X, Y which can interfere the independence of the layer. # -------------------------------------------------------------------------------- GN = layer.gradient_numerical() # [dL/dX] from the layer # -------------------------------------------------------------------------------- # Cannot use CrossEntropyLogLoss.function() to simulate the objective function L. # because it causes applying transform_X_T multiple times. # Because internally transform_X_T(X, T) has transformed T into the index label # in 1D with with length 1 by "T = T.reshape(-1)". # Then providing X in 1D into "dummy.function(x)" re-run "transform_X_T(X, T)" # again. The (X.ndim == T.ndim ==1) as an input and T must be OHE label for such # combination and T.shape == P.shape must be true for OHE labels. # However, T has been converted into the index format already by transform_X_T # (applying transform_X_T multiple times) and (T.shape=(1,1), X.shape=(1, > 1) # that violates the (X.shape == T.shape) constraint. # -------------------------------------------------------------------------------- # dummy = CrossEntropyLogLoss( # name="dummy", # num_nodes=M, # log_level=logging.DEBUG # ) # dummy.T = T # dummy.objective = objective # dummy.function(X) # -------------------------------------------------------------------------------- def objective(x): j, p = sigmoid_cross_entropy_log_loss(x, T) return np.array(np.sum(j) / N, dtype=TYPE_FLOAT) EGN = numerical_jacobian(objective, X).reshape(1, -1) # Expected numerical dL/dX assert np.array_equal(GN[0], EGN), \ f"Layer gradient_numerical GN \n{GN} \nneeds to be \n{EGN}." # ================================================================================ # Layer backward path # ================================================================================ # -------------------------------------------------------------------------------- # constraint: Analytical gradient G: gradient() == (P-1)/N. # -------------------------------------------------------------------------------- dY = TYPE_FLOAT(1) G = layer.gradient(dY) assert np.all(np.abs(G-EG) <= GRADIENT_DIFF_ACCEPTANCE_VALUE), \ f"Layer gradient dL/dX \n{G} \nneeds to be \n{EG}." # -------------------------------------------------------------------------------- # constraint: Analytical gradient G is close to GN: gradient_numerical(). # -------------------------------------------------------------------------------- assert \ np.all(np.abs(G-GN[0]) <= GRADIENT_DIFF_ACCEPTANCE_VALUE) or \ np.all(np.abs(G-GN[0]) <= np.abs(GRADIENT_DIFF_ACCEPTANCE_RATIO * GN[0])), \ "dX is \n%s\nGN is \n%s\nG-GN is \n%s\n Ratio * GN[0] is \n%s.\n" \ % (G, GN[0], G-GN[0], GRADIENT_DIFF_ACCEPTANCE_RATIO * GN[0])
def test_020_adapt_embedding_loss_adapter_gradient_to_succeed(caplog): """ Objective: Verify the Adapter gradient method handles dY in shape (N, 1+SL) Adapter.function(Y) returns - For Y:(N, 1+SL), the return is in shape (N*(1+SL),1). Log loss T is set to the same shape Expected: """ caplog.set_level(logging.DEBUG) name = "test_020_adapt_embedding_logistic_loss_function_multi_lines" sentences = """ Verify the EventIndexing function can handle multi line sentences the asbestos fiber <unk> is unusually <unk> once it enters the <unk> with even brief exposures to it causing symptoms that show up decades later researchers said """ dictionary: EventIndexing = _instantiate_event_indexing() profiler = cProfile.Profile() profiler.enable() for _ in range(NUM_MAX_TEST_TIMES): # First validate the correct configuration, then change parameter one by one. E = target_size = TYPE_INT(np.random.randint(1, 3)) C = context_size = TYPE_INT(2 * np.random.randint(1, 5)) SL = negative_sample_size = TYPE_INT(np.random.randint(1, 5)) event_vector_size: TYPE_INT = TYPE_INT(np.random.randint(5, 20)) W: TYPE_TENSOR = np.random.rand(dictionary.vocabulary_size, event_vector_size) loss, adapter, embedding, event_context = _instantiate( name=name, num_nodes=TYPE_INT(1), target_size=target_size, context_size=context_size, negative_sample_size=negative_sample_size, event_vector_size=event_vector_size, dictionary=dictionary, W=W, log_level=logging.DEBUG, ) # ================================================================================ # Forward path # ================================================================================ # -------------------------------------------------------------------------------- # Event indexing # -------------------------------------------------------------------------------- sequences = dictionary.function(sentences) # -------------------------------------------------------------------------------- # Event context pairs # -------------------------------------------------------------------------------- target_context_pairs = event_context.function(sequences) # -------------------------------------------------------------------------------- # Embedding # -------------------------------------------------------------------------------- Y = embedding.function(target_context_pairs) N, _ = embedding.tensor_shape(Y) batch_size = TYPE_FLOAT(N * (1 + SL)) # -------------------------------------------------------------------------------- # Adapter # -------------------------------------------------------------------------------- Z = adapter.function(Y) # -------------------------------------------------------------------------------- # Loss # -------------------------------------------------------------------------------- L = loss.function(Z) # ******************************************************************************** # Constraint: # loss.T is set to the T by adapter.function() # ******************************************************************************** T = np.zeros(shape=(N, (1 + SL)), dtype=TYPE_LABEL) T[::, 0] = TYPE_LABEL(1) assert embedding.all_equal(T.reshape(-1, 1), loss.T), \ "Expected T must equals loss.T. Expected\n%s\nLoss.T\n%s\n" % (T, loss.T) # ******************************************************************************** # Constraint: # Expected loss is sum(sigmoid_cross_entropy_log_loss(Y, T)) / (N*(1+SL)) # The batch size for the Log Loss is (N*(1+SL)) # ******************************************************************************** EJ, EP = sigmoid_cross_entropy_log_loss(X=Z, T=T.reshape(-1, 1)) EL = np.sum(EJ, dtype=TYPE_FLOAT) / batch_size assert embedding.all_close(EL, L), \ "Expected EL=L but EL=\n%s\nL=\n%s\nDiff=\n%s\n" % (EL, L, (EL-L)) # ================================================================================ # Backward path # ================================================================================ # ******************************************************************************** # Constraint: # Expected dL/dY from the Log Loss is (P-T)/N # ******************************************************************************** EDY = (sigmoid(Y) - T.astype(TYPE_FLOAT)) / batch_size assert EDY.shape == Y.shape dY = adapter.gradient(loss.gradient(TYPE_FLOAT(1))) assert dY.shape == Y.shape assert embedding.all_close(EDY, dY), \ "Expected EDY==dY. EDY=\n%s\nDiff\n%s\n" % (EDY, (EDY-dY)) profiler.disable() profiler.print_stats(sort="cumtime")
def test_020_adapt_embedding_loss_adapter_function_Y_to_succeed(caplog): """ Objective: Verify the Adapter function handles Y in shape - Y:(N, 1+SL) - ys:(N,SL) - ye:(N,1) Expected: Adapter.function(Y) returns - For Y:(N, 1+SL), the return is in shape (N*(1+SL),1). Log loss T is set to the same shape - For Y:(N, SL), the return is in shape (N*SL,1). Log loss T is set to the same shape - For Y:(N,), the return is in shape (N,1). Log loss T is set to the same shape """ caplog.set_level(logging.DEBUG) name = "test_020_adapt_embedding_logistic_loss_function_multi_lines" sentences = """ Verify the EventIndexing function can handle multi line sentences the asbestos fiber <unk> is unusually <unk> once it enters the <unk> with even brief exposures to it causing symptoms that show up decades later researchers said """ dictionary: EventIndexing = _instantiate_event_indexing() profiler = cProfile.Profile() profiler.enable() for _ in range(NUM_MAX_TEST_TIMES): # First validate the correct configuration, then change parameter one by one. E = target_size = TYPE_INT(np.random.randint(1, 3)) C = context_size = TYPE_INT(2 * np.random.randint(1, 5)) SL = negative_sample_size = TYPE_INT(np.random.randint(1, 5)) event_vector_size: TYPE_INT = TYPE_INT(np.random.randint(5, 20)) W: TYPE_TENSOR = np.random.randn(dictionary.vocabulary_size, event_vector_size) loss, adapter, embedding, event_context = _instantiate( name=name, num_nodes=TYPE_INT(1), target_size=target_size, context_size=context_size, negative_sample_size=negative_sample_size, event_vector_size=event_vector_size, dictionary=dictionary, W=W, log_level=logging.DEBUG, ) sequences = dictionary.function(sentences) target_context_pairs = event_context.function(sequences) Y = embedding.function(target_context_pairs) N, _ = embedding.tensor_shape(Y) # ******************************************************************************** # Constraint: # - Adapter function returns (N*(SL+1),1) with the same values of Y # - Adapter function has set T:(N*(SL+1),1) in the loss layer # ******************************************************************************** msg = "Y must succeed" EZ = expected_Z = embedding.reshape(Y, shape=(N * (SL + 1), 1)) Z = _function_must_succeed(adapter=adapter, Y=Y, msg=msg) assert embedding.all_close( Z, EZ, "Z must close to EZ. Z:\n%s\nEZ\n%s\nDiff\n%s\n" % (Z, EZ, (EZ - Z))) T = np.zeros(shape=(N, (1 + SL)), dtype=TYPE_LABEL) T[::, 0] = TYPE_LABEL(1) T = embedding.reshape(T, shape=(-1, 1)) assert embedding.all_equal(T, loss.T), \ "Expected T must equals loss.T. Expected\n%s\nLoss.T\n%s\n" % (T, loss.T) profiler.disable() profiler.print_stats(sort="cumtime")