def train_binary_classifier(N: int, D: int, M: int, X: np.ndarray, T: np.ndarray, W: np.ndarray, log_loss_function: Callable, optimizer: Optimizer, num_epochs: int = 100, test_numerical_gradient: bool = False, log_level: int = logging.ERROR, callback: Callable = None): """Test case for binary classification with matmul + log loss. Args: N: Batch size D: Number of features M: Number of nodes. 1 for sigmoid and 2 for softmax X: train data T: labels W: weight log_loss_function: cross entropy logg loss function optimizer: Optimizer num_epochs: Number of epochs to run test_numerical_gradient: Flag if test the analytical gradient with the numerical one. log_level: logging level callback: callback function to invoke at the each epoch end. """ name = __name__ assert isinstance(T, np.ndarray) and np.issubdtype( T.dtype, np.integer) and T.ndim == 1 and T.shape[0] == N assert isinstance( X, np.ndarray) and X.dtype == TYPE_FLOAT and X.ndim == 2 and X.shape[ 0] == N and X.shape[1] == D assert isinstance( W, np.ndarray) and W.dtype == TYPE_FLOAT and W.ndim == 2 and W.shape[ 0] == M and W.shape[1] == D + 1 assert num_epochs > 0 and N > 0 and D > 0 assert ((log_loss_function == sigmoid_cross_entropy_log_loss and M == 1) or (log_loss_function == softmax_cross_entropy_log_loss and M >= 2)) # -------------------------------------------------------------------------------- # Instantiate a CrossEntropyLogLoss layer # -------------------------------------------------------------------------------- loss = CrossEntropyLogLoss(name="loss", num_nodes=M, log_loss_function=log_loss_function, log_level=log_level) # -------------------------------------------------------------------------------- # Instantiate a Matmul layer # -------------------------------------------------------------------------------- matmul = Matmul(name="matmul", num_nodes=M, W=W, optimizer=optimizer, log_level=log_level) matmul.objective = loss.function num_no_progress: int = 0 # how many time when loss L not decreased. loss.T = T history: List[np.ndarray] = [loss.function(matmul.function(X))] for i in range(num_epochs): # -------------------------------------------------------------------------------- # Layer forward path # Calculate the matmul output Y=f(X), and get the loss L = objective(Y) # Test the numerical gradient dL/dX=matmul.gradient_numerical(). # -------------------------------------------------------------------------------- Y = matmul.function(X) L = loss.function(Y) if not (i % 50): print(f"iteration {i} Loss {L}") Logger.info("%s: iteration[%s]. Loss is [%s]", name, i, L) # -------------------------------------------------------------------------------- # Constraint: 1. Objective/Loss L(Yn+1) after gradient descent < L(Yn) # -------------------------------------------------------------------------------- if L >= history[-1] and (i % 20) == 1: Logger.warning( "Iteration [%i]: Loss[%s] has not improved from the previous [%s].", i, L, history[-1]) if (num_no_progress := num_no_progress + 1) > 20: Logger.error( "The training has no progress more than %s times.", num_no_progress) # break else: num_no_progress = 0 history.append(L) # -------------------------------------------------------------------------------- # Expected dL/dW.T = X.T @ dL/dY = X.T @ (P-T) / N, and dL/dX = dL/dY @ W # P = sigmoid(X) or softmax(X) # dL/dX = dL/dY * W is to use W BEFORE updating W. # -------------------------------------------------------------------------------- P = None if log_loss_function == sigmoid_cross_entropy_log_loss: # P = sigmoid(np.matmul(X, W.T)) P = sigmoid(np.matmul(matmul.X, matmul.W.T)) P = P - T.reshape(-1, 1) # T(N,) -> T(N,1) to align with P(N,1) assert P.shape == ( N, 1), "P.shape is %s T.shape is %s" % (P.shape, T.shape) elif log_loss_function == softmax_cross_entropy_log_loss: # matmul.X.shape is (N, D+1), matmul.W.T.shape is (D+1, M) P = softmax(np.matmul(matmul.X, matmul.W.T)) # (N, M) P[np.arange(N), T] -= 1 EDX = np.matmul(P / N, matmul.W) # (N,M) @ (M, D+1) -> (N, D+1) EDX = EDX[::, 1:] # Hide the bias -> (N, D) EDW = np.matmul(matmul.X.T, P / N).T # ((D+1,N) @ (N, M)).T -> (M, D+1) # -------------------------------------------------------------------------------- # Layer backward path # 1. Calculate the analytical gradient dL/dX=matmul.gradient(dL/dY) with a dL/dY. # 2. Gradient descent to update Wn+1 = Wn - lr * dL/dX. # -------------------------------------------------------------------------------- before = copy.deepcopy(matmul.W) dY = loss.gradient(TYPE_FLOAT(1)) dX = matmul.gradient(dY) # gradient descent and get the analytical gradients dS=[dL/dX, dL/dW] # dL/dX.shape = (N, D) # dL/dW.shape = (M, D+1) dS = matmul.update() dW = dS[0] # -------------------------------------------------------------------------------- # Constraint 1. W in the matmul has been updated by the gradient descent. # -------------------------------------------------------------------------------- Logger.debug("W after is \n%s", matmul.W) assert not np.array_equal(before, matmul.W), "W has not been updated." if not validate_against_expected_gradient(EDX, dX): Logger.warning("Expected dL/dX \n%s\nDiff\n%s", EDX, EDX - dX) if not validate_against_expected_gradient(EDW, dW): Logger.warning("Expected dL/dW \n%s\nDiff\n%s", EDW, EDW - dW) if test_numerical_gradient: # -------------------------------------------------------------------------------- # Numerical gradients gn=[dL/dX, dL/dW] # dL/dX.shape = (N, D) # dL/dW.shape = (M, D+1) # -------------------------------------------------------------------------------- gn = matmul.gradient_numerical() validate_against_numerical_gradient([dX] + dS, gn, Logger) if callback: # if W.shape[1] == 1 else callback(W=np.average(matmul.W, axis=0)) callback(W=matmul.W[0])
def disabled_test_020_matmul_round_trip(): """ TODO: Disabled as need to re-design numerical_jacobian for 32 bit float e.g TF. Objective: Verify the forward and backward paths at matmul. Expected: Forward path: 1. Matmul function(X) == X @ W.T 2. Numerical gradient should be the same with numerical Jacobian Backward path: 3. Analytical gradient dL/dX == dY @ W 4. Analytical dL/dW == X.T @ dY 5. Analytical gradients are similar to the numerical gradient ones Gradient descent 6. W is updated via the gradient descent. 7. Objective L is decreasing via the gradient descent. """ profiler = cProfile.Profile() profiler.enable() for _ in range(NUM_MAX_TEST_TIMES): # -------------------------------------------------------------------------------- # Instantiate a Matmul layer # -------------------------------------------------------------------------------- N: int = np.random.randint(1, NUM_MAX_BATCH_SIZE) M: int = np.random.randint(1, NUM_MAX_NODES) D: int = np.random.randint(1, NUM_MAX_FEATURES) W = weights.he(M, D + 1) name = "test_020_matmul_methods" def objective(X: np.ndarray) -> Union[float, np.ndarray]: """Dummy objective function to calculate the loss L""" return np.sum(X) # Test both static instantiation and build() if TYPE_FLOAT(np.random.uniform()) < 0.5: matmul = Matmul(name=name, num_nodes=M, W=W, log_level=logging.DEBUG) else: matmul_spec = { _NAME: "test_020_matmul_builder_to_fail_matmul_spec", _NUM_NODES: M, _NUM_FEATURES: D, _WEIGHTS: { _SCHEME: "he", }, _OPTIMIZER: { _SCHEME: "sGd" } } matmul = Matmul.build(matmul_spec) matmul.objective = objective # ================================================================================ # Layer forward path # Calculate the layer output Y=f(X), and get the loss L = objective(Y) # Test the numerical gradient dL/dX=matmul.gradient_numerical(). # # Note that bias columns are added inside the matmul layer instance, hence # matmul.X.shape is (N, 1+D), matmul.W.shape is (M, 1+D) # ================================================================================ X = np.random.randn(N, D).astype(TYPE_FLOAT) Logger.debug("%s: X is \n%s", name, X) # pylint: disable=not-callable Y = matmul.function(X) # pylint: disable=not-callable L = matmul.objective(Y) # Constraint 1 : Matmul outputs Y should be [email protected] assert np.array_equal(Y, np.matmul(matmul.X, matmul.W.T)) # Constraint 2: Numerical gradient should be the same with numerical Jacobian GN = matmul.gradient_numerical() # [dL/dX, dL/dW] # DO NOT use matmul.function() as the objective function for numerical_jacobian(). # The state of the layer will be modified. # LX = lambda x: matmul.objective(matmul.function(x)) def LX(x): y = np.matmul(x, matmul.W.T) # pylint: disable=not-callable return matmul.objective(y) EGNX = numerical_jacobian(LX, matmul.X) # Numerical dL/dX including bias EGNX = EGNX[::, 1::] # Remove bias for dL/dX assert np.array_equal(GN[0], EGNX), \ "GN[0]\n%s\nEGNX=\n%s\n" % (GN[0], EGNX) # DO NOT use matmul.function() as the objective function for numerical_jacobian(). # The state of the layer will be modified. # LW = lambda w: matmul.objective(np.matmul(X, w.T)) def LW(w): Y = np.matmul(matmul.X, w.T) # pylint: disable=not-callable return matmul.objective(Y) EGNW = numerical_jacobian(LW, matmul.W) # Numerical dL/dW including bias assert np.array_equal(GN[1], EGNW) # No need to remove bias # ================================================================================ # Layer backward path # Calculate the analytical gradient dL/dX=matmul.gradient(dL/dY) with a dummy dL/dY. # ================================================================================ dY = np.ones_like(Y) dX = matmul.gradient(dY) # Constraint 3: Matmul gradient dL/dX should be dL/dY @ W. Use a dummy dL/dY = 1.0. expected_dX = np.matmul(dY, matmul.W) expected_dX = expected_dX[::, 1:: # Omit bias ] assert np.array_equal(dX, expected_dX) # Constraint 5: Analytical gradient dL/dX close to the numerical gradient GN. assert np.all(np.abs(dX - GN[0]) < GRADIENT_DIFF_ACCEPTANCE_VALUE), \ "dX need close to GN[0]. dX:\n%s\ndiff \n%s\n" % (dX, dX-GN[0]) # -------------------------------------------------------------------------------- # Gradient update. # Run the gradient descent to update Wn+1 = Wn - lr * dL/dX. # -------------------------------------------------------------------------------- # Python passes the reference to W, hence it is directly updated by the gradient- # descent to avoid a temporary copy. Backup W before to compare before/after. backup = copy.deepcopy(W) # Gradient descent and returns analytical dL/dX, dL/dW dS = matmul.update() dW = dS[0] # Constraint 6.: W has been updated by the gradient descent. assert np.any(backup != matmul.W), "W has not been updated " # Constraint 5: the numerical gradient (dL/dX, dL/dW) are closer to the analytical ones. assert validate_against_expected_gradient(GN[0], dX), \ "dX=\n%s\nGN[0]=\n%sdiff=\n%s\n" % (dX, GN[0], (dX-GN[0])) assert validate_against_expected_gradient(GN[1], dW), \ "dW=\n%s\nGN[1]=\n%sdiff=\n%s\n" % (dW, GN[1], (dW-GN[1])) # Constraint 7: gradient descent progressing with the new objective L(Yn+1) < L(Yn) # pylint: disable=not-callable assert np.all(np.abs(objective(matmul.function(X)) < L)) profiler.disable() profiler.print_stats(sort="cumtime")
def validate_relu_neuron_training(matmul: Matmul, activation: ReLU, loss: CrossEntropyLogLoss, X: np.ndarray, T: np.ndarray, num_epochs: int = 100, test_numerical_gradient: bool = False, callback: Callable = None): activation.objective = loss.function matmul.objective = compose(activation.function, loss.function) objective = compose(matmul.function, matmul.objective) num_no_progress: int = 0 # how many time when loss L not decreased. history: List[np.ndarray] = [] loss.T = T for i in range(num_epochs): L = objective(X) N = X.shape[0] P = softmax(relu(np.matmul(matmul.X, matmul.W.T))) EDA = expected_gradient_from_log_loss(P=P, T=T, N=N) # ******************************************************************************** # Constraint: Expected gradients must match actual # ******************************************************************************** validate_relu_neuron_round_trip(matmul=matmul, activation=activation, X=X, dA=EDA) # -------------------------------------------------------------------------------- # gradient descent and get the analytical dL/dX, dL/dW # -------------------------------------------------------------------------------- previous_W = copy.deepcopy(matmul.W) matmul.update() # dL/dX, dL/dW # ******************************************************************************** # Constraint. W in the matmul has been updated by the gradient descent. # ******************************************************************************** Logger.debug("W after is \n%s", matmul.W) if np.array_equal(previous_W, matmul.W): Logger.warning("W has not been updated") # ******************************************************************************** # Constraint: Objective/Loss L(Yn+1) after gradient descent < L(Yn) # ******************************************************************************** if i > 0 and L >= history[-1]: Logger.warning( "Iteration [%i]: Loss[%s] has not improved from the previous [%s] for %s times.", i, L, history[-1], num_no_progress + 1) # -------------------------------------------------------------------------------- # Reduce the learning rate can make the situation worse. # When reduced the lr every time L >= history, the (L >= history) became successive # and eventually exceeded 50 successive non-improvement ending in failure. # Keep the learning rate make the L>=history more frequent but still up to 3 # successive events, and the training still kept progressing. # -------------------------------------------------------------------------------- num_no_progress += 1 if num_no_progress > 5: matmul.lr = matmul.lr * 0.95 if num_no_progress > 50: Logger.error( "The training has no progress more than %s times.", num_no_progress) break else: num_no_progress = 0 history.append(L) if callback: callback(W=matmul.W) return history