def check_backward(): model_base.assign_weights(tf_weights) tester = Tester() optimizer.dump_gradients_code() tf_grads = tf_model.compute_gradients(tf_model_base, labels=y_train[:BATCH_SIZE], input=X_train[:BATCH_SIZE]) np_grads = optimizer.compute_gradients(labels=y_train[:BATCH_SIZE], input=X_train[:BATCH_SIZE]) for (idx, (tf_grad, np_grad)) in enumerate(zip(tf_grads, np_grads)): tester.check_tensors('grad_{}'.format(idx), tf_grad.numpy(), np_grad) tester.end()
def check_forward(): model_base.assign_weights(tf_weights) tester = Tester() tf_logits = tf_model_base(X_train[:BATCH_SIZE]).numpy() nn_logits = model_base(X_train[:BATCH_SIZE], dump_code=True) tester.check_tensors('logits', tf_logits, nn_logits) tf_probs = tf_model_probs(X_train[:BATCH_SIZE]).numpy() nn_probs = model_probs(X_train[:BATCH_SIZE]) tester.check_tensors('probs', tf_probs, nn_probs) tf_loss = tf_model.compute_loss(tf_model_base, labels=y_train[:BATCH_SIZE], input=X_train[:BATCH_SIZE]).numpy() nn_loss = optimizer.loss(labels=y_train[:BATCH_SIZE], input=X_train[:BATCH_SIZE]) tester.check_tensors('loss', tf_loss, nn_loss) tester.end()
ts = Tester() ### Forward pass def bias_add(lhs, rhs): return np.add(lhs, rhs) X = np.random.randn(13, 7) y = np.random.randn(7) out_tf = tf.nn.bias_add(X, y).numpy() out_np = bias_add(X, y) ts.check_tensors('bias_add', out_tf, out_np) ### Backward pass def tf_bias_add_grad(lhs, rhs, dout): v_lhs = tf.Variable(lhs) v_rhs = tf.Variable(rhs) with tf.GradientTape() as tape: out = tf.nn.bias_add(v_lhs, v_rhs) loss = tf.reduce_sum(out * dout) d_lhs, d_rhs = tape.gradient(loss, [v_lhs, v_rhs]) return d_lhs.numpy(), d_rhs.numpy()
import os import sys sys.path.append(os.getcwd()) import numpy as np import tensorflow as tf from tester import Tester ts = Tester() X = 5 * np.random.randn(12, 6) def softmax(x): e_x = np.exp(x - np.max(x, axis=1).reshape(-1, 1)) return e_x / e_x.sum(axis=1).reshape(-1, 1) res_tf = tf.nn.softmax(X, axis=1).numpy() res_np = softmax(X) ts.check_tensors('softmax', res_tf, res_np) ts.end()
def log_softmax(x): x_max = np.max(x, axis=1).reshape(-1, 1) logsum = np.log(np.sum(np.exp(x - x_max), axis=1)).reshape(-1, 1) return x - x_max - logsum def softmax_cross_entropy_with_logits(labels, logits): return -np.sum(labels * log_softmax(logits), axis=1) X = 5 * np.random.randn(13, 7) y = softmax(np.random.randn(13, 7)) res_tf = tf.nn.softmax_cross_entropy_with_logits(labels=y, logits=X).numpy() res_np = softmax_cross_entropy_with_logits(labels=y, logits=X) ts.check_tensors('softmax_cross_entropy_with_logits', res_tf, res_np) ### Compute Softmax Cross Entropy grad # Start from the log-softmax grad operation: # dX = dout - softmax(x) * sum(dout, axis=1).reshape(-1, 1) # # Combine with the rest of the cross-entropy loss to get the gradient # t = - dL.reshape(-1, 1) * y # dX = t - softmax(x) * sum(t, axis=1).reshape(-1, 1) # = -dL.reshape(-1, 1) * y + softmax(x) * sum(dL*y, axis=1) # sum(y) = 1 => sum(DL*y) = Dl # dX = softmax(x) * dL.reshape(-1, 1) - dL.reshape(-1, 1) * y # dX = dL.reshape(-1, 1) * (softmax(x) - y)
ts = Tester() ### Forward pass def matmul(lhs, rhs): return np.matmul(lhs, rhs) X = np.random.randn(13, 7) Y = np.random.randn(7, 29) out_tf = tf.matmul(X, Y).numpy() out_np = matmul(X, Y) ts.check_tensors('matmul', out_tf, out_np) ### Backward pass def tf_matmul_grad(lhs, rhs, dout): v_lhs = tf.Variable(lhs) v_rhs = tf.Variable(rhs) with tf.GradientTape() as tape: out = tf.matmul(v_lhs, v_rhs) loss = tf.reduce_sum(out * dout) d_lhs, d_rhs = tape.gradient(loss, [v_lhs, v_rhs]) return d_lhs.numpy(), d_rhs.numpy()
### Compute LogSoftmax # More infos at https://stackoverflow.com/questions/61567597/how-is-log-softmax-implemented-to-compute-its-value-and-gradient-with-better X = 5 * np.random.randn(13, 7) def log_softmax(x): x_max = np.max(x, axis=1).reshape(-1, 1) logsum = np.log(np.sum(np.exp(x - x_max), axis=1)).reshape(-1, 1) return x - x_max - logsum res_tf = tf.nn.log_softmax(X, axis=1).numpy() res_np = log_softmax(X) ts.check_tensors('log_softmax', res_tf, res_np) ### Compute LogSoftmax grad # More infos at https://stackoverflow.com/questions/35304393/trying-to-understand-code-that-computes-the-gradient-wrt-to-the-input-for-logsof # gradInputi = gradOutputi - exp(outputi) . sum_j( gradOutputj ) def tf_log_softmax_grad(x, dy): vX = tf.Variable(X) with tf.GradientTape() as tape: y = tf.nn.log_softmax(vX, axis=1) loss = tf.reduce_sum(y * dy) return tape.gradient(loss, vX).numpy()
from tester import Tester ts = Tester() ### Forward pass def relu(x): return np.maximum(x, 0) x = np.random.randn(7, 9, 3) res_tf = tf.nn.relu(x).numpy() res_np = relu(x) ts.check_tensors('relu', res_tf, res_np) ### Backward pass def tf_relu_grad(x, dout): v_x = tf.Variable(x) with tf.GradientTape() as tape: out = tf.nn.relu(v_x) loss = tf.reduce_sum(out * dout) return tape.gradient(loss, v_x).numpy() def relu_prime(x): return np.where(x > 0, 1.0, 0.0)
from tester import Tester ts = Tester() ### Forward pass def sum(x, axis): return np.sum(x, axis=axis) X = np.random.randn(456) res_tf = tf.reduce_sum(X, axis=0).numpy() res_np = sum(X, axis=0) ts.check_tensors('sum1', res_tf, res_np) ### Backward pass def tf_sum_grad(x, axis, dout): vX = tf.Variable(X) with tf.GradientTape() as tape: out = tf.reduce_sum(vX, axis=axis) loss = tf.reduce_sum(out * dout) return tape.gradient(loss, vX).numpy() def sum_grad(x, axis, dout): shape = list(x.shape)