def dotND(a, b): val = a.val.dot(b.val) def jacob(grad): a_val = a.val b_val = b.val if len(a_val.shape) < 3: np.expand_dims(a_val, -1) if len(b_val.shape) < 3: np.expand_dims(b_val, -1) a_grad = np.zeros(a_val.shape) b_grad = np.zeros(b_val.shape) for index in np.ndindex(*(a_grad.shape[:-2] + b_grad.shape[:-2])): for i in range(grad.shape[0]): a_grad[index, i, :] = np.sum(grad[index, (i, ), :] * b_val[index], axis=-1) for j in range(grad.shape[1]): b_grad[index, :, j] = np.sum(grad[index, :, (j, )] * a_val[index], axis=-2) return np.reshape(a_grad, a.val.shape), np.reshape(b_grad, b.val.shape) return Node(val, (a, b), jacob)
def softmax(a): def _softmax(x): e_x = np.exp(x - np.expand_dims(np.max(x, axis=-1), -1)) return e_x / np.expand_dims(e_x.sum(axis=-1), axis=-1) val = _softmax(a.val) # grad not included, can just use derivative of loss with respect to logits directly return Node(val, (a, ), None)
def concat(inputs, axis=0): val = np.concatenate([x.val for x in inputs], axis=axis) sizes = np.cumsum([x.val.shape[axis] for x in inputs[1:]]) def jacob(grad): return tuple(np.split(grad, sizes, axis=axis)) return Node(val, inputs, jacob)
def slice(a, slice): val = a.val[slice] def jacob(grad): a_grad = np.zeros(a.val.shape) a_grad[slice] = grad return (a_grad, ) return Node(val, (a, ), jacob)
def dot(a, b): val = a.val.dot(b.val) def jacob(grad): a_grad = np.zeros(a.val.shape) b_grad = np.zeros(b.val.shape) for i in range(grad.shape[0]): a_grad[i, :] = np.sum(grad[(i, ), :] * b.val, axis=1) for j in range(grad.shape[1]): b_grad[:, j] = np.sum(grad[:, (j, )] * a.val, axis=0) return (a_grad, b_grad) return Node(val, (a, b), jacob)
def stop_gradient(a): return Node(a.val, (a, ), None)
def mean(a, axis=None): val = a.val.mean(axis=axis) size = a.val.size if axis is None else a.val.shape[axis] jacob = lambda grad: (np.full(a.val.shape, grad / size), ) return Node(val, (a, ), jacob)
def mul(a, b): val = a.val * b.val jacob = lambda grad: (grad * b.val, grad * a.val) return Node(val, (a, b), jacob)
import numpy as np from mnist.preprocessing import get_mnist from backprop import Node from ops import dot, softmax n_classes = 5 x_train, y_train, x_test, y_test = get_mnist(n_classes) x_train, x_test = (np.reshape(x, (x.shape[0], -1)) for x in (x_train, x_test)) # flatten x_train, x_test = (np.concatenate([x, np.ones((x.shape[0], 1))], axis=1) for x in (x_train, x_test)) # add bias elt layer_size = 500 w1 = Node(val=np.random.randn(x_train.shape[1], layer_size) / 10, changeable=True) w2 = Node(val=np.random.randn(layer_size, n_classes) / 10, changeable=True) def neural_net(x): h = x for w in (w1, w2): h = dot(h, w) return h, softmax(h) batch_size = 32 lr = .0003 for epoch in range(1): for bi in range(0, x_train.shape[0], batch_size): end = min(x_train.shape[0], bi + batch_size) x = Node(val=x_train[bi:end]) y = y_train[bi:end] logits, pred = neural_net(x) logits.backprop(lr * (y - pred.val)) # total rather than mean loss over batch