def forward(self, x: NPArray) -> NPArray: """ Computes the forward pass for spatial batch normalization. Inputs: - x: Input data of shape (N, C, H, W) - gamma: Scale parameter, of shape (C,) - beta: Shift parameter, of shape (C,) - bn_param: Dictionary with the following keys: - mode: 'train' or 'test'; required - eps: Constant for numeric stability - momentum: Constant for running mean / variance. momentum=0 means that old information is discarded completely at every time step, while momentum=1 means that new information is never incorporated. The default of momentum=0.9 should work well in most situations. - running_mean: Array of shape (D,) giving running mean of features - running_var Array of shape (D,) giving running variance of features Returns a tuple of: - out: Output data, of shape (N, C, H, W) - cache: Values needed for the backward pass """ N, C, H, W = x.shape x_flat = x.transpose(0, 2, 3, 1).reshape(-1, C) out_flat = super().forward(x_flat) out = out_flat.reshape(N, H, W, C).transpose(0, 3, 1, 2) return out
def forward(self, x: NPArray) -> NPArray: """ Computes the forward pass for spatial group normalization. In contrast to layer normalization, group normalization splits each entry in the data into G contiguous pieces, which it then normalizes independently. Per feature shifting and scaling are then applied to the data, in a manner identical to that of batch normalization and layer normalization. Inputs: - x: Input data of shape (N, C, H, W) - gamma: Scale parameter, of shape (C,) - beta: Shift parameter, of shape (C,) - G: Integer mumber of groups to split into, should be a divisor of C - gn_param: Dictionary with the following keys: - eps: Constant for numeric stability Returns a tuple of: - out: Output data, of shape (N, C, H, W) - cache: Values needed for the backward pass """ N, C, H, W = x.shape self.gamma = self.gamma.reshape((1, C, 1, 1)) self.beta = self.beta.reshape((1, C, 1, 1)) x = x.reshape(N * self.G, -1).T sample_mean = np.mean(x, axis=0) sample_var = np.var(x, axis=0) v = np.sqrt(sample_var + self.eps) x_hat = (x - sample_mean) / v x_hat = x_hat.T.reshape(N, C, H, W) out = self.gamma * x_hat + self.beta self.cache = (x_hat, v) return out
def loss(self, X: NPArray, y: NPIntArray) -> tuple[float, NPArray]: """ Structured SVM loss function, vectorized implementation. Inputs have dimension D, there are C classes, and we operate on minibatches of N examples. Inputs: - W: A numpy array of shape (D, C) containing weights. - X: A numpy array of shape (N, D) containing a minibatch of data. - y: A numpy array of shape (N,) containing training labels; y[i] = c means that X[i] has label c, where 0 <= c < C. - reg: (float) regularization strength Returns a tuple of: - loss as single float - gradient with respect to weights W; an array of same shape as W """ num_train = X.shape[0] score_matrix = X.dot(self.W) correct_class_scores = score_matrix[np.arange(num_train), y].reshape(-1, 1) margin = score_matrix - correct_class_scores + 1 # note delta = 1 margin[margin < 0] = 0 margin[np.arange(num_train), y] = 0 loss = np.sum(margin) margin[margin > 0] = 1 num_y = np.sum(margin, axis=1) margin[np.arange(num_train), y] = -num_y dW = X.T.dot(margin) return loss, dW
def backward(self, dout: NPArray) -> tuple[NPArray, ...]: """ Backward pass for temporal affine layer. Input: - dout: Upstream gradients of shape (N, T, M) - cache: Values from forward pass Returns a tuple of: - dx: Gradient of input, of shape (N, T, D) - dw: Gradient of weights, of shape (D, M) - db: Gradient of biases, of shape (M,) """ (x, ) = self.cache N, T, D = x.shape M = self.b.shape[0] dx = dout.reshape(N * T, M).dot(self.w.T).reshape(N, T, D) dw = dout.reshape(N * T, M).T.dot(x.reshape(N * T, D)).T db = dout.sum(axis=(0, 1)) return dx, dw, db
def backward_naive(self, dout: NPArray) -> tuple[NPArray, ...]: """ Backward pass for batch normalization. For this implementation, you should write out a computation graph for batch normalization on paper and propagate gradients backward through intermediate nodes. Inputs: - dout: Upstream derivatives, of shape (N, D) - cache: Variable of intermediates from batchnorm_forward. Returns a tuple of: - dx: Gradient with respect to inputs x, of shape (N, D) - dgamma: Gradient with respect to scale parameter gamma, of shape (D,) - dbeta: Gradient with respect to shift parameter beta, of shape (D,) """ xn, std = self.cache if self.train_mode: N = dout.shape[0] dbeta = dout.sum(axis=0) dgamma = np.sum(xn * dout, axis=0) dxn = self.gamma * dout dxc = dxn / std dstd = -np.sum((dxn * xn) / std, axis=0) dvar = 0.5 * dstd / std dxc += (2 / N) * (xn * std) * dvar dmu = np.sum(dxc, axis=0) dx = dxc - dmu / N else: dbeta = dout.sum(axis=0) dgamma = np.sum(xn * dout, axis=0) dxn = self.gamma * dout dx = dxn / std return dx, dgamma, dbeta
def loss(self, X: NPArray, y: NPIntArray) -> tuple[float, NPArray]: """ Softmax loss function, vectorized version. Inputs and outputs are the same as softmax_loss_naive. """ num_classes = self.W.shape[1] num_train = X.shape[0] scores = X.dot(self.W) softmx = softmax(scores) loss = np.sum(np.log(softmx[np.arange(num_train), y])) kronecker = np.zeros((num_train, num_classes)) kronecker[np.arange(num_train), y] = 1 dW = X.T.dot(kronecker - softmx) return loss, dW
def backward(self, dout: NPArray) -> tuple[NPArray, ...]: """ Computes the backward pass for spatial batch normalization. Inputs: - dout: Upstream derivatives, of shape (N, C, H, W) - cache: Values from the forward pass Returns a tuple of: - dx: Gradient with respect to inputs, of shape (N, C, H, W) - dgamma: Gradient with respect to scale parameter, of shape (C,) - dbeta: Gradient with respect to shift parameter, of shape (C,) """ N, C, H, W = dout.shape dout_flat = dout.transpose(0, 2, 3, 1).reshape(-1, C) dx_flat, dgamma, dbeta = super().backward(dout_flat) dx = dx_flat.reshape(N, H, W, C).transpose(0, 3, 1, 2) return dx, dgamma, dbeta
def temporal_softmax_loss(x: NPArray, y: NPIntArray, mask: NPBoolArray) -> tuple[float, NPArray]: """ A temporal version of softmax loss for use in RNNs. We assume that we are making predictions over a vocabulary of size V for each timestep of a timeseries of length T, over a minibatch of size N. The input x gives scores for all vocabulary elements at all timesteps, and y gives the indices of the ground-truth element at each timestep. We use a cross-entropy loss at each timestep, summing the loss over all timesteps and averaging across the minibatch. As an additional complication, we may want to ignore the model output at some timesteps, since sequences of different length may have been combined into a minibatch and padded with NULL tokens. The optional mask argument tells us which elements should contribute to the loss. Inputs: - x: Input scores, of shape (N, T, V) - y: Ground-truth indices, of shape (N, T) where each element is in the range 0 <= y[i, t] < V - mask: Boolean array of shape (N, T) where mask[i, t] tells whether or not the scores at x[i, t] should contribute to the loss. Returns a tuple of: - loss: Scalar giving loss - dx: Gradient of loss with respect to scores x. """ N, T, V = x.shape x_flat = x.reshape(N * T, V) y_flat = y.reshape(N * T) mask_flat = mask.reshape(N * T) probs = np.exp(x_flat - np.max(x_flat, axis=1, keepdims=True)) probs /= np.sum(probs, axis=1, keepdims=True) loss = -np.sum(mask_flat * np.log(probs[np.arange(N * T), y_flat])) / N dx_flat = probs.copy() dx_flat[np.arange(N * T), y_flat] -= 1 dx_flat /= N dx_flat *= mask_flat[:, None] dx = dx_flat.reshape(N, T, V) return loss, dx
def forward(self, x: NPArray) -> NPArray: """ Computes the forward pass for an affine (fully-connected) layer. The input x has shape (N, d_1, ..., d_k) where x[i] is the ith input. We multiply this against a weight matrix of shape (D, M) where D = prod_i d_i Inputs: x - Input data, of shape (N, d_1, ..., d_k) w - Weights, of shape (D, M) b - Biases, of shape (M,) Returns a tuple of: - out: output, of shape (N, M) - cache: (x, w, b) """ self.cache = (x, ) return x.reshape(x.shape[0], -1).dot(self.w) + self.b
def backward(self, dout: NPArray) -> tuple[NPArray, ...]: """ Computes the backward pass for an affine layer. Inputs: - dout: Upstream derivative, of shape (N, M) - cache: Tuple of: - x: Input data, of shape (N, d_1, ... d_k) - w: Weights, of shape (D, M) Returns a tuple of: - dx: Gradient with respect to x, of shape (N, d1, ..., d_k) - dw: Gradient with respect to w, of shape (D, M) - db: Gradient with respect to b, of shape (M,) """ (x, ) = self.cache dx = dout.dot(self.w.T).reshape(x.shape) dw = x.reshape(x.shape[0], -1).T.dot(dout) db = np.sum(dout, axis=0) return dx, dw, db
def forward(self, x: NPArray) -> NPArray: """ Forward pass for a temporal affine layer. The input is a set of D-dimensional vectors arranged into a minibatch of N timeseries, each of length T. We use an affine function to transform each of those vectors into a new vector of dimension M. Inputs: - x: Input data of shape (N, T, D) - w: Weights of shape (D, M) - b: Biases of shape (M,) Returns a tuple of: - out: Output data of shape (N, T, M) - cache: Values needed for the backward pass """ N, T, D = x.shape M = self.b.shape[0] self.cache = (x, ) return x.reshape(N * T, D).dot(self.w).reshape(N, T, M) + self.b
def forward(self, x: NPArray) -> NPArray: """ Forward pass for batch normalization. Uses minibatch statistics to compute the mean and variance, use these statistics to normalize the incoming data, and scale and shift the normalized data using gamma and beta. During training the sample mean and (uncorrected) sample variance are computed from minibatch statistics and used to normalize the incoming data. During training we also keep an exponentially decaying running mean of the mean and variance of each feature, and these averages are used to normalize data at test-time. At each timestep we update the running averages for mean and variance using an exponential decay based on the momentum parameter: running_mean = momentum * running_mean + (1 - momentum) * sample_mean running_var = momentum * running_var + (1 - momentum) * sample_var Note that though you should be keeping track of the running variance, you should normalize the data based on the standard deviation (square root of variance) instead! Referencing the original paper (https://arxiv.org/abs/1502.03167) might prove to be helpful. Note that the batch normalization paper suggests a different test-time behavior: they compute sample mean and variance for each feature using a large number of training images rather than using a running average. For this implementation we have chosen to use running averages instead since they do not require an additional estimation step; the torch7 implementation of batch normalization also uses running averages. Input: - x: Data of shape (N, D) - gamma: Scale parameter of shape (D,) - beta: Shift parameter of shape (D,) - bn_param: Dictionary with the following keys: - mode: 'train' or 'test'; required - eps: Constant for numeric stability - momentum: Constant for running mean / variance. - running_mean: Array of shape (D,) giving running mean of features - running_var Array of shape (D,) giving running variance of features Returns a tuple of: - out: of shape (N, D) - cache: A tuple of values needed in the backward pass """ if self.train_mode: # Compute output mu = x.mean(axis=0) xc = x - mu var = np.mean(xc**2, axis=0) std = np.sqrt(var + self.eps) xn = xc / std out = self.gamma * xn + self.beta # Update running average of mean self.running_mean *= self.momentum self.running_mean += (1 - self.momentum) * mu # Update running average of variance self.running_var *= self.momentum self.running_var += (1 - self.momentum) * var else: # Using running mean and variance to normalize std = np.sqrt(self.running_var + self.eps) xn = (x - self.running_mean) / std out = self.gamma * xn + self.beta self.cache = (xn, std) return out