def forward(self, x: NPArray) -> NPArray: """ Computes the forward pass for spatial group normalization. In contrast to layer normalization, group normalization splits each entry in the data into G contiguous pieces, which it then normalizes independently. Per feature shifting and scaling are then applied to the data, in a manner identical to that of batch normalization and layer normalization. Inputs: - x: Input data of shape (N, C, H, W) - gamma: Scale parameter, of shape (C,) - beta: Shift parameter, of shape (C,) - G: Integer mumber of groups to split into, should be a divisor of C - gn_param: Dictionary with the following keys: - eps: Constant for numeric stability Returns a tuple of: - out: Output data, of shape (N, C, H, W) - cache: Values needed for the backward pass """ N, C, H, W = x.shape self.gamma = self.gamma.reshape((1, C, 1, 1)) self.beta = self.beta.reshape((1, C, 1, 1)) x = x.reshape(N * self.G, -1).T sample_mean = np.mean(x, axis=0) sample_var = np.var(x, axis=0) v = np.sqrt(sample_var + self.eps) x_hat = (x - sample_mean) / v x_hat = x_hat.T.reshape(N, C, H, W) out = self.gamma * x_hat + self.beta self.cache = (x_hat, v) return out
def backward(self, dout: NPArray) -> tuple[NPArray, ...]: """ Backward pass for temporal affine layer. Input: - dout: Upstream gradients of shape (N, T, M) - cache: Values from forward pass Returns a tuple of: - dx: Gradient of input, of shape (N, T, D) - dw: Gradient of weights, of shape (D, M) - db: Gradient of biases, of shape (M,) """ (x, ) = self.cache N, T, D = x.shape M = self.b.shape[0] dx = dout.reshape(N * T, M).dot(self.w.T).reshape(N, T, D) dw = dout.reshape(N * T, M).T.dot(x.reshape(N * T, D)).T db = dout.sum(axis=(0, 1)) return dx, dw, db
def temporal_softmax_loss(x: NPArray, y: NPIntArray, mask: NPBoolArray) -> tuple[float, NPArray]: """ A temporal version of softmax loss for use in RNNs. We assume that we are making predictions over a vocabulary of size V for each timestep of a timeseries of length T, over a minibatch of size N. The input x gives scores for all vocabulary elements at all timesteps, and y gives the indices of the ground-truth element at each timestep. We use a cross-entropy loss at each timestep, summing the loss over all timesteps and averaging across the minibatch. As an additional complication, we may want to ignore the model output at some timesteps, since sequences of different length may have been combined into a minibatch and padded with NULL tokens. The optional mask argument tells us which elements should contribute to the loss. Inputs: - x: Input scores, of shape (N, T, V) - y: Ground-truth indices, of shape (N, T) where each element is in the range 0 <= y[i, t] < V - mask: Boolean array of shape (N, T) where mask[i, t] tells whether or not the scores at x[i, t] should contribute to the loss. Returns a tuple of: - loss: Scalar giving loss - dx: Gradient of loss with respect to scores x. """ N, T, V = x.shape x_flat = x.reshape(N * T, V) y_flat = y.reshape(N * T) mask_flat = mask.reshape(N * T) probs = np.exp(x_flat - np.max(x_flat, axis=1, keepdims=True)) probs /= np.sum(probs, axis=1, keepdims=True) loss = -np.sum(mask_flat * np.log(probs[np.arange(N * T), y_flat])) / N dx_flat = probs.copy() dx_flat[np.arange(N * T), y_flat] -= 1 dx_flat /= N dx_flat *= mask_flat[:, None] dx = dx_flat.reshape(N, T, V) return loss, dx
def forward(self, x: NPArray) -> NPArray: """ Computes the forward pass for an affine (fully-connected) layer. The input x has shape (N, d_1, ..., d_k) where x[i] is the ith input. We multiply this against a weight matrix of shape (D, M) where D = prod_i d_i Inputs: x - Input data, of shape (N, d_1, ..., d_k) w - Weights, of shape (D, M) b - Biases, of shape (M,) Returns a tuple of: - out: output, of shape (N, M) - cache: (x, w, b) """ self.cache = (x, ) return x.reshape(x.shape[0], -1).dot(self.w) + self.b
def forward(self, x: NPArray) -> NPArray: """ Forward pass for a temporal affine layer. The input is a set of D-dimensional vectors arranged into a minibatch of N timeseries, each of length T. We use an affine function to transform each of those vectors into a new vector of dimension M. Inputs: - x: Input data of shape (N, T, D) - w: Weights of shape (D, M) - b: Biases of shape (M,) Returns a tuple of: - out: Output data of shape (N, T, M) - cache: Values needed for the backward pass """ N, T, D = x.shape M = self.b.shape[0] self.cache = (x, ) return x.reshape(N * T, D).dot(self.w).reshape(N, T, M) + self.b