def forward_faster(self, X: np.ndarray) -> np.ndarray: """Forward pass for convolutional layer. This layer convolves the input `X` with a filter of weights, adds a bias term, and applies an activation function to compute the output. This layer also supports padding and integer strides. Intermediates necessary for the backward pass are stored in the cache. This implementation uses `im2col` which allows us to use fast general matrix multiply (GEMM) routines implemented by numpy. This is still rather slow compared to GPU acceleration, but still LEAGUES faster than the nested loop in the naive implementation. DO NOT ALTER THIS METHOD. You will write your naive implementation in forward(). We will use forward_faster() to check your method. Parameters ---------- X input with shape (batch_size, in_rows, in_cols, in_channels) Returns ------- output feature maps with shape (batch_size, out_rows, out_cols, out_channels) """ if self.n_in is None: self._init_parameters(X.shape) W = self.parameters["W"] b = self.parameters["b"] kernel_height, kernel_width, in_channels, out_channels = W.shape n_examples, in_rows, in_cols, in_channels = X.shape kernel_shape = (kernel_height, kernel_width) X_col, p = im2col(X, kernel_shape, self.stride, self.pad) out_rows = int((in_rows + p[0] + p[1] - kernel_height) / self.stride + 1) out_cols = int((in_cols + p[2] + p[3] - kernel_width) / self.stride + 1) W_col = W.transpose(3, 2, 0, 1).reshape(out_channels, -1) Z = ( (W_col @ X_col) .reshape(out_channels, out_rows, out_cols, n_examples) .transpose(3, 1, 2, 0) ) Z += b out = self.activation(Z) self.cache["Z"] = Z self.cache["X"] = X return out
def backward_faster(self, dLdY: np.ndarray) -> np.ndarray: """Backward pass for conv layer. Computes the gradients of the output with respect to the input feature maps as well as the filter weights and biases. This uses im2col, so it is considerably faster than the naive implementation even on a CPU. DO NOT ALTER THIS METHOD. You will write your naive implementation in backward(). We will use backward_faster() to check your method. Parameters ---------- dLdY derivative of loss with respect to output of this layer shape (batch_size, out_rows, out_cols, out_channels) Returns ------- derivative of the loss with respect to the input of this layer shape (batch_size, in_rows, in_cols, in_channels) """ W = self.parameters["W"] b = self.parameters["b"] Z = self.cache["Z"] X = self.cache["X"] kernel_height, kernel_width, in_channels, out_channels = W.shape n_examples, in_rows, in_cols, in_channels = X.shape kernel_shape = (kernel_height, kernel_width) dZ = self.activation.backward(Z, dLdY) dZ_col = dZ.transpose(3, 1, 2, 0).reshape(dLdY.shape[-1], -1) X_col, p = im2col(X, kernel_shape, self.stride, self.pad) W_col = W.transpose(3, 2, 0, 1).reshape(out_channels, -1).T dW = ( (dZ_col @ X_col.T) .reshape(out_channels, in_channels, kernel_height, kernel_width) .transpose(2, 3, 1, 0) ) dB = dZ_col.sum(axis=1).reshape(1, -1) dX_col = W_col @ dZ_col dX = col2im(dX_col, X, W.shape, self.stride, p).transpose(0, 2, 3, 1) self.gradients["W"] = dW self.gradients["b"] = dB return dX
def backward(self, dLdY: np.ndarray) -> np.ndarray: """Backward pass for conv layer. Computes the gradients of the output with respect to the input feature maps as well as the filter weights and biases. Parameters ---------- dLdY derivative of loss with respect to output of this layer shape (batch_size, out_rows, out_cols, out_channels) Returns ------- derivative of the loss with respect to the input of this layer shape (batch_size, in_rows, in_cols, in_channels) """ ### BEGIN YOUR CODE ### W = self.parameters["W"] b = self.parameters["b"] Z = self.cache["Z"] X = self.cache["X"] kernel_height, kernel_width, in_channels, out_channels = W.shape n_examples, in_rows, in_cols, in_channels = X.shape kernel_shape = (kernel_height, kernel_width) # perform a backward pass dZ = self.activation.backward(Z, dLdY) dZ_col = dZ.transpose(3, 1, 2, 0).reshape(dLdY.shape[-1], -1) X_col, p = im2col(X, kernel_shape, self.stride, self.pad) W_col = W.transpose(3, 2, 0, 1).reshape(out_channels, -1).T dW = ( (dZ_col @ X_col.T) .reshape(out_channels, in_channels, kernel_height, kernel_width) .transpose(2, 3, 1, 0) ) dB = dZ_col.sum(axis=1).reshape(1, -1) dX_col = W_col @ dZ_col dX = col2im(dX_col, X, W.shape, self.stride, p).transpose(0, 2, 3, 1) self.gradients["W"] = dW self.gradients["b"] = dB ### END YOUR CODE ### return dX