Esempio n. 1
0
    def forward(self, x: NPArray) -> NPArray:
        """
        Computes the forward pass for spatial group normalization.
        In contrast to layer normalization, group normalization splits each entry
        in the data into G contiguous pieces, which it then normalizes independently.
        Per feature shifting and scaling are then applied to the data, in a manner
        identical to that of batch normalization and layer normalization.

        Inputs:
        - x: Input data of shape (N, C, H, W)
        - gamma: Scale parameter, of shape (C,)
        - beta: Shift parameter, of shape (C,)
        - G: Integer mumber of groups to split into, should be a divisor of C
        - gn_param: Dictionary with the following keys:
        - eps: Constant for numeric stability

        Returns a tuple of:
        - out: Output data, of shape (N, C, H, W)
        - cache: Values needed for the backward pass
        """
        N, C, H, W = x.shape
        self.gamma = self.gamma.reshape((1, C, 1, 1))
        self.beta = self.beta.reshape((1, C, 1, 1))

        x = x.reshape(N * self.G, -1).T
        sample_mean = np.mean(x, axis=0)
        sample_var = np.var(x, axis=0)
        v = np.sqrt(sample_var + self.eps)
        x_hat = (x - sample_mean) / v
        x_hat = x_hat.T.reshape(N, C, H, W)
        out = self.gamma * x_hat + self.beta

        self.cache = (x_hat, v)

        return out
Esempio n. 2
0
    def backward(self, dout: NPArray) -> tuple[NPArray, ...]:
        """
        Backward pass for temporal affine layer.

        Input:
        - dout: Upstream gradients of shape (N, T, M)
        - cache: Values from forward pass

        Returns a tuple of:
        - dx: Gradient of input, of shape (N, T, D)
        - dw: Gradient of weights, of shape (D, M)
        - db: Gradient of biases, of shape (M,)
        """
        (x, ) = self.cache
        N, T, D = x.shape
        M = self.b.shape[0]

        dx = dout.reshape(N * T, M).dot(self.w.T).reshape(N, T, D)
        dw = dout.reshape(N * T, M).T.dot(x.reshape(N * T, D)).T
        db = dout.sum(axis=(0, 1))

        return dx, dw, db
Esempio n. 3
0
def temporal_softmax_loss(x: NPArray, y: NPIntArray,
                          mask: NPBoolArray) -> tuple[float, NPArray]:
    """
    A temporal version of softmax loss for use in RNNs. We assume that we are
    making predictions over a vocabulary of size V for each timestep of a
    timeseries of length T, over a minibatch of size N. The input x gives scores
    for all vocabulary elements at all timesteps, and y gives the indices of the
    ground-truth element at each timestep. We use a cross-entropy loss at each
    timestep, summing the loss over all timesteps and averaging across the
    minibatch.

    As an additional complication, we may want to ignore the model output at some
    timesteps, since sequences of different length may have been combined into a
    minibatch and padded with NULL tokens. The optional mask argument tells us
    which elements should contribute to the loss.

    Inputs:
    - x: Input scores, of shape (N, T, V)
    - y: Ground-truth indices, of shape (N, T) where each element is in the range
         0 <= y[i, t] < V
    - mask: Boolean array of shape (N, T) where mask[i, t] tells whether or not
      the scores at x[i, t] should contribute to the loss.

    Returns a tuple of:
    - loss: Scalar giving loss
    - dx: Gradient of loss with respect to scores x.
    """
    N, T, V = x.shape

    x_flat = x.reshape(N * T, V)
    y_flat = y.reshape(N * T)
    mask_flat = mask.reshape(N * T)

    probs = np.exp(x_flat - np.max(x_flat, axis=1, keepdims=True))
    probs /= np.sum(probs, axis=1, keepdims=True)
    loss = -np.sum(mask_flat * np.log(probs[np.arange(N * T), y_flat])) / N
    dx_flat = probs.copy()
    dx_flat[np.arange(N * T), y_flat] -= 1
    dx_flat /= N
    dx_flat *= mask_flat[:, None]

    dx = dx_flat.reshape(N, T, V)
    return loss, dx
Esempio n. 4
0
    def forward(self, x: NPArray) -> NPArray:
        """
        Computes the forward pass for an affine (fully-connected) layer.

        The input x has shape (N, d_1, ..., d_k) where x[i] is the ith input.
        We multiply this against a weight matrix of shape (D, M) where
        D = prod_i d_i

        Inputs:
        x - Input data, of shape (N, d_1, ..., d_k)
        w - Weights, of shape (D, M)
        b - Biases, of shape (M,)

        Returns a tuple of:
        - out: output, of shape (N, M)
        - cache: (x, w, b)
        """
        self.cache = (x, )
        return x.reshape(x.shape[0], -1).dot(self.w) + self.b
Esempio n. 5
0
    def forward(self, x: NPArray) -> NPArray:
        """
        Forward pass for a temporal affine layer. The input is a set of D-dimensional
        vectors arranged into a minibatch of N timeseries, each of length T. We use
        an affine function to transform each of those vectors into a new vector of
        dimension M.

        Inputs:
        - x: Input data of shape (N, T, D)
        - w: Weights of shape (D, M)
        - b: Biases of shape (M,)

        Returns a tuple of:
        - out: Output data of shape (N, T, M)
        - cache: Values needed for the backward pass
        """
        N, T, D = x.shape
        M = self.b.shape[0]
        self.cache = (x, )
        return x.reshape(N * T, D).dot(self.w).reshape(N, T, M) + self.b