def backward(self, grad_out): # diag # D(softmax) / dz_i # = D(exp(z_i) * sum(exp(z))^-1) / dz_i # = D(exp(z_i)) / dz * sum(exp(z))^-1 + exp(z_i) * D(sum(exp(z))^-1) / dz_i # = exp(z_i) * sum(exp(z))^-1 + exp(z_i) * -1 * sum(exp(z))^-2 * exp(z_i) # = exp(z_i) / sum(exp(z)) - exp(z_i)^2 / sum(exp(z))^2 # off diag # D(softmax) / dz_j # = D(exp(z_i) * sum(exp(z))^-1) / dz_j # = exp(z_i) * D(sum(exp(z))^-1) / dz_j) # = exp(z_i) * -1 * sum(exp(z))^-2 * exp(z_j) # = - exp(z_i) * exp(z_j) / sum(exp(z))^2 z_min = xp.min(self.z, axis=1) z = self.z - z_min[:, xp.newaxis] exp = xp.exp(z) sum_exp = xp.sum(exp, axis=1) outer_mat = -xp.einsum("ij,ik->ijk", exp, exp) outer_mat /= sum_exp[:, xp.newaxis, xp.newaxis]**2 diag = exp / sum_exp[:, xp.newaxis] diag_idx = xp.arange(diag.shape[1]) outer_mat[:, diag_idx, diag_idx] += diag grad_in = xp.einsum("ik,ikj->ij", grad_out, outer_mat) # Directly use matmul() instead of einsum # grad_in = xp.squeeze(xp.matmul(grad_out[:, xp.newaxis, :], outer_mat)) return grad_in
def forward(self, z): self.z = z z_min = xp.min(z, axis=1) z = z - z_min[:, xp.newaxis] exp = xp.exp(z) sum_exp = xp.sum(exp, axis=1)[:, xp.newaxis] outputs = exp / sum_exp return outputs
def backward(self, grad_out): grad_out = grad_out.reshape(-1, self.num_rows, self.num_filters) # input_rows shape: (num_samples, num_rows, num_filter_inputs) # grad_z shape: (num_samples, num_rows, num_filters) self.grad_W = xp.einsum("ijk,ijl->kl", self.input_rows, grad_out) self.grad_b = xp.sum(grad_out, axis=(0, 1)) # grad_z shape: (num_samples, num_rows, num_filters) # W shape: num_filter_inputs, num_filters # grad_rows shape: (num_samples, num_rows, num_filter_inputs) grad_rows = xp.dot(grad_out, xp.transpose(self.W)) # xp.einsum("ijl,kl->ijk", grad_z, self.W) grad_in = common.row2im(grad_rows, self.row_indices, self.input_dim, self.filter_size, self.stride, self.pad, xp) assert grad_in.shape[1:] == self.input_dim return grad_in
def backward(self, grad_out): num_samples = grad_out.shape[0] grad_out = grad_out.reshape((num_samples, -1)) # This is basically a cross join between the last axes of x and z. # grad_W_i = xp.einsum("ij,ik->ijk", self.x, grad_z) # grad_b_i = grad_z # self.grad_W = xp.sum(grad_W_i, axis=0) # self.grad_b = xp.sum(grad_b_i, axis=0) # x shape: (num_samples, num_inputs) # z shape: (num_samples, num_outputs) self.grad_W = xp.einsum("ij,ik->jk", self.x, grad_out) self.grad_b = xp.sum(grad_out, axis=0) # xp.einsum("ik,jk->ij", grad_out, self.W) grad_in = xp.dot(grad_out, xp.transpose(self.W)) return grad_in.reshape((-1, *self.input_dim))
def forward(self, yhat, y): self.yhat, self.y = yhat, y loss = -xp.sum(y * xp.log(yhat + 1e-6)) return loss