def backward_2d(self, dy_2d): self.beta.g = dy_2d.sum(axis=0) self.gamma.g = np.sum(self.xn * dy_2d, axis=0) dxn = self.gamma.d * dy_2d dxc = dxn / self.std dstd = -np.sum((dxn * self.xc) / (self.std * self.std), axis=0) dvar = 0.5 * dstd / self.std batch_size = dy_2d.shape[0] dxc += (2.0 / batch_size) * self.xc * dvar dmu = np.sum(dxc, axis=0) return dxc - dmu / batch_size
def backward(self): L, M = self.shape N, T, L = self.x.d.shape dx = np.empty((N, T, L), dtype=self.dtype) self.h.g = 0 dc = 0 for t in reversed(range(T)): dy = self.y.g[:, t] + self.h.g tanh_c = np.tanh(self.c[:, t]) a = self.A[:, t] f, g, i, o = a[:, :M], a[:, M:2 * M], a[:, 2 * M:3 * M], a[:, 3 * M:] ds = dc + (dy * o) * (1 - tanh_c**2) c = self.c_prev if t == 0 else self.c[:, t - 1] dc = ds * f df = ds * c di = ds * g do = dy * tanh_c dg = ds * i df *= f * (1 - f) di *= i * (1 - i) do *= o * (1 - o) dg *= 1 - g**2 da = np.hstack((df, dg, di, do)) self.b.g = np.sum(da, axis=0) self.W.g = self.x.d[:, t].T @ da h = self.h_prev if t == 0 else self.y.d[:, t - 1] self.U.g = h.T @ da self.h.g = da @ self.U.d.T dx[:, t] = da @ self.W.d.T self.x.g = dx
def __init__(self, corpus, power=0.75): counts = collections.Counter(corpus) self.vocab_size = len(counts) self.probability = np.zeros(self.vocab_size) for i in range(self.vocab_size): self.probability[i] = counts[i] self.probability = np.power(self.probability, power) self.probability /= np.sum(self.probability)
def ppmi(C, verbose=False, eps=1e-8): M = np.zeros_like(C, dtype=np.float32) N = np.sum(C) S = np.sum(C, axis=0) total = C.shape[0] * C.shape[1] cnt = 0 for i in range(C.shape[0]): for j in range(C.shape[1]): pmi = np.log2(C[i, j] * N / (S[j] * S[i]) + eps) M[i, j] = max(0, pmi) if verbose: cnt += 1 if cnt % (total // 100) == 0: print("%.1f%% done" % (100 * cnt / total)) return M
def backward(self): FN, C, FH, FW = self.W.shape dy_2d = self.y.g.transpose(0, 2, 3, 1).reshape(-1, FN) self.b.g = np.sum(dy_2d, axis=0) dW_2d = self.x_2d.T @ dy_2d self.W.g = dW_2d.transpose(1, 0).reshape(FN, C, FH, FW) dx_2d = dy_2d @ self.W_2d.T self.x.g = col2im(dx_2d, self.x.d.shape, FH, FW, self.stride.d, self.padding.d)
def forward(self): y = np.exp(self.x.d - self.x.d.max(axis=-1, keepdims=True)) y /= y.sum(axis=-1, keepdims=True) self.y.d = y self.y_2d = self.y.d.reshape(-1, self.y.d.shape[-1]) self.t_1d = self.t.d.reshape(-1) self.size = self.y_2d.shape[0] loss = self.y_2d[np.arange(self.size), self.t_1d] self.loss.d = -np.sum(np.log(loss + 1e-7)) / self.size
def clip_grads(self, max_grad): grads = [v.grad for v in self.weight_variables] total_norm = 0.0 for grad in grads: total_norm += np.sum(grad**2) # type:ignore total_norm = np.sqrt(total_norm) rate = max_grad / (total_norm + 1e-6) if rate < 1: for grad in grads: grad *= rate # type:ignore
def backward(self): L, M = self.shape N, T, L = self.x.d.shape dx = np.empty((N, T, L), dtype=self.dtype) self.h.g = 0 for t in reversed(range(T)): dy = self.y.g[:, t] + self.h.g dt = dy * (1 - self.y.d[:, t]**2) self.b.g = np.sum(dt, axis=0) self.W.g = self.x.d[:, t].T @ dt h = self.h_prev if t == 0 else self.y.d[:, t - 1] self.U.g = h.T @ dt self.h.g = dt @ self.U.d.T dx[:, t] = dt @ self.W.d.T self.x.g = dx
def forward(self): self.y.d = 1 / (1 + np.exp(-self.x.d)) y = self.y.d.reshape(-1) self.size = y.shape[0] loss = np.c_[1 - y, y][np.arange(self.size), self.t.d.reshape(-1)] self.loss.d = -np.sum(np.log(loss + 1e-7)) / self.size
def forward(self): self.t_W = self.W.d[self.t.d] self.y.d = np.sum(self.x.d * self.t_W, axis=1)
def backward(self): dx = (self.y.g @ self.W.d.T) / self.shape[0] self.x.g = np.repeat(dx, self.shape[0], axis=0).reshape(*self.x.d.shape) self.W.g = np.sum(self.x.d.T @ self.y.g, axis=1) / self.shape[0]
def forward(self): self.y.d = np.sum(self.x.d @ self.W.d, axis=1) / self.shape[0]
def cos_similarity(x, y, eps=1e-8): nx = x / (np.sqrt(np.sum(x ** 2)) + eps) ny = y / (np.sqrt(np.sum(y ** 2)) + eps) return np.dot(nx, ny)