def update(self, params, grads): if not self.m: self.m, self.v = [], [] for param in params: self.m.append(np.zeros_like(param)) self.v.append(np.zeros_like(param)) self.iter += 1 lr_t = self.lr * np.sqrt(1.0 - self.beta2 ** self.iter) / (1.0 - self.beta1 ** self.iter) for i in range(len(params)): self.m[i] += (1 - self.beta1) * (grads[i] - self.m[i]) self.v[i] += (1 - self.beta2) * (grads[i] ** 2 - self.v[i]) params[i] -= lr_t * self.m[i] / (np.sqrt(self.v[i]) + 1e-7)
def __init__(self, vocab_size, wordvec_size, hidden_size): V, D, H = vocab_size, wordvec_size, hidden_size rn = np.random.randn embed_W = (rn(V, D) / 100).astype('f') lstm_Wx = (rn(D, 4 * H) / np.sqrt(D)).astype('f') lstm_Wh = (rn(H, 4 * H) / np.sqrt(H)).astype('f') lstm_b = np.zeros(4 * H).astype('f') self.embed = TimeEmbedding(embed_W) self.lstm = TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=False) self.params = self.embed.params + self.lstm.params self.grads = self.embed.grads + self.lstm.grads self.hs = None
def __init__(self, vocab_size, wordvec_size, hidden_size): V, D, H = vocab_size, wordvec_size, hidden_size rn = np.random.randn embed_W = (rn(V, D) / 100).astype('f') lstm_Wx = (rn(D, 4 * H) / np.sqrt(D)).astype('f') lstm_Wh = (rn(H, 4 * H) / np.sqrt(H)).astype('f') lstm_b = np.zeros(4 * H).astype('f') affine_W = (rn(H, V) / np.sqrt(H)).astype('f') affine_b = np.zeros(V).astype('f') self.embed = TimeEmbedding(embed_W) self.lstm = TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=True) self.affine = TimeAffine(affine_W, affine_b) self.params, self.grads = [], [] for layer in (self.embed, self.lstm, self.affine): self.params += layer.params self.grads += layer.grads
def clip_grads(grads, max_norm): total_norm = 0 for grad in grads: total_norm += np.sum(grad**2) total_norm = np.sqrt(total_norm) rate = max_norm / (total_norm + 1e-6) if rate < 1: for grad in grads: grad *= rate
def __init__(self, vocab_size, wordvec_size, hidden_size): V, D, H = vocab_size, wordvec_size, hidden_size H = H * 2 # 順方向と逆方向が足し合わされた行列を受け取るため rn = np.random.randn embed_W = (rn(V, D) / 100).astype('f') lstm_Wx = (rn(D, 4 * H) / np.sqrt(D)).astype('f') lstm_Wh = (rn(H, 4 * H) / np.sqrt(H)).astype('f') lstm_b = np.zeros(4 * H).astype('f') affine_W = (rn(2 * H, V) / np.sqrt(H)).astype('f') affine_b = np.zeros(V).astype('f') self.embed = TimeEmbedding(embed_W) self.lstm = TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=True) self.attention = TimeAttention() self.affine = TimeAffine(affine_W, affine_b) layers = [self.embed, self.lstm, self.attention, self.affine] self.params, self.grads = [], [] for layer in layers: self.params += layer.params self.grads += layer.grads