class Decoder: def __init__(self, vocab_size, wordvec_size, hidden_size): V, D, H = vocab_size, wordvec_size, hidden_size rn = np.random.randn embed_W = (rn(V, D) / 100).astype('f') lstm_Wx = (rn(D, 4 * H) / np.sqrt(D)).astype('f') lstm_Wh = (rn(H, 4 * H) / np.sqrt(H)).astype('f') lstm_b = np.zeros(4 * H).astype('f') affine_W = (rn(H, V) / np.sqrt(H)).astype('f') affine_b = np.zeros(V).astype('f') self.embed = TimeEmbedding(embed_W) self.lstm = TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=True) self.affine = TimeAffine(affine_W, affine_b) self.params, self.grads = [], [] for layer in (self.embed, self.lstm, self.affine): self.params += layer.params self.grads += layer.grads def forward(self, xs, h): self.lstm.set_state(h) out = self.embed.forward(xs) out = self.lstm.forward(out) score = self.affine.forward(out) return score def backward(self, dscore): dout = self.affine.backward(dscore) dout = self.lstm.backward(dout) dout = self.embed.backward(dout) dh = self.lstm.dh return dh def generate(self, h, start_id, sample_size): sampled = [] sample_id = start_id self.lstm.set_state(h) for _ in range(sample_size): x = np.array(sample_id).reshape((1, 1)) out = self.embed.forward(x) out = self.lstm.forward(out) score = self.affine.forward(out) sample_id = np.argmax(score.flatten()) sampled.append(int(sample_id)) return sampled
class Encoder: def __init__(self, vocab_size, wordvec_size, hidden_size): V, D, H = vocab_size, wordvec_size, hidden_size rn = np.random.randn embed_W = (rn(V, D) / 100).astype('f') lstm_Wx = (rn(D, 4 * H) / np.sqrt(D)).astype('f') lstm_Wh = (rn(H, 4 * H) / np.sqrt(H)).astype('f') lstm_b = np.zeros(4 * H).astype('f') self.embed = TimeEmbedding(embed_W) self.lstm = TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=False) self.params = self.embed.params + self.lstm.params self.grads = self.embed.grads + self.lstm.grads self.hs = None def forward(self, xs): xs = self.embed.forward(xs) hs = self.lstm.forward(xs) self.hs = hs return hs[:, -1, :] def backward(self, dh): dhs = np.zeros_like(self.hs) dhs[:, -1, :] = dh dout = self.lstm.backward(dhs) dout = self.embed.backward(dout) return dout
class PeekyDecoder: def __init__(self, vocab_size, wordvec_size, hidden_size): V, D, H = vocab_size, wordvec_size, hidden_size rn = np.random.randn embed_W = (rn(V, D) / 100).astype('f') lstm_Wx = (rn(H + D, 4 * H) / np.sqrt(D)).astype('f') lstm_Wh = (rn(H, 4 * H) / np.sqrt(H)).astype('f') lstm_b = np.zeros(4 * H).astype('f') affine_W = (rn(H + H, V) / np.sqrt(H)).astype('f') affine_b = np.zeros(V).astype('f') self.embed = TimeEmbedding(embed_W) self.lstm = TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=True) self.affine = TimeAffine(affine_W, affine_b) self.params, self.grads = [], [] for layer in [self.embed, self.lstm, self.affine]: self.params += layer.params self.grads += layer.grads self.cache = None def forward(self, xs, h): N, T = xs.shape N, H = h.shape self.lstm.set_state(h) out = self.embed.forward(xs) hs = np.repeat(h, T, axis=0).reshape(N, T, H) out = np.concatenate((hs, out), axis=2) out = self.lstm.forward(out) out = np.concatenate((hs, out), axis=2) score = self.affine.forward(out) self.cache = H return score def backward(self, dscore): H = self.cache dout = self.affine.backward(dscore) dhs0, dout, = dout[:, :, :H], dout[:, :, H:] dout = self.lstm.backward(dout) dhs1, dout = dout[:, :, :H], dout[:, :, H:] self.embed.backward(dout) dhs = dhs0 + dhs1 dh = self.lstm.dh + np.sum(dhs, axis=1) return dh def generate(self, h, start_id, sample_size): sampled = [] char_id = start_id self.lstm.set_state(h) H = h.shape[1] peeky_h = h.reshape(1, 1, H) for _ in range(sample_size): x = np.array([char_id]).reshape((1, 1)) out = self.embed.forward(x) out = np.concatenate((peeky_h, out), axis=2) out = self.lstm.forward(out) out = np.concatenate((peeky_h, out), axis=2) score = self.affine.forward(out) char_id = np.argmax(score.flatten()) sampled.append(int(char_id)) return sampled