class Decoder: def __init__(self, vocab_size, wordvec_size, hidden_size): V, D, H = vocab_size, wordvec_size, hidden_size rn = np.random.randn embed_W = (rn(V, D) / 100).astype('f') lstm_Wx = (rn(D, 4 * H) / np.sqrt(D)).astype('f') lstm_Wh = (rn(H, 4 * H) / np.sqrt(H)).astype('f') lstm_b = np.zeros(4 * H).astype('f') affine_W = (rn(H, V) / np.sqrt(H)).astype('f') affine_b = np.zeros(V).astype('f') self.embed = TimeEmbedding(embed_W) self.lstm = TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=True) self.affine = TimeAffine(affine_W, affine_b) self.params, self.grads = [], [] for layer in (self.embed, self.lstm, self.affine): self.params += layer.params self.grads += layer.grads def forward(self, xs, h): self.lstm.set_state(h) out = self.embed.forward(xs) out = self.lstm.forward(out) score = self.affine.forward(out) return score def backward(self, dscore): dout = self.affine.backward(dscore) dout = self.lstm.backward(dout) dout = self.embed.backward(dout) dh = self.lstm.dh return dh def generate(self, h, start_id, sample_size): ''' h: Encoderから受け取る隠れ状態 start_id: 最初に与える文字ID sample_size: 生成する文字数 ''' sampled = [] sample_id = start_id self.lstm.set_state(h) for _ in range(sample_size): x = np.array(sample_id).reshape((1, 1)) out = self.embed.forward(x) out = self.lstm.forward(out) score = self.affine.forward(out) sample_id = np.argmax(score.flatten()) sampled.append(int(sample_id)) return sampled
def __init__(self, vocab_size: int = 10000, wordvec_size: int = 100, hidden_size: int = 100) -> None: V, D, H = vocab_size, wordvec_size, hidden_size rn = np.random.randn # Initialize of weights embed_W = (rn(V, D) / 100).astype('f') lstm_Wx = (rn(D, 4 * H) / np.sqrt(D).astype('f')) lstm_Wh = (rn(D, 4 * H) / np.sqrt(H).astype('f')) lstm_b = np.zeros(4 * H).astype('f') affine_W = (rn(H, V) / np.sqrt(H)).astype('f') affine_b = np.zeros(V).astype('f') # Generating layers self.layers = [ TimeEmbedding(embed_W), TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=True), TimeAffine(affine_W, affine_b) ] self.loss_layer = TimeSoftmaxWithLoss() self.lstm_layer = self.layers[1] # Conclude all of weights and grads as a list self.params, self.grads = [], [] for layer in self.layers: self.params += layer.params self.grads += layer.grads
def __init__(self, vocabulary_size, wordvec_size, hidden_size): V, D, H = vocabulary_size, wordvec_size, hidden_size rn = np.random.randn # Initialize weights embed_W = (rn(V, D) / 100).astype('f') rnn_Wx = (rn(D, H) / np.sqrt(D)).astype('f') rnn_Wh = (rn(H, H) / np.sqrt(H)).astype('f') rnn_b = np.zeros(H).astype('f') affine_W = (rn(H, V) / np.sqrt(H)).astype('f') affine_b = np.zeros(V).astype('f') # generate layers self.layers = [ TimeEmbedding(embed_W), TimeRNN(rnn_Wx, rnn_Wh, rnn_b, stateful=True), TimeAffine(affine_W, affine_b) ] self.loss_layer = TimeSoftmaxWithLoss() self.rnn_layer = self.layers[1] # list all weights and gradiants self.params, self.grads = [], [] for layer in self.layers: self.params += layer.params self.grads += layer.grads
def __init__(self, vocab_size, wordvec_size, hidden_size): V, D, H = vocab_size, wordvec_size, hidden_size rn = np.random.randn # 重みの初期化 embed_W = (rn(V, D) / 100).astype('f') rnn_Wx = (rn(D, H) / np.sqrt(D)).astype('f') rnn_Wh = (rn(H, H) / np.sqrt(H)).astype('f') rnn_b = np.zeros(H).astype('f') affine_W = (rn(H, V) / np.sqrt(H)).astype('f') affine_b = np.zeros(V).astype('f') # レイヤの生成 self.layers = [ TimeEmbedding(embed_W), TimeRNN(rnn_Wx, rnn_Wh, rnn_b, stateful=True), TimeAffine(affine_W, affine_b) ] self.loss_layer = TimeSoftmaxWithLoss() self.rnn_layer = self.layers[1] # 全ての重みと勾配をリストにまとめる self.params, self.grads = [], [] for layer in self.layers: self.params += layer.params self.grads += layer.grads
def __init__(self, vocab_size=10000, wordvec_size=100, hidden_size=100): V, D, H = vocab_size, wordvec_size, hidden_size rn = np.random.randn # initializing weights embed_W = (rn(V, D) / 100).astype('f') lstm_Wx = (rn(D, 4 * H) / np.sqrt(D)).astype('f') lstm_Wh = (rn(H, 4 * H) / np.sqrt(H)).astype('f') lstm_b = np.zeros(4 * H).astype('f') affine_W = (rn(H, V) / np.sqrt(H)).astype('f') affine_b = np.zeros(V).astype('f') # generate each layers self.layers = [ TimeEmbedding(embed_W), TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=True), TimeAffine(affine_W, affine_b) ] self.loss_layer = TimeSoftmaxWithLoss() self.lstm_layer = self.layers[1] # gather all weights and gradients self.params, self.grads = [], [] for layer in self.layers: self.params += layer.params self.grads += layer.grads
def __init__(self, vocab_size, wordvec_size, hidden_size): V, D, H = vocab_size, wordvec_size, hidden_size rn = np.random.randn # Initialize of weights embed_W = (rn(V, D) / 100).astype("f") rnn_Wx = (rn(D, H) / np.sqrt(D)).astype("f") rnn_Wh = (rn(H, H) / np.sqrt(H)).astype("f") rnn_b = np.zeros(H).astype("f") affine_W = (rn(H, V) / np.sqrt(H)).astype("f") affine_b = np.zeros(V).astype("f") # Making layers self.layers = [ TimeEmbedding(embed_W), TimeRNN(rnn_Wx, rnn_Wh, rnn_b, stateful=True), TimeAffine(affine_W, affine_b), ] self.loss_layer = TimeSoftmaxWithLoss() self.rnn_layer = self.layers[1] # Conclude all of weights & grads self.params, self.grads = [], [] for layer in self.layers: self.params += layer.params self.grads += layer.grads
def __init__(self, vocab_size, wordvec_size, hidden_size): V, D, H = vocab_size, wordvec_size, hidden_size rn = np.random.randn embed_W = (rn(V, D) / 100).astype('f') lstm_Wx = (rn(D, 4 * H) / np.sqrt(D)).astype('f') lstm_Wh = (rn(H, 4 * H) / np.sqrt(H)).astype('f') lstm_b = np.zeros(4 * H).astype('f') affine_W = (rn(H, V) / np.sqrt(H)).astype('f') affine_b = np.zeros(V).astype('f') self.embed = TimeEmbedding(embed_W) self.lstm = TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=True) self.affine = TimeAffine(affine_W, affine_b) self.params, self.grads = [], [] for layer in (self.embed, self.lstm, self.affine): self.params += layer.params self.grads += layer.grads
def __init__(self, vocab_size, wordvec_size, hidden_size): V, D, H = vocab_size, wordvec_size, hidden_size embed_W = (np.random.randn(V, D) / 100).astype(np.float32) lstm_Wx = (np.random.randn(D, 4 * H) / np.sqrt(D)).astype(np.float32) lstm_Wh = (np.random.randn(H, 4 * H) / np.sqrt(H)).astype(np.float32) lstm_b = np.zeros(4 * H).astype(np.float32) affine_W = (np.random.randn(2 * H, V) / np.sqrt(2 * H)).astype(np.float32) affine_b = np.zeros(V).astype(np.float32) self.embed = TimeEmbedding(embed_W) self.lstm = TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, statefull=True) self.attention = TimeAttention() self.affine = TimeAffine(affine_W, affine_b) layers = [self.embed, self.lstm, self.attention, self.affine] self.params, self.grads = [], [] for layer in layers: self.params += layer.params self.grads += layer.grads
def __init__(self, vocab_size=10000, word_vec=650, hidden_size=0.5, dropout_ratio=0.5): """Rnnの改良版 LSTMの多層化(2層) Dropoutを使用(深さ方向に使用) 重み共有(EmbeddingレイヤとAffineレイヤで重み共有) """ V, D, H = vocab_size, word_vec, hidden_size rn = np.random.randn # 重みの初期化 embed_W = (rn(V, D) / 100).astype('f') lstm_Wx1 = (rn(D, 4 * H) / np.sqrt(D)).astype('f') lstm_Wh1 = (rn(H, 4 * H) / np.sqrt(H)).astype('f') lstm_b1 = np.zeros(4 * H).astype('f') lstm_Wx2 = (rn(H, 4 * H) / np.sqrt(H)).astype('f') lstm_Wh2 = (rn(H, 4 * H) / np.sqrt(H)).astype('f') lstm_b2 = np.zeros(4 * H).astype('f') affine_b = np.zeros(V).astype('f') # 3つの改善 self.layers = [ TimeEmbedding(embed_W), TimeDropout(dropout_ratio), TimeLSTM(lstm_Wx1, lstm_Wh1, lstm_b1, stateful=True), TimeDropout(dropout_ratio), TimeLSTM(lstm_Wx2, lstm_Wh2, lstm_b2, stateful=True), TimeDropout(dropout_ratio), TimeAffine(embed_W.T, affine_b) # 重み共有 ] self.loss_layer = TimeSoftmaxWithLoss() self.lstm_layers = [self.layers[2], self.layers[4]] self.drop_layers = [self.layers[1], self.layers[3], self.layers[5]] # 全ての重みと勾配をリストにまとめる self.params, self.grads = [], [] for layer in self.layers: self.params += layer.params self.grads += layer.grads
def __init__(self, vocab_size=10000, wordvec_size=650, hidden_size=650, dropout_ratio=0.5): V, D, H = vocab_size, wordvec_size, hidden_size rn = np.random.randn embed_W = (rn(V, D) / 100).astype(np.float32) lstm_Wx1 = (rn(D, 4 * H) / np.sqrt(D)).astype(np.float32) lstm_Wh1 = (rn(H, 4 * H) / np.sqrt(H)).astype(np.float32) lstm_b1 = np.zeros(4 * H).astype(np.float32) lstm_Wx2 = (rn(D, 4 * H) / np.sqrt(D)).astype(np.float32) lstm_Wh2 = (rn(H, 4 * H) / np.sqrt(H)).astype(np.float32) lstm_b2 = np.zeros(4 * H).astype(np.float32) affine_b = np.zeros(V).astype(np.float32) # 3つの改善 # 1) LSTM層を重ねる # 2) Dropout層の追加 (深さ方向でLSTM層の間に追加) # 3) 重み共有 Time Embedding層とTime Affine層 @ W(V, D) self.layers = [ TimeEmbedding(embed_W), TimeDropout(dropout_ratio), TimeLSTM(lstm_Wx1, lstm_Wh1, lstm_b1, statefull=True), TimeDropout(dropout_ratio), TimeLSTM(lstm_Wx2, lstm_Wh2, lstm_b2, statefull=True), TimeDropout(dropout_ratio), TimeAffine(embed_W.T, affine_b) # embed_W(V, D)とembed_W.T(D, V)を共有 ] self.loss_layer = TimeSoftmaxWithLoss() self.lstm_layers = [self.layers[2], self.layers[4]] self.drop_layers = [self.layers[1], self.layers[3], self.layers[5]] # 重みと勾配をまとめる self.params, self.grads = [], [] for layer in self.layers: self.params += layer.params self.grads += layer.grads
def __init__(self, vocab_size=10000, wordvec_size=650, hidden_size=650, dropout_ratio=0.5): V, D, H = vocab_size, wordvec_size, hidden_size rn = np.random.randn # initializing weight embed_W = (rn(V, D) / 100).astype('f') lstm_Wx1 = (rn(D, 4 * H) / np.sqrt(D)).astype('f') lstm_Wh1 = (rn(H, 4 * H) / np.sqrt(H)).astype('f') lstm_b1 = np.zeros(4 * H).astype('f') lstm_Wx2 = (rn(H, 4 * H) / np.sqrt(H)).astype('f') lstm_Wh2 = (rn(H, 4 * H) / np.sqrt(H)).astype('f') lstm_b2 = np.zeros(4 * H).astype('f') affine_b = np.zeros(V).astype('f') # generating layers self.layers = [ TimeEmbedding(embed_W), TimeDropout(dropout_ratio), TimeLSTM(lstm_Wx1, lstm_Wh1, lstm_b1, stateful=True), TimeDropout(dropout_ratio), TimeLSTM(lstm_Wx2, lstm_Wh2, lstm_b2, stateful=True), TimeDropout(dropout_ratio), TimeAffine(embed_W.T, affine_b) ] self.loss_layer = TimeSoftmaxWithLoss() self.lstm_layers = [self.layers[2], self.layers[4]] self.drop_layers = [self.layers[1], self.layers[3], self.layers[5]] # gathering weights and gradients self.params, self.grads = [], [] for layer in self.layers: self.params += layer.params self.grads += layer.grads
class AttentionDecoder: def __init__(self, vocab_size, wordvec_size, hidden_size): V, D, H = vocab_size, wordvec_size, hidden_size rn = np.random.randn embed_W = (rn(V, D) / 100).astype('f') lstm_Wx = (rn(D, 4 * H) / np.sqrt(D)).astype('f') lstm_Wh = (rn(H, 4 * H) / np.sqrt(H)).astype('f') lstm_b = np.zeros(4 * H).astype('f') affine_W = (rn(2 * H, V) / np.sqrt(2 * H)).astype('f') affine_b = np.zeros(V).astype('f') self.embed = TimeEmbedding(embed_W) self.lstm = TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=True) self.attention = TimeAttention() self.affine = TimeAffine(affine_W, affine_b) layers = [self.embed, self.lstm, self.attention, self.affine] self.params, self.grads = [], [] for layer in layers: self.params += layer.params self.grads += layer.grads def forward(self, xs, enc_hs): h = enc_hs[:, -1] self.lstm.set_state(h) out = self.embed.forward(xs) dec_hs = self.lstm.forward(out) c = self.attention.forward(enc_hs, dec_hs) out = np.concatenate((c, dec_hs), axis=2) score = self.affine.forward(out) return score def backward(self, dscore): dout = self.affine.backward(dscore) _, _, H2 = dout.shape H = H2 // 2 dc, ddec_hs0 = dout[:, :, :H], dout[:, :, H:] denc_hs, ddec_hs1 = self.attention.backward(dc) ddec_hs = ddec_hs0 + ddec_hs1 dout = self.lstm.backward(ddec_hs) dh = self.lstm.dh denc_hs[:, -1] += dh self.embed.backward(dout) return denc_hs def generate(self, enc_hs, start_id, sample_size): sampled = [] sample_id = start_id h = enc_hs[:, -1] self.lstm.set_state(h) for _ in range(sample_size): x = np.array([sample_id]).reshape((1, 1)) out = self.embed.forward(x) dec_hs = self.lstm.forward(out) c = self.attention.forward(enc_hs, dec_hs) out = np.concatenate((c, dec_hs), axis=2) score = self.affine.forward(out) sample_id = np.argmax(score.flatten()) sampled.append(sample_id) return sampled
class PeekyDecoder: def __init__(self, vocab_size, wordvec_size, hidden_size): V, D, H = vocab_size, wordvec_size, hidden_size rn = np.random.randn embed_W = (rn(V, D) / 100).astype('f') lstm_Wx = (rn(H + D, 4 * H) / np.sqrt(H + D)).astype('f') lstm_Wh = (rn(H, 4 * H) / np.sqrt(H)).astype('f') lstm_b = np.zeros(4 * H).astype('f') affine_W = (rn(H + H, V) / np.sqrt(H + H)).astype('f') affine_b = np.zeros(V).astype('f') self.embed = TimeEmbedding(embed_W) self.lstm = TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=True) self.affine = TimeAffine(affine_W, affine_b) self.params, self.grads = [], [] for layer in (self.embed, self.lstm, self.affine): self.params += layer.params self.grads += layer.grads self.cache = None def forward(self, xs, h): N, T = xs.shape N, H = h.shape self.lstm.set_state(h) out = self.embed.forward(xs) hs = np.repeat(h, T, axis=0).reshape(N, T, H) out = np.concatenate((hs, out), axis=2) out = self.lstm.forward(out) out = np.concatenate((hs, out), axis=2) score = self.affine.forward(out) self.cache = H return score def backward(self, dscore): H = self.cache dout = self.affine.backward(dscore) dout, dhs0 = dout[:, :, H:], dout[:, :, :H] dout = self.lstm.backward(dout) dembed, dhs1 = dout[:, :, H:], dout[:, :, :H] self.embed.backward(dembed) dhs = dhs0 + dhs1 dh = self.lstm.dh + np.sum(dhs, axis=1) return dh def generate(self, h, start_id, sample_size): ''' h: Encoderから受け取る隠れ状態 start_id: 最初に与える文字ID sample_size: 生成する文字数 ''' sampled = [] sample_id = start_id self.lstm.set_state(h) for _ in range(sample_size): x = np.array(sample_id).reshape((1, 1)) out = self.embed.forward(x) out = self.lstm.forward(out) score = self.affine.forward(out) sample_id = np.argmax(score.flatten()) sampled.append(int(sample_id)) return sampled
class PeekyDecoder: def __init__(self, vocab_size, wordvec_size, hidden_size): V, D, H = vocab_size, wordvec_size, hidden_size rn = np.random.randn embed_W = (rn(V, D) / 100).astype(np.float32) lstm_Wx = (rn(H + D, 4 * H) / np.sqrt(H + D)).astype(np.float32) lstm_Wh = (rn(H, 4 * H) / np.sqrt(H)).astype(np.float32) lstm_b = np.zeros(4 * H, dtype=np.float32) affine_W = (rn(H + H, V) / np.sqrt(H + H)).astype(np.float32) affine_b = np.zeros(V, dtype=np.float32) self.embed = TimeEmbedding(embed_W) self.lstm = TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, statefull=True) self.affine = TimeAffine(affine_W, affine_b) self.params, self.grads = [], [] for layer in (self.embed, self.lstm, self.affine): self.params += layer.params self.grads += layer.grads self.cache = None def forward(self, xs, h): N, T = xs.shape N, H = h.shape self.lstm.set_state(h) out = self.embed.forward(xs) # TimeLSTMへの入力を拡張する hs = np.repeat(h, T, axis=0).reshape(N, T, H) out = np.concatenate((hs, out), axis=2) # (N, T, H + D) # TimeAffineへの入力を拡張する out = self.lstm.forward(out) out = np.concatenate((hs, out), axis=2) score = self.affine.forward(out) self.cache = H return score def backward(self, dscore): H = self.cache dout = self.affine.backward(dscore) dout, dhs0 = dout[:, :, H:], dout[:, :, :H] dout = self.lstm.backward(dout) dembed, dhs1 = dout[:, :, H:], dout[:, :, :H] self.embed.backward(dembed) dhs = dhs0 + dhs1 dh = self.lstm.dh + np.sum(dhs, axis=1) # 時間方向に集約 return dh def generate(self, h, start_id, sample_size): sampled = [] char_id = start_id self.lstm.set_state(h) H = h.shape[1] peeky_h = h.reshape(1, 1, H) for _ in range(sample_size): x = np.array([char_id]).reshape((1, 1)) out = self.embed.forward(x) out = np.concatenate((peeky_h, out), axis=2) out = self.lstm.forward(out) out = np.concatenate((peeky_h, out), axis=2) score = self.affine.forward(out) char_id = np.argmax(score.flatten()) sampled.append(char_id) return sampled