def __init__(self, vocab_size, wordvec_size, hidden_size, dropout_ratio=0.5): V, D, H = vocab_size, wordvec_size, hidden_size rn = np.random.randn embed_W = (rn(V, D) / 100).astype('f') lstm_Wx = (rn(D, 4 * H) / np.sqrt(D)).astype('f') lstm_Wh = (rn(H, 4 * H) / np.sqrt(H)).astype('f') lstm_b = np.zeros(4 * H).astype('f') affine_W = (rn(2*H, V) / np.sqrt(2*H)).astype('f') affine_b = np.zeros(V).astype('f') lstm_Wx_1 = (rn(D, 4 * H) / np.sqrt(D)).astype('f') lstm_Wh_1 = (rn(H, 4 * H) / np.sqrt(H)).astype('f') lstm_b1 = (rn(4 * H)).astype('f') self.embed = TimeEmbedding(embed_W) self.lstm = TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=True) self.lstm_1 = TimeLSTM(lstm_Wx_1, lstm_Wh_1, lstm_b1, stateful=True) self.attention = TimeAttention() self.affine = TimeAffine(affine_W, affine_b) self.dropout = TimeDropout(dropout_ratio) self.dropout_1 = TimeDropout(dropout_ratio) self.dropout_2 = TimeDropout(dropout_ratio) layers = [self.embed, self.lstm, self.lstm_1, self.attention, self.affine, self.dropout, self.dropout_1, self.dropout_2] self.params, self.grads = [], [] for layer in layers: self.params += layer.params self.grads += layer.grads
def __init__(self, vocab_size, wordvec_size, hidden_size): V, D, H = vocab_size, wordvec_size, hidden_size rn = np.random.randn embed_W = (rn(V, D) / 100).astype('f') lstm_Wx = (rn(D, 4 * H) / np.sqrt(D)).astype('f') lstm_Wh = (rn(H, 4 * H) / np.sqrt(H)).astype('f') lstm_b = np.zeros(4 * H).astype('f') affine_W = (rn(2 * H, V) / np.sqrt(2 * H)).astype('f') affine_b = np.zeros(V).astype('f') self.embed = TimeEmbedding(embed_W) self.lstm = TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=True) self.attention = TimeAttention() self.affine = TimeAffine(affine_W, affine_b) layers = [self.embed, self.lstm, self.attention, self.affine] self.attention_weights_at_generate = None self.params, self.grads = [], [] for layer in layers: self.params += layer.params self.grads += layer.grads
class AttentionDecoder: def __init__(self, vocab_size, wordvec_size, hidden_size): V, D, H = vocab_size, wordvec_size, hidden_size rn = np.random.randn embed_W = (rn(V, D) / 100).astype('f') lstm_Wx = (rn(D, 4 * H) / np.sqrt(D)).astype('f') lstm_Wh = (rn(H, 4 * H) / np.sqrt(H)).astype('f') lstm_b = np.zeros(4 * H).astype('f') affine_W = (rn(2 * H, V) / np.sqrt(2 * H)).astype('f') affine_b = np.zeros(V).astype('f') self.embed = TimeEmbedding(embed_W) self.lstm = TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=True) self.attention = TimeAttention() self.affine = TimeAffine(affine_W, affine_b) layers = [self.embed, self.lstm, self.attention, self.affine] self.params, self.grads = [], [] for layer in layers: self.params += layer.params self.grads += layer.grads def forward(self, xs, enc_hs): h = enc_hs[:, -1] self.lstm.set_state(h) out = self.embed.forward(xs) dec_hs = self.lstm.forward(out) c = self.attention.forward(enc_hs, dec_hs) out = np.concatenate((c, dec_hs), axis=2) score = self.affine.forward(out) return score def backward(self, dscore): dout = self.affine.backward(dscore) N, T, H2 = dout.shape H = H2 // 2 dc, ddec_hs0 = dout[:, :, :H], dout[:, :, H:] denc_hs, ddec_hs1 = self.attention.backward(dc) ddec_hs = ddec_hs0 + ddec_hs1 dout = self.lstm.backward(ddec_hs) dh = self.lstm.dh denc_hs[:, -1] += dh self.embed.backward(dout) return denc_hs def generate(self, enc_hs, start_id, sample_size): sampled = [] sample_id = start_id h = enc_hs[:, -1] self.lstm.set_state(h) for _ in range(sample_size): x = np.array(sample_id).reshape((1, 1)) out = self.embed.forward(x) dec_hs = self.lstm.forward(out) c = self.attention.forward(enc_hs, dec_hs) out = np.concatenate((c, dec_hs), axis=2) score = self.affine.forward(out) sample_id = np.argmax(score.flatten()) sampled.append(int(sample_id)) return sampled
class AttentionDecoder: def __init__(self, vocab_size, wordvec_size, hidden_size): V, D, H = vocab_size, wordvec_size, hidden_size rn = np.random.randn embed_W = (rn(V, D) / 100).astype('f') lstm_Wx = (rn(D, 4 * H) / np.sqrt(D)).astype('f') lstm_Wh = (rn(H, 4 * H) / np.sqrt(H)).astype('f') lstm_b = np.zeros(4 * H).astype('f') affine_W = (rn(2 * H, V) / np.sqrt(2 * H)).astype('f') affine_b = np.zeros(V).astype('f') self.embed = TimeEmbedding(embed_W) self.lstm = TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=True) # TimeAttentionレイヤ追加 ch07/seq2seq.pyとの違い self.attention = TimeAttention() self.affine = TimeAffine(affine_W, affine_b) # TimeLSTMレイヤとAffineの間に、TimeAttentionレイヤを追加 layers = [self.embed, self.lstm, self.attention, self.affine] self.params, self.grads = [], [] for layer in layers: self.params += layer.params self.grads += layer.grads def forward(self, xs, enc_hs): # hsの最終行を取得 h = enc_hs[:, -1] self.lstm.set_state(h) out = self.embed.forward(xs) dec_hs = self.lstm.forward(out) c = self.attention.forward(enc_hs, dec_hs) # 図8-21参照 # Affineレイヤの入力は下記の2つレイヤの結果なので、concatenateで行列を結合 # c: TimeAttentionの結果 # dec_hs: TimeLSTMの結果 out = np.concatenate((c, dec_hs), axis=2) score = self.affine.forward(out) return score def backward(self, dscore): # dout: affine -> TimeAttention and TimeLSTM dout = self.affine.backward(dscore) N, T, H2 = dout.shape H = H2 // 2 # dc: TimeAffine -> TimeAttention # ddec_hs0: -> TimeAffine -> Time LSTM # np.concatenate((c, dec_hs), axis=2) の逆操作 dc, ddec_hs0 = dout[:, :, :H], dout[:, :, H:] # denc_hs: TimeAttention -> Encoder # ddec_hs1: TimeAttention -> TimeLSTM denc_hs, ddec_hs1 = self.attention.backward(dc) # TimeAffine and TimeAttention -> TimeLSTM # 下手に展開しない方がわかりやすいかも # dout = self.lstm.backward(ddec_hs0 + ddec_hs1) ddec_hs = ddec_hs0 + ddec_hs1 dout = self.lstm.backward(ddec_hs) # hsの最終行の逆伝播。操作の実体は加算 dh = self.lstm.dh denc_hs[:, -1] += dh # TimeLSTM -> TimeEmbedding self.embed.backward(dout) return denc_hs def generate(self, enc_hs, start_id, sample_size): ''' 文章の生成。ch07/seq2seq.generate()との違いは、 TimeAttentionレイヤが加わっただけ。 ''' sampled = [] sample_id = start_id h = enc_hs[:, -1] self.lstm.set_state(h) for _ in range(sample_size): x = np.array([sample_id]).reshape((1, 1)) out = self.embed.forward(x) dec_hs = self.lstm.forward(out) c = self.attention.forward(enc_hs, dec_hs) # Attentionで追加された行 out = np.concatenate((c, dec_hs), axis=2) score = self.affine.forward(out) sample_id = np.argmax(score.flatten()) sampled.append(sample_id) return sampled
class AttentionDecoder: def __init__(self, vocab_size, wordvec_size, hidden_size, dropout_ratio=0.35): V, D, H = vocab_size, wordvec_size, hidden_size rn = np.random.randn embed_W = (rn(V, D) / 100).astype('f') lstm_Wx = (rn(D, 4 * H) / np.sqrt(D)).astype('f') lstm_Wh = (rn(H, 4 * H) / np.sqrt(H)).astype('f') lstm_b = np.zeros(4 * H).astype('f') affine_W = (rn(2 * H, V) / np.sqrt(2 * H)).astype('f') affine_b = np.zeros(V).astype('f') lstm_Wx_1 = (rn(H + D, 4 * H) / np.sqrt(D)).astype('f') lstm_Wh_1 = (rn(H, 4 * H) / np.sqrt(H)).astype('f') lstm_b1 = (rn(4 * H)).astype('f') lstm_Wx_2 = (rn(H + D, 4 * H) / np.sqrt(D)).astype('f') lstm_Wh_2 = (rn(H, 4 * H) / np.sqrt(D)).astype('f') lstm_b2 = (rn(4 * H)).astype('f') self.embed = TimeEmbedding(embed_W) self.lstm = TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=True) self.lstm_1 = TimeLSTM(lstm_Wx_1, lstm_Wh_1, lstm_b1, stateful=True) self.lstm_2 = TimeLSTM(lstm_Wx_2, lstm_Wh_2, lstm_b2, stateful=True) self.attention = TimeAttention() self.affine = TimeAffine(affine_W, affine_b) self.dropout = TimeDropout(dropout_ratio) self.dropout_1 = TimeDropout(dropout_ratio) self.dropout_2 = TimeDropout(dropout_ratio) self.dropout_3 = TimeDropout(dropout_ratio) layers = [ self.embed, self.lstm, self.lstm_1, self.lstm_2, self.attention, self.affine, self.dropout, self.dropout_1, self.dropout_2, self.dropout_3 ] self.params, self.grads = [], [] for layer in layers: self.params += layer.params self.grads += layer.grads def forward(self, xs, enc_hs): h = enc_hs[:, -1] self.lstm.set_state(h) self.lstm_1.set_state(h) self.lstm_2.set_state(h) self.dropout.train_flg = True self.dropout_1.train_flg = True self.dropout_2.train_flg = True self.dropout_3.train_flg = True out = self.embed.forward(xs) out_save = out out = self.dropout.forward(out) dec_hs = self.lstm.forward(out) c = self.attention.forward(enc_hs, dec_hs) dec_hs = self.dropout_1.forward(dec_hs) rec_hs = np.concatenate((c, dec_hs), axis=2) #rec_hs = (dec_hs + c) / 2 rec_hs = self.lstm_1.forward(rec_hs) dec_hs = self.dropout_2.forward(rec_hs) rec_hs_1 = np.concatenate((c, dec_hs), axis=2) rec_hs = self.lstm_2.forward(rec_hs_1) dec_hs = self.dropout_3.forward(rec_hs) out_skip = np.concatenate((out_save, out_save), axis=2) out = np.concatenate((c, dec_hs), axis=2) out = out + out_skip score = self.affine.forward(out) return score def backward(self, dscore): dout = self.affine.backward(dscore) N, T, H2 = dout.shape H = H2 // 2 dc, ddec_hs0 = dout[:, :, :H], dout[:, :, H:] dout = self.dropout_3.backward(ddec_hs0) dout = self.lstm_2.backward(dout) N, T, H2 = dout.shape H = H2 // 2 dc2, ddec_hs02 = dout[:, :, :H], dout[:, :, H:] dout = self.dropout_2.backward(ddec_hs02) dout = self.lstm_1.backward(dout) N, T, H2 = dout.shape H = H2 // 2 dc1, ddec_hs01 = dout[:, :, :H], dout[:, :, H:] #dout2 = dc + dout dout2 = dc + dc1 + dc2 dout1 = self.dropout_1.backward(ddec_hs01) denc_hs, ddec_hs1 = self.attention.backward(dout2) dout = dout1 + ddec_hs1 dout = self.lstm.backward(dout) dout = self.dropout.backward(dout) dh = self.lstm.dh denc_hs[:, -1] += dh dout += ddec_hs0 self.embed.backward(dout) return denc_hs def generate(self, enc_hs, start_id, sample_size): sampled = [] sample_id = start_id h = enc_hs[:, -1] self.lstm.set_state(h) self.lstm_1.set_state(h) self.lstm_2.set_state(h) for _ in range(sample_size): x = np.array([sample_id], dtype=np.float32) #x = np.array([sample_id]).reshape((1, 1)) x = x.reshape((1, 1)) x = np.array(x, dtype=np.int32) self.dropout.train_flg = True self.dropout_1.train_flg = True self.dropout_2.train_flg = True self.dropout_3.train_flg = True out = self.embed.forward(x) out_save = out out = self.dropout.forward(out) dec_hs = self.lstm.forward(out) c = self.attention.forward(enc_hs, dec_hs) dec_hs = self.dropout_1.forward(dec_hs) rec_hs = np.concatenate((c, dec_hs), axis=2) #rec_hs = (dec_hs + c) / 2 rec_hs = self.lstm_1.forward(rec_hs) dec_hs = self.dropout_2.forward(rec_hs) rec_hs_1 = np.concatenate((c, dec_hs), axis=2) rec_hs = self.lstm_2.forward(rec_hs_1) dec_hs = self.dropout_3.forward(rec_hs) out = np.concatenate((c, dec_hs), axis=2) out_skip = np.concatenate((out_save, out_save), axis=2) out = out + out_skip score = self.affine.forward(out) sample_id = np.argmax(score.flatten()) sampled.append(sample_id) return sampled