def __init__(self, vocab_size, wordvec_size, hidden_size): V, D, H = vocab_size, wordvec_size, hidden_size # Decoder内部レイヤのパラメータを初期化する embed_W, lstm_Wx, lstm_Wh, lstm_b, affine_W, affine_b = _init_parameter_attention( V, D, H) # レイヤの定義 self.embed = TimeEmbedding(embed_W) self.lstm = TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=True) self.attention = TimeAttention() self.affine = TimeAffine(affine_W, affine_b) layers = [self.embed, self.lstm, self.attention, self.affine] # パラメータのセット self.params, self.grads = [], [] for layer in layers: self.params += layer.params self.grads += layer.grads
def __init__(self, vocab_size, wordvec_size, hidden_size): V, D, H = vocab_size, wordvec_size, hidden_size rn = np.random.randn embed_W = (rn(V, D) / 100).astype('f') lstm_Wx = (rn(D, 4 * H) / np.sqrt(D)).astype('f') lstm_Wh = (rn(H, 4 * H) / np.sqrt(H)).astype('f') lstm_b = np.zeros(4 * H).astype('f') affine_W = (rn(2 * H, V) / np.sqrt(2 * H)).astype('f') affine_b = np.zeros(V).astype('f') self.embed = TimeEmbedding(embed_W) self.lstm = TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=True) self.attention = TimeAttention() self.affine = TimeAffine(affine_W, affine_b) layers = [self.embed, self.lstm, self.attention, self.affine] self.params, self.grads = [], [] for layer in layers: self.params += layer.params self.grads += layer.grads
def __init__(self, vocab_size: int, wordvec_size: int, hidden_size) -> None: embed_W = (np.random.randn(vocab_size, wordvec_size) / 100).astype(float) lstm_Wx = (np.random.randn(wordvec_size, 4 * hidden_size) / np.sqrt(wordvec_size)).astype(float) lstm_Wh = (np.random.randn(hidden_size, 4 * hidden_size) / np.sqrt(hidden_size)).astype(float) lstm_b = np.zeros(4 * hidden_size).astype(float) affine_W = (np.random.randn(2 * hidden_size, vocab_size) / np.sqrt(2 * hidden_size)).astype(float) affine_b = np.zeros(vocab_size).astype(float) self.embed = TimeEmbedding(embed_W) self.lstm = TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=True) self.attention = TimeAttention() self.affine = TimeAffine(affine_W, affine_b) layers = [self.embed, self.lstm, self.attention, self.affine] self.params = [] self.grads = [] for layer in layers: self.params += layer.params self.grads += layer.grads
class AttentionDecoder: def __init__(self, vocab_size: int, wordvec_size: int, hidden_size) -> None: embed_W = (np.random.randn(vocab_size, wordvec_size) / 100).astype(float) lstm_Wx = (np.random.randn(wordvec_size, 4 * hidden_size) / np.sqrt(wordvec_size)).astype(float) lstm_Wh = (np.random.randn(hidden_size, 4 * hidden_size) / np.sqrt(hidden_size)).astype(float) lstm_b = np.zeros(4 * hidden_size).astype(float) affine_W = (np.random.randn(2 * hidden_size, vocab_size) / np.sqrt(2 * hidden_size)).astype(float) affine_b = np.zeros(vocab_size).astype(float) self.embed = TimeEmbedding(embed_W) self.lstm = TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=True) self.attention = TimeAttention() self.affine = TimeAffine(affine_W, affine_b) layers = [self.embed, self.lstm, self.attention, self.affine] self.params = [] self.grads = [] for layer in layers: self.params += layer.params self.grads += layer.grads def forward(self, xs: np.ndarray, enc_hs: np.ndarray) -> np.ndarray: h = enc_hs[:, -1] self.lstm.set_state(h) out = self.embed.forward(xs) dec_hs = self.lstm.forward(out) c = self.attention.forward(enc_hs, dec_hs) out = np.concatenate((c, dec_hs), axis=2) score = self.affine.forward(out) return score def backward(self, dscore: np.ndarray) -> np.ndarray: dout = self.affine.backward(dscore) N, T, H2 = dout.shape H = H2 // 2 dc, ddec_hs0 = dout[:, :, :H], dout[:, :, H:] denc_hs, ddec_hs1 = self.attention.backward(dc) ddec_hs = ddec_hs0 + ddec_hs1 dout = self.lstm.backward(ddec_hs) denc_hs[:, -1] += self.lstm.dh self.embed.backward(dout) return denc_hs def generate(self, enc_hs: np.ndarray, start_id: int, sample_size: int) -> List[int]: sampled = [] sample_id = start_id h = enc_hs[:, -1] self.lstm.set_state(h) for _ in range(sample_size): x = np.array([sample_id]).reshape((1, 1)) out = self.embed.forward(x) dec_hs = self.lstm.forward(out) c = self.attention.forward(enc_hs, dec_hs) out = np.concatenate((c, dec_hs), axis=2) score = self.affine.forward(out) sample_id = np.argmax(score.flatten()) sampled.append(sample_id) return sampled
class AttentionDecoder: def __init__(self, vocab_size, wordvec_size, hidden_size): V, D, H = vocab_size, wordvec_size, hidden_size rn = np.random.randn embed_W = (rn(V, D) / 100).astype('f') lstm_Wx = (rn(D, 4 * H) / np.sqrt(D)).astype('f') lstm_Wh = (rn(H, 4 * H) / np.sqrt(H)).astype('f') lstm_b = np.zeros(4 * H).astype('f') affine_W = (rn(2 * H, V) / np.sqrt(2 * H)).astype('f') affine_b = np.zeros(V).astype('f') self.embed = TimeEmbedding(embed_W) self.lstm = TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=True) self.attention = TimeAttention() self.affine = TimeAffine(affine_W, affine_b) layers = [self.embed, self.lstm, self.attention, self.affine] self.params, self.grads = [], [] for layer in layers: self.params += layer.params self.grads += layer.grads def forward(self, xs, enc_hs): h = enc_hs[:, -1] self.lstm.set_state(h) out = self.embed.forward(xs) dec_hs = self.lstm.forward(out) c = self.attention.forward(enc_hs, dec_hs) out = np.concatenate((c, dec_hs), axis=2) score = self.affine.forward(out) return score def backward(self, dscore): dout = self.affine.backward(dscore) _, _, H2 = dout.shape H = H2 // 2 dc, ddec_hs0 = dout[:, :, :H], dout[:, :, H:] denc_hs, ddec_hs1 = self.attention.backward(dc) ddec_hs = ddec_hs0 + ddec_hs1 dout = self.lstm.backward(ddec_hs) dh = self.lstm.dh denc_hs[:, -1] += dh self.embed.backward(dout) return denc_hs def generate(self, enc_hs, start_id, sample_size): sampled = [] sample_id = start_id h = enc_hs[:, -1] self.lstm.set_state(h) for _ in range(sample_size): x = np.array([sample_id]).reshape((1, 1)) out = self.embed.forward(x) dec_hs = self.lstm.forward(out) c = self.attention.forward(enc_hs, dec_hs) out = np.concatenate((c, dec_hs), axis=2) score = self.affine.forward(out) sample_id = np.argmax(score.flatten()) sampled.append(sample_id) return sampled
class AttentionDecoder: def __init__(self, vocab_size, wordvec_size, hidden_size): V, D, H = vocab_size, wordvec_size, hidden_size # Decoder内部レイヤのパラメータを初期化する embed_W, lstm_Wx, lstm_Wh, lstm_b, affine_W, affine_b = _init_parameter_attention( V, D, H) # レイヤの定義 self.embed = TimeEmbedding(embed_W) self.lstm = TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=True) self.attention = TimeAttention() self.affine = TimeAffine(affine_W, affine_b) layers = [self.embed, self.lstm, self.attention, self.affine] # パラメータのセット self.params, self.grads = [], [] for layer in layers: self.params += layer.params self.grads += layer.grads def forward(self, xs, hs_enc): h = hs_enc[:, -1] self.lstm.set_state(h) out = self.embed.forward(xs) hs_dec = self.lstm.forward(out) cs = self.attention.forward(hs_enc, hs_dec) out = np.concatenate((cs, hs_dec), axis=2) score = self.affine.forward(out) return score def backward(self, dscore): dout = self.affine.backward(dscore) N, T, H2 = dout.shape H = H2 // 2 dcs, dhs_dec0 = dout[:, :, :H], dout[:, :, H:] dhs_enc, dhs_dec1 = self.attention.backward(dcs) dhs_dec = dhs_dec0 + dhs_dec1 dout = self.lstm.backward(dhs_dec) dh = self.lstm.dh dhs_enc[:, -1] += dh self.embed.backward(dout) return dhs_enc def generate(self, hs_enc, start_id, sample_size): N = hs_enc.shape[0] h = hs_enc[:, -1] self.lstm.set_state(h) sampled = [] char_id = np.array(start_id).reshape(1, 1).repeat(N, axis=0) for _ in range(sample_size): x = char_id out = self.embed.forward(x) hs_dec = self.lstm.forward(out) cs = self.attention.forward(hs_enc, hs_dec) out = np.concatenate((cs, hs_dec), axis=2) score = self.affine.forward(out) char_id = score.argmax(axis=2) sampled.append(char_id.flatten()) return np.array(sampled, dtype=np.int).T def generate_with_cf(self, hs_enc, start_id, sample_size): N = hs_enc.shape[0] h = hs_enc[:, -1] self.lstm.set_state(h) sampled = [] char_id = np.array(start_id).reshape(1, 1).repeat(N, axis=0) ### 確信度の取得用 ### sum_cf = np.zeros(N) counts = np.zeros(N, dtype=np.int) softmax = TimeSoftmax() ########## for _ in range(sample_size): x = char_id out = self.embed.forward(x) hs_dec = self.lstm.forward(out) cs = self.attention.forward(hs_enc, hs_dec) out = np.concatenate((cs, hs_dec), axis=2) score = self.affine.forward(out) ### 確信度の取得 ### score = softmax.forward(score) ########## char_id = score.argmax(axis=2) sampled.append(char_id.flatten()) ### 確信度の加算 ### mask = (char_id.flatten() != 0) sum_cf[mask] += score.max(axis=2).flatten()[mask] counts += mask ########## cf = sum_cf / counts # mean return np.array(sampled, dtype=np.int).T, cf