def __init__(self, vocab_size, wordvec_size, hidden_size, dropout_ratio=0.5):
        V, D, H = vocab_size, wordvec_size, hidden_size
        rn = np.random.randn

        embed_W = (rn(V, D) / 100).astype('f')
        lstm_Wx = (rn(D, 4 * H) / np.sqrt(D)).astype('f')
        lstm_Wh = (rn(H, 4 * H) / np.sqrt(H)).astype('f')
        lstm_b = np.zeros(4 * H).astype('f')
        affine_W = (rn(2*H, V) / np.sqrt(2*H)).astype('f')
        affine_b = np.zeros(V).astype('f')
        lstm_Wx_1 = (rn(D, 4 * H) / np.sqrt(D)).astype('f')
        lstm_Wh_1 = (rn(H, 4 * H) / np.sqrt(H)).astype('f')
        lstm_b1 = (rn(4 * H)).astype('f')

        self.embed = TimeEmbedding(embed_W)
        self.lstm = TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=True)
        self.lstm_1 = TimeLSTM(lstm_Wx_1, lstm_Wh_1, lstm_b1, stateful=True)
        self.attention = TimeAttention()
        self.affine = TimeAffine(affine_W, affine_b)
        self.dropout = TimeDropout(dropout_ratio)
        self.dropout_1 = TimeDropout(dropout_ratio)
        self.dropout_2 = TimeDropout(dropout_ratio)
        layers = [self.embed, self.lstm, self.lstm_1, self.attention,
                  self.affine, self.dropout, self.dropout_1, self.dropout_2]

        self.params, self.grads = [], []
        for layer in layers:
            self.params += layer.params
            self.grads += layer.grads
Exemple #2
0
    def __init__(self, vocab_size, wordvec_size, hidden_size):
        V, D, H = vocab_size, wordvec_size, hidden_size
        rn = np.random.randn

        embed_W = (rn(V, D) / 100).astype('f')
        lstm_Wx = (rn(D, 4 * H) / np.sqrt(D)).astype('f')
        lstm_Wh = (rn(H, 4 * H) / np.sqrt(H)).astype('f')
        lstm_b = np.zeros(4 * H).astype('f')
        affine_W = (rn(2 * H, V) / np.sqrt(2 * H)).astype('f')
        affine_b = np.zeros(V).astype('f')

        self.embed = TimeEmbedding(embed_W)
        self.lstm = TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=True)
        self.attention = TimeAttention()
        self.affine = TimeAffine(affine_W, affine_b)
        layers = [self.embed, self.lstm, self.attention, self.affine]
        self.attention_weights_at_generate = None

        self.params, self.grads = [], []
        for layer in layers:
            self.params += layer.params
            self.grads += layer.grads
class AttentionDecoder:
    def __init__(self, vocab_size, wordvec_size, hidden_size):
        V, D, H = vocab_size, wordvec_size, hidden_size
        rn = np.random.randn

        embed_W = (rn(V, D) / 100).astype('f')
        lstm_Wx = (rn(D, 4 * H) / np.sqrt(D)).astype('f')
        lstm_Wh = (rn(H, 4 * H) / np.sqrt(H)).astype('f')
        lstm_b = np.zeros(4 * H).astype('f')
        affine_W = (rn(2 * H, V) / np.sqrt(2 * H)).astype('f')
        affine_b = np.zeros(V).astype('f')

        self.embed = TimeEmbedding(embed_W)
        self.lstm = TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=True)
        self.attention = TimeAttention()
        self.affine = TimeAffine(affine_W, affine_b)
        layers = [self.embed, self.lstm, self.attention, self.affine]

        self.params, self.grads = [], []
        for layer in layers:
            self.params += layer.params
            self.grads += layer.grads

    def forward(self, xs, enc_hs):
        h = enc_hs[:, -1]
        self.lstm.set_state(h)

        out = self.embed.forward(xs)
        dec_hs = self.lstm.forward(out)
        c = self.attention.forward(enc_hs, dec_hs)
        out = np.concatenate((c, dec_hs), axis=2)
        score = self.affine.forward(out)

        return score

    def backward(self, dscore):
        dout = self.affine.backward(dscore)
        N, T, H2 = dout.shape
        H = H2 // 2

        dc, ddec_hs0 = dout[:, :, :H], dout[:, :, H:]
        denc_hs, ddec_hs1 = self.attention.backward(dc)
        ddec_hs = ddec_hs0 + ddec_hs1
        dout = self.lstm.backward(ddec_hs)
        dh = self.lstm.dh
        denc_hs[:, -1] += dh
        self.embed.backward(dout)

        return denc_hs

    def generate(self, enc_hs, start_id, sample_size):
        sampled = []
        sample_id = start_id
        h = enc_hs[:, -1]
        self.lstm.set_state(h)

        for _ in range(sample_size):
            x = np.array(sample_id).reshape((1, 1))

            out = self.embed.forward(x)
            dec_hs = self.lstm.forward(out)
            c = self.attention.forward(enc_hs, dec_hs)
            out = np.concatenate((c, dec_hs), axis=2)
            score = self.affine.forward(out)

            sample_id = np.argmax(score.flatten())
            sampled.append(int(sample_id))

        return sampled
Exemple #4
0
class AttentionDecoder:
    def __init__(self, vocab_size, wordvec_size, hidden_size):
        V, D, H = vocab_size, wordvec_size, hidden_size
        rn = np.random.randn

        embed_W = (rn(V, D) / 100).astype('f')
        lstm_Wx = (rn(D, 4 * H) / np.sqrt(D)).astype('f')
        lstm_Wh = (rn(H, 4 * H) / np.sqrt(H)).astype('f')
        lstm_b = np.zeros(4 * H).astype('f')
        affine_W = (rn(2 * H, V) / np.sqrt(2 * H)).astype('f')
        affine_b = np.zeros(V).astype('f')

        self.embed = TimeEmbedding(embed_W)
        self.lstm = TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=True)
        # TimeAttentionレイヤ追加 ch07/seq2seq.pyとの違い
        self.attention = TimeAttention()
        self.affine = TimeAffine(affine_W, affine_b)
        # TimeLSTMレイヤとAffineの間に、TimeAttentionレイヤを追加
        layers = [self.embed, self.lstm, self.attention, self.affine]

        self.params, self.grads = [], []
        for layer in layers:
            self.params += layer.params
            self.grads += layer.grads

    def forward(self, xs, enc_hs):
        # hsの最終行を取得
        h = enc_hs[:, -1]
        self.lstm.set_state(h)

        out = self.embed.forward(xs)
        dec_hs = self.lstm.forward(out)
        c = self.attention.forward(enc_hs, dec_hs)
        # 図8-21参照
        # Affineレイヤの入力は下記の2つレイヤの結果なので、concatenateで行列を結合
        # c: TimeAttentionの結果
        # dec_hs: TimeLSTMの結果
        out = np.concatenate((c, dec_hs), axis=2)
        score = self.affine.forward(out)

        return score

    def backward(self, dscore):
        # dout: affine -> TimeAttention and TimeLSTM
        dout = self.affine.backward(dscore)
        N, T, H2 = dout.shape
        H = H2 // 2

        # dc: TimeAffine -> TimeAttention
        # ddec_hs0: -> TimeAffine -> Time LSTM
        # np.concatenate((c, dec_hs), axis=2) の逆操作
        dc, ddec_hs0 = dout[:, :, :H], dout[:, :, H:]
        # denc_hs: TimeAttention -> Encoder
        # ddec_hs1: TimeAttention -> TimeLSTM
        denc_hs, ddec_hs1 = self.attention.backward(dc)
        # TimeAffine and TimeAttention -> TimeLSTM
        # 下手に展開しない方がわかりやすいかも
        # dout = self.lstm.backward(ddec_hs0 + ddec_hs1)
        ddec_hs = ddec_hs0 + ddec_hs1
        dout = self.lstm.backward(ddec_hs)

        # hsの最終行の逆伝播。操作の実体は加算
        dh = self.lstm.dh
        denc_hs[:, -1] += dh

        # TimeLSTM -> TimeEmbedding
        self.embed.backward(dout)

        return denc_hs

    def generate(self, enc_hs, start_id, sample_size):
        '''
        文章の生成。ch07/seq2seq.generate()との違いは、
        TimeAttentionレイヤが加わっただけ。
        '''
        sampled = []
        sample_id = start_id
        h = enc_hs[:, -1]
        self.lstm.set_state(h)

        for _ in range(sample_size):
            x = np.array([sample_id]).reshape((1, 1))

            out = self.embed.forward(x)
            dec_hs = self.lstm.forward(out)
            c = self.attention.forward(enc_hs, dec_hs)  # Attentionで追加された行
            out = np.concatenate((c, dec_hs), axis=2)
            score = self.affine.forward(out)

            sample_id = np.argmax(score.flatten())
            sampled.append(sample_id)

        return sampled
Exemple #5
0
class AttentionDecoder:
    def __init__(self,
                 vocab_size,
                 wordvec_size,
                 hidden_size,
                 dropout_ratio=0.35):
        V, D, H = vocab_size, wordvec_size, hidden_size
        rn = np.random.randn

        embed_W = (rn(V, D) / 100).astype('f')
        lstm_Wx = (rn(D, 4 * H) / np.sqrt(D)).astype('f')
        lstm_Wh = (rn(H, 4 * H) / np.sqrt(H)).astype('f')
        lstm_b = np.zeros(4 * H).astype('f')
        affine_W = (rn(2 * H, V) / np.sqrt(2 * H)).astype('f')
        affine_b = np.zeros(V).astype('f')
        lstm_Wx_1 = (rn(H + D, 4 * H) / np.sqrt(D)).astype('f')
        lstm_Wh_1 = (rn(H, 4 * H) / np.sqrt(H)).astype('f')
        lstm_b1 = (rn(4 * H)).astype('f')
        lstm_Wx_2 = (rn(H + D, 4 * H) / np.sqrt(D)).astype('f')
        lstm_Wh_2 = (rn(H, 4 * H) / np.sqrt(D)).astype('f')
        lstm_b2 = (rn(4 * H)).astype('f')

        self.embed = TimeEmbedding(embed_W)
        self.lstm = TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=True)
        self.lstm_1 = TimeLSTM(lstm_Wx_1, lstm_Wh_1, lstm_b1, stateful=True)
        self.lstm_2 = TimeLSTM(lstm_Wx_2, lstm_Wh_2, lstm_b2, stateful=True)
        self.attention = TimeAttention()
        self.affine = TimeAffine(affine_W, affine_b)
        self.dropout = TimeDropout(dropout_ratio)
        self.dropout_1 = TimeDropout(dropout_ratio)
        self.dropout_2 = TimeDropout(dropout_ratio)
        self.dropout_3 = TimeDropout(dropout_ratio)
        layers = [
            self.embed, self.lstm, self.lstm_1, self.lstm_2, self.attention,
            self.affine, self.dropout, self.dropout_1, self.dropout_2,
            self.dropout_3
        ]

        self.params, self.grads = [], []
        for layer in layers:
            self.params += layer.params
            self.grads += layer.grads

    def forward(self, xs, enc_hs):
        h = enc_hs[:, -1]
        self.lstm.set_state(h)
        self.lstm_1.set_state(h)
        self.lstm_2.set_state(h)
        self.dropout.train_flg = True
        self.dropout_1.train_flg = True
        self.dropout_2.train_flg = True
        self.dropout_3.train_flg = True
        out = self.embed.forward(xs)
        out_save = out
        out = self.dropout.forward(out)
        dec_hs = self.lstm.forward(out)
        c = self.attention.forward(enc_hs, dec_hs)
        dec_hs = self.dropout_1.forward(dec_hs)
        rec_hs = np.concatenate((c, dec_hs), axis=2)

        #rec_hs = (dec_hs + c) / 2
        rec_hs = self.lstm_1.forward(rec_hs)
        dec_hs = self.dropout_2.forward(rec_hs)
        rec_hs_1 = np.concatenate((c, dec_hs), axis=2)
        rec_hs = self.lstm_2.forward(rec_hs_1)
        dec_hs = self.dropout_3.forward(rec_hs)
        out_skip = np.concatenate((out_save, out_save), axis=2)
        out = np.concatenate((c, dec_hs), axis=2)
        out = out + out_skip
        score = self.affine.forward(out)
        return score

    def backward(self, dscore):
        dout = self.affine.backward(dscore)
        N, T, H2 = dout.shape
        H = H2 // 2
        dc, ddec_hs0 = dout[:, :, :H], dout[:, :, H:]
        dout = self.dropout_3.backward(ddec_hs0)
        dout = self.lstm_2.backward(dout)
        N, T, H2 = dout.shape
        H = H2 // 2
        dc2, ddec_hs02 = dout[:, :, :H], dout[:, :, H:]
        dout = self.dropout_2.backward(ddec_hs02)
        dout = self.lstm_1.backward(dout)
        N, T, H2 = dout.shape
        H = H2 // 2
        dc1, ddec_hs01 = dout[:, :, :H], dout[:, :, H:]
        #dout2 = dc + dout
        dout2 = dc + dc1 + dc2
        dout1 = self.dropout_1.backward(ddec_hs01)
        denc_hs, ddec_hs1 = self.attention.backward(dout2)
        dout = dout1 + ddec_hs1
        dout = self.lstm.backward(dout)
        dout = self.dropout.backward(dout)
        dh = self.lstm.dh
        denc_hs[:, -1] += dh
        dout += ddec_hs0
        self.embed.backward(dout)
        return denc_hs

    def generate(self, enc_hs, start_id, sample_size):
        sampled = []
        sample_id = start_id
        h = enc_hs[:, -1]
        self.lstm.set_state(h)
        self.lstm_1.set_state(h)
        self.lstm_2.set_state(h)

        for _ in range(sample_size):
            x = np.array([sample_id], dtype=np.float32)
            #x = np.array([sample_id]).reshape((1, 1))

            x = x.reshape((1, 1))
            x = np.array(x, dtype=np.int32)
            self.dropout.train_flg = True
            self.dropout_1.train_flg = True
            self.dropout_2.train_flg = True
            self.dropout_3.train_flg = True
            out = self.embed.forward(x)
            out_save = out
            out = self.dropout.forward(out)
            dec_hs = self.lstm.forward(out)
            c = self.attention.forward(enc_hs, dec_hs)
            dec_hs = self.dropout_1.forward(dec_hs)
            rec_hs = np.concatenate((c, dec_hs), axis=2)
            #rec_hs = (dec_hs + c) / 2
            rec_hs = self.lstm_1.forward(rec_hs)
            dec_hs = self.dropout_2.forward(rec_hs)
            rec_hs_1 = np.concatenate((c, dec_hs), axis=2)
            rec_hs = self.lstm_2.forward(rec_hs_1)
            dec_hs = self.dropout_3.forward(rec_hs)
            out = np.concatenate((c, dec_hs), axis=2)
            out_skip = np.concatenate((out_save, out_save), axis=2)
            out = out + out_skip
            score = self.affine.forward(out)
            sample_id = np.argmax(score.flatten())
            sampled.append(sample_id)

        return sampled