Esempio n. 1
0
    def __init__(self, vocab_size, wordvec_size, hidden_size):
        V, D, H = vocab_size, wordvec_size, hidden_size

        # Decoder内部レイヤのパラメータを初期化する
        embed_W, lstm_Wx, lstm_Wh, lstm_b, affine_W, affine_b = _init_parameter_attention(
            V, D, H)

        # レイヤの定義
        self.embed = TimeEmbedding(embed_W)
        self.lstm = TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=True)
        self.attention = TimeAttention()
        self.affine = TimeAffine(affine_W, affine_b)
        layers = [self.embed, self.lstm, self.attention, self.affine]

        # パラメータのセット
        self.params, self.grads = [], []
        for layer in layers:
            self.params += layer.params
            self.grads += layer.grads
Esempio n. 2
0
    def __init__(self, vocab_size, wordvec_size, hidden_size):
        V, D, H = vocab_size, wordvec_size, hidden_size
        rn = np.random.randn

        embed_W = (rn(V, D) / 100).astype('f')
        lstm_Wx = (rn(D, 4 * H) / np.sqrt(D)).astype('f')
        lstm_Wh = (rn(H, 4 * H) / np.sqrt(H)).astype('f')
        lstm_b = np.zeros(4 * H).astype('f')
        affine_W = (rn(2 * H, V) / np.sqrt(2 * H)).astype('f')
        affine_b = np.zeros(V).astype('f')

        self.embed = TimeEmbedding(embed_W)
        self.lstm = TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=True)
        self.attention = TimeAttention()
        self.affine = TimeAffine(affine_W, affine_b)
        layers = [self.embed, self.lstm, self.attention, self.affine]

        self.params, self.grads = [], []
        for layer in layers:
            self.params += layer.params
            self.grads += layer.grads
Esempio n. 3
0
    def __init__(self, vocab_size: int, wordvec_size: int,
                 hidden_size) -> None:
        embed_W = (np.random.randn(vocab_size, wordvec_size) /
                   100).astype(float)
        lstm_Wx = (np.random.randn(wordvec_size, 4 * hidden_size) /
                   np.sqrt(wordvec_size)).astype(float)
        lstm_Wh = (np.random.randn(hidden_size, 4 * hidden_size) /
                   np.sqrt(hidden_size)).astype(float)
        lstm_b = np.zeros(4 * hidden_size).astype(float)
        affine_W = (np.random.randn(2 * hidden_size, vocab_size) /
                    np.sqrt(2 * hidden_size)).astype(float)
        affine_b = np.zeros(vocab_size).astype(float)

        self.embed = TimeEmbedding(embed_W)
        self.lstm = TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=True)
        self.attention = TimeAttention()
        self.affine = TimeAffine(affine_W, affine_b)
        layers = [self.embed, self.lstm, self.attention, self.affine]

        self.params = []
        self.grads = []
        for layer in layers:
            self.params += layer.params
            self.grads += layer.grads
Esempio n. 4
0
class AttentionDecoder:
    def __init__(self, vocab_size: int, wordvec_size: int,
                 hidden_size) -> None:
        embed_W = (np.random.randn(vocab_size, wordvec_size) /
                   100).astype(float)
        lstm_Wx = (np.random.randn(wordvec_size, 4 * hidden_size) /
                   np.sqrt(wordvec_size)).astype(float)
        lstm_Wh = (np.random.randn(hidden_size, 4 * hidden_size) /
                   np.sqrt(hidden_size)).astype(float)
        lstm_b = np.zeros(4 * hidden_size).astype(float)
        affine_W = (np.random.randn(2 * hidden_size, vocab_size) /
                    np.sqrt(2 * hidden_size)).astype(float)
        affine_b = np.zeros(vocab_size).astype(float)

        self.embed = TimeEmbedding(embed_W)
        self.lstm = TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=True)
        self.attention = TimeAttention()
        self.affine = TimeAffine(affine_W, affine_b)
        layers = [self.embed, self.lstm, self.attention, self.affine]

        self.params = []
        self.grads = []
        for layer in layers:
            self.params += layer.params
            self.grads += layer.grads

    def forward(self, xs: np.ndarray, enc_hs: np.ndarray) -> np.ndarray:
        h = enc_hs[:, -1]
        self.lstm.set_state(h)

        out = self.embed.forward(xs)
        dec_hs = self.lstm.forward(out)
        c = self.attention.forward(enc_hs, dec_hs)
        out = np.concatenate((c, dec_hs), axis=2)
        score = self.affine.forward(out)

        return score

    def backward(self, dscore: np.ndarray) -> np.ndarray:
        dout = self.affine.backward(dscore)
        N, T, H2 = dout.shape
        H = H2 // 2

        dc, ddec_hs0 = dout[:, :, :H], dout[:, :, H:]
        denc_hs, ddec_hs1 = self.attention.backward(dc)
        ddec_hs = ddec_hs0 + ddec_hs1
        dout = self.lstm.backward(ddec_hs)
        denc_hs[:, -1] += self.lstm.dh
        self.embed.backward(dout)

        return denc_hs

    def generate(self, enc_hs: np.ndarray, start_id: int,
                 sample_size: int) -> List[int]:
        sampled = []
        sample_id = start_id
        h = enc_hs[:, -1]
        self.lstm.set_state(h)

        for _ in range(sample_size):
            x = np.array([sample_id]).reshape((1, 1))

            out = self.embed.forward(x)
            dec_hs = self.lstm.forward(out)
            c = self.attention.forward(enc_hs, dec_hs)
            out = np.concatenate((c, dec_hs), axis=2)
            score = self.affine.forward(out)

            sample_id = np.argmax(score.flatten())
            sampled.append(sample_id)

        return sampled
Esempio n. 5
0
class AttentionDecoder:
    def __init__(self, vocab_size, wordvec_size, hidden_size):
        V, D, H = vocab_size, wordvec_size, hidden_size
        rn = np.random.randn

        embed_W = (rn(V, D) / 100).astype('f')
        lstm_Wx = (rn(D, 4 * H) / np.sqrt(D)).astype('f')
        lstm_Wh = (rn(H, 4 * H) / np.sqrt(H)).astype('f')
        lstm_b = np.zeros(4 * H).astype('f')
        affine_W = (rn(2 * H, V) / np.sqrt(2 * H)).astype('f')
        affine_b = np.zeros(V).astype('f')

        self.embed = TimeEmbedding(embed_W)
        self.lstm = TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=True)
        self.attention = TimeAttention()
        self.affine = TimeAffine(affine_W, affine_b)
        layers = [self.embed, self.lstm, self.attention, self.affine]

        self.params, self.grads = [], []
        for layer in layers:
            self.params += layer.params
            self.grads += layer.grads

    def forward(self, xs, enc_hs):
        h = enc_hs[:, -1]
        self.lstm.set_state(h)

        out = self.embed.forward(xs)
        dec_hs = self.lstm.forward(out)
        c = self.attention.forward(enc_hs, dec_hs)
        out = np.concatenate((c, dec_hs), axis=2)
        score = self.affine.forward(out)

        return score

    def backward(self, dscore):
        dout = self.affine.backward(dscore)
        _, _, H2 = dout.shape
        H = H2 // 2

        dc, ddec_hs0 = dout[:, :, :H], dout[:, :, H:]
        denc_hs, ddec_hs1 = self.attention.backward(dc)
        ddec_hs = ddec_hs0 + ddec_hs1
        dout = self.lstm.backward(ddec_hs)
        dh = self.lstm.dh
        denc_hs[:, -1] += dh
        self.embed.backward(dout)

        return denc_hs

    def generate(self, enc_hs, start_id, sample_size):
        sampled = []
        sample_id = start_id
        h = enc_hs[:, -1]
        self.lstm.set_state(h)
        for _ in range(sample_size):
            x = np.array([sample_id]).reshape((1, 1))

            out = self.embed.forward(x)
            dec_hs = self.lstm.forward(out)
            c = self.attention.forward(enc_hs, dec_hs)
            out = np.concatenate((c, dec_hs), axis=2)
            score = self.affine.forward(out)

            sample_id = np.argmax(score.flatten())
            sampled.append(sample_id)

        return sampled
Esempio n. 6
0
class AttentionDecoder:
    def __init__(self, vocab_size, wordvec_size, hidden_size):
        V, D, H = vocab_size, wordvec_size, hidden_size

        # Decoder内部レイヤのパラメータを初期化する
        embed_W, lstm_Wx, lstm_Wh, lstm_b, affine_W, affine_b = _init_parameter_attention(
            V, D, H)

        # レイヤの定義
        self.embed = TimeEmbedding(embed_W)
        self.lstm = TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=True)
        self.attention = TimeAttention()
        self.affine = TimeAffine(affine_W, affine_b)
        layers = [self.embed, self.lstm, self.attention, self.affine]

        # パラメータのセット
        self.params, self.grads = [], []
        for layer in layers:
            self.params += layer.params
            self.grads += layer.grads

    def forward(self, xs, hs_enc):
        h = hs_enc[:, -1]
        self.lstm.set_state(h)

        out = self.embed.forward(xs)
        hs_dec = self.lstm.forward(out)
        cs = self.attention.forward(hs_enc, hs_dec)
        out = np.concatenate((cs, hs_dec), axis=2)
        score = self.affine.forward(out)

        return score

    def backward(self, dscore):
        dout = self.affine.backward(dscore)
        N, T, H2 = dout.shape
        H = H2 // 2

        dcs, dhs_dec0 = dout[:, :, :H], dout[:, :, H:]
        dhs_enc, dhs_dec1 = self.attention.backward(dcs)
        dhs_dec = dhs_dec0 + dhs_dec1
        dout = self.lstm.backward(dhs_dec)
        dh = self.lstm.dh
        dhs_enc[:, -1] += dh
        self.embed.backward(dout)

        return dhs_enc

    def generate(self, hs_enc, start_id, sample_size):
        N = hs_enc.shape[0]
        h = hs_enc[:, -1]
        self.lstm.set_state(h)
        sampled = []
        char_id = np.array(start_id).reshape(1, 1).repeat(N, axis=0)

        for _ in range(sample_size):
            x = char_id
            out = self.embed.forward(x)
            hs_dec = self.lstm.forward(out)
            cs = self.attention.forward(hs_enc, hs_dec)
            out = np.concatenate((cs, hs_dec), axis=2)
            score = self.affine.forward(out)

            char_id = score.argmax(axis=2)
            sampled.append(char_id.flatten())

        return np.array(sampled, dtype=np.int).T

    def generate_with_cf(self, hs_enc, start_id, sample_size):
        N = hs_enc.shape[0]
        h = hs_enc[:, -1]
        self.lstm.set_state(h)
        sampled = []
        char_id = np.array(start_id).reshape(1, 1).repeat(N, axis=0)

        ### 確信度の取得用 ###
        sum_cf = np.zeros(N)
        counts = np.zeros(N, dtype=np.int)
        softmax = TimeSoftmax()
        ##########

        for _ in range(sample_size):
            x = char_id
            out = self.embed.forward(x)
            hs_dec = self.lstm.forward(out)
            cs = self.attention.forward(hs_enc, hs_dec)
            out = np.concatenate((cs, hs_dec), axis=2)
            score = self.affine.forward(out)
            ### 確信度の取得 ###
            score = softmax.forward(score)
            ##########

            char_id = score.argmax(axis=2)
            sampled.append(char_id.flatten())
            ### 確信度の加算 ###
            mask = (char_id.flatten() != 0)
            sum_cf[mask] += score.max(axis=2).flatten()[mask]
            counts += mask
            ##########

        cf = sum_cf / counts  # mean

        return np.array(sampled, dtype=np.int).T, cf