Exemple #1
0
class Encoder:
    def __init__(self, vocab_size: int, wordvec_size: int,
                 hidden_size: int) -> None:
        embed_W = (np.random.randn(vocab_size, wordvec_size) /
                   100).astype(float)
        lstm_Wx = (np.random.randn(wordvec_size, 4 * hidden_size) /
                   np.sqrt(wordvec_size)).astype(float)
        lstm_Wh = (np.random.randn(hidden_size, 4 * hidden_size) /
                   np.sqrt(hidden_size)).astype(float)
        lstm_b = np.zeros(4 * hidden_size).astype(float)

        self.embed = TimeEmbedding(embed_W)
        self.lstm = TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=False)

        self.params = self.embed.params + self.lstm.params
        self.grads = self.embed.grads + self.lstm.grads
        self.hs = None

    def forward(self, xs: np.ndarray) -> np.ndarray:
        xs = self.embed.forward(xs)
        hs = self.lstm.forward(xs)
        self.hs = hs
        return hs[:, -1, :]

    def backward(self, dh: np.ndarray) -> None:
        dhs = np.zeros_like(self.hs)
        dhs[:, -1, :] = dh

        dout = self.lstm.backward(dhs)
        dout = self.embed.backward(dout)  # return None
        return dout
Exemple #2
0
    def __init__(self, vocab_size: int=10000, wordvec_size: int=100, hidden_size: int=100, dropout_ratio: float=1.0) -> None:
        embed_W = (np.random.randn(vocab_size, wordvec_size)/100).astype(float)
        lstm_Wx1 = (np.random.randn(wordvec_size, 4*hidden_size)/np.sqrt(wordvec_size)).astype(float)
        lstm_Wh1 = (np.random.randn(hidden_size, 4*hidden_size)/np.sqrt(hidden_size)).astype(float)
        lstm_b1 = np.zeros(4*hidden_size).astype(float)
        lstm_Wx2 = (np.random.randn(wordvec_size, 4*hidden_size)/np.sqrt(wordvec_size)).astype(float)
        lstm_Wh2 = (np.random.randn(hidden_size, 4*hidden_size)/np.sqrt(hidden_size)).astype(float)
        lstm_b2 = np.zeros(4*hidden_size).astype(float)
        affine_b = np.zeros(vocab_size).astype(float)

        self.layers = [
            TimeEmbedding(embed_W),
            TimeDropout(dropout_ratio),
            TimeLSTM(lstm_Wx1, lstm_Wh1, lstm_b1, stateful=True),
            TimeDropout(dropout_ratio),
            TimeLSTM(lstm_Wx2, lstm_Wh2, lstm_b2, stateful=True),
            TimeDropout(dropout_ratio),
            TimeAffine(embed_W.T, affine_b)
        ]
        self.loss_layer = TimeSoftmaxWithLoss()
        self.lstm_layers = [self.layers[2], self.layers[4]]
        self.drop_layers = [self.layers[1], self.layers[3], self.layers[5]]

        self.params = []
        self.grads = []
        for layer in self.layers:
            self.params += layer.params
            self.grads += layer.grads
Exemple #3
0
    def __init__(self, vocab_size: int, wordvec_size: int,
                 hidden_size: int) -> None:
        embed_W = (np.random.randn(vocab_size, wordvec_size) /
                   100).astype(float)
        lstm_Wx = (
            np.random.randn(wordvec_size + hidden_size, 4 * hidden_size) /
            np.sqrt(wordvec_size)).astype(float)
        lstm_Wh = (np.random.randn(hidden_size, 4 * hidden_size) /
                   np.sqrt(hidden_size)).astype(float)
        lstm_b = np.zeros(4 * hidden_size).astype(float)
        affine_W = (np.random.randn(hidden_size + hidden_size, vocab_size) /
                    np.sqrt(hidden_size)).astype(float)
        affine_b = np.zeros(vocab_size).astype(float)

        self.embed = TimeEmbedding(embed_W)
        self.lstm = TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=True)
        self.affine = TimeAffine(affine_W, affine_b)

        self.params = []
        self.grads = []

        for layer in (self.embed, self.lstm, self.affine):
            self.params += layer.params
            self.grads += layer.grads
        self.cache = None
Exemple #4
0
    def __init__(self, vocab_size: int, wordvec_size: int,
                 hidden_size: int) -> None:
        embed_W = (np.random.randn(vocab_size, wordvec_size) /
                   100).astype(float)
        lstm_Wx = (np.random.randn(wordvec_size, 4 * hidden_size) /
                   np.sqrt(wordvec_size)).astype(float)
        lstm_Wh = (np.random.randn(hidden_size, 4 * hidden_size) /
                   np.sqrt(hidden_size)).astype(float)
        lstm_b = np.zeros(4 * hidden_size).astype(float)

        self.embed = TimeEmbedding(embed_W)
        self.lstm = TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=False)

        self.params = self.embed.params + self.lstm.params
        self.grads = self.embed.grads + self.lstm.grads
        self.hs = None
Exemple #5
0
class Decoder:
    def __init__(self, vocab_size: int, wordvec_size: int,
                 hidden_size: int) -> None:
        embed_W = (np.random.randn(vocab_size, wordvec_size) /
                   100).astype(float)
        lstm_Wx = (np.random.randn(wordvec_size, 4 * hidden_size) /
                   np.sqrt(wordvec_size)).astype(float)
        lstm_Wh = (np.random.randn(hidden_size, 4 * hidden_size) /
                   np.sqrt(hidden_size)).astype(float)
        lstm_b = np.zeros(4 * hidden_size).astype(float)
        affine_W = (np.random.randn(hidden_size, vocab_size) /
                    np.sqrt(hidden_size)).astype(float)
        affine_b = np.zeros(vocab_size).astype(float)

        self.embed = TimeEmbedding(embed_W)
        self.lstm = TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=True)
        self.affine = TimeAffine(affine_W, affine_b)

        self.params = []
        self.grads = []

        for layer in (self.embed, self.lstm, self.affine):
            self.params += layer.params
            self.grads += layer.grads

    def forward(self, xs: np.ndarray, h: np.ndarray) -> np.ndarray:
        self.lstm.set_state(h)

        out = self.embed.forward(xs)
        out = self.lstm.forward(out)
        score = self.affine.forward(out)
        return score

    def backward(self, dscore: np.ndarray) -> np.ndarray:
        dout = self.affine.backward(dscore)
        dout = self.lstm.backward(dout)
        dout = self.embed.backward(dout)
        dh = self.lstm.dh
        return dh

    def generate(self, h: np.ndarray, start_id: int,
                 sample_size: int) -> List[int]:
        sampled = []
        sample_id = start_id
        self.lstm.set_state(h)

        for _ in range(sample_size):
            x = np.array(sample_id).reshape((1, 1))
            out = self.embed.forward(x)
            out = self.lstm.forward(out)
            score = self.affine.forward(out)

            sample_id = np.argmax(score.flatten())
            sampled.append(int(sample_id))

        return sampled
Exemple #6
0
class AttentionDecoder:
    def __init__(self, vocab_size: int, wordvec_size: int,
                 hidden_size) -> None:
        embed_W = (np.random.randn(vocab_size, wordvec_size) /
                   100).astype(float)
        lstm_Wx = (np.random.randn(wordvec_size, 4 * hidden_size) /
                   np.sqrt(wordvec_size)).astype(float)
        lstm_Wh = (np.random.randn(hidden_size, 4 * hidden_size) /
                   np.sqrt(hidden_size)).astype(float)
        lstm_b = np.zeros(4 * hidden_size).astype(float)
        affine_W = (np.random.randn(2 * hidden_size, vocab_size) /
                    np.sqrt(2 * hidden_size)).astype(float)
        affine_b = np.zeros(vocab_size).astype(float)

        self.embed = TimeEmbedding(embed_W)
        self.lstm = TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=True)
        self.attention = TimeAttention()
        self.affine = TimeAffine(affine_W, affine_b)
        layers = [self.embed, self.lstm, self.attention, self.affine]

        self.params = []
        self.grads = []
        for layer in layers:
            self.params += layer.params
            self.grads += layer.grads

    def forward(self, xs: np.ndarray, enc_hs: np.ndarray) -> np.ndarray:
        h = enc_hs[:, -1]
        self.lstm.set_state(h)

        out = self.embed.forward(xs)
        dec_hs = self.lstm.forward(out)
        c = self.attention.forward(enc_hs, dec_hs)
        out = np.concatenate((c, dec_hs), axis=2)
        score = self.affine.forward(out)

        return score

    def backward(self, dscore: np.ndarray) -> np.ndarray:
        dout = self.affine.backward(dscore)
        N, T, H2 = dout.shape
        H = H2 // 2

        dc, ddec_hs0 = dout[:, :, :H], dout[:, :, H:]
        denc_hs, ddec_hs1 = self.attention.backward(dc)
        ddec_hs = ddec_hs0 + ddec_hs1
        dout = self.lstm.backward(ddec_hs)
        denc_hs[:, -1] += self.lstm.dh
        self.embed.backward(dout)

        return denc_hs

    def generate(self, enc_hs: np.ndarray, start_id: int,
                 sample_size: int) -> List[int]:
        sampled = []
        sample_id = start_id
        h = enc_hs[:, -1]
        self.lstm.set_state(h)

        for _ in range(sample_size):
            x = np.array([sample_id]).reshape((1, 1))

            out = self.embed.forward(x)
            dec_hs = self.lstm.forward(out)
            c = self.attention.forward(enc_hs, dec_hs)
            out = np.concatenate((c, dec_hs), axis=2)
            score = self.affine.forward(out)

            sample_id = np.argmax(score.flatten())
            sampled.append(sample_id)

        return sampled
Exemple #7
0
class PeekyDecoder:
    def __init__(self, vocab_size: int, wordvec_size: int,
                 hidden_size: int) -> None:
        embed_W = (np.random.randn(vocab_size, wordvec_size) /
                   100).astype(float)
        lstm_Wx = (
            np.random.randn(wordvec_size + hidden_size, 4 * hidden_size) /
            np.sqrt(wordvec_size)).astype(float)
        lstm_Wh = (np.random.randn(hidden_size, 4 * hidden_size) /
                   np.sqrt(hidden_size)).astype(float)
        lstm_b = np.zeros(4 * hidden_size).astype(float)
        affine_W = (np.random.randn(hidden_size + hidden_size, vocab_size) /
                    np.sqrt(hidden_size)).astype(float)
        affine_b = np.zeros(vocab_size).astype(float)

        self.embed = TimeEmbedding(embed_W)
        self.lstm = TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=True)
        self.affine = TimeAffine(affine_W, affine_b)

        self.params = []
        self.grads = []

        for layer in (self.embed, self.lstm, self.affine):
            self.params += layer.params
            self.grads += layer.grads
        self.cache = None

    def forward(self, xs: np.ndarray, h: np.ndarray) -> np.ndarray:
        N, T = xs.shape
        N, H = h.shape
        self.lstm.set_state(h)

        out = self.embed.forward(xs)
        hs = np.repeat(h, T, axis=0).reshape(N, T, H)
        out = np.concatenate((hs, out), axis=2)

        out = self.lstm.forward(out)
        out = np.concatenate((hs, out), axis=2)

        score = self.affine.forward(out)
        self.cache = H
        return score

    def backward(self, dscore: np.ndarray) -> np.ndarray:
        H = self.cache

        dout = self.affine.backward(dscore)
        dout, dhs0 = dout[:, :, H:], dout[:, :, :H]

        dout = self.lstm.backward(dout)
        dembed, dhs1 = dout[:, :, H:], dout[:, :, :H]
        self.embed.backward(dembed)
        dhs = dhs0 + dhs1
        dh = self.lstm.dh + np.sum(dhs, axis=1)
        return dh

    def generate(self, h: np.ndarray, start_id: int,
                 sample_size: int) -> List[int]:
        sampled = []
        sample_id = start_id
        self.lstm.set_state(h)

        H = h.shape[1]
        peeky_h = h.reshape(1, 1, H)
        for _ in range(sample_size):
            x = np.array([sample_id]).reshape((1, 1))
            out = self.embed.forward(x)

            out = np.concatenate((peeky_h, out), axis=2)
            out = self.lstm.forward(out)
            out = np.concatenate((peeky_h, out), axis=2)
            score = self.affine.forward(out)

            sample_id = np.argmax(score.flatten())
            sampled.append(sample_id)

        return sampled