class Seq2Seq(BaseModel): def __init__(self, src_vocab_size, tgt_vocab_size, wordvec_size, hidden_size): Vs, Vt, D, H = src_vocab_size, tgt_vocab_size, wordvec_size, hidden_size self.encoder = Encoder(Vs, D, H) self.decoder = Decoder(Vt, D, H) self.softmax = TimeSoftmaxWithLoss() self.params = self.encoder.params + self.decoder.params self.grads = self.encoder.grads + self.decoder.grads def forward(self, xs, ts): decoder_xs, decoder_ts = ts[:, :-1], ts[:, 1:] h = self.encoder.forward(xs) score = self.decoder.forward(decoder_xs, h) loss = self.softmax.forward(score, decoder_ts) return loss def backward(self, dout=1): dout = self.softmax.backward(dout) dh = self.decoder.backward(dout) dout = self.encoder.backward(dh) return dout def generate(self, xs, bos_id, eos_id): # start_id = xs.flatten()[0] start_id = bos_id h = self.encoder.forward(xs) sampled = self.decoder.generate(h, start_id, eos_id) return sampled
class BetterRnnlm(BaseModel): def __init__(self, vocab_size=10000, wordvec_size=650, hidden_size=650, dropout_ratio=0.5): V, D, H = vocab_size, wordvec_size, hidden_size rn = np.random.randn # initializing weight embed_W = (rn(V, D) / 100).astype('f') lstm_Wx1 = (rn(D, 4 * H) / np.sqrt(D)).astype('f') lstm_Wh1 = (rn(H, 4 * H) / np.sqrt(H)).astype('f') lstm_b1 = np.zeros(4 * H).astype('f') lstm_Wx2 = (rn(H, 4 * H) / np.sqrt(H)).astype('f') lstm_Wh2 = (rn(H, 4 * H) / np.sqrt(H)).astype('f') lstm_b2 = np.zeros(4 * H).astype('f') affine_b = np.zeros(V).astype('f') # generating layers self.layers = [ TimeEmbedding(embed_W), TimeDropout(dropout_ratio), TimeLSTM(lstm_Wx1, lstm_Wh1, lstm_b1, stateful=True), TimeDropout(dropout_ratio), TimeLSTM(lstm_Wx2, lstm_Wh2, lstm_b2, stateful=True), TimeDropout(dropout_ratio), TimeAffine(embed_W.T, affine_b) ] self.loss_layer = TimeSoftmaxWithLoss() self.lstm_layers = [self.layers[2], self.layers[4]] self.drop_layers = [self.layers[1], self.layers[3], self.layers[5]] # gathering weights and gradients self.params, self.grads = [], [] for layer in self.layers: self.params += layer.params self.grads += layer.grads def predict(self, xs, train_flag=False): for layer in self.drop_layers: layer.train_flg = train_flag for layer in self.layers: xs = layer.forward(xs) return xs def forward(self, xs, ts, train_flag=True): score = self.predict(xs, train_flag) loss = self.loss_layer.forward(score, ts) return loss def backward(self, dout=1): dout = self.loss_layer.backward(dout) for layer in reversed(self.layers): dout = layer.backward(dout) return dout def reset_state(self): for layer in self.lstm_layers: layer.reset_state()
class Seq2seq(BaseModel): def __init__(self, vocab_size, wordvec_size, hidden_size): V, D, H = vocab_size, wordvec_size, hidden_size self.encoder = Encoder(V, D, H) self.decoder = Decoder(V, D, H) self.softmax = TimeSoftmaxWithLoss() self.params = self.encoder.params + self.decoder.params self.grads = self.encoder.grads + self.decoder.grads def forward(self, xs, ts): decoder_xs, decoder_ts = ts[:, :-1], ts[:, 1:] h = self.encoder.forward(xs) score = self.decoder.forward(decoder_xs, h) loss = self.softmax.forward(score, decoder_ts) return loss def backward(self, dout=1): dout = self.softmax.backward(dout) dh = self.decoder.backward(dout) dout = self.encoder.backward(dh) return dout def generate(self, xs, start_id, sample_size): h = self.encoder.forward(xs) sampled = self.decoder.generate(h, start_id, sample_size) return sampled
class Rnnlm: def __init__(self, vocab_size=10000, word_vec=100, hidden_size=100): V, D, H = vocab_size, word_vec, hidden_size rn = np.random.randn # 重みの初期化 embed_W = (rn(V, D) / 100).astype('f') lstm_Wx = (rn(D, 4 * H) / np.sqrt(D)).astype('f') lstm_Wh = (rn(H, 4 * H) / np.sqrt(H)).astype('f') lstm_b = np.zeros(4 * H).astype('f') affine_W = (rn(H, V) / np.sqrt(H)).astype('f') affine_b = np.zeros(V).astype('f') # レイヤの生成 self.layers = [ TimeEmbedding(embed_W), TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=True), TimeAffine(affine_W, affine_b) ] self.loss_layer = TimeSoftmaxWithLoss() self.lstm_layer = self.layers[1] # 全ての重みと勾配をリストにまとめる self.params, self.grads = [], [] for layer in self.layers: self.params += layer.params self.grads += layer.grads def predict(self, xs): for layer in self.layers: xs = layer.forward(xs) return xs def forward(self, xs, ts): score = self.predict(xs) loss = self.loss_layer.forward(score, ts) return loss def backward(self, dout=1): dout = self.loss_layer.backward(dout) for layer in reversed(self.layers): dout = layer.backward(dout) return dout def reset_state(self): self.lstm_layer.reset_state() def save_params(self, file_name='RNNlm.pkl'): with open(file_name, 'wb') as f: pickle.dump(self.params, f) def load_params(self, file_name='RNNlm.pkl'): if os.path.getsize(file_name) > 0: with open(file_name, 'rb') as f: self.params = pickle.load(f) else: print("========= this file is empty!! ==========")
class Rnnlm: def __init__(self, vocab_size=10000, wordvec_size=100, hidden_size=100): V, D, H = vocab_size, wordvec_size, hidden_size rn = np.random.randn # initializing weights embed_W = (rn(V, D) / 100).astype('f') lstm_Wx = (rn(D, 4 * H) / np.sqrt(D)).astype('f') lstm_Wh = (rn(H, 4 * H) / np.sqrt(H)).astype('f') lstm_b = np.zeros(4 * H).astype('f') affine_W = (rn(H, V) / np.sqrt(H)).astype('f') affine_b = np.zeros(V).astype('f') # generate each layers self.layers = [ TimeEmbedding(embed_W), TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=True), TimeAffine(affine_W, affine_b) ] self.loss_layer = TimeSoftmaxWithLoss() self.lstm_layer = self.layers[1] # gather all weights and gradients self.params, self.grads = [], [] for layer in self.layers: self.params += layer.params self.grads += layer.grads def predict(self, xs): for layer in self.layers: xs = layer.forward(xs) return xs def forward(self, xs, ts): score = self.predict(xs) loss = self.loss_layer.forward(score, ts) return loss def backward(self, dout=1): dout = self.loss_layer.backward(dout) for layer in reversed(self.layers): dout = layer.backward(dout) return dout def reset_state(self): self.lstm_layer.reset_state() def save_params(self, file_name="Rnnlm.pkl"): with open(file_name, "wb") as fout: pickle.dump(self.params, fout) def load_params(self, file_name="Rnnlm.pkl"): with open(file_name, "rb") as fin: self.params = pickle.load(fin)
class Rnnlm(BaseModel): def __init__(self, vocab_size: int = 10000, wordvec_size: int = 100, hidden_size: int = 100) -> None: V, D, H = vocab_size, wordvec_size, hidden_size rn = np.random.randn # Initialize of weights embed_W = (rn(V, D) / 100).astype('f') lstm_Wx = (rn(D, 4 * H) / np.sqrt(D).astype('f')) lstm_Wh = (rn(D, 4 * H) / np.sqrt(H).astype('f')) lstm_b = np.zeros(4 * H).astype('f') affine_W = (rn(H, V) / np.sqrt(H)).astype('f') affine_b = np.zeros(V).astype('f') # Generating layers self.layers = [ TimeEmbedding(embed_W), TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=True), TimeAffine(affine_W, affine_b) ] self.loss_layer = TimeSoftmaxWithLoss() self.lstm_layer = self.layers[1] # Conclude all of weights and grads as a list self.params, self.grads = [], [] for layer in self.layers: self.params += layer.params self.grads += layer.grads def predict(self, xs: List[float]) -> List[float]: for layer in self.layers: xs = layer.forward(xs) return xs def forward(self, xs: List[float], ts: List[float]) -> float: score = self.predict(xs) loss = self.loss_layer.forward(score, ts) return loss def backward(self, dout: float = 1) -> float: dout = self.loss_layer.backward(dout) for layer in reversed(self.layers): dout = layer.backward(dout) return dout def reset_state(self) -> None: self.lstm_layer.reset_state()
class Rnnlm(BaseModel): def __init__(self, vocab_size=10000, wordvec_size=100, hidden_size=100): V, D, H = vocab_size, wordvec_size, hidden_size rn = np.random.randn # 重みの初期化 embed_W = (rn(V, D) / 100).astype('f') lstm_Wx = (rn(D, 4 * H) / np.sqrt(D)).astype('f') lstm_Wh = (rn(H, 4 * H) / np.sqrt(H)).astype('f') lstm_b = np.zeros(4 * H).astype('f') affine_W = (rn(H, V) / np.sqrt(H)).astype('f') affine_b = np.zeros(V).astype('f') # レイヤの生成 self.layers = [ TimeEmbedding(embed_W), TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=True), TimeAffine(affine_W, affine_b) ] self.loss_layer = TimeSoftmaxWithLoss() self.lstm_layer = self.layers[1] # すべての重みと勾配をリストにまとめる self.params, self.grads = [], [] for layer in self.layers: self.params += layer.params self.grads += layer.grads def predict(self, xs): for layer in self.layers: xs = layer.forward(xs) return xs def forward(self, xs, ts): score = self.predict(xs) loss = self.loss_layer.forward(score, ts) return loss def backward(self, dout=1): dout = self.loss_layer.backward(dout) for layer in reversed(self.layers): dout = layer.backward(dout) return dout def reset_state(self): self.lstm_layer.reset_state()
class SimpleRnnlm: def __init__(self, vocabulary_size, wordvec_size, hidden_size): V, D, H = vocabulary_size, wordvec_size, hidden_size rn = np.random.randn # Initialize weights embed_W = (rn(V, D) / 100).astype('f') rnn_Wx = (rn(D, H) / np.sqrt(D)).astype('f') rnn_Wh = (rn(H, H) / np.sqrt(H)).astype('f') rnn_b = np.zeros(H).astype('f') affine_W = (rn(H, V) / np.sqrt(H)).astype('f') affine_b = np.zeros(V).astype('f') # generate layers self.layers = [ TimeEmbedding(embed_W), TimeRNN(rnn_Wx, rnn_Wh, rnn_b, stateful=True), TimeAffine(affine_W, affine_b) ] self.loss_layer = TimeSoftmaxWithLoss() self.rnn_layer = self.layers[1] # list all weights and gradiants self.params, self.grads = [], [] for layer in self.layers: self.params += layer.params self.grads += layer.grads def forward(self, xs, ts): for layer in self.layers: xs = layer.forward(xs) loss = self.loss_layer.forward(xs, ts) return loss def backward(self, dout=1): dout = self.loss_layer.backward(dout) for layer in reversed(self.layers): dout = layer.backward(dout) return dout def reset_state(self): self.rnn_layer.reset_state()
class SimpleRnnlm: def __init__(self, vocab_size, wordvec_size, hidden_size): V, D, H = vocab_size, wordvec_size, hidden_size rn = np.random.randn # 重みの初期化 embed_W = (rn(V, D) / 100).astype(np.float32) rnn_Wx = (rn(D, H) / np.sqrt(D)).astype(np.float32) # Xavierの初期値 rnn_Wh = (rn(H, H) / np.sqrt(H)).astype(np.float32) # Xavierの初期値 rnn_b = np.zeros(H).astype(np.float32) affine_W = (rn(H, V) / np.sqrt(H)).astype(np.float32) # Xavierの初期値 affine_b = np.zeros(V).astype(np.float32) # Timeレイヤの作成 self.layers = [ TimeEmbedding(embed_W), TimeRNN(rnn_Wx, rnn_Wh, rnn_b, statefull=True), # 順伝搬の隠れベクトルを引き継ぐ TimeAffine(affine_W, affine_b) ] self.loss_layer = TimeSoftmaxWithLoss() self.rnn_layer = self.layers[1] # すべての重みと勾配をリストにまとめる self.params, self.grads = [], [] for layer in self.layers: self.params += layer.params self.grads += layer.grads def forward(self, xs, ts): for layer in self.layers: xs = layer.forward(xs) loss = self.loss_layer.forward(xs, ts) return loss def backward(self, dout=1): dout = self.loss_layer.backward(dout) for layer in reversed(self.layers): dout = layer.backward(dout) return dout def reset_state(self): self.rnn_layer.reset_state()
class BetterRnnlm(BaseModel): def __init__(self, vocab_size=10000, word_vec=650, hidden_size=0.5, dropout_ratio=0.5): """Rnnの改良版 LSTMの多層化(2層) Dropoutを使用(深さ方向に使用) 重み共有(EmbeddingレイヤとAffineレイヤで重み共有) """ V, D, H = vocab_size, word_vec, hidden_size rn = np.random.randn # 重みの初期化 embed_W = (rn(V, D) / 100).astype('f') lstm_Wx1 = (rn(D, 4 * H) / np.sqrt(D)).astype('f') lstm_Wh1 = (rn(H, 4 * H) / np.sqrt(H)).astype('f') lstm_b1 = np.zeros(4 * H).astype('f') lstm_Wx2 = (rn(H, 4 * H) / np.sqrt(H)).astype('f') lstm_Wh2 = (rn(H, 4 * H) / np.sqrt(H)).astype('f') lstm_b2 = np.zeros(4 * H).astype('f') affine_b = np.zeros(V).astype('f') # 3つの改善 self.layers = [ TimeEmbedding(embed_W), TimeDropout(dropout_ratio), TimeLSTM(lstm_Wx1, lstm_Wh1, lstm_b1, stateful=True), TimeDropout(dropout_ratio), TimeLSTM(lstm_Wx2, lstm_Wh2, lstm_b2, stateful=True), TimeDropout(dropout_ratio), TimeAffine(embed_W.T, affine_b) # 重み共有 ] self.loss_layer = TimeSoftmaxWithLoss() self.lstm_layers = [self.layers[2], self.layers[4]] self.drop_layers = [self.layers[1], self.layers[3], self.layers[5]] # 全ての重みと勾配をリストにまとめる self.params, self.grads = [], [] for layer in self.layers: self.params += layer.params self.grads += layer.grads def predict(self, xs, train_flg=False): for layer in self.drop_layers: layer.train_flg = train_flg for layer in self.layers: xs = layer.forward(xs) return xs def forward(self, xs, ts, train_flg=True): score = self.predict(xs, train_flg) loss = self.loss_layer.forward(score, ts) return loss def backward(self, dout=1): dout = self.loss_layer.backward(dout) for layer in reversed(self.layers): dout = layer.backward(dout) return dout def reset_state(self): for layer in self.lstm_layers: layer.reset_state()
class BetterRnnlm(BaseModel): def __init__(self, vocab_size=10000, wordvec_size=650, hidden_size=650, dropout_ratio=0.5): V, D, H = vocab_size, wordvec_size, hidden_size rn = np.random.randn embed_W = (rn(V, D) / 100).astype(np.float32) lstm_Wx1 = (rn(D, 4 * H) / np.sqrt(D)).astype(np.float32) lstm_Wh1 = (rn(H, 4 * H) / np.sqrt(H)).astype(np.float32) lstm_b1 = np.zeros(4 * H).astype(np.float32) lstm_Wx2 = (rn(D, 4 * H) / np.sqrt(D)).astype(np.float32) lstm_Wh2 = (rn(H, 4 * H) / np.sqrt(H)).astype(np.float32) lstm_b2 = np.zeros(4 * H).astype(np.float32) affine_b = np.zeros(V).astype(np.float32) # 3つの改善 # 1) LSTM層を重ねる # 2) Dropout層の追加 (深さ方向でLSTM層の間に追加) # 3) 重み共有 Time Embedding層とTime Affine層 @ W(V, D) self.layers = [ TimeEmbedding(embed_W), TimeDropout(dropout_ratio), TimeLSTM(lstm_Wx1, lstm_Wh1, lstm_b1, statefull=True), TimeDropout(dropout_ratio), TimeLSTM(lstm_Wx2, lstm_Wh2, lstm_b2, statefull=True), TimeDropout(dropout_ratio), TimeAffine(embed_W.T, affine_b) # embed_W(V, D)とembed_W.T(D, V)を共有 ] self.loss_layer = TimeSoftmaxWithLoss() self.lstm_layers = [self.layers[2], self.layers[4]] self.drop_layers = [self.layers[1], self.layers[3], self.layers[5]] # 重みと勾配をまとめる self.params, self.grads = [], [] for layer in self.layers: self.params += layer.params self.grads += layer.grads def predict(self, xs, train_flg=False): for layer in self.drop_layers: layer.train_flg = train_flg for layer in self.layers: xs = layer.forward(xs) return xs def forward(self, xs, ts, train_flg=True): score = self.predict(xs, train_flg) loss = self.loss_layer.forward(score, ts) return loss def backward(self, dout=1): dout = self.loss_layer.backward(dout) for layer in reversed(self.layers): dout = layer.backward(dout) return dout def reset_state(self): for layer in self.lstm_layers: layer.reset_state()
class Transformer(BaseModel): def __init__(self, vocab_size, wordvec_size, head_size, num_heads, num_encoders=3, num_decoders=3): S, D, H = vocab_size, wordvec_size, head_size rn = np.random.randn self.num_encoders = num_encoders self.num_decoders = num_decoders self.params, self.grads = [], [] # Double embed (encoder, decoder) embed_W1 = (rn(S, D) / 100).astype('f') self.e_embed = PositionalEmbedding(embed_W1) self.params += self.e_embed.params self.grads += self.e_embed.grads self.encoders, self.decoders = [], [] for _ in range(num_encoders): te = TransformerEncoder(wordvec_size=D, head_size=H, num_heads=num_heads) self.encoders.append(te) self.params += te.params self.grads += te.grads for _ in range(num_decoders): td = TransformerDecoder(wordvec_size=D, head_size=H, num_heads=num_heads) self.decoders.append(td) self.params += td.params self.grads += td.grads # 편의를 위해 linear 변수에 따로 weight 저장 self.linear = MatMul((rn(D, S) / np.sqrt(D)).astype('f')) self.params += self.linear.params self.grads += self.linear.grads # TimeSoftmaxWithLoss도 params와 grads가 있으나 사용되지 않기때문에 생략 self.softmax = TimeSoftmaxWithLoss(ignore_label=-1) def forward(self, xs, ts): # xs->(N,T) / eout, dout, ts->N,(T,D) eout = self.e_embed.forward(xs) dout = self.e_embed.forward(ts) N, T, D = eout.shape for encoder in self.encoders: eout = encoder.forward(eout) for decoder in self.decoders: ts = decoder.forward(dout, eout) ts = ts.reshape(N * T, D) # score->(N*T,S) score = self.linear.forward(ts) _, S = score.shape # 순서 주의 score는 linear된 2차원 행렬, xs는 임베딩되기전 2차원 행렬 # loss->(N*T,1) score = score.reshape(N, T, S) loss = self.softmax.forward(score, xs) return loss def backward(self, dout=1): # dout->N,(T,S) dout = self.softmax.backward(dout) N, T, S = dout.shape dout = dout.reshape(N * T, S) # dout->(N*T,S) / self.linear.W->(D,S) dout = self.linear.backward(dout) # dout->(N*T,D) _, D = dout.shape dout = dout.reshape(N, T, D) # ddout->N,(T,D) for i in range(self.num_decoders - 1, 0, -1): _, dout = self.decoders[i].backward(dout) ddout, dout = self.decoders[0].backward(dout) # dout->N,(T,D) for i in range(self.num_encoders - 1, -1, -1): ddout = self.encoders[i].backward(ddout) self.e_embed.backward(ddout) def generate(self, xs, type='GPT'): sampled = [] # 'GPT'는 transformer의 decoder만 이용 if type == 'GPT': # xs->(T,), out->(T,D) out = self.e_embed.forward(xs) # out->(1,T,D) # out = out[np.newaxis,:] for i in range(self.num_decoders): out = self.decoders[i].generate(out) # out->(1,T,D) N, T, D = out.shape out = out.reshape(N * T, D) # score->(1,T,S) score = self.linear.forward(out) sampled = np.argmax(score, axis=-1).flatten() # 'BERT'는 transformer의 encoder만 이용 # 하지만 아직 masking 처리가 되어있지 않은 구조고 # positional embedding 이외에 segment embedding이 추가되어야함 # 따라서 현재 이 코드에서 BERT는 사용하는 의미가 없으며 GPT를 이용해야함 elif type == 'BERT': # xs->(T,), out->(T,D) out = self.e_embed.forward(xs) # out->(1,T,D) out = out[np.newaxis, :] for i in range(self.num_encoders): out = self.encoders[i].generate(out) # decoder의 linear를 그대로 이용하기로 하자 N, T, D = out.shape out = out.reshape(N * T, D) # score->(1,T,S) score = self.linear.forward(out) sampled = np.argmax(score, axis=-1).flatten() else: print('invalid generate type') return sampled