def __init__(self, vocab_size=10000, wordvec_size=100, hidden_size=100): V, D, H = vocab_size, wordvec_size, hidden_size rn = np.random.randn # Initialize Weights embed_W = (rn(V, D) / 100).astype('f') lstm_Wx = (rn(D, 4 * H) / np.sqrt(D)).astype('f') # Xavier Initialize lstm_Wh = (rn(H, 4 * H) / np.sqrt(H)).astype('f') # Xavier Initialize lstm_b = np.zeros(4 * H).astype('f') affine_W = (rn(H, V) / np.sqrt(H)).astype('f') # Xavier Initialize affine_b = np.zeros(V).astype('f') # Create Layers self.layers = [ TimeEmbedding(embed_W), TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=True), TimeAffine(affine_W, affine_b) ] self.loss_layer = TimeSoftmaxWithLoss() self.lstm_layer = self.layers[1] # Aggregate all Weights and Gradients self.params, self.grads = [], [] for layer in self.layers: self.params += layer.params self.grads += layer.grads
def __init__(self, vocab_size, wordvec_size, hidden_size): V, D, H = vocab_size, wordvec_size, hidden_size rn = np.random.randn embed_W = (rn(V, D) / 100).astype('f') lstm_Wx = (rn(D, 4 * H) / np.sqrt(D)).astype('f') lstm_Wh = (rn(H, 4 * H) / np.sqrt(H)).astype('f') lstm_b = np.zeros(4 * H).astype('f') self.embed = TimeEmbedding(embed_W) # statefule=False : 짧은 시계열 데이터가 여러 개인 문제이므로, # 문제마다 LSTM 은닉상태를 다시 초기화한 상태(영벡터)로 설정함 self.lstm = TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=False) self.params = self.embed.params + self.lstm.params self.grads = self.embed.grads + self.lstm.grads self.hs = None
def __init__(self, vocab_size, wordvec_size, hidden_size): V, D, H = vocab_size, wordvec_size, hidden_size rn = np.random.randn embed_W = (rn(V, D) / 100).astype('f') lstm_Wx = (rn(D, 4 * H) / np.sqrt(D)).astype('f') lstm_Wh = (rn(H, 4 * H) / np.sqrt(H)).astype('f') lstm_b = np.zeros(4 * H).astype('f') affine_W = (rn(H, V) / np.sqrt(H)).astype('f') affine_b = np.zeros(V).astype('f') self.embed = TimeEmbedding(embed_W) self.lstm = TimeLSTM(lstm_Wx, lstm_Wh, lstm_b, stateful=True) self.affine = TimeAffine(affine_W, affine_b) self.params, self.grads = [], [] for layer in (self.embed, self.lstm, self.affine): self.params += layer.params self.grads += layer.grads
def __init__(self, vocab_size=10000, wordvec_size=650, hidden_size=650, dropout_ratio=0.5): V, D, H = vocab_size, wordvec_size, hidden_size rn = np.random.randn # Initialize Weights embed_W = (rn(V, D) / 100).astype('f') lstm_Wx1 = (rn(D, 4 * H) / np.sqrt(D)).astype('f') # Xavier Initialize lstm_Wh1 = (rn(H, 4 * H) / np.sqrt(H)).astype('f') # Xavier Initialize lstm_b1 = np.zeros(4 * H).astype('f') lstm_Wx2 = (rn(D, 4 * H) / np.sqrt(H)).astype('f') # Xavier Initialize lstm_Wh2 = (rn(H, 4 * H) / np.sqrt(H)).astype('f') # Xavier Initialize lstm_b2 = np.zeros(4 * H).astype('f') affine_b = np.zeros(V).astype('f') # 세 가지 개선 self.layers = [ TimeEmbedding(embed_W), TimeDropout(dropout_ratio), TimeLSTM(lstm_Wx1, lstm_Wh1, lstm_b1, stateful=True), TimeDropout(dropout_ratio), TimeLSTM(lstm_Wx2, lstm_Wh2, lstm_b2, stateful=True), TimeDropout(dropout_ratio), TimeAffine(embed_W.T, affine_b) # Weight Tying ] self.loss_layer = TimeSoftmaxWithLoss() self.lstm_layers = [self.layers[2], self.layers[4]] self.drop_layers = [self.layers[1], self.layers[3], self.layers[5]] # Aggregate all Weights and Gradients self.params, self.grads = [], [] for layer in self.layers: self.params += layer.params self.grads += layer.grads