class SkipGram: def __init__(self, vocab_size, hidden_size, window_size, corpus): # おもみ w_in = 0.01 * np.random.randn(vocab_size, hidden_size).astype('f') w_out = 0.01 * np.random.randn(vocab_size, hidden_size).astype('f') # layers self.embed_layer = Embedding(w_in) self.ns_loss_layers = [ NegativeSamplingLoss(w_out, corpus) for _ in range(2 * window_size) ] # おもみ, 勾配まとめ layers = [self.embed_layer] + self.ns_loss_layers self.params, self.grads = [], [] for l in layers: self.params += l.params self.grads += l.grads # 単語の分散表現 self.word_vecs = w_in def forward(self, contexts, target): h = self.embed_layer.forward(target) loss = sum([ l.forward(h, contexts[:, i]) for i, l in enumerate(self.ns_loss_layers) ]) return loss def backward(self, dl=1): dh = sum([l.backward(dl) for i, l in enumerate(self.ns_loss_layers)]) self.embed_layer.backward(dh)
class Decoder(Sequential): def __init__(self, vocab_size, embedding_size, hidden_size, output_size): self.vocab_size = vocab_size self.embedding_size = embedding_size self.hidden_size = hidden_size self.output_size = output_size self.lstm = LSTM(embedding_size, hidden_size) self.lstm_output = TimeDistributed(hidden_size, output_size, activation='tanh') self.softmax = TimeDistributed(output_size, vocab_size, activation='softmax') self.embedding = Embedding(vocab_size, embedding_size) self.layers = [ self.lstm, self.lstm_output, self.softmax, self.embedding ] self.params = list( itertools.chain(*[ layer.params for layer in self.layers if hasattr(layer, 'params') ])) def forward(self, ec_H, ec_C, mask): (sens_size, batch_size) = T.shape(mask) def step(m, prev_Y, prev_H, prev_C): """Forward a time step of the decoder.""" # LSTM forward time step (H, C) = self.lstm.step(prev_Y, m, prev_H, prev_C) # LSTM output O = self.lstm_output.forward(H) # Apply softmax to LSTM output P = self.softmax.forward(O) # Make prediction one_hot_Y = T.argmax(P, axis=1) # Feed the output to the next time step Y = self.embedding.forward(one_hot_Y) # FIXME: Deal with differ length ? return (P, Y, H, C) results, updates = theano.scan(fn=step, sequences=[mask], outputs_info=[ None, dict(initial=T.zeros( (batch_size, self.embedding_size)), taps=[-1]), dict(initial=ec_H, taps=[-1]), dict(initial=ec_C, taps=[-1]) ]) # return np.swapaxes(results[0], 0, 1) # returns the softmax probabilities return results[0]
def forward(self, xs: np.ndarray) -> np.ndarray: N, T = xs.shape V, D = self.W.shape out = np.empty((N, T, D), dtype=float) self.layers = [] for t in range(T): layer = Embedding(self.W) out[:, t, :] = layer.forward(xs[:, t]) self.layers.append(layer) return out
def forward(self, idxs): w, = self.params N, T = idxs.shape V, D = w.shape # 語彙数, 分散表現の次元数 self.layers = [] ys = np.empty((N, T, D), dtype='f') for t in range(T): layer = Embedding(w) ys[:, t, :] = layer.forward(idxs[:, t]) self.layers.append(layer) return ys
class Decoder(Sequential): def __init__(self, vocab_size, embedding_size, hidden_size, output_size): self.vocab_size = vocab_size self.embedding_size = embedding_size self.hidden_size = hidden_size self.output_size = output_size self.lstm = LSTM(embedding_size, hidden_size) self.lstm_output = TimeDistributed(hidden_size, output_size, activation='tanh') self.softmax = TimeDistributed(output_size, vocab_size, activation='softmax') self.embedding = Embedding(vocab_size, embedding_size) self.layers = [self.lstm, self.lstm_output, self.softmax, self.embedding] self.params = list(itertools.chain(*[layer.params for layer in self.layers if hasattr(layer, 'params')])) def forward(self, ec_H, ec_C, mask): (sens_size, batch_size) = T.shape(mask) def step(m, prev_Y, prev_H, prev_C): """Forward a time step of the decoder.""" # LSTM forward time step (H, C) = self.lstm.step(prev_Y, m, prev_H, prev_C) # LSTM output O = self.lstm_output.forward(H) # Apply softmax to LSTM output P = self.softmax.forward(O) # Make prediction one_hot_Y = T.argmax(P, axis=1) # Feed the output to the next time step Y = self.embedding.forward(one_hot_Y) # FIXME: Deal with differ length ? return (P, Y, H, C) results, updates = theano.scan( fn=step, sequences=[mask], outputs_info=[ None, dict(initial=T.zeros((batch_size, self.embedding_size)), taps=[-1]), dict(initial=ec_H, taps=[-1]), dict(initial=ec_C, taps=[-1]) ] ) # return np.swapaxes(results[0], 0, 1) # returns the softmax probabilities return results[0]
class Encoder(Sequential): def __init__(self, vocab_size, embedding_size, hidden_size): self.vocab_size = vocab_size self.embedding_size = embedding_size self.hidden_size = hidden_size self.embedding = Embedding(vocab_size, embedding_size) self.lstm = LSTM(embedding_size, hidden_size) self.layers = [self.embedding, self.lstm] self.params = list(itertools.chain(*[layer.params for layer in self.layers if hasattr(layer, 'params')])) def forward(self, batch, mask): # ``batch`` is a matrix whose row ``x`` is a sentence, e.g. x = [1, 4, 5, 2, 0] # ``emb`` is a list of embedding matrix, e[i].shape = (sene_size, embedding_size) emb = self.embedding.forward(batch) (H, C) = self.lstm.forward(emb, mask) return (H[-1], C[-1])
class EmbeddingDot: def __init__(self, w): self.embed = Embedding(w) self.params = self.embed.params self.grads = self.embed.grads self.cache = None def forward(self, h, idx): w_idx = self.embed.forward(idx) s = np.sum(h * w_idx, axis=1) self.cache = (h, w_idx) return s def backward(self, ds): ds = ds.reshape(ds.shape[0], 1) # ??? h, w_idx = self.cache dw_idx = ds * h self.embed.backward(dw_idx) dh = ds * w_idx return dh
class Encoder(Sequential): def __init__(self, vocab_size, embedding_size, hidden_size): self.vocab_size = vocab_size self.embedding_size = embedding_size self.hidden_size = hidden_size self.embedding = Embedding(vocab_size, embedding_size) self.lstm = LSTM(embedding_size, hidden_size) self.layers = [self.embedding, self.lstm] self.params = list( itertools.chain(*[ layer.params for layer in self.layers if hasattr(layer, 'params') ])) def forward(self, batch, mask): # ``batch`` is a matrix whose row ``x`` is a sentence, e.g. x = [1, 4, 5, 2, 0] # ``emb`` is a list of embedding matrix, e[i].shape = (sene_size, embedding_size) emb = self.embedding.forward(batch) (H, C) = self.lstm.forward(emb, mask) return (H[-1], C[-1])
class EmbeddingDot: def __init__(self, W: np.ndarray) -> None: self.embed = Embedding(W) self.params = self.embed.params self.grads = self.embed.grads self.cache = None def forward(self, h: np.ndarray, idx: List[int]): target_W = self.embed.forward(idx) out = np.sum(target_W * h, axis=1) self.cache = (h, target_W) return out def backward(self, dout: np.ndarray) -> np.ndarray: h, target_W = self.cache dout = dout.reshape(dout.shape[0], 1) dtarget_W = dout * h self.embed.backward(dtarget_W) dh = dout * target_W return dh
class LSTM_DQN(torch.nn.Module): model_name = 'lstm_dqn' def __init__(self, model_config, word_vocab, generate_length=5, enable_cuda=False): super(LSTM_DQN, self).__init__() self.model_config = model_config self.enable_cuda = enable_cuda self.word_vocab_size = len(word_vocab) self.id2word = word_vocab self.generate_length = generate_length self.read_config() self._def_layers() self.init_weights() # self.print_parameters() def print_parameters(self): amount = 0 for p in self.parameters(): amount += np.prod(p.size()) print("total number of parameters: %s" % (amount)) parameters = filter(lambda p: p.requires_grad, self.parameters()) amount = 0 for p in parameters: amount += np.prod(p.size()) print("number of trainable parameters: %s" % (amount)) def read_config(self): # model config self.embedding_size = self.model_config['embedding_size'] self.encoder_rnn_hidden_size = self.model_config[ 'encoder_rnn_hidden_size'] self.action_scorer_hidden_dim = self.model_config[ 'action_scorer_hidden_dim'] self.dropout_between_rnn_layers = self.model_config[ 'dropout_between_rnn_layers'] def _def_layers(self): # word embeddings self.word_embedding = Embedding(embedding_size=self.embedding_size, vocab_size=self.word_vocab_size, enable_cuda=self.enable_cuda) # lstm encoder self.encoder = FastUniLSTM( ninp=self.embedding_size, nhids=self.encoder_rnn_hidden_size, dropout_between_rnn_layers=self.dropout_between_rnn_layers) self.action_scorer_shared = torch.nn.Linear( self.encoder_rnn_hidden_size[-1], self.action_scorer_hidden_dim) action_scorers = [] for _ in range(self.generate_length): action_scorers.append( torch.nn.Linear(self.action_scorer_hidden_dim, self.word_vocab_size, bias=False)) self.action_scorers = torch.nn.ModuleList(action_scorers) self.fake_recurrent_mask = None def init_weights(self): torch.nn.init.xavier_uniform_(self.action_scorer_shared.weight.data) for i in range(len(self.action_scorers)): torch.nn.init.xavier_uniform_(self.action_scorers[i].weight.data) self.action_scorer_shared.bias.data.fill_(0) def representation_generator(self, _input_words): embeddings, mask = self.word_embedding.forward( _input_words) # batch x time x emb encoding_sequence, _, _ = self.encoder.forward( embeddings, mask) # batch x time x h mean_encoding = masked_mean(encoding_sequence, mask) # batch x h return mean_encoding def action_scorer(self, state_representation): hidden = self.action_scorer_shared.forward( state_representation) # batch x hid hidden = F.relu(hidden) # batch x hid action_ranks = [] for i in range(len(self.action_scorers)): action_ranks.append( self.action_scorers[i].forward(hidden)) # batch x n_vocab return action_ranks