def test_lstm(): from layers import LSTM lstm = LSTM(input_size, layer_size) lstm.set_state(batch_size) x = T.tensor3() f = theano.function([x], lstm(x), updates=lstm.updates) X = np.ones((batch_size, time_steps, input_size), dtype=np.float32) assert f(X).shape == (batch_size, time_steps, layer_size)
def __init__(self, vocab_size, embedding_size, hidden_size): self.vocab_size = vocab_size self.embedding_size = embedding_size self.hidden_size = hidden_size self.embedding = Embedding(vocab_size, embedding_size) self.lstm = LSTM(embedding_size, hidden_size) self.layers = [self.embedding, self.lstm] self.params = list( itertools.chain(*[ layer.params for layer in self.layers if hasattr(layer, 'params') ]))
class Decoder(Sequential): def __init__(self, vocab_size, embedding_size, hidden_size, output_size): self.vocab_size = vocab_size self.embedding_size = embedding_size self.hidden_size = hidden_size self.output_size = output_size self.lstm = LSTM(embedding_size, hidden_size) self.lstm_output = TimeDistributed(hidden_size, output_size, activation='tanh') self.softmax = TimeDistributed(output_size, vocab_size, activation='softmax') self.embedding = Embedding(vocab_size, embedding_size) self.layers = [ self.lstm, self.lstm_output, self.softmax, self.embedding ] self.params = list( itertools.chain(*[ layer.params for layer in self.layers if hasattr(layer, 'params') ])) def forward(self, ec_H, ec_C, mask): (sens_size, batch_size) = T.shape(mask) def step(m, prev_Y, prev_H, prev_C): """Forward a time step of the decoder.""" # LSTM forward time step (H, C) = self.lstm.step(prev_Y, m, prev_H, prev_C) # LSTM output O = self.lstm_output.forward(H) # Apply softmax to LSTM output P = self.softmax.forward(O) # Make prediction one_hot_Y = T.argmax(P, axis=1) # Feed the output to the next time step Y = self.embedding.forward(one_hot_Y) # FIXME: Deal with differ length ? return (P, Y, H, C) results, updates = theano.scan(fn=step, sequences=[mask], outputs_info=[ None, dict(initial=T.zeros( (batch_size, self.embedding_size)), taps=[-1]), dict(initial=ec_H, taps=[-1]), dict(initial=ec_C, taps=[-1]) ]) # return np.swapaxes(results[0], 0, 1) # returns the softmax probabilities return results[0]
def __init__(self, vocab_size, embedding_size, hidden_size): self.vocab_size = vocab_size self.embedding_size = embedding_size self.hidden_size = hidden_size self.embedding = Embedding(vocab_size, embedding_size) self.lstm = LSTM(embedding_size, hidden_size) self.layers = [self.embedding, self.lstm] self.params = list(itertools.chain(*[layer.params for layer in self.layers if hasattr(layer, 'params')]))
def build_test_model(self, data): rng = np.random.RandomState(3435) lstm_params, hidden_params, hidden_relu_params, full_connect_params = self.load_trained_params() data_x, data_y, maxlen = data test_len = len(data_x) n_test_batches = test_len // self.batch_size x = T.matrix('x') y = T.ivector('y') index = T.lscalar() Words = theano.shared(value=self.word_vectors, name="Words", borrow=True) input_width = self.hidden_sizes[0] layer0_input = T.cast(Words[T.cast(x.flatten(), dtype="int32")], dtype=floatX).reshape((self.batch_size, maxlen, input_width)) lstm = LSTM(dim=input_width, batch_size=self.batch_size, number_step=maxlen, params=lstm_params) layer0_input = lstm.feed_foward(layer0_input) lstm.mean_pooling_input(layer0_input) hidden_sizes = [self.hidden_sizes[0], self.hidden_sizes[0]] hidden_layer = HiddenLayer(rng, hidden_sizes=hidden_sizes, input_vectors=lstm.output, activation=utils.Tanh, name="Hidden_Tanh", W=hidden_params[0], b=hidden_params[1]) hidden_layer.predict() hidden_layer_relu = HiddenLayer(rng, hidden_sizes=hidden_sizes, input_vectors=hidden_layer.output, W=hidden_relu_params[0], b=hidden_relu_params[1]) hidden_layer_relu.predict() # hidden_layer_dropout = HiddenLayerDropout(rng, hidden_sizes=self.hidden_sizes[:2], input_vectors=lstm.output, W=hidden_layer.W, b=hidden_layer.b) full_connect = FullConnectLayer(rng, layers_size=[self.hidden_sizes[0], self.hidden_sizes[-1]], input_vector=hidden_layer_relu.output, W=full_connect_params[0], b=full_connect_params[1]) full_connect.predict() test_data_x = theano.shared(np.asarray(data_x, dtype=floatX), borrow=True) test_data_y = theano.shared(np.asarray(data_y, dtype='int32'), borrow=True) errors = 0. if test_len == 1: test_model = theano.function([index],outputs=full_connect.get_predict(), on_unused_input='ignore', givens={ x: test_data_x[index * self.batch_size: (index + 1) * self.batch_size], y: test_data_y[index * self.batch_size: (index + 1) * self.batch_size] }) index = 0 avg_errors = test_model(index) else: test_model = theano.function([index], outputs=full_connect.errors(y), givens={ x: test_data_x[index * self.batch_size: (index + 1) * self.batch_size], y: test_data_y[index * self.batch_size: (index + 1) * self.batch_size] }) for i in xrange(n_test_batches): errors += test_model(i) avg_errors = errors / n_test_batches return avg_errors
def __init__(self, dmemory, daddress, nstates, dinput, doutput): self.layers = {} self.layers['INPUT'] = Dense(dinput, dmemory) self.layers['PREVIOUS_READ'] = Dense(dmemory, dmemory) self.layers['CONTROL_KEY'] = LSTM(dmemory + dmemory, nstates) self.layers['OUTPUT'] = Dense(doutput, doutput) self.daddress = daddress self.dmemory = dmemory self.doutput = doutput
def __init__(self, vocabulary_size: int, hidden_size: int, n_layers: int, embedding_size: int, dropout: float): super().__init__() self.vocabulary_size = vocabulary_size self.dropout = dropout self.embedding = torch.nn.Embedding(vocabulary_size + 1, embedding_size) self.lstm = LSTM(embedding_size, hidden_size, n_layers, dropout=dropout)
def run_lstm_alt(self, context_out, question_pool, context_len, is_train): # tile pooled question rep and concat with context q_rep = tf.expand_dims(question_pool, 1) # (batch_size, 1, D) context_shape = tf.shape(context_out)[1] q_rep = tf.tile(q_rep, [1, context_shape, 1]) q_c_rep = tf.concat([context_out, q_rep], axis=-1) with tf.variable_scope('lstm_') as scope: lstm_out_fw = LSTM(q_c_rep, context_len, self.hidden_units, tf.cond(is_train, lambda: self.output_keep_prob, lambda: 1.0), tf.cond(is_train, lambda: self.input_keep_prob, lambda: 1.0), tf.cond(is_train, lambda: self.state_keep_prob, lambda: 1.0), use_last=False, seed=self.seed, reuse=False) q_c_rep_rev = _reverse(q_c_rep, context_len, 1, 0) lstm_out_rev = LSTM(q_c_rep_rev, context_len, self.hidden_units, tf.cond(is_train, lambda: self.output_keep_prob, lambda: 1.0), tf.cond(is_train, lambda: self.input_keep_prob, lambda: 1.0), tf.cond(is_train, lambda: self.state_keep_prob, lambda: 1.0), use_last=False, seed=self.seed, reuse=True) lstm_out_bw = _reverse(lstm_out_rev, context_len, 1, 0) lstm_out = tf.concat([lstm_out_fw, lstm_out_bw], 2, name='lstm_out') return lstm_out
def __init__(self, vocab_size, embedding_size, hidden_size, output_size): self.vocab_size = vocab_size self.embedding_size = embedding_size self.hidden_size = hidden_size self.output_size = output_size self.lstm = LSTM(embedding_size, hidden_size) self.lstm_output = TimeDistributed(hidden_size, output_size, activation='tanh') self.softmax = TimeDistributed(output_size, vocab_size, activation='softmax') self.embedding = Embedding(vocab_size, embedding_size) self.layers = [self.lstm, self.lstm_output, self.softmax, self.embedding] self.params = list(itertools.chain(*[layer.params for layer in self.layers if hasattr(layer, 'params')]))
def __init__(self, vocab_size, embedding_size, hidden_size, output_size): self.vocab_size = vocab_size self.embedding_size = embedding_size self.hidden_size = hidden_size self.output_size = output_size self.lstm = LSTM(embedding_size, hidden_size) self.lstm_output = TimeDistributed(hidden_size, output_size, activation='tanh') self.softmax = TimeDistributed(output_size, vocab_size, activation='softmax') self.embedding = Embedding(vocab_size, embedding_size) self.layers = [ self.lstm, self.lstm_output, self.softmax, self.embedding ] self.params = list( itertools.chain(*[ layer.params for layer in self.layers if hasattr(layer, 'params') ]))
def main(): input_size, output_size = 3, 3 rnn = RNN() rnn.add_layer(LSTM(input_size, output_size)) X_train = [[[1, 0, 0]], [[0, 1, 0]], [[0, 0, 1]]] Y_train = [[[0, 1, 0]], [[0, 0, 1]], [[1, 0, 0]]] epochs = 1000 rnn.train(X_train, Y_train, epochs=epochs) for p, y in zip(rnn.predict(X_train), Y_train): _p = np.zeros_like(p).astype(int) _p[:, np.argmax(p)] = 1 print('%30s %10s %10s' % (p.reshape(1, -1), _p, np.array(y)))
class Encoder(Sequential): def __init__(self, vocab_size, embedding_size, hidden_size): self.vocab_size = vocab_size self.embedding_size = embedding_size self.hidden_size = hidden_size self.embedding = Embedding(vocab_size, embedding_size) self.lstm = LSTM(embedding_size, hidden_size) self.layers = [self.embedding, self.lstm] self.params = list(itertools.chain(*[layer.params for layer in self.layers if hasattr(layer, 'params')])) def forward(self, batch, mask): # ``batch`` is a matrix whose row ``x`` is a sentence, e.g. x = [1, 4, 5, 2, 0] # ``emb`` is a list of embedding matrix, e[i].shape = (sene_size, embedding_size) emb = self.embedding.forward(batch) (H, C) = self.lstm.forward(emb, mask) return (H[-1], C[-1])
class Decoder(Sequential): def __init__(self, vocab_size, embedding_size, hidden_size, output_size): self.vocab_size = vocab_size self.embedding_size = embedding_size self.hidden_size = hidden_size self.output_size = output_size self.lstm = LSTM(embedding_size, hidden_size) self.lstm_output = TimeDistributed(hidden_size, output_size, activation='tanh') self.softmax = TimeDistributed(output_size, vocab_size, activation='softmax') self.embedding = Embedding(vocab_size, embedding_size) self.layers = [self.lstm, self.lstm_output, self.softmax, self.embedding] self.params = list(itertools.chain(*[layer.params for layer in self.layers if hasattr(layer, 'params')])) def forward(self, ec_H, ec_C, mask): (sens_size, batch_size) = T.shape(mask) def step(m, prev_Y, prev_H, prev_C): """Forward a time step of the decoder.""" # LSTM forward time step (H, C) = self.lstm.step(prev_Y, m, prev_H, prev_C) # LSTM output O = self.lstm_output.forward(H) # Apply softmax to LSTM output P = self.softmax.forward(O) # Make prediction one_hot_Y = T.argmax(P, axis=1) # Feed the output to the next time step Y = self.embedding.forward(one_hot_Y) # FIXME: Deal with differ length ? return (P, Y, H, C) results, updates = theano.scan( fn=step, sequences=[mask], outputs_info=[ None, dict(initial=T.zeros((batch_size, self.embedding_size)), taps=[-1]), dict(initial=ec_H, taps=[-1]), dict(initial=ec_C, taps=[-1]) ] ) # return np.swapaxes(results[0], 0, 1) # returns the softmax probabilities return results[0]
def __init__(self, vocabulary_size: int, hidden_size: int, n_layers: int, embedding_size: int, dropout: float, max_out_len: int): super().__init__() self.dropout = dropout self.vocabulary_size = vocabulary_size self.embedding = torch.nn.Embedding(vocabulary_size + 1, embedding_size) self.lstm = LSTM(embedding_size, hidden_size, n_layers, dropout=dropout) self.output_projection = Linear(hidden_size, vocabulary_size + 1) self.max_out_len = max_out_len self.sos_token = self.vocabulary_size self.eos_token = self.vocabulary_size
def main(): x = tensor.tensor3() t = tensor.matrix() task = TempOrder(args.low, args.high, args.length, batch_size=config.batch_size, long_sequences=False) train_db = SyntheticDatabase(task, number_of_batches=config.number_of_batches) valid_db = SyntheticDatabase(task, number_of_batches=config.test_sequences // config.batch_size, phase='valid') model = TempOrderModel(x, t, [ LSTM(task.input_size, config.layer_size, config.batch_size, args.hid_dropout_rate, args.drop_candidates, args.per_step, weight_init=Uniform(config.scale), persistent=False), LastStepPooling(), Linear(config.layer_size, task.output_size) ]) if args.finetune: model.load('exp/temp_order/opt.pkl') opt = SGDOptimizer(model, x, t, train_db, test_db=valid_db, name="temp_order", clip_gradients=True, clip_threshold=5, print_norms=True) opt.train(train_db, test_db=valid_db, learning_rate=args.lr, epochs=1000)
class Encoder(Sequential): def __init__(self, vocab_size, embedding_size, hidden_size): self.vocab_size = vocab_size self.embedding_size = embedding_size self.hidden_size = hidden_size self.embedding = Embedding(vocab_size, embedding_size) self.lstm = LSTM(embedding_size, hidden_size) self.layers = [self.embedding, self.lstm] self.params = list( itertools.chain(*[ layer.params for layer in self.layers if hasattr(layer, 'params') ])) def forward(self, batch, mask): # ``batch`` is a matrix whose row ``x`` is a sentence, e.g. x = [1, 4, 5, 2, 0] # ``emb`` is a list of embedding matrix, e[i].shape = (sene_size, embedding_size) emb = self.embedding.forward(batch) (H, C) = self.lstm.forward(emb, mask) return (H[-1], C[-1])
def train(self, train_data, dev_data, test_data, maxlen): # tr = tracker.SummaryTracker() rng = np.random.RandomState(3435) train_x, train_y = self.shared_dataset(train_data) dev_x, dev_y = self.shared_dataset(dev_data) test_x, test_y = self.shared_dataset(test_data) test_len = len(test_data[0]) n_train_batches = len(train_data[0]) // self.batch_size n_val_batches = len(dev_data[0]) // self.batch_size n_test_batches = test_len // self.batch_size input_width = self.hidden_sizes[0] x = T.matrix('x') y = T.ivector('y') index = T.lscalar() Words = theano.shared(value=self.word_vectors, name="Words", borrow=True) layer0_input = Words[T.cast(x.flatten(), dtype="int32")].reshape((self.batch_size, maxlen, input_width)) lstm = LSTM(dim=input_width, batch_size=self.batch_size, number_step=maxlen) leyer0_output = lstm.feed_foward(layer0_input) lstm.mean_pooling_input(leyer0_output) hidden_sizes = [self.hidden_sizes[0], self.hidden_sizes[0]] hidden_layer = HiddenLayer(rng, hidden_sizes=hidden_sizes, input_vectors=lstm.output, activation=utils.Tanh, name="Hidden_Tanh") hidden_layer.predict() hidden_layer_relu = HiddenLayer(rng, hidden_sizes=hidden_sizes, input_vectors=hidden_layer.output) hidden_layer_relu.predict() # hidden_layer_dropout = HiddenLayerDropout(rng, hidden_sizes=self.hidden_sizes[:2], input_vectors=lstm.output, W=hidden_layer.W, b=hidden_layer.b) full_connect = FullConnectLayer(rng, layers_size=[self.hidden_sizes[0], self.hidden_sizes[-1]], input_vector=hidden_layer_relu.output) full_connect.predict() cost = full_connect.negative_log_likelihood(y) params = lstm.params + hidden_layer.params + hidden_layer_relu.params + full_connect.params # params = hidden_layer.params + hidden_layer_relu.params + full_connect.params params_length = len(params) #init value for e_grad time 0, e_delta time 0 and delta at time 0 e_grad, e_delta_prev, delta = self.init_hyper_values(params_length) # e_grad_d, e_delta_prev_d, delta_d = self.init_hyper_values(params_length, name="D") #apply gradient grads = T.grad(cost, params) #dropout hidden layer # hidden_layer_dropout.dropout() # hidden_layer_dropout.predict() # full_connect.setInput(hidden_layer_dropout.output) # full_connect.predict() # cost_d = full_connect.negative_log_likelihood(y) #apply gradient to cost_d # grads_d = T.grad(cost_d, params) e_grad, e_delta_prev, delta = self.adadelta(grads, e_grad, e_delta_prev) # e_grad_d, e_delta_prev_d, delta_d = self.adadelta(grads_d, e_grad_d, e_delta_prev_d, delta_d) grads = delta # grad_d = delta_d updates = [(p, p - d) for p, d in zip(params, grads)] # updates = [(p, p - self.learning_rate * d) for p, d in zip(params, grads)] train_model = theano.function([index], cost, updates=updates, givens={ x: train_x[(index * self.batch_size):((index + 1) * self.batch_size)], y: train_y[(index * self.batch_size):((index + 1) * self.batch_size)] }) val_model = theano.function([index], full_connect.errors(y), givens={ x: dev_x[index * self.batch_size: (index + 1) * self.batch_size], y: dev_y[index * self.batch_size: (index + 1) * self.batch_size], }) test_model = theano.function(inputs=[index], outputs=full_connect.errors(y), givens={ x: test_x[index * self.batch_size: (index + 1) * self.batch_size], y: test_y[index * self.batch_size: (index + 1) * self.batch_size] }) validation_frequency = min(n_train_batches, self.patience // 2) val_batch_lost = 1. best_batch_lost = 1. best_test_lost = 1. stop_count = 0 epoch = 0 done_loop = False current_time_step = 0 improve_threshold = 0.995 iter_list = range(n_train_batches) while(epoch < self.epochs and done_loop is not True): epoch_cost_train = 0. epoch += 1 batch_train = 0 print("Start epoch: %i" % epoch) start = time.time() random.shuffle(iter_list) for mini_batch, m_b_i in zip(iter_list, xrange(n_train_batches)): current_time_step = (epoch - 1) * n_train_batches + m_b_i epoch_cost_train += train_model(mini_batch) batch_train += 1 if (current_time_step + 1) % validation_frequency == 0: val_losses = [val_model(i) for i in xrange(n_val_batches)] val_losses = np.array(val_losses) val_batch_lost = np.mean(val_losses) if val_batch_lost < best_batch_lost: if best_batch_lost * improve_threshold > val_batch_lost: self.patience = max(self.patience, current_time_step * self.patience_frq) best_batch_lost = val_batch_lost # test it on the test set test_losses = [ test_model(i) for i in range(n_test_batches) ] current_test_lost = np.mean(test_losses) print(('epoch %i minibatch %i test accuracy of %i example is: %.5f') % (epoch, m_b_i, test_len, (1 - current_test_lost) * 100.)) if best_test_lost > current_test_lost: best_test_lost = current_test_lost if self.patience <= current_time_step: print(self.patience) done_loop = True break print('epoch: %i, training time: %.2f secs; with avg cost: %.5f' % (epoch, time.time() - start, epoch_cost_train / batch_train)) print('Best test accuracy is: %.5f' % (1 - best_test_lost)) utils.save_layer_params(lstm, 'lstm') utils.save_layer_params(hidden_layer, 'hidden_lstm') utils.save_layer_params(hidden_layer_relu, 'hidden_relu_lstm') utils.save_layer_params(full_connect, 'full_connect_lstm') return lstm.params
def _build_layers(self): self.emb = HRRWordEmbedding(self.vocab_size, self.cell_dim, self.num_roles, self.num_fillers) self.rnn = LSTM(self.num_layers, self.cell_dim, keep_prob=self.keep_prob)
char_to_int[chars[i]] = i int_to_char[i] = chars[i] vec = np.zeros((len(chars), 1)) vec[i] = 1. char_to_vec[chars[i]] = vec source = np.array(list(source))[:(len(source) // BATCH_SIZE) * BATCH_SIZE] source = np.array(np.split(source, BATCH_SIZE)) EMBEDDING_LENGTH = len(chars) # Creating the model. model = Network( LSTM(size=512, input_size=EMBEDDING_LENGTH, batch_size=BATCH_SIZE, backprop_depth=SEQUENCE_LENGTH, stateful=True), LSTM(size=512, input_size=512, batch_size=BATCH_SIZE, backprop_depth=SEQUENCE_LENGTH, stateful=True), TimeDistributed( Dense(size=EMBEDDING_LENGTH, input_size=512, activation=SparseSoftmax()))) if RESTORE_MODEL_PATH: model.loadParams(RESTORE_MODEL_PATH)
def _build(self): """ Build model Input's form is BatchSize x Num_Time_Steps x Num_Channels Params: None Returns: None """ self.layers.append( Conv1D(num_in_channels=self.num_input_channels, num_out_channels=16, filter_size=3, strides=1, padding="SAME", dropout=0.0, bias=True, act=leak_relu ) ) self.layers.append( Conv1D(num_in_channels=16, num_out_channels=32, filter_size=3, strides=1, padding="SAME", dropout=0.0, bias=True, act=leak_relu ) ) self.layers.append( LSTM(input_dim=32, num_units=128, length=self.num_time_steps, batch_size=32, return_sequece=False, bias=True ) ) self.layers.append( Dense(input_dim=128, output_dim=64, dropout=0.0, sparse_inputs=False, act=leak_relu, bias=True ) ) self.layers.append(CenterLoss(num_classes=self.num_classes, num_feas=64, learning_rate=0.5)) self.layers.append( Dense(input_dim=64, output_dim=self.num_classes, dropout=0.0, sparse_inputs=False, act=leak_relu, bias=True ) )
char_to_int[chars[i]] = i int_to_char[i] = chars[i] vec = np.zeros((len(chars), 1)) vec[i] = 1. char_to_vec[chars[i]] = vec # The length of the vector that represents a character # is equivalent to the number of different characters # in the text. EMBEDDING_LENGTH = len(chars) # Creating the model. model = Network( LSTM(size=512, input_size=EMBEDDING_LENGTH, batch_size=1, backprop_depth=1, stateful=True), LSTM(size=512, input_size=512, batch_size=1, backprop_depth=1, stateful=True), TimeDistributed( Dense(size=EMBEDDING_LENGTH, input_size=512, activation=SparseSoftmax(TEMPERATURE)))) model.loadParams(MODEL) # optimizer = Adam(learning_rate=lambda n: 0.001, beta_1=0.9, beta_2=0.999)
train_data_iter = data_iterator_simple(load_train_func, len(x_train), batch_size, shuffle=True, with_file_cache=False) valid_data_iter = data_iterator_simple(load_valid_func, len(x_valid), batch_size, shuffle=True, with_file_cache=False) x = nn.Variable((batch_size, sentence_length)) t = nn.Variable((batch_size, sentence_length, 1)) h = PF.embed(x, vocab_size, embedding_size) h = LSTM(h, hidden, return_sequences=True) h = TimeDistributed(PF.affine)(h, hidden, name='hidden') y = TimeDistributed(PF.affine)(h, vocab_size, name='output') mask = F.sum(F.sign(t), axis=2) # do not predict 'pad'. entropy = TimeDistributedSoftmaxCrossEntropy(y, t) * mask count = F.sum(mask, axis=1) loss = F.mean(F.div2(F.sum(entropy, axis=1), count)) # Create solver. solver = S.Momentum(1e-2, momentum=0.9) solver.set_parameters(nn.get_parameters()) # Create monitor. from nnabla.monitor import Monitor, MonitorSeries, MonitorTimeElapsed monitor = Monitor('./tmp-lstmlm')
def _build_layers(self): self.emb = HRRWordEmbedding(self.vocab_size, self.cell_dim, self.num_roles, self.num_fillers) self.rnn = LSTM(self.num_layers, self.cell_dim, keep_prob=self.keep_prob, proj_dim=self.cell_dim * 2) self.chunk_layer = HRRChunkLayer(self.cell_dim, self.num_roles)
def main(): with open(LOOKUP_FILE, 'r') as file: chars = json.load(file) # Here we make dictionaries that can be used to convert # between characters, integer id-s of characters, and one-hot # vectors that will be used to represent the characters. char_to_int = dict() int_to_char = dict() char_to_vec = dict() for i in range(len(chars)): char_to_int[chars[i]] = i int_to_char[i] = chars[i] vec = np.zeros((len(chars), 1)) vec[i] = 1. char_to_vec[chars[i]] = vec # The length of the vector that represents a character # is equivalent to the number of different characters # in the text. EMBEDDING_LENGTH = len(chars) # Create the LSTM layers only. We don't use the Network class, # since we are only interested in the activations of the recurrent # layers. first_layer = LSTM(size=512, input_size=EMBEDDING_LENGTH, batch_size=1, backprop_depth=1, stateful=True) second_layer = LSTM(size=512, input_size=512, batch_size=1, backprop_depth=1, stateful=True) # Load the weights. with open(MODEL, 'r') as file: weights = json.load(file) first_layer.loadParams(weights[0]) second_layer.loadParams(weights[1]) # Loading in the file. with open(TEXT_FILE, 'r', encoding='utf8') as file: text = file.read() source = list(text) for i in range(len(source)): source[i] = char_to_vec[source[i]] # Feed the text to the network. # Here we look at the activation of the neurons of the # hidden state at the 2nd LSTM layer. # We take the first element of the output as there is only one # batch. out = second_layer.forward(first_layer.forward(np.array([source])))[0] # ###############---TKINTER---############################################# class Wrap: NEURON_INDEX = 0 def showNeuron(): for j in range(out.shape[0]): # We will leave the background of the newline characters white, # regardless of its activation. The reason for that is that the color # would fill the entire remainder of the line, which is very disturbing to look at. intensity = 255 if text[j] == '\n' else 255 - int( (out[j, Wrap.NEURON_INDEX, 0] + 1) * 127.5) text_box.tag_config(str(j), background="#%02x%02x%02x" % (255, intensity, intensity)) def inputFromEntry(evt): Wrap.NEURON_INDEX = int(entry.get()) entry.delete(0, "end") showNeuron() def nextButtonClicked(): Wrap.NEURON_INDEX += 1 entry.delete(0, "end") entry.insert(tk.INSERT, str(Wrap.NEURON_INDEX)) showNeuron() # Making the tkinter window. root = tk.Tk() text_box = tk.Text(root, height=35) text_box.insert(tk.INSERT, text) text_box.pack() current_line = 1 current_char = 0 for i in range(out.shape[0]): text_box.tag_add(str(i), f"{current_line}.{current_char}") current_char += 1 if text[i] == '\n': current_line += 1 current_char = 0 # Making the entry box. entry = tk.Entry(root, width=5) entry.pack() entry.bind("<Return>", inputFromEntry) # Buttons up = tk.Button(text="Next", command=nextButtonClicked) up.pack() # Show the first neuron by default. showNeuron() root.mainloop()
from loss import CrossEntropyLoss from optimizer import SGDOptimizer, RMSpropOptimizer from network import Network from data_preparation import load_data from solve_rnn import solve_rnn import theano.tensor as T X_train, y_train, X_test, y_test = load_data() HIDDEN_DIM = 32 INPUT_DIM = 20 OUTPUT_DIM = 10 model = Network() model.add(LSTM('rnn1', HIDDEN_DIM, INPUT_DIM, 0.1)) # output shape: 4 x HIDDEN_DIM model.add(Linear('fc', HIDDEN_DIM, OUTPUT_DIM, 0.1)) # output shape: 4 x OUTPUT_DIM model.add(Softmax('softmax')) loss = CrossEntropyLoss('xent') optim = RMSpropOptimizer(learning_rate=0.01, rho=0.9) input_placeholder = T.fmatrix('input') label_placeholder = T.fmatrix('label') model.compile(input_placeholder, label_placeholder, loss, optim) MAX_EPOCH = 6 DISP_FREQ = 1000 TEST_FREQ = 10000
output = [] for f, f_size in zip(filters, filster_sizes): _h = PF.convolution(h, f, kernel=(1, f_size), pad=(0, f_size // 2), name='conv_{}'.format(f_size)) _h = F.max_pooling(_h, kernel=(1, word_length)) output.append(_h) h = F.concatenate(*output, axis=1) h = F.transpose(h, (0, 2, 1, 3)) h = F.reshape(h, (batch_size, sentence_length, sum(filters))) # h = PF.batch_normalization(h, axes=[2]) h = TimeDistributed(Highway)(h, name='highway1') h = TimeDistributed(Highway)(h, name='highway2') h = LSTM(h, lstm_size, return_sequences=True, name='lstm1') h = LSTM(h, lstm_size, return_sequences=True, name='lstm2') h = TimeDistributed(PF.affine)(h, lstm_size, name='hidden') y = TimeDistributed(PF.affine)(h, word_vocab_size, name='output') t = nn.Variable((batch_size, sentence_length, 1)) mask = F.sum(F.sign(t), axis=2) # do not predict 'pad'. entropy = TimeDistributedSoftmaxCrossEntropy(y, t) * mask count = F.sum(mask, axis=1) loss = F.mean(F.div2(F.sum(entropy, axis=1), count)) # Create solver. solver = S.Momentum(1e-2, momentum=0.9) solver.set_parameters(nn.get_parameters()) # Create monitor.
def _build_layers(self): emb_cls = TiedIOEmbedding if self.tied_io else EmbeddingLayer self.emb = emb_cls(self.vocab_size, self.cell_dim) self.rnn = LSTM(self.num_layers, self.cell_dim, keep_prob=self.keep_prob)
def __init__(self, word_V, dep_V, word_d=100, pos_d=25, mlp_d=100, mlp_label_d=100, num_lstm_layers=2, lstm_d=125, embeddings_init=None, pos_V=None, seed=0, verbose=False): ''' word_V - size of word vocab dep_V - size of relation label vocab word_d - dimension of word embeddings pos_d - dimension of POS embeddings mlp_d - dimension of hidden layer for arc prediction MLP mlp_label_d - dimension of hidden layer for label prediction MLP num_lstm_layers - number of bi-directional LSTM layers to stack lstm_d - dimension of hidden state in the LSTM embeddings_init - use pre-trained embeddings pos_V - size of POS vocab seed - random seed for initialization verbose - whether to print information about these parameters ''' if verbose: print('Word vocabulary size: {}'.format(word_V)) print('Dependency relation vocabulary size: {}'.format(dep_V)) print('POS vocabulary size: {}'.format(pos_V)) self.word_V = word_V self.dep_V = dep_V self.pos_V = pos_V self.word_d = word_d self.pos_d = pos_d self.mlp_d = mlp_d self.mlp_label_d = mlp_label_d self.lstm_layers = num_lstm_layers self.lstm_d = lstm_d np.random.seed(seed) self.model = dynet.Model() #embedding layers for words and POS self.embeddings = self.model.add_lookup_parameters( (self.word_V, self.word_d)) if pos_V is not None: self.pos_embeddings = self.model.add_lookup_parameters( (self.pos_V, self.pos_d)) #bi-directional LSTM layers #embeddings -> layer1 -> layer2 lstm_layers = [] for i in range(num_lstm_layers): input_d = word_d if i: input_d = 2 * lstm_d elif pos_V is not None: input_d += pos_d fwd_lstm_layer = LSTM(self.model, input_d, lstm_d) rev_lstm_layer = LSTM(self.model, input_d, lstm_d, reverse=True) lstm_layers.append((fwd_lstm_layer, rev_lstm_layer)) #arc prediction MLP #layer2(i), layer2(j) -> concatenate -> score mlp_layer = MLP(self.model, lstm_d * 4, mlp_d, 1) #label prediction MLP if mlp_label_d: mlp_label_layer = MLP(self.model, lstm_d * 4, mlp_label_d, dep_V) else: mlp_label_layer = None #train the model using Adam optimizer self.trainer = dynet.AdamTrainer(self.model) #take in word and pos_indices, return the output of the 2nd layer def get_lstm_output(indices, pos_indices=None): embeddings_out = [self.embeddings[w] for w in indices] x = embeddings_out if pos_V is not None and pos_indices is not None: x = [] for i, input in enumerate(embeddings_out): x.append( dynet.concatenate( [input, self.pos_embeddings[pos_indices[i]]])) for i in range(num_lstm_layers): x_1 = lstm_layers[i][0].get_output(x)[0] x_2 = lstm_layers[i][1].get_output(x)[0] x = [ dynet.concatenate([x_1[i], x_2[i]]) for i in range(len(indices)) ] return x self.states = get_lstm_output #score all arcs from i to j using the arc prediction MLP def score_arcs(states, value=True): length = len(states) scores = [[None for i in range(length)] for j in range(length)] for i in range(length): for j in range(length): score = mlp_layer.get_output( dynet.concatenate([states[i], states[j]])) if value: scores[i][j] = score.scalar_value() else: scores[i][j] = score return scores self.score_arcs = score_arcs #score all labels at i using the label prediction MLP def score_labels(states, arcs, value=True): scores = [] for i in range(len(states)): score = mlp_label_layer.get_output( dynet.concatenate([states[i], states[arcs[i]]])) if value: scores.append(score.value()) else: scores.append(score) return scores self.score_labels = score_labels
def build_test_model(self, data): rng = np.random.RandomState(3435) lstm_params, hidden_params, hidden_relu_params, full_connect_params, convs = self.load_trained_params( ) data_x, data_y, maxlen = data test_len = len(data_x) n_test_batches = test_len // self.batch_size x = T.matrix('x') y = T.ivector('y') index = T.lscalar() Words = theano.shared(value=self.word_vectors, name="Words", borrow=True) input_width = self.hidden_sizes[0] layer0_input = Words[T.cast(x.flatten(), dtype="int32")].reshape( (self.batch_size, maxlen, input_width)) lstm = LSTM(dim=input_width, batch_size=self.batch_size, number_step=maxlen, params=lstm_params) leyer0_output = lstm.feed_foward(layer0_input) conv_outputs = list() conv_nnets = list() params = list() output = T.cast(layer0_input.flatten(), dtype=floatX) conv_input = output.reshape((self.batch_size, 1, maxlen, input_width)) for it, p_conv in enumerate(convs): pheight = maxlen - self.filter_sizes[it] + 1 conv = ConvolutionLayer(rng=rng, filter_shape=(self.kernel, 1, self.filter_sizes[it], input_width), input_shape=(self.batch_size, 1, maxlen, input_width), poolsize=(pheight, 1), name="conv" + str(self.filter_sizes[it]), W=p_conv[0], b=p_conv[1]) #=>batch size * 1 * 100 * width output = conv.predict(conv_input) layer1_input = output.flatten(2) params += conv.params conv_outputs.append(layer1_input) conv_nnets.append(conv) conv_nnets_output = T.concatenate(conv_outputs, axis=1) hidden_layer = HiddenLayer( rng, hidden_sizes=[self.kernel * 3, self.hidden_sizes[0]], input_vectors=conv_nnets_output, activation=utils.Tanh, name="Hidden_Tanh", W=hidden_params[0], b=hidden_params[1]) hidden_layer.predict() hidden_layer_relu = HiddenLayer( rng, hidden_sizes=[self.hidden_sizes[0], self.hidden_sizes[0]], input_vectors=hidden_layer.output, W=hidden_relu_params[0], b=hidden_relu_params[1]) hidden_layer_relu.predict() # hidden_layer_dropout = HiddenLayerDropout(rng, hidden_sizes=self.hidden_sizes[:2], input_vectors=lstm.output, W=hidden_layer.W, b=hidden_layer.b) full_connect = FullConnectLayer( rng, layers_size=[self.hidden_sizes[0], self.hidden_sizes[-1]], input_vector=hidden_layer_relu.output, W=full_connect_params[0], b=full_connect_params[1]) full_connect.predict() test_data_x = theano.shared(np.asarray(data_x, dtype=floatX), borrow=True) test_data_y = theano.shared(np.asarray(data_y, dtype='int32'), borrow=True) errors = 0. if test_len == 1: test_model = theano.function( [index], outputs=full_connect.get_predict(), on_unused_input='ignore', givens={ x: test_data_x[index * self.batch_size:(index + 1) * self.batch_size], y: test_data_y[index * self.batch_size:(index + 1) * self.batch_size] }) index = 0 avg_errors = test_model(index) else: test_model = theano.function( [index], outputs=full_connect.errors(y), givens={ x: test_data_x[index * self.batch_size:(index + 1) * self.batch_size], y: test_data_y[index * self.batch_size:(index + 1) * self.batch_size] }) for i in xrange(n_test_batches): errors += test_model(i) avg_errors = errors / n_test_batches return avg_errors