class ReadLayer(object): def __init__(self, rng, h_shape, image_shape, N, name='Default_readlayer'): print('Building layer: ' + name) self.lin_transform = HiddenLayer( rng, n_in=h_shape[0] * h_shape[1], n_out=4, activation=None, irange=0.001, name='readlayer: linear transformation') self.reader = Reader( rng, image_shape=image_shape, N=N, name='readlayer: reader') self.params = self.lin_transform.params def one_step(self, h, image): linear = self.lin_transform.one_step(h) read, g_x, g_y, delta, sigma_sq = self.reader.one_step(linear, image) return read, g_x, g_y, delta, sigma_sq
class TestLSTM(AbstractModel): def __init__(self, input_dims, learning_rate, batch_size): self.input = T.tensor3(name='input', dtype=theano.config.floatX) self.target = T.matrix(name="target", dtype=theano.config.floatX) self.h_tm1 = T.matrix(name="hidden_output", dtype=theano.config.floatX) self.c_tm1 = T.matrix(name="hidden_state", dtype=theano.config.floatX) self.learning_rate = learning_rate N = 12 self.lstm_layer_sizes = [128, 128] self.read_layer = ReadLayer( rng, h_shape=(reduce(lambda x, y: x + y, self.lstm_layer_sizes), 1), image_shape=input_dims, N=N, name='Read Layer' ) self.conv_layer = ConvPoolLayer( rng, filter_shape=(30, 1, 3, 3), input_shape=(1, N, N), ) self.lstm_layer1 = LSTMLayer( rng, n_in=N*N, n_out=self.lstm_layer_sizes[0], name='LSTM1' ) self.lstm_layer2 = LSTMLayer( rng, n_in=self.lstm_layer_sizes[0], n_out=self.lstm_layer_sizes[1], name='LSTM2' ) self.output_layer = HiddenLayer( rng, n_in=self.lstm_layer_sizes[0] + self.lstm_layer_sizes[1] + 5*5*30, n_out=10, activation=None, name='output' ) self.params = self.read_layer.params + self.lstm_layer1.params +\ self.lstm_layer2.params + self.output_layer.params def get_predict_output(self, input, h_tm1, c_tm1): h, c, output, g_y, g_x, read, delta, sigma_sq = self.recurrent_step(input, h_tm1, c_tm1) return output, h, c, read, g_x, g_y, delta, sigma_sq def get_train_output(self, images, batch_size): images = images.dimshuffle([1, 0, 2, 3]) h0, c0 = self.get_initial_state(batch_size) [h, c, output, g_y, g_x, _, _, _], _ = theano.scan(fn=self.recurrent_step, outputs_info=[ h0, c0, None, None, None, None, None, None], sequences=images, ) return output, g_y, g_x def recurrent_step(self, image, h_tm1, c_tm1): read, g_x, g_y, delta, sigma_sq = self.read_layer.one_step(h_tm1, image) read_ = read.flatten(ndim=2) h_1, c_1 =\ self.lstm_layer1.one_step(read_, h_tm1[:, 0:self.lstm_layer_sizes[0]], c_tm1[:, 0:self.lstm_layer_sizes[0]]) h_2, c_2 =\ self.lstm_layer2.one_step(h_1, h_tm1[:, self.lstm_layer_sizes[0]:], c_tm1[:, self.lstm_layer_sizes[0]:] ) h = T.concatenate([h_1, h_2], axis=1) c = T.concatenate([c_1, c_2], axis=1) conv = self.conv_layer.one_step(read.dimshuffle([0, 'x', 1, 2])) conv = conv.flatten(ndim=2) lin_output = self.output_layer.one_step(T.concatenate([h_1, h_2, conv], axis=1)) output = T.nnet.softmax(lin_output) return [h, c, output, g_y, g_x, read, delta, sigma_sq] def step_with_att(self, h_tm1, c_tm1, image): read, g_x, g_y, delta, sigma_sq = self.read_layer.one_step( h_tm1, image) read_ = read_.flatten(ndim=2) h_1, c_1 =\ self.lstm_layer1.one_step(read_, h_tm1[:, 0:self.lstm_layer_sizes[0]], c_tm1[:, 0:self.lstm_layer_sizes[0]]) h_2, c_2 =\ self.lstm_layer2.one_step(h_1, h_tm1[:, self.lstm_layer_sizes[0]:], c_tm1[:, self.lstm_layer_sizes[0]:] ) return [h, c, read, g_x, g_y, delta, sigma_sq] def compile(self, train_batch_size): print("Compiling functions...") train_input = T.tensor4() target_y = T.matrix() target_x = T.matrix() train_output, g_y, g_x = self.get_train_output(train_input, train_batch_size) classification_loss = self.get_NLL_cost(train_output[-1], self.target) tracking_loss = self.get_tracking_cost(g_y, g_x, target_y, target_x) loss = 5 * classification_loss + tracking_loss updates = Adam(loss, self.params, lr=self.learning_rate) # updates = self.get_updates(loss, self.params, self.learning_rate) self.train_func = theano.function( inputs=[train_input, self.target, target_y, target_x], outputs=[train_output[-1], loss], updates=updates, allow_input_downcast=True ) h_tm1 = T.matrix() c_tm1 = T.matrix() predict_output, h, c, read, g_x, g_y, delta, sigma_sq = \ self.get_predict_output(self.input, h_tm1, c_tm1) self.predict_func = theano.function(inputs=[self.input, h_tm1, c_tm1], outputs=[predict_output, h, c, read, g_x, g_y, delta, sigma_sq], allow_input_downcast=True) print("Done!") def train(self, x, y, target_y, target_x): ''' x is in the form of [batch, time, height, width] y is [batch, target] ''' prediction, loss = self.train_func(x, y, target_y, target_x) return prediction, loss def get_initial_state(self, batch_size, shared=True): total_states = reduce(lambda x, y: x + y, self.lstm_layer_sizes) h0 = np.zeros((batch_size, total_states), dtype=theano.config.floatX) c0 = np.zeros((batch_size, total_states), dtype=theano.config.floatX) if shared: h0 = theano.shared( h0, name='h0', borrow=True) c0 = theano.shared( c0, name='c0', borrow=True) return h0, c0 # initial_state = self.lstm_layer1.initial_hidden_state # initial_state = initial_state.dimshuffle( # ['x', 0]).repeat(batch_size, axis=0) # return initial_state def predict(self, x, reset=True, batch_size=1): if reset: self.predict_h, self.predict_c = self.get_initial_state( batch_size, shared=False) if len(x.shape) == 2: x = np.expand_dims(x, axis=0) prediction, self.predict_h, self.predict_c, read, g_x, g_y, delta, sigma_sq =\ self.predict_func(x, self.predict_h, self.predict_c) return prediction, [read, g_x, g_y, delta, sigma_sq] def get_NLL_cost(self, output, target): NLL = -T.sum((T.log(output) * target), axis=1) return NLL.mean() def get_tracking_cost(self, g_y, g_x, target_y, target_x): loss = ( (target_y - g_y) ** 2) + ((target_x - g_x) ** 2) loss = T.sqrt(loss + 1e-4) return loss.mean() def get_updates(self, cost, params, learning_rate): gradients = T.grad(cost, params) updates = updates = [ (param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(params, gradients) ] return updates def deserialize(self, hidden): result = [] start = 0 for size in self.lstm_layer_sizes: result.append(hidden[start:size].reshape((size, 1))) start = start + size return result
class TestLSTM(AbstractModel): def __init__(self, input_dims, learning_rate, batch_size): self.input = T.tensor3(name='input', dtype=theano.config.floatX) self.target = T.matrix(name="target", dtype=theano.config.floatX) self.h_tm1 = T.matrix(name="hidden_output", dtype=theano.config.floatX) self.c_tm1 = T.matrix(name="hidden_state", dtype=theano.config.floatX) self.learning_rate = learning_rate N = 12 self.lstm_layer_sizes = [128, 128] self.read_layer = ReadLayer(rng, h_shape=(reduce(lambda x, y: x + y, self.lstm_layer_sizes), 1), image_shape=input_dims, N=N, name='Read Layer') self.conv_layer = ConvPoolLayer( rng, filter_shape=(30, 1, 3, 3), input_shape=(1, N, N), ) self.lstm_layer1 = LSTMLayer(rng, n_in=N * N, n_out=self.lstm_layer_sizes[0], name='LSTM1') self.lstm_layer2 = LSTMLayer(rng, n_in=self.lstm_layer_sizes[0], n_out=self.lstm_layer_sizes[1], name='LSTM2') self.output_layer = HiddenLayer(rng, n_in=self.lstm_layer_sizes[0] + self.lstm_layer_sizes[1] + 5 * 5 * 30, n_out=10, activation=None, name='output') self.params = self.read_layer.params + self.lstm_layer1.params +\ self.lstm_layer2.params + self.output_layer.params def get_predict_output(self, input, h_tm1, c_tm1): h, c, output, g_y, g_x, read, delta, sigma_sq = self.recurrent_step( input, h_tm1, c_tm1) return output, h, c, read, g_x, g_y, delta, sigma_sq def get_train_output(self, images, batch_size): images = images.dimshuffle([1, 0, 2, 3]) h0, c0 = self.get_initial_state(batch_size) [h, c, output, g_y, g_x, _, _, _], _ = theano.scan( fn=self.recurrent_step, outputs_info=[h0, c0, None, None, None, None, None, None], sequences=images, ) return output, g_y, g_x def recurrent_step(self, image, h_tm1, c_tm1): read, g_x, g_y, delta, sigma_sq = self.read_layer.one_step( h_tm1, image) read_ = read.flatten(ndim=2) h_1, c_1 =\ self.lstm_layer1.one_step(read_, h_tm1[:, 0:self.lstm_layer_sizes[0]], c_tm1[:, 0:self.lstm_layer_sizes[0]]) h_2, c_2 =\ self.lstm_layer2.one_step(h_1, h_tm1[:, self.lstm_layer_sizes[0]:], c_tm1[:, self.lstm_layer_sizes[0]:] ) h = T.concatenate([h_1, h_2], axis=1) c = T.concatenate([c_1, c_2], axis=1) conv = self.conv_layer.one_step(read.dimshuffle([0, 'x', 1, 2])) conv = conv.flatten(ndim=2) lin_output = self.output_layer.one_step( T.concatenate([h_1, h_2, conv], axis=1)) output = T.nnet.softmax(lin_output) return [h, c, output, g_y, g_x, read, delta, sigma_sq] def step_with_att(self, h_tm1, c_tm1, image): read, g_x, g_y, delta, sigma_sq = self.read_layer.one_step( h_tm1, image) read_ = read_.flatten(ndim=2) h_1, c_1 =\ self.lstm_layer1.one_step(read_, h_tm1[:, 0:self.lstm_layer_sizes[0]], c_tm1[:, 0:self.lstm_layer_sizes[0]]) h_2, c_2 =\ self.lstm_layer2.one_step(h_1, h_tm1[:, self.lstm_layer_sizes[0]:], c_tm1[:, self.lstm_layer_sizes[0]:] ) return [h, c, read, g_x, g_y, delta, sigma_sq] def compile(self, train_batch_size): print("Compiling functions...") train_input = T.tensor4() target_y = T.matrix() target_x = T.matrix() train_output, g_y, g_x = self.get_train_output(train_input, train_batch_size) classification_loss = self.get_NLL_cost(train_output[-1], self.target) tracking_loss = self.get_tracking_cost(g_y, g_x, target_y, target_x) loss = 5 * classification_loss + tracking_loss updates = Adam(loss, self.params, lr=self.learning_rate) # updates = self.get_updates(loss, self.params, self.learning_rate) self.train_func = theano.function( inputs=[train_input, self.target, target_y, target_x], outputs=[train_output[-1], loss], updates=updates, allow_input_downcast=True) h_tm1 = T.matrix() c_tm1 = T.matrix() predict_output, h, c, read, g_x, g_y, delta, sigma_sq = \ self.get_predict_output(self.input, h_tm1, c_tm1) self.predict_func = theano.function( inputs=[self.input, h_tm1, c_tm1], outputs=[predict_output, h, c, read, g_x, g_y, delta, sigma_sq], allow_input_downcast=True) print("Done!") def train(self, x, y, target_y, target_x): ''' x is in the form of [batch, time, height, width] y is [batch, target] ''' prediction, loss = self.train_func(x, y, target_y, target_x) return prediction, loss def get_initial_state(self, batch_size, shared=True): total_states = reduce(lambda x, y: x + y, self.lstm_layer_sizes) h0 = np.zeros((batch_size, total_states), dtype=theano.config.floatX) c0 = np.zeros((batch_size, total_states), dtype=theano.config.floatX) if shared: h0 = theano.shared(h0, name='h0', borrow=True) c0 = theano.shared(c0, name='c0', borrow=True) return h0, c0 # initial_state = self.lstm_layer1.initial_hidden_state # initial_state = initial_state.dimshuffle( # ['x', 0]).repeat(batch_size, axis=0) # return initial_state def predict(self, x, reset=True, batch_size=1): if reset: self.predict_h, self.predict_c = self.get_initial_state( batch_size, shared=False) if len(x.shape) == 2: x = np.expand_dims(x, axis=0) prediction, self.predict_h, self.predict_c, read, g_x, g_y, delta, sigma_sq =\ self.predict_func(x, self.predict_h, self.predict_c) return prediction, [read, g_x, g_y, delta, sigma_sq] def get_NLL_cost(self, output, target): NLL = -T.sum((T.log(output) * target), axis=1) return NLL.mean() def get_tracking_cost(self, g_y, g_x, target_y, target_x): loss = ((target_y - g_y)**2) + ((target_x - g_x)**2) loss = T.sqrt(loss + 1e-4) return loss.mean() def get_updates(self, cost, params, learning_rate): gradients = T.grad(cost, params) updates = updates = [(param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(params, gradients)] return updates def deserialize(self, hidden): result = [] start = 0 for size in self.lstm_layer_sizes: result.append(hidden[start:size].reshape((size, 1))) start = start + size return result