class NeuralTuringMachine(Recurrent): print(7) """ Neural Turing Machines Non obvious parameter: ---------------------- shift_range: int, number of available shifts, ex. if 3, avilable shifts are (-1, 0, 1) n_slots: number of memory locations m_length: memory length at each location Known issues: ------------- Theano may complain when n_slots == 1. """ def __init__(self, output_dim, n_slots, m_length, shift_range=3, inner_rnn='lstm', init='glorot_uniform', inner_init='orthogonal', input_dim=4, input_length=5, **kwargs): self.output_dim = output_dim self.n_slots = n_slots self.m_length = m_length self.shift_range = shift_range self.init = init self.inner_init = inner_init self.inner_rnn = inner_rnn self.input_dim = input_dim self.input_length = input_length if self.input_dim: kwargs['input_shape'] = (self.input_length, self.input_dim) super(NeuralTuringMachine, self).__init__(**kwargs) def build(self): input_leng, input_dim = self.input_shape[1:] self.input = T.tensor3() if self.inner_rnn == 'gru': self.rnn = GRU( activation='relu', input_dim=input_dim + self.m_length, input_length=input_leng, output_dim=self.output_dim, init=self.init, inner_init=self.inner_init) elif self.inner_rnn == 'lstm': self.rnn = LSTM( input_dim=input_dim + self.m_length, input_length=input_leng, output_dim=self.output_dim, init=self.init, forget_bias_init='zero', inner_init=self.inner_init) else: raise ValueError('this inner_rnn is not implemented yet.') self.rnn.build() # initial memory, state, read and write vecotrs self.M = theano.shared((.001 * np.ones((1,)).astype(floatX))) print(self.M) self.init_h = K.zeros((self.output_dim)) self.init_wr = self.rnn.init((self.n_slots,)) self.init_ww = self.rnn.init((self.n_slots,)) # write self.W_e = self.rnn.init((self.output_dim, self.m_length)) # erase self.b_e = K.zeros((self.m_length)) self.W_a = self.rnn.init((self.output_dim, self.m_length)) # add self.b_a = K.zeros((self.m_length)) # get_w parameters for reading operation self.W_k_read = self.rnn.init((self.output_dim, self.m_length)) self.b_k_read = self.rnn.init((self.m_length,)) self.W_c_read = self.rnn.init((self.output_dim, 3)) # 3 = beta, g, gamma see eq. 5, 7, 9 self.b_c_read = K.zeros((3)) self.W_s_read = self.rnn.init((self.output_dim, self.shift_range)) self.b_s_read = K.zeros((self.shift_range)) # b_s lol! not intentional # get_w parameters for writing operation self.W_k_write = self.rnn.init((self.output_dim, self.m_length)) self.b_k_write = self.rnn.init((self.m_length,)) self.W_c_write = self.rnn.init((self.output_dim, 3)) # 3 = beta, g, gamma see eq. 5, 7, 9 self.b_c_write = K.zeros((3)) self.W_s_write = self.rnn.init((self.output_dim, self.shift_range)) self.b_s_write = K.zeros((self.shift_range)) self.C = _circulant(self.n_slots, self.shift_range) self.trainable_weights = self.rnn.trainable_weights + [ self.W_e, self.b_e, self.W_a, self.b_a, self.W_k_read, self.b_k_read, self.W_c_read, self.b_c_read, self.W_s_read, self.b_s_read, self.W_k_write, self.b_k_write, self.W_s_write, self.b_s_write, self.W_c_write, self.b_c_write, self.M, self.init_h, self.init_wr, self.init_ww] if self.inner_rnn == 'lstm': self.init_c = K.zeros((self.output_dim)) self.trainable_weights = self.trainable_weights + [self.init_c, ] def _read(self, w, M): return (w[:, :, None] * M).sum(axis=1) def _write(self, w, e, a, M): Mtilda = M * (1 - w[:, :, None] * e[:, None, :]) Mout = Mtilda + w[:, :, None] * a[:, None, :] return Mout def _get_content_w(self, beta, k, M): num = beta[:, None] * _cosine_distance(M, k) return _softmax(num) def _get_location_w(self, g, s, C, gamma, wc, w_tm1): wg = g[:, None] * wc + (1 - g[:, None]) * w_tm1 Cs = (C[None, :, :, :] * wg[:, None, None, :]).sum(axis=3) wtilda = (Cs * s[:, :, None]).sum(axis=1) wout = _renorm(wtilda ** gamma[:, None]) return wout def _get_controller_output(self, h, W_k, b_k, W_c, b_c, W_s, b_s): k = T.tanh(T.dot(h, W_k) + b_k) # + 1e-6 c = T.dot(h, W_c) + b_c beta = T.nnet.relu(c[:, 0]) + 1e-4 g = T.nnet.sigmoid(c[:, 1]) gamma = T.nnet.relu(c[:, 2]) + 1.0001 s = T.nnet.softmax(T.dot(h, W_s) + b_s) return k, beta, g, gamma, s def get_initial_states(self, X): batch_size = X.shape[0] init_M = self.M.dimshuffle(0, 'x', 'x').repeat( batch_size, axis=0).repeat(self.n_slots, axis=1).repeat( self.m_length, axis=2) init_M = init_M.flatten(ndim=2) init_h = self.init_h.dimshuffle(('x', 0)).repeat(batch_size, axis=0) init_wr = self.init_wr.dimshuffle(('x', 0)).repeat(batch_size, axis=0) init_ww = self.init_ww.dimshuffle(('x', 0)).repeat(batch_size, axis=0) if self.inner_rnn == 'lstm': init_c = self.init_c.dimshuffle(('x', 0)).repeat(batch_size, axis=0) return [init_M, T.nnet.softmax(init_wr), T.nnet.softmax(init_ww), init_h, init_c] else: return [init_M, T.nnet.softmax(init_wr), T.nnet.softmax(init_ww), init_h] @property def output_shape(self): input_shape = self.input_shape if self.return_sequences: return input_shape[0], input_shape[1], self.output_dim else: return input_shape[0], self.output_dim def step(self, x, states): '''print(self.input_shape) print(self.n_slots) print(self.m_length)''' M_tm1, wr_tm1, ww_tm1 = states[:3] # reshape M_tm1 = M_tm1.reshape((x.shape[0], self.n_slots, self.m_length)) # read h_tm1 = states[3:] k_read, beta_read, g_read, gamma_read, s_read = self._get_controller_output( h_tm1[0], self.W_k_read, self.b_k_read, self.W_c_read, self.b_c_read, self.W_s_read, self.b_s_read) wc_read = self._get_content_w(beta_read, k_read, M_tm1) wr_t = self._get_location_w(g_read, s_read, self.C, gamma_read, wc_read, wr_tm1) M_read = self._read(wr_t, M_tm1) # update controller h_t = _update_controller(self, x, h_tm1, M_read) # write k_write, beta_write, g_write, gamma_write, s_write = self._get_controller_output( h_t[0], self.W_k_write, self.b_k_write, self.W_c_write, self.b_c_write, self.W_s_write, self.b_s_write) wc_write = self._get_content_w(beta_write, k_write, M_tm1) ww_t = self._get_location_w(g_write, s_write, self.C, gamma_write, wc_write, ww_tm1) e = T.nnet.sigmoid(T.dot(h_t[0], self.W_e) + self.b_e) a = T.tanh(T.dot(h_t[0], self.W_a) + self.b_a) M_t = self._write(ww_t, e, a, M_tm1) M_t = M_t.flatten(ndim=2) print(h_t[0], [M_t, wr_t, ww_t] + h_t) return h_t[0], [M_t, wr_t, ww_t] + h_t
class DRAW(Recurrent): '''DRAW Parameters: =========== h_dim : encoder/decoder dimension z_dim : random sample dimension (reparametrization trick output) input_shape : (n_channels, rows, cols) N_enc : Size of the encoder's filter bank (MNIST default: 2) N_dec : Size of the decoder's filter bank (MNIST default: 5) n_steps : number of sampling steps (or how long it takes to draw, default 64) inner_rnn : str with rnn type ('gru' default) truncate_gradient : int (-1 default) return_sequences : bool (False default) ''' theano_rng = theano_rng() def __init__(self, input_shape, h_dim, z_dim, N_enc=2, N_dec=5, n_steps=64, inner_rnn='gru', truncate_gradient=-1, return_sequences=False, canvas_activation=T.nnet.sigmoid, init='glorot_uniform', inner_init='orthogonal'): self.input = T.tensor4() self.h_dim = h_dim # this is 256 for MNIST self.z_dim = z_dim # this is 100 for MNIST self.input_shape = input_shape self.N_enc = N_enc self.N_dec = N_dec self.truncate_gradient = truncate_gradient self.return_sequences = return_sequences self.n_steps = n_steps self.canvas_activation = canvas_activation self.height = input_shape[1] self.width = input_shape[2] self.inner_rnn = inner_rnn if inner_rnn == 'gru': self.enc = GRU(input_dim=self.input_shape[0]*2*self.N_enc**2 + h_dim, output_dim=h_dim, init=init, inner_init=inner_init) self.dec = GRU(input_dim=z_dim, output_dim=h_dim, init=init, inner_init=inner_init) elif inner_rnn == 'lstm': self.enc = LSTM(input_dim=self.input_shape[0]*2*self.N_enc**2 + h_dim, output_dim=h_dim, init=init, inner_init=inner_init) self.dec = LSTM(input_dim=z_dim, output_dim=h_dim, init=init, inner_init=inner_init) else: raise ValueError('This type of inner_rnn is not supported') self.init_canvas = shared_zeros(input_shape) # canvas and hidden state self.init_h_enc = shared_zeros((h_dim)) # initial values self.init_h_dec = shared_zeros((h_dim)) # should be trained self.L_enc = self.enc.init((h_dim, 5)) # "read" attention parameters (eq. 21) self.L_dec = self.enc.init((h_dim, 5)) # "write" attention parameters (eq. 28) self.b_enc = shared_zeros((5)) # "read" attention parameters (eq. 21) self.b_dec = shared_zeros((5)) # "write" attention parameters (eq. 28) self.W_patch = self.enc.init((h_dim, self.N_dec**2*self.input_shape[0])) self.b_patch = shared_zeros((self.N_dec**2*self.input_shape[0])) self.W_mean = self.enc.init((h_dim, z_dim)) self.W_sigma = self.enc.init((h_dim, z_dim)) self.b_mean = shared_zeros((z_dim)) self.b_sigma = shared_zeros((z_dim)) self.params = self.enc.params + self.dec.params + [ self.L_enc, self.L_dec, self.b_enc, self.b_dec, self.W_patch, self.b_patch, self.W_mean, self.W_sigma, self.b_mean, self.b_sigma] # self.init_canvas, self.init_h_enc, self.init_h_dec] def init_updates(self): self.get_output(train=True) # populate regularizers list def _get_attention_params(self, h, L, b, N): p = T.dot(h, L) + b gx = self.width * (p[:, 0]+1) / 2. gy = self.height * (p[:, 1]+1) / 2. sigma2 = T.exp(p[:, 2]) delta = T.exp(p[:, 3]) * (max(self.width, self.height) - 1) / (N - 1.) gamma = T.exp(p[:, 4]) return gx, gy, sigma2, delta, gamma def _get_filterbank(self, gx, gy, sigma2, delta, N): small = 1e-4 i = T.arange(N) a = T.arange(self.width) b = T.arange(self.height) mx = gx[:, None] + delta[:, None] * (i - N/2. - .5) my = gy[:, None] + delta[:, None] * (i - N/2. - .5) Fx = T.exp(-(a - mx[:, :, None])**2 / 2. / sigma2[:, None, None]) Fx /= (Fx.sum(axis=-1)[:, :, None] + small) Fy = T.exp(-(b - my[:, :, None])**2 / 2. / sigma2[:, None, None]) Fy /= (Fy.sum(axis=-1)[:, :, None] + small) return Fx, Fy def _read(self, x, gamma, Fx, Fy): Fyx = (Fy[:, None, :, :, None] * x[:, :, None, :, :]).sum(axis=3) FxT = Fx.dimshuffle(0, 2, 1) FyxFx = (Fyx[:, :, :, :, None] * FxT[:, None, None, :, :]).sum(axis=3) return gamma[:, None, None, None] * FyxFx def _get_patch(self, h): write_patch = T.dot(h, self.W_patch) + self.b_patch write_patch = write_patch.reshape((h.shape[0], self.input_shape[0], self.N_dec, self.N_dec)) return write_patch def _write(self, write_patch, gamma, Fx, Fy): Fyx = (Fy[:, None, :, :, None] * write_patch[:, :, :, None, :]).sum(axis=2) FyxFx = (Fyx[:, :, :, :, None] * Fx[:, None, None, :, :]).sum(axis=3) return FyxFx / gamma[:, None, None, None] def _get_sample(self, h, eps): mean = T.dot(h, self.W_mean) + self.b_mean # eps = self.theano_rng.normal(avg=0., std=1., size=mean.shape) logsigma = T.dot(h, self.W_sigma) + self.b_sigma sigma = T.exp(logsigma) if self._train_state: sample = mean + eps * sigma else: sample = mean + 0 * eps * sigma kl = -.5 - logsigma + .5 * (mean**2 + sigma**2) # kl = .5 * (mean**2 + sigma**2 - logsigma - 1) return sample, kl.sum(axis=-1) def _get_rnn_input(self, x, rnn): if self.inner_rnn == 'gru': x_z = T.dot(x, rnn.W_z) + rnn.b_z x_r = T.dot(x, rnn.W_r) + rnn.b_r x_h = T.dot(x, rnn.W_h) + rnn.b_h return x_z, x_r, x_h elif self.inner_rnn == 'lstm': xi = T.dot(x, rnn.W_i) + rnn.b_i xf = T.dot(x, rnn.W_f) + rnn.b_f xc = T.dot(x, rnn.W_c) + rnn.b_c xo = T.dot(x, rnn.W_o) + rnn.b_o return xi, xf, xc, xo def _get_rnn_state(self, rnn, *args): mask = 1. # no masking if self.inner_rnn == 'gru': x_z, x_r, x_h, h_tm1 = args h = rnn._step(x_z, x_r, x_h, mask, h_tm1, rnn.U_z, rnn.U_r, rnn.U_h) return h elif self.inner_rnn == 'lstm': xi, xf, xc, xo, h_tm1, cell_tm1 = args h, cell = rnn._step(xi, xf, xo, xc, mask, h_tm1, cell_tm1, rnn.U_i, rnn.U_f, rnn.U_o, rnn.U_c) return h, cell def _get_initial_states(self, X): if self.inner_rnn == 'gru': batch_size = X.shape[0] canvas = self.init_canvas.dimshuffle('x', 0, 1, 2).repeat(batch_size, axis=0) init_enc = self.init_h_enc.dimshuffle('x', 0).repeat(batch_size, axis=0) init_dec = self.init_h_dec.dimshuffle('x', 0).repeat(batch_size, axis=0) else: canvas = alloc_zeros_matrix(*X.shape) # + self.init_canvas[None, :, :, :] init_enc = alloc_zeros_matrix(X.shape[0], self.h_dim) # + self.init_h_enc[None, :] init_dec = alloc_zeros_matrix(X.shape[0], self.h_dim) # + self.init_h_dec[None, :] return canvas, init_enc, init_dec def _step(self, eps, canvas, h_enc, h_dec, x, *args): x_hat = x - self.canvas_activation(canvas) gx, gy, sigma2, delta, gamma = self._get_attention_params( h_dec, self.L_enc, self.b_enc, self.N_enc) Fx, Fy = self._get_filterbank(gx, gy, sigma2, delta, self.N_enc) read_x = self._read(x, gamma, Fx, Fy).flatten(ndim=2) read_x_hat = self._read(x_hat, gamma, Fx, Fy).flatten(ndim=2) enc_input = T.concatenate([read_x, read_x_hat, h_dec], axis=-1) x_enc_z, x_enc_r, x_enc_h = self._get_rnn_input(enc_input, self.enc) new_h_enc = self._get_rnn_state(self.enc, x_enc_z, x_enc_r, x_enc_h, h_enc) sample, kl = self._get_sample(new_h_enc, eps) x_dec_z, x_dec_r, x_dec_h = self._get_rnn_input(sample, self.dec) new_h_dec = self._get_rnn_state(self.dec, x_dec_z, x_dec_r, x_dec_h, h_dec) gx_w, gy_w, sigma2_w, delta_w, gamma_w = self._get_attention_params( new_h_dec, self.L_dec, self.b_dec, self.N_dec) Fx_w, Fy_w = self._get_filterbank(gx_w, gy_w, sigma2_w, delta_w, self.N_dec) write_patch = self._get_patch(new_h_dec) new_canvas = canvas + self._write(write_patch, gamma_w, Fx_w, Fy_w) return new_canvas, new_h_enc, new_h_dec, kl def _step_lstm(self, eps, canvas, h_enc, cell_enc, h_dec, cell_dec, x, *args): x_hat = x - self.canvas_activation(canvas) gx, gy, sigma2, delta, gamma = self._get_attention_params( h_dec, self.L_enc, self.b_enc, self.N_enc) Fx, Fy = self._get_filterbank(gx, gy, sigma2, delta, self.N_enc) read_x = self._read(x, gamma, Fx, Fy).flatten(ndim=2) read_x_hat = self._read(x_hat, gamma, Fx, Fy).flatten(ndim=2) enc_input = T.concatenate([read_x, read_x_hat, h_dec.flatten(ndim=2)], axis=1) x_enc_i, x_enc_f, x_enc_c, x_enc_o = self._get_rnn_input(enc_input, self.enc) new_h_enc, new_cell_enc = self._get_rnn_state( self.enc, x_enc_i, x_enc_f, x_enc_c, x_enc_o, h_enc, cell_enc) sample, kl = self._get_sample(new_h_enc, eps) x_dec_i, x_dec_f, x_dec_c, x_dec_o = self._get_rnn_input(sample, self.dec) new_h_dec, new_cell_dec = self._get_rnn_state( self.dec, x_dec_i, x_dec_f, x_dec_c, x_dec_o, h_dec, cell_dec) gx_w, gy_w, sigma2_w, delta_w, gamma_w = self._get_attention_params( new_h_dec, self.L_dec, self.b_dec, self.N_dec) Fx_w, Fy_w = self._get_filterbank(gx_w, gy_w, sigma2_w, delta_w, self.N_dec) write_patch = self._get_patch(new_h_dec) new_canvas = canvas + self._write(write_patch, gamma_w, Fx_w, Fy_w) return new_canvas, new_h_enc, new_cell_enc, new_h_dec, new_cell_dec, kl def get_output(self, train=False): self._train_state = train X, eps = self.get_input(train).values() eps = eps.dimshuffle(1, 0, 2) canvas, init_enc, init_dec = self._get_initial_states(X) if self.inner_rnn == 'gru': outputs, updates = scan(self._step, sequences=eps, outputs_info=[canvas, init_enc, init_dec, None], non_sequences=[X, ] + self.params, # n_steps=self.n_steps, truncate_gradient=self.truncate_gradient) elif self.inner_rnn == 'lstm': outputs, updates = scan(self._step_lstm, sequences=eps, outputs_info=[0*canvas, 0*init_enc, 0*init_enc, 0*init_dec, 0*init_dec, None], non_sequences=[X, ] + self.params, truncate_gradient=self.truncate_gradient) kl = outputs[-1].sum(axis=0).mean() if train: # self.updates = updates self.regularizers = [SimpleCost(kl), ] if self.return_sequences: return [outputs[0].dimshuffle(1, 0, 2, 3, 4), kl] else: return [outputs[0][-1], kl]
class NeuralTuringMachine(Recurrent): """ Neural Turing Machines Parameters: ----------- shift_range: int, number of available shifts, ex. if 3, avilable shifts are (-1, 0, 1) n_slots: number of memory locations m_length: memory length at each location inner_rnn: str, supported values are 'gru' and 'lstm' output_dim: hidden state size (RNN controller output_dim) Known issues and TODO: ---------------------- Theano may complain when n_slots == 1. Add multiple reading and writing heads. """ def __init__(self, output_dim, n_slots, m_length, shift_range=3, inner_rnn='gru', truncate_gradient=-1, return_sequences=False, init='glorot_uniform', inner_init='orthogonal', input_dim=None, input_length=None, **kwargs): self.output_dim = output_dim self.n_slots = n_slots self.m_length = m_length self.shift_range = shift_range self.init = init self.inner_init = inner_init self.inner_rnn = inner_rnn self.return_sequences = return_sequences self.truncate_gradient = truncate_gradient self.input_dim = input_dim self.input_length = input_length if self.input_dim: kwargs['input_shape'] = (self.input_length, self.input_dim) super(NeuralTuringMachine, self).__init__(**kwargs) def build(self): input_leng, input_dim = self.input_shape[1:] self.input = T.tensor3() if self.inner_rnn == 'gru': self.rnn = GRU( input_dim=input_dim+self.m_length, input_length=input_leng, output_dim=self.output_dim, init=self.init, inner_init=self.inner_init) elif self.inner_rnn == 'lstm': self.rnn = LSTM( input_dim=input_dim+self.m_length, input_length=input_leng, output_dim=self.output_dim, init=self.init, inner_init=self.inner_init) else: raise ValueError('this inner_rnn is not implemented yet.') self.rnn.build() # initial memory, state, read and write vecotrs self.M = theano.shared((.001*np.ones((1,)).astype(floatX))) self.init_h = shared_zeros((self.output_dim)) self.init_wr = self.rnn.init((self.n_slots,)) self.init_ww = self.rnn.init((self.n_slots,)) # write self.W_e = self.rnn.init((self.output_dim, self.m_length)) # erase self.b_e = shared_zeros((self.m_length)) self.W_a = self.rnn.init((self.output_dim, self.m_length)) # add self.b_a = shared_zeros((self.m_length)) # get_w parameters for reading operation self.W_k_read = self.rnn.init((self.output_dim, self.m_length)) self.b_k_read = self.rnn.init((self.m_length, )) self.W_c_read = self.rnn.init((self.output_dim, 3)) # 3 = beta, g, gamma see eq. 5, 7, 9 in Graves et. al 2014 self.b_c_read = shared_zeros((3)) self.W_s_read = self.rnn.init((self.output_dim, self.shift_range)) self.b_s_read = shared_zeros((self.shift_range)) # get_w parameters for writing operation self.W_k_write = self.rnn.init((self.output_dim, self.m_length)) self.b_k_write = self.rnn.init((self.m_length, )) self.W_c_write = self.rnn.init((self.output_dim, 3)) # 3 = beta, g, gamma see eq. 5, 7, 9 self.b_c_write = shared_zeros((3)) self.W_s_write = self.rnn.init((self.output_dim, self.shift_range)) self.b_s_write = shared_zeros((self.shift_range)) self.C = _circulant(self.n_slots, self.shift_range) self.params = self.rnn.params + [ self.W_e, self.b_e, self.W_a, self.b_a, self.W_k_read, self.b_k_read, self.W_c_read, self.b_c_read, self.W_s_read, self.b_s_read, self.W_k_write, self.b_k_write, self.W_s_write, self.b_s_write, self.W_c_write, self.b_c_write, self.M, self.init_h, self.init_wr, self.init_ww] if self.inner_rnn == 'lstm': self.init_c = shared_zeros((self.output_dim)) self.params = self.params + [self.init_c, ] def _read(self, w, M): return (w[:, :, None]*M).sum(axis=1) def _write(self, w, e, a, M, mask): Mtilda = M * (1 - w[:, :, None]*e[:, None, :]) Mout = Mtilda + w[:, :, None]*a[:, None, :] return mask[:, None, None]*Mout + (1-mask[:, None, None])*M def _get_content_w(self, beta, k, M): num = beta[:, None] * _cosine_distance(M, k) return _softmax(num) def _get_location_w(self, g, s, C, gamma, wc, w_tm1, mask): wg = g[:, None] * wc + (1-g[:, None])*w_tm1 Cs = (C[None, :, :, :] * wg[:, None, None, :]).sum(axis=3) wtilda = (Cs * s[:, :, None]).sum(axis=1) wout = _renorm(wtilda ** gamma[:, None]) return mask[:, None] * wout + (1-mask[:, None])*w_tm1 def _get_controller_output(self, h, W_k, b_k, W_c, b_c, W_s, b_s): k = T.tanh(T.dot(h, W_k) + b_k) # + 1e-6 c = T.dot(h, W_c) + b_c beta = T.nnet.relu(c[:, 0]) + 1e-6 g = T.nnet.sigmoid(c[:, 1]) gamma = T.nnet.relu(c[:, 2]) + 1 s = T.nnet.softmax(T.dot(h, W_s) + b_s) return k, beta, g, gamma, s def _get_initial_states(self, batch_size): init_M = self.M.dimshuffle(0, 'x', 'x').repeat( batch_size, axis=0).repeat(self.n_slots, axis=1).repeat( self.m_length, axis=2) init_h = self.init_h.dimshuffle(('x', 0)).repeat(batch_size, axis=0) init_wr = self.init_wr.dimshuffle(('x', 0)).repeat(batch_size, axis=0) init_ww = self.init_ww.dimshuffle(('x', 0)).repeat(batch_size, axis=0) if self.inner_rnn == 'lstm': init_c = self.init_c.dimshuffle(('x', 0)).repeat(batch_size, axis=0) return init_M, T.nnet.softmax(init_wr), T.nnet.softmax(init_ww), init_h, init_c else: return init_M, T.nnet.softmax(init_wr), T.nnet.softmax(init_ww), init_h def _step(self, x, mask, M_tm1, wr_tm1, ww_tm1, *args): # read if self.inner_rnn == 'lstm': h_tm1 = args[0:2][::-1] # (cell_tm1, h_tm1) else: h_tm1 = args[0:1] # (h_tm1, ) k_read, beta_read, g_read, gamma_read, s_read = self._get_controller_output( h_tm1[-1], self.W_k_read, self.b_k_read, self.W_c_read, self.b_c_read, self.W_s_read, self.b_s_read) wc_read = self._get_content_w(beta_read, k_read, M_tm1) wr_t = self._get_location_w(g_read, s_read, self.C, gamma_read, wc_read, wr_tm1, mask) M_read = self._read(wr_t, M_tm1) # update controller h_t = _update_controller(self, x, h_tm1, M_read, mask) # write k_write, beta_write, g_write, gamma_write, s_write = self._get_controller_output( h_t[-1], self.W_k_write, self.b_k_write, self.W_c_write, self.b_c_write, self.W_s_write, self.b_s_write) wc_write = self._get_content_w(beta_write, k_write, M_tm1) ww_t = self._get_location_w(g_write, s_write, self.C, gamma_write, wc_write, ww_tm1, mask) e = T.nnet.sigmoid(T.dot(h_t[-1], self.W_e) + self.b_e) a = T.tanh(T.dot(h_t[-1], self.W_a) + self.b_a) M_t = self._write(ww_t, e, a, M_tm1, mask) return (M_t, wr_t, ww_t) + h_t def get_output(self, train=False): outputs = self.get_full_output(train) if self.return_sequences: return outputs[-1] else: return outputs[-1][:, -1] @property def output_shape(self): input_shape = self.input_shape if self.return_sequences: return input_shape[0], input_shape[1], self.output_dim else: return input_shape[0], self.output_dim def get_full_output(self, train=False): """ This method is for research and visualization purposes. Use it as: X = model.get_input() # full model Y = ntm.get_output() # this layer F = theano.function([X], Y, allow_input_downcast=True) [memory, read_address, write_address, rnn_state] = F(x) if inner_rnn == "lstm" use it as: [memory, read_address, write_address, rnn_cell, rnn_state] = F(x) """ X = self.get_input(train) padded_mask = self.get_padded_shuffled_mask(train, X, pad=1)[:, :, 0] X = X.dimshuffle((1, 0, 2)) init_states = self._get_initial_states(X.shape[1]) outputs, updates = theano.scan(self._step, sequences=[X, padded_mask], outputs_info=init_states, non_sequences=self.params, truncate_gradient=self.truncate_gradient) out = [outputs[0].dimshuffle((1, 0, 2, 3)), outputs[1].dimshuffle(1, 0, 2), outputs[2].dimshuffle((1, 0, 2)), outputs[3].dimshuffle((1, 0, 2))] if self.inner_rnn == 'lstm': out + [outputs[4].dimshuffle((1, 0, 2))] return out
class NeuralTuringMachine(Recurrent): """ Neural Turing Machines Non obvious parameter: ---------------------- shift_range: int, number of available shifts, ex. if 3, avilable shifts are (-1, 0, 1) n_slots: number of memory locations m_length: memory length at each location Known issues: ------------- Theano may complain when n_slots == 1. """ def __init__(self, output_dim, n_slots, m_length, shift_range=3, inner_rnn='gru', init='glorot_uniform', inner_init='orthogonal', input_dim=None, input_length=None, **kwargs): self.output_dim = output_dim self.n_slots = n_slots self.m_length = m_length self.shift_range = shift_range self.init = init self.inner_init = inner_init self.inner_rnn = inner_rnn self.input_dim = input_dim self.input_length = input_length if self.input_dim: kwargs['input_shape'] = (self.input_length, self.input_dim) super(NeuralTuringMachine, self).__init__(**kwargs) def build(self): input_leng, input_dim = self.input_shape[1:] self.input = T.tensor3() if self.inner_rnn == 'gru': self.rnn = GRU(activation='relu', input_dim=input_dim + self.m_length, input_length=input_leng, output_dim=self.output_dim, init=self.init, inner_init=self.inner_init) elif self.inner_rnn == 'lstm': self.rnn = LSTM(input_dim=input_dim + self.m_length, input_length=input_leng, output_dim=self.output_dim, init=self.init, forget_bias_init='zero', inner_init=self.inner_init) else: raise ValueError('this inner_rnn is not implemented yet.') self.rnn.build() # initial memory, state, read and write vecotrs self.M = theano.shared((.001 * np.ones((1, )).astype(floatX))) self.init_h = K.zeros((self.output_dim)) self.init_wr = self.rnn.init((self.n_slots, )) self.init_ww = self.rnn.init((self.n_slots, )) # write self.W_e = self.rnn.init((self.output_dim, self.m_length)) # erase self.b_e = K.zeros((self.m_length)) self.W_a = self.rnn.init((self.output_dim, self.m_length)) # add self.b_a = K.zeros((self.m_length)) # get_w parameters for reading operation self.W_k_read = self.rnn.init((self.output_dim, self.m_length)) self.b_k_read = self.rnn.init((self.m_length, )) self.W_c_read = self.rnn.init( (self.output_dim, 3)) # 3 = beta, g, gamma see eq. 5, 7, 9 self.b_c_read = K.zeros((3)) self.W_s_read = self.rnn.init((self.output_dim, self.shift_range)) self.b_s_read = K.zeros((self.shift_range)) # b_s lol! not intentional # get_w parameters for writing operation self.W_k_write = self.rnn.init((self.output_dim, self.m_length)) self.b_k_write = self.rnn.init((self.m_length, )) self.W_c_write = self.rnn.init( (self.output_dim, 3)) # 3 = beta, g, gamma see eq. 5, 7, 9 self.b_c_write = K.zeros((3)) self.W_s_write = self.rnn.init((self.output_dim, self.shift_range)) self.b_s_write = K.zeros((self.shift_range)) self.C = _circulant(self.n_slots, self.shift_range) self.trainable_weights = self.rnn.trainable_weights + [ self.W_e, self.b_e, self.W_a, self.b_a, self.W_k_read, self.b_k_read, self.W_c_read, self.b_c_read, self.W_s_read, self.b_s_read, self.W_k_write, self.b_k_write, self.W_s_write, self.b_s_write, self.W_c_write, self.b_c_write, self.M, self.init_h, self.init_wr, self.init_ww ] if self.inner_rnn == 'lstm': self.init_c = K.zeros((self.output_dim)) self.trainable_weights = self.trainable_weights + [ self.init_c, ] def _read(self, w, M): return (w[:, :, None] * M).sum(axis=1) def _write(self, w, e, a, M): Mtilda = M * (1 - w[:, :, None] * e[:, None, :]) Mout = Mtilda + w[:, :, None] * a[:, None, :] return Mout def _get_content_w(self, beta, k, M): num = beta[:, None] * _cosine_distance(M, k) return _softmax(num) def _get_location_w(self, g, s, C, gamma, wc, w_tm1): wg = g[:, None] * wc + (1 - g[:, None]) * w_tm1 Cs = (C[None, :, :, :] * wg[:, None, None, :]).sum(axis=3) wtilda = (Cs * s[:, :, None]).sum(axis=1) wout = _renorm(wtilda**gamma[:, None]) return wout def _get_controller_output(self, h, W_k, b_k, W_c, b_c, W_s, b_s): k = T.tanh(T.dot(h, W_k) + b_k) # + 1e-6 c = T.dot(h, W_c) + b_c beta = T.nnet.relu(c[:, 0]) + 1e-4 g = T.nnet.sigmoid(c[:, 1]) gamma = T.nnet.relu(c[:, 2]) + 1.0001 s = T.nnet.softmax(T.dot(h, W_s) + b_s) return k, beta, g, gamma, s def get_initial_states(self, X): batch_size = X.shape[0] init_M = self.M.dimshuffle(0, 'x', 'x').repeat( batch_size, axis=0).repeat(self.n_slots, axis=1).repeat(self.m_length, axis=2) init_M = init_M.flatten(ndim=2) init_h = self.init_h.dimshuffle(('x', 0)).repeat(batch_size, axis=0) init_wr = self.init_wr.dimshuffle(('x', 0)).repeat(batch_size, axis=0) init_ww = self.init_ww.dimshuffle(('x', 0)).repeat(batch_size, axis=0) if self.inner_rnn == 'lstm': init_c = self.init_c.dimshuffle(('x', 0)).repeat(batch_size, axis=0) return [ init_M, T.nnet.softmax(init_wr), T.nnet.softmax(init_ww), init_h, init_c ] else: return [ init_M, T.nnet.softmax(init_wr), T.nnet.softmax(init_ww), init_h ] @property def output_shape(self): input_shape = self.input_shape if self.return_sequences: return input_shape[0], input_shape[1], self.output_dim else: return input_shape[0], self.output_dim def get_full_output(self, train=False): """ This method is for research and visualization purposes. Use it as X = model.get_input() # full model Y = ntm.get_output() # this layer F = theano.function([X], Y, allow_input_downcast=True) [memory, read_address, write_address, rnn_state] = F(x) if inner_rnn == "lstm" use it as [memory, read_address, write_address, rnn_cell, rnn_state] = F(x) """ # input shape: (nb_samples, time (padded with zeros), input_dim) X = self.get_input(train) assert K.ndim(X) == 3 if K._BACKEND == 'tensorflow': if not self.input_shape[1]: raise Exception('When using TensorFlow, you should define ' + 'explicitely the number of timesteps of ' + 'your sequences. Make sure the first layer ' + 'has a "batch_input_shape" argument ' + 'including the samples axis.') mask = self.get_output_mask(train) if mask: # apply mask X *= K.cast(K.expand_dims(mask), X.dtype) masking = True else: masking = False if self.stateful: initial_states = self.states else: initial_states = self.get_initial_states(X) states = rnn_states(self.step, X, initial_states, go_backwards=self.go_backwards, masking=masking) return states def step(self, x, states): M_tm1, wr_tm1, ww_tm1 = states[:3] # reshape M_tm1 = M_tm1.reshape((x.shape[0], self.n_slots, self.m_length)) # read h_tm1 = states[3:] k_read, beta_read, g_read, gamma_read, s_read = self._get_controller_output( h_tm1[0], self.W_k_read, self.b_k_read, self.W_c_read, self.b_c_read, self.W_s_read, self.b_s_read) wc_read = self._get_content_w(beta_read, k_read, M_tm1) wr_t = self._get_location_w(g_read, s_read, self.C, gamma_read, wc_read, wr_tm1) M_read = self._read(wr_t, M_tm1) # update controller h_t = _update_controller(self, x, h_tm1, M_read) # write k_write, beta_write, g_write, gamma_write, s_write = self._get_controller_output( h_t[0], self.W_k_write, self.b_k_write, self.W_c_write, self.b_c_write, self.W_s_write, self.b_s_write) wc_write = self._get_content_w(beta_write, k_write, M_tm1) ww_t = self._get_location_w(g_write, s_write, self.C, gamma_write, wc_write, ww_tm1) e = T.nnet.sigmoid(T.dot(h_t[0], self.W_e) + self.b_e) a = T.tanh(T.dot(h_t[0], self.W_a) + self.b_a) M_t = self._write(ww_t, e, a, M_tm1) M_t = M_t.flatten(ndim=2) return h_t[0], [M_t, wr_t, ww_t] + h_t
class Stack(Recurrent): """ Stack and queue network output_dim = output dimension n_slots = number of memory slot m_length = dimention of the memory rnn_size = output length of the memory controler inner_rnn = "lstm" only lstm is supported stack = True to create neural stack or False to create neural queue from Learning to Transduce with Unbounded Memory [[http://arxiv.org/pdf/1506.02516.pdf]] """ def __init__(self, output_dim, n_slots, m_length, inner_rnn='lstm', rnn_size=64, stack=True, init='glorot_uniform', inner_init='orthogonal', input_dim=None, input_length=None, **kwargs): self.output_dim = output_dim self.n_slots = n_slots + 1 # because we start at time 1 self.m_length = m_length self.init = init self.inner_init = inner_init if inner_rnn != "lstm": print "Only lstm is supported" raise self.inner_rnn = inner_rnn self.rnn_size = rnn_size self.stack = stack self.input_dim = input_dim self.input_length = input_length if self.input_dim: kwargs['input_shape'] = (self.input_length, self.input_dim) super(Stack, self).__init__(**kwargs) def build(self): input_leng, input_dim = self.input_shape[1:] self.input = T.tensor3() if self.inner_rnn == 'gru': self.rnn = GRU(activation='relu', input_dim=input_dim + self.m_length, input_length=input_leng, output_dim=self.output_dim, init=self.init, inner_init=self.inner_init) elif self.inner_rnn == 'lstm': self.rnn = LSTM(input_dim=input_dim + self.m_length, input_length=input_leng, output_dim=self.rnn_size, init=self.init, forget_bias_init='zero', inner_init=self.inner_init) else: raise ValueError('this inner_rnn is not implemented yet.') self.rnn.build() self.init_h = K.zeros((self.rnn_size)) self.W_d = self.rnn.init((self.rnn_size, 1)) self.W_u = self.rnn.init((self.rnn_size, 1)) self.W_v = self.rnn.init((self.rnn_size, self.m_length)) self.W_o = self.rnn.init((self.rnn_size, self.output_dim)) self.b_d = K.zeros((1, ), name="b_d") self.b_u = K.zeros((1, ), name="b_u") self.b_v = K.zeros((self.m_length, )) self.b_o = K.zeros((self.output_dim, )) self.trainable_weights = self.rnn.trainable_weights + [ self.W_d, self.b_d, self.W_v, self.b_v, self.W_u, self.b_u, self.W_o, self.b_o, self.init_h ] if self.inner_rnn == 'lstm': self.init_c = K.zeros((self.rnn_size)) self.trainable_weights = self.trainable_weights + [ self.init_c, ] #self.trainable_weights =[self.W_d] def get_initial_states(self, X): batch_size = X.shape[0] init_r = K.zeros((self.m_length)).dimshuffle('x', 0).repeat(batch_size, axis=0) init_V = K.zeros( (self.n_slots, self.m_length)).dimshuffle('x', 0, 1).repeat(batch_size, axis=0) init_S = K.zeros((self.n_slots)).dimshuffle('x', 0).repeat(batch_size, axis=0) init_h = self.init_h.dimshuffle(('x', 0)).repeat(batch_size, axis=0) itime = K.zeros((1, ), dtype=np.int32) if self.inner_rnn == 'lstm': init_c = self.init_c.dimshuffle(('x', 0)).repeat(batch_size, axis=0) return [init_r, init_V, init_S, itime, init_h, init_c] @property def output_shape(self): input_shape = self.input_shape if self.return_sequences: return input_shape[0], input_shape[1], self.output_dim else: return input_shape[0], self.output_dim def step(self, x, states): r_tm1, V_tm1, s_tm1, time = states[:4] h_tm1 = states[4:] r_tm1 = r_tm1 op_t, h_t = _update_controller(self, T.concatenate([x, r_tm1], axis=-1), h_tm1) # op_t = op_t + print_name_shape("W_d",self.W_d.get_value()) op_t = op_t #op_t = op_t[:,0,:] d_t = K.sigmoid(K.dot(op_t, self.W_d) + self.b_d) u_t = K.sigmoid(K.dot(op_t, self.W_u) + self.b_u) v_t = K.tanh(K.dot(op_t, self.W_v) + self.b_v) o_t = K.tanh(K.dot(op_t, self.W_o) + self.b_o) time = time + 1 V_t, s_t, r_t = _update_neural_stack(self, V_tm1, s_tm1, d_t[::, 0], u_t[::, 0], v_t, time[0], stack=self.stack) return o_t, [r_t, V_t, s_t, time] + h_t
class Stack(Recurrent): """ Stack and queue network output_dim = output dimension n_slots = number of memory slot m_length = dimention of the memory rnn_size = output length of the memory controler inner_rnn = "lstm" only lstm is supported stack = True to create neural stack or False to create neural queue from Learning to Transduce with Unbounded Memory [[http://arxiv.org/pdf/1506.02516.pdf]] """ def __init__(self, output_dim, n_slots, m_length, inner_rnn='lstm', rnn_size=64, stack=True, init='glorot_uniform', inner_init='orthogonal', input_dim=None, input_length=None, **kwargs): self.output_dim = output_dim self.n_slots = n_slots + 1 # because we start at time 1 self.m_length = m_length self.init = init self.inner_init = inner_init if inner_rnn != "lstm": print "Only lstm is supported" raise self.inner_rnn = inner_rnn self.rnn_size = rnn_size self.stack = stack self.input_dim = input_dim self.input_length = input_length if self.input_dim: kwargs['input_shape'] = (self.input_length, self.input_dim) super(Stack, self).__init__(**kwargs) def build(self, input_shape): self.input_spec = [InputSpec(shape=input_shape)] input_leng, input_dim = input_shape[1:] if self.inner_rnn == 'gru': self.rnn = GRU(activation='relu', input_dim=input_dim + self.m_length, input_length=input_leng, output_dim=self.output_dim, init=self.init, inner_init=self.inner_init, consume_less='gpu', name="{}_inner_rnn".format(self.name)) elif self.inner_rnn == 'lstm': self.rnn = LSTM(input_dim=input_dim + self.m_length, input_length=input_leng, output_dim=self.rnn_size, init=self.init, forget_bias_init='zero', inner_init=self.inner_init, consume_less='gpu', name="{}_inner_rnn".format(self.name)) else: raise ValueError('this inner_rnn is not implemented yet.') inner_shape = list(input_shape) inner_shape[-1] = input_dim + self.m_length self.rnn.build(inner_shape) self.init_h = K.zeros((self.rnn_size), name="{}_init_h".format(self.name)) self.W_d = self.rnn.init((self.rnn_size, 1), name="{}_W_d".format(self.name)) self.W_u = self.rnn.init((self.rnn_size, 1), name="{}_W_u".format(self.name)) self.W_v = self.rnn.init((self.rnn_size, self.m_length), name="{}_W_v".format(self.name)) self.W_o = self.rnn.init((self.rnn_size, self.output_dim), name="{}_W_o".format(self.name)) self.b_d = K.zeros((1, ), name="{}_b_d".format(self.name)) self.b_u = K.zeros((1, ), name="{}_b_u".format(self.name)) self.b_v = K.zeros((self.m_length, ), name="{}_b_v".format(self.name)) self.b_o = K.zeros((self.output_dim, ), name="{}_b_o".format(self.name)) self.trainable_weights = self.rnn.trainable_weights + [ self.W_d, self.b_d, self.W_v, self.b_v, self.W_u, self.b_u, self.W_o, self.b_o, self.init_h ] if self.inner_rnn == 'lstm': self.init_c = K.zeros((self.rnn_size), name="{}_init_c".format(self.name)) self.trainable_weights = self.trainable_weights + [ self.init_c, ] if self.initial_weights is not None: self.set_weights(self.initial_weights) del self.initial_weight def get_initial_states(self, X): batch_size = X.shape[0] init_r = K.zeros((self.m_length)).dimshuffle('x', 0).repeat(batch_size, axis=0) init_V = K.zeros( (self.n_slots, self.m_length)).dimshuffle('x', 0, 1).repeat(batch_size, axis=0) init_S = K.zeros((self.n_slots)).dimshuffle('x', 0).repeat(batch_size, axis=0) init_h = self.init_h.dimshuffle(('x', 0)).repeat(batch_size, axis=0) itime = K.zeros((1, ), dtype=np.int32) if self.inner_rnn == 'lstm': init_c = self.init_c.dimshuffle(('x', 0)).repeat(batch_size, axis=0) return [init_r, init_V, init_S, itime, init_h, init_c] def get_output_shape_for(self, input_shape): if self.return_sequences: return input_shape[0], input_shape[1], self.output_dim else: return input_shape[0], self.output_dim def step(self, x, states): r_tm1, V_tm1, s_tm1, time = states[:4] h_tm1 = states[4:] op_t, h_t = _update_controller(self, T.concatenate([x, r_tm1], axis=-1), h_tm1) d_t = K.sigmoid(K.dot(op_t, self.W_d) + self.b_d) u_t = K.sigmoid(K.dot(op_t, self.W_u) + self.b_u) v_t = K.tanh(K.dot(op_t, self.W_v) + self.b_v) o_t = K.tanh(K.dot(op_t, self.W_o) + self.b_o) time = time + 1 V_t, s_t, r_t = _update_neural_stack(self, V_tm1, s_tm1, d_t[::, 0], u_t[::, 0], v_t, time[0], stack=self.stack) return o_t, [r_t, V_t, s_t, time] + h_t def get_config(self): config = { 'output_dim': self.output_dim, 'n_slots': self.n_slots, 'm_length': self.m_length, 'init': self.init, 'inner_init': self.inner_init, 'inner_rnn ': self.inner_rnn, 'rnn_size': self.rnn_size, 'stack': self.stack } base_config = super(Stack, self).get_config() return dict(list(base_config.items()) + list(config.items()))
class DRAW(Recurrent): '''DRAW Parameters: =========== output_dim : encoder/decoder dimension code_dim : random sample dimension (reparametrization trick output) input_shape : (n_channels, rows, cols) N_enc : Size of the encoder's filter bank (MNIST default: 2) N_dec : Size of the decoder's filter bank (MNIST default: 5) n_steps : number of sampling steps (or how long it takes to draw, default 64) inner_rnn : str with rnn type ('gru' default) truncate_gradient : int (-1 default) return_sequences : bool (False default) ''' theano_rng = theano_rng() def __init__(self, output_dim, code_dim, N_enc=2, N_dec=5, n_steps=64, inner_rnn='gru', truncate_gradient=-1, return_sequences=False, canvas_activation=T.nnet.sigmoid, init='glorot_uniform', inner_init='orthogonal', input_shape=None, **kwargs): self.output_dim = output_dim # this is 256 for MNIST self.code_dim = code_dim # this is 100 for MNIST self.N_enc = N_enc self.N_dec = N_dec self.truncate_gradient = truncate_gradient self.return_sequences = return_sequences self.n_steps = n_steps self.canvas_activation = canvas_activation self.init = init self.inner_init = inner_init self.inner_rnn = inner_rnn self.height = input_shape[1] self.width = input_shape[2] self._input_shape = input_shape super(DRAW, self).__init__(**kwargs) def build(self): self.input = T.tensor4() if self.inner_rnn == 'gru': self.enc = GRU( input_length=self.n_steps, input_dim=self._input_shape[0]*2*self.N_enc**2 + self.output_dim, output_dim=self.output_dim, init=self.init, inner_init=self.inner_init) self.dec = GRU( input_length=self.n_steps, input_dim=self.code_dim, output_dim=self.output_dim, init=self.init, inner_init=self.inner_init) elif self.inner_rnn == 'lstm': self.enc = LSTM( input_length=self.n_steps, input_dim=self._input_shape[0]*2*self.N_enc**2 + self.output_dim, output_dim=self.output_dim, init=self.init, inner_init=self.inner_init) self.dec = LSTM( input_length=self.n_steps, input_dim=self.code_dim, output_dim=self.output_dim, init=self.init, inner_init=self.inner_init) else: raise ValueError('This type of inner_rnn is not supported') self.enc.build() self.dec.build() self.init_canvas = shared_zeros(self._input_shape) # canvas and hidden state self.init_h_enc = shared_zeros((self.output_dim)) # initial values self.init_h_dec = shared_zeros((self.output_dim)) # should be trained self.L_enc = self.enc.init((self.output_dim, 5)) # "read" attention parameters (eq. 21) self.L_dec = self.enc.init((self.output_dim, 5)) # "write" attention parameters (eq. 28) self.b_enc = shared_zeros((5)) # "read" attention parameters (eq. 21) self.b_dec = shared_zeros((5)) # "write" attention parameters (eq. 28) self.W_patch = self.enc.init((self.output_dim, self.N_dec**2*self._input_shape[0])) self.b_patch = shared_zeros((self.N_dec**2*self._input_shape[0])) self.W_mean = self.enc.init((self.output_dim, self.code_dim)) self.W_sigma = self.enc.init((self.output_dim, self.code_dim)) self.b_mean = shared_zeros((self.code_dim)) self.b_sigma = shared_zeros((self.code_dim)) self.trainable_weights = self.enc.trainable_weights + self.dec.trainable_weights + [ self.L_enc, self.L_dec, self.b_enc, self.b_dec, self.W_patch, self.b_patch, self.W_mean, self.W_sigma, self.b_mean, self.b_sigma, self.init_canvas, self.init_h_enc, self.init_h_dec] if self.inner_rnn == 'lstm': self.init_cell_enc = shared_zeros((self.output_dim)) # initial values self.init_cell_dec = shared_zeros((self.output_dim)) # should be trained self.trainable_weights = self.trainable_weights + [self.init_cell_dec, self.init_cell_enc] def set_previous(self, layer, connection_map={}): self.previous = layer self.build() self.init_updates() def init_updates(self): self.get_output(train=True) # populate regularizers list def _get_attention.trainable_weights(self, h, L, b, N): p = T.dot(h, L) + b gx = self.width * (p[:, 0]+1) / 2. gy = self.height * (p[:, 1]+1) / 2. sigma2 = T.exp(p[:, 2]) delta = T.exp(p[:, 3]) * (max(self.width, self.height) - 1) / (N - 1.) gamma = T.exp(p[:, 4]) return gx, gy, sigma2, delta, gamma
class NeuralTuringMachine(Recurrent): """ Neural Turing Machines Parameters: ----------- shift_range: int, number of available shifts, ex. if 3, avilable shifts are (-1, 0, 1) n_slots: number of memory locations m_length: memory length at each location inner_rnn: str, supported values are 'gru' and 'lstm' output_dim: hidden state size (RNN controller output_dim) Known issues and TODO: ---------------------- Theano may complain when n_slots == 1. Add multiple reading and writing heads. """ def __init__(self, output_dim, n_slots, m_length, shift_range=3, inner_rnn='gru', truncate_gradient=-1, return_sequences=False, init='glorot_uniform', inner_init='orthogonal', input_dim=None, input_length=None, **kwargs): self.output_dim = output_dim self.n_slots = n_slots self.m_length = m_length self.shift_range = shift_range self.init = init self.inner_init = inner_init self.inner_rnn = inner_rnn self.return_sequences = return_sequences self.truncate_gradient = truncate_gradient self.input_dim = input_dim self.input_length = input_length if self.input_dim: kwargs['input_shape'] = (self.input_length, self.input_dim) super(NeuralTuringMachine, self).__init__(**kwargs) def build(self): input_leng, input_dim = self.input_shape[1:] self.input = T.tensor3() if self.inner_rnn == 'gru': self.rnn = GRU(input_dim=input_dim + self.m_length, input_length=input_leng, output_dim=self.output_dim, init=self.init, inner_init=self.inner_init) elif self.inner_rnn == 'lstm': self.rnn = LSTM(input_dim=input_dim + self.m_length, input_length=input_leng, output_dim=self.output_dim, init=self.init, inner_init=self.inner_init) else: raise ValueError('this inner_rnn is not implemented yet.') self.rnn.build() # initial memory, state, read and write vecotrs self.M = theano.shared((.001 * np.ones((1, )).astype(floatX))) self.init_h = shared_zeros((self.output_dim)) self.init_wr = self.rnn.init((self.n_slots, )) self.init_ww = self.rnn.init((self.n_slots, )) # write self.W_e = self.rnn.init((self.output_dim, self.m_length)) # erase self.b_e = shared_zeros((self.m_length)) self.W_a = self.rnn.init((self.output_dim, self.m_length)) # add self.b_a = shared_zeros((self.m_length)) # get_w parameters for reading operation self.W_k_read = self.rnn.init((self.output_dim, self.m_length)) self.b_k_read = self.rnn.init((self.m_length, )) self.W_c_read = self.rnn.init( (self.output_dim, 3)) # 3 = beta, g, gamma see eq. 5, 7, 9 in Graves et. al 2014 self.b_c_read = shared_zeros((3)) self.W_s_read = self.rnn.init((self.output_dim, self.shift_range)) self.b_s_read = shared_zeros((self.shift_range)) # get_w parameters for writing operation self.W_k_write = self.rnn.init((self.output_dim, self.m_length)) self.b_k_write = self.rnn.init((self.m_length, )) self.W_c_write = self.rnn.init( (self.output_dim, 3)) # 3 = beta, g, gamma see eq. 5, 7, 9 self.b_c_write = shared_zeros((3)) self.W_s_write = self.rnn.init((self.output_dim, self.shift_range)) self.b_s_write = shared_zeros((self.shift_range)) self.C = _circulant(self.n_slots, self.shift_range) self.params = self.rnn.params + [ self.W_e, self.b_e, self.W_a, self.b_a, self.W_k_read, self.b_k_read, self.W_c_read, self.b_c_read, self.W_s_read, self.b_s_read, self.W_k_write, self.b_k_write, self.W_s_write, self.b_s_write, self.W_c_write, self.b_c_write, self.M, self.init_h, self.init_wr, self.init_ww ] if self.inner_rnn == 'lstm': self.init_c = shared_zeros((self.output_dim)) self.params = self.params + [ self.init_c, ] def _read(self, w, M): return (w[:, :, None] * M).sum(axis=1) def _write(self, w, e, a, M, mask): Mtilda = M * (1 - w[:, :, None] * e[:, None, :]) Mout = Mtilda + w[:, :, None] * a[:, None, :] return mask[:, None, None] * Mout + (1 - mask[:, None, None]) * M def _get_content_w(self, beta, k, M): num = beta[:, None] * _cosine_distance(M, k) return _softmax(num) def _get_location_w(self, g, s, C, gamma, wc, w_tm1, mask): wg = g[:, None] * wc + (1 - g[:, None]) * w_tm1 Cs = (C[None, :, :, :] * wg[:, None, None, :]).sum(axis=3) wtilda = (Cs * s[:, :, None]).sum(axis=1) wout = _renorm(wtilda**gamma[:, None]) return mask[:, None] * wout + (1 - mask[:, None]) * w_tm1 def _get_controller_output(self, h, W_k, b_k, W_c, b_c, W_s, b_s): k = T.tanh(T.dot(h, W_k) + b_k) # + 1e-6 c = T.dot(h, W_c) + b_c beta = T.nnet.relu(c[:, 0]) + 1e-6 g = T.nnet.sigmoid(c[:, 1]) gamma = T.nnet.relu(c[:, 2]) + 1 s = T.nnet.softmax(T.dot(h, W_s) + b_s) return k, beta, g, gamma, s def _get_initial_states(self, batch_size): init_M = self.M.dimshuffle(0, 'x', 'x').repeat( batch_size, axis=0).repeat(self.n_slots, axis=1).repeat(self.m_length, axis=2) init_h = self.init_h.dimshuffle(('x', 0)).repeat(batch_size, axis=0) init_wr = self.init_wr.dimshuffle(('x', 0)).repeat(batch_size, axis=0) init_ww = self.init_ww.dimshuffle(('x', 0)).repeat(batch_size, axis=0) if self.inner_rnn == 'lstm': init_c = self.init_c.dimshuffle(('x', 0)).repeat(batch_size, axis=0) return init_M, T.nnet.softmax(init_wr), T.nnet.softmax( init_ww), init_h, init_c else: return init_M, T.nnet.softmax(init_wr), T.nnet.softmax( init_ww), init_h def _step(self, x, mask, M_tm1, wr_tm1, ww_tm1, *args): # read if self.inner_rnn == 'lstm': h_tm1 = args[0:2][::-1] # (cell_tm1, h_tm1) else: h_tm1 = args[0:1] # (h_tm1, ) k_read, beta_read, g_read, gamma_read, s_read = self._get_controller_output( h_tm1[-1], self.W_k_read, self.b_k_read, self.W_c_read, self.b_c_read, self.W_s_read, self.b_s_read) wc_read = self._get_content_w(beta_read, k_read, M_tm1) wr_t = self._get_location_w(g_read, s_read, self.C, gamma_read, wc_read, wr_tm1, mask) M_read = self._read(wr_t, M_tm1) # update controller h_t = _update_controller(self, x, h_tm1, M_read, mask) # write k_write, beta_write, g_write, gamma_write, s_write = self._get_controller_output( h_t[-1], self.W_k_write, self.b_k_write, self.W_c_write, self.b_c_write, self.W_s_write, self.b_s_write) wc_write = self._get_content_w(beta_write, k_write, M_tm1) ww_t = self._get_location_w(g_write, s_write, self.C, gamma_write, wc_write, ww_tm1, mask) e = T.nnet.sigmoid(T.dot(h_t[-1], self.W_e) + self.b_e) a = T.tanh(T.dot(h_t[-1], self.W_a) + self.b_a) M_t = self._write(ww_t, e, a, M_tm1, mask) return (M_t, wr_t, ww_t) + h_t def get_output(self, train=False): outputs = self.get_full_output(train) if self.return_sequences: return outputs[-1] else: return outputs[-1][:, -1] @property def output_shape(self): input_shape = self.input_shape if self.return_sequences: return input_shape[0], input_shape[1], self.output_dim else: return input_shape[0], self.output_dim def get_full_output(self, train=False): """ This method is for research and visualization purposes. Use it as: X = model.get_input() # full model Y = ntm.get_output() # this layer F = theano.function([X], Y, allow_input_downcast=True) [memory, read_address, write_address, rnn_state] = F(x) if inner_rnn == "lstm" use it as: [memory, read_address, write_address, rnn_cell, rnn_state] = F(x) """ X = self.get_input(train) padded_mask = self.get_padded_shuffled_mask(train, X, pad=1)[:, :, 0] X = X.dimshuffle((1, 0, 2)) init_states = self._get_initial_states(X.shape[1]) outputs, updates = theano.scan( self._step, sequences=[X, padded_mask], outputs_info=init_states, non_sequences=self.params, truncate_gradient=self.truncate_gradient) out = [ outputs[0].dimshuffle((1, 0, 2, 3)), outputs[1].dimshuffle(1, 0, 2), outputs[2].dimshuffle( (1, 0, 2)), outputs[3].dimshuffle((1, 0, 2)) ] if self.inner_rnn == 'lstm': out + [outputs[4].dimshuffle((1, 0, 2))] return out
class NeuralTuringMachine(Recurrent): """ Neural Turing Machines Non obvious parameter: ---------------------- shift_range: int, number of available shifts, ex. if 3, avilable shifts are (-1, 0, 1) n_slots: number of memory locations m_length: memory length at each location Known issues: ------------- Theano may complain when n_slots == 1. """ def __init__(self, output_dim, n_slots, m_length, shift_range=3, inner_rnn='gru', init='glorot_uniform', inner_init='orthogonal', input_dim=None, input_length=None, **kwargs): self.output_dim = output_dim self.n_slots = n_slots self.m_length = m_length self.shift_range = shift_range self.init = init self.inner_init = inner_init self.inner_rnn = inner_rnn self.input_dim = input_dim self.input_length = input_length if self.input_dim: kwargs['input_shape'] = (self.input_length, self.input_dim) super(NeuralTuringMachine, self).__init__(**kwargs) def build(self): input_leng, input_dim = self.input_shape[1:] self.input = T.tensor3() if self.inner_rnn == 'gru': self.rnn = GRU( activation='relu', input_dim=input_dim+self.m_length, input_length=input_leng, output_dim=self.output_dim, init=self.init, inner_init=self.inner_init) elif self.inner_rnn == 'lstm': self.rnn = LSTM( input_dim=input_dim+self.m_length, input_length=input_leng, output_dim=self.output_dim, init=self.init, forget_bias_init='zero', inner_init=self.inner_init) else: raise ValueError('this inner_rnn is not implemented yet.') self.rnn.build() # initial memory, state, read and write vecotrs self.M = theano.shared((.001*np.ones((1,)).astype(floatX))) self.init_h = K.zeros((self.output_dim)) self.init_wr = self.rnn.init((self.n_slots,)) self.init_ww = self.rnn.init((self.n_slots,)) # write self.W_e = self.rnn.init((self.output_dim, self.m_length)) # erase self.b_e = K.zeros((self.m_length)) self.W_a = self.rnn.init((self.output_dim, self.m_length)) # add self.b_a = K.zeros((self.m_length)) # get_w parameters for reading operation self.W_k_read = self.rnn.init((self.output_dim, self.m_length)) self.b_k_read = self.rnn.init((self.m_length, )) self.W_c_read = self.rnn.init((self.output_dim, 3)) # 3 = beta, g, gamma see eq. 5, 7, 9 self.b_c_read = K.zeros((3)) self.W_s_read = self.rnn.init((self.output_dim, self.shift_range)) self.b_s_read = K.zeros((self.shift_range)) # b_s lol! not intentional # get_w parameters for writing operation self.W_k_write = self.rnn.init((self.output_dim, self.m_length)) self.b_k_write = self.rnn.init((self.m_length, )) self.W_c_write = self.rnn.init((self.output_dim, 3)) # 3 = beta, g, gamma see eq. 5, 7, 9 self.b_c_write = K.zeros((3)) self.W_s_write = self.rnn.init((self.output_dim, self.shift_range)) self.b_s_write = K.zeros((self.shift_range)) self.C = _circulant(self.n_slots, self.shift_range) self.trainable_weights = self.rnn.trainable_weights + [ self.W_e, self.b_e, self.W_a, self.b_a, self.W_k_read, self.b_k_read, self.W_c_read, self.b_c_read, self.W_s_read, self.b_s_read, self.W_k_write, self.b_k_write, self.W_s_write, self.b_s_write, self.W_c_write, self.b_c_write, self.M, self.init_h, self.init_wr, self.init_ww] if self.inner_rnn == 'lstm': self.init_c = K.zeros((self.output_dim)) self.trainable_weights = self.trainable_weights + [self.init_c, ] def _read(self, w, M): return (w[:, :, None]*M).sum(axis=1) def _write(self, w, e, a, M): Mtilda = M * (1 - w[:, :, None]*e[:, None, :]) Mout = Mtilda + w[:, :, None]*a[:, None, :] return Mout def _get_content_w(self, beta, k, M): num = beta[:, None] * _cosine_distance(M, k) return _softmax(num) def _get_location_w(self, g, s, C, gamma, wc, w_tm1): wg = g[:, None] * wc + (1-g[:, None])*w_tm1 Cs = (C[None, :, :, :] * wg[:, None, None, :]).sum(axis=3) wtilda = (Cs * s[:, :, None]).sum(axis=1) wout = _renorm(wtilda ** gamma[:, None]) return wout def _get_controller_output(self, h, W_k, b_k, W_c, b_c, W_s, b_s): k = T.tanh(T.dot(h, W_k) + b_k) # + 1e-6 c = T.dot(h, W_c) + b_c beta = T.nnet.relu(c[:, 0]) + 1e-4 g = T.nnet.sigmoid(c[:, 1]) gamma = T.nnet.relu(c[:, 2]) + 1.0001 s = T.nnet.softmax(T.dot(h, W_s) + b_s) return k, beta, g, gamma, s def get_initial_states(self, X): batch_size = X.shape[0] init_M = self.M.dimshuffle(0, 'x', 'x').repeat( batch_size, axis=0).repeat(self.n_slots, axis=1).repeat( self.m_length, axis=2) init_M = init_M.flatten(ndim=2) init_h = self.init_h.dimshuffle(('x', 0)).repeat(batch_size, axis=0) init_wr = self.init_wr.dimshuffle(('x', 0)).repeat(batch_size, axis=0) init_ww = self.init_ww.dimshuffle(('x', 0)).repeat(batch_size, axis=0) if self.inner_rnn == 'lstm': init_c = self.init_c.dimshuffle(('x', 0)).repeat(batch_size, axis=0) return [init_M, T.nnet.softmax(init_wr), T.nnet.softmax(init_ww), init_h, init_c] else: return [init_M, T.nnet.softmax(init_wr), T.nnet.softmax(init_ww), init_h] @property def output_shape(self): input_shape = self.input_shape if self.return_sequences: return input_shape[0], input_shape[1], self.output_dim else: return input_shape[0], self.output_dim def get_full_output(self, train=False): """ This method is for research and visualization purposes. Use it as X = model.get_input() # full model Y = ntm.get_output() # this layer F = theano.function([X], Y, allow_input_downcast=True) [memory, read_address, write_address, rnn_state] = F(x) if inner_rnn == "lstm" use it as [memory, read_address, write_address, rnn_cell, rnn_state] = F(x) """ # input shape: (nb_samples, time (padded with zeros), input_dim) X = self.get_input(train) assert K.ndim(X) == 3 if K._BACKEND == 'tensorflow': if not self.input_shape[1]: raise Exception('When using TensorFlow, you should define ' + 'explicitely the number of timesteps of ' + 'your sequences. Make sure the first layer ' + 'has a "batch_input_shape" argument ' + 'including the samples axis.') mask = self.get_output_mask(train) if mask: # apply mask X *= K.cast(K.expand_dims(mask), X.dtype) masking = True else: masking = False if self.stateful: initial_states = self.states else: initial_states = self.get_initial_states(X) states = rnn_states(self.step, X, initial_states, go_backwards=self.go_backwards, masking=masking) return states def step(self, x, states): M_tm1, wr_tm1, ww_tm1 = states[:3] # reshape M_tm1 = M_tm1.reshape((x.shape[0], self.n_slots, self.m_length)) # read h_tm1 = states[3:] k_read, beta_read, g_read, gamma_read, s_read = self._get_controller_output( h_tm1[0], self.W_k_read, self.b_k_read, self.W_c_read, self.b_c_read, self.W_s_read, self.b_s_read) wc_read = self._get_content_w(beta_read, k_read, M_tm1) wr_t = self._get_location_w(g_read, s_read, self.C, gamma_read, wc_read, wr_tm1) M_read = self._read(wr_t, M_tm1) # update controller h_t = _update_controller(self, x, h_tm1, M_read) # write k_write, beta_write, g_write, gamma_write, s_write = self._get_controller_output( h_t[0], self.W_k_write, self.b_k_write, self.W_c_write, self.b_c_write, self.W_s_write, self.b_s_write) wc_write = self._get_content_w(beta_write, k_write, M_tm1) ww_t = self._get_location_w(g_write, s_write, self.C, gamma_write, wc_write, ww_tm1) e = T.nnet.sigmoid(T.dot(h_t[0], self.W_e) + self.b_e) a = T.tanh(T.dot(h_t[0], self.W_a) + self.b_a) M_t = self._write(ww_t, e, a, M_tm1) M_t = M_t.flatten(ndim=2) return h_t[0], [M_t, wr_t, ww_t] + h_t
class NeuralTuringMachine(Recurrent): def __init__(self, output_dim, memory_size, shift_range=3, init='glorot_uniform', inner_init='orthogonal', input_dim=None, input_length=None, **kwargs): self.output_dim = output_dim self.n_slots = memory_size[1] self.m_length = memory_size[0] self.shift_range = shift_range self.init = init self.inner_init = inner_init self.input_dim = input_dim self.input_length = input_length self.u = None if self.input_dim: kwargs['input_shape'] = (self.input_length, self.input_dim) super(NeuralTuringMachine, self).__init__(**kwargs) def build(self, input_shape): self.u = input_shape input_leng, input_dim = input_shape[1:] # self.input = T.tensor3() self.rnn = LSTM( input_dim=input_dim + self.m_length, input_length=input_leng, output_dim=self.output_dim, init=self.init, forget_bias_init='zero', inner_init=self.inner_init) self.rnn.build(input_shape) self.M = theano.shared((.001 * np.ones((1,)).astype(floatX))) self.init_h = K.zeros((self.output_dim)) self.init_wr = self.rnn.init((self.n_slots,)) self.init_ww = self.rnn.init((self.n_slots,)) # write self.W_e = self.rnn.init((self.output_dim, self.m_length)) # erase self.b_e = K.zeros((self.m_length)) self.W_a = self.rnn.init((self.output_dim, self.m_length)) # add self.b_a = K.zeros((self.m_length)) # get_w parameters for reading operation self.W_k_read = self.rnn.init((self.output_dim, self.m_length)) self.b_k_read = self.rnn.init((self.m_length,)) self.W_c_read = self.rnn.init((self.output_dim, 3)) # 3 = beta, g, gamma see eq. 5, 7, 9 self.b_c_read = K.zeros((3)) self.W_s_read = self.rnn.init((self.output_dim, self.shift_range)) self.b_s_read = K.zeros((self.shift_range)) # b_s lol! not intentional # get_w parameters for writing operation self.W_k_write = self.rnn.init((self.output_dim, self.m_length)) self.b_k_write = self.rnn.init((self.m_length,)) self.W_c_write = self.rnn.init((self.output_dim, 3)) # 3 = beta, g, gamma see eq. 5, 7, 9 self.b_c_write = K.zeros((3)) self.W_s_write = self.rnn.init((self.output_dim, self.shift_range)) self.b_s_write = K.zeros((self.shift_range)) self.C = _circulant(self.n_slots, self.shift_range) self.trainable_weights = self.rnn.trainable_weights + [ self.W_e, self.b_e, self.W_a, self.b_a, self.W_k_read, self.b_k_read, self.W_c_read, self.b_c_read, self.W_s_read, self.b_s_read, self.W_k_write, self.b_k_write, self.W_s_write, self.b_s_write, self.W_c_write, self.b_c_write, self.M, self.init_h, self.init_wr, self.init_ww] self.init_c = K.zeros((self.output_dim)) self.trainable_weights = self.trainable_weights + [self.init_c, ] def _read(self, w, M): return (w[:, :, None] * M).sum(axis=1) def _write(self, w, e, a, M): Mtilda = M * (1 - w[:, :, None] * e[:, None, :]) Mout = Mtilda + w[:, :, None] * a[:, None, :] return Mout def _get_content_w(self, beta, k, M): num = beta[:, None] * _cosine_distance(M, k) return _softmax(num) def _get_location_w(self, g, s, C, gamma, wc, w_tm1): wg = g[:, None] * wc + (1 - g[:, None]) * w_tm1 Cs = (C[None, :, :, :] * wg[:, None, None, :]).sum(axis=3) wtilda = (Cs * s[:, :, None]).sum(axis=1) wout = _renorm(wtilda ** gamma[:, None]) return wout def _get_controller_output(self, h, W_k, b_k, W_c, b_c, W_s, b_s): k = T.tanh(T.dot(h, W_k) + b_k) # + 1e-6 c = T.dot(h, W_c) + b_c beta = T.nnet.relu(c[:, 0]) + 1e-4 g = T.nnet.sigmoid(c[:, 1]) gamma = T.nnet.relu(c[:, 2]) + 1.0001 s = T.nnet.softmax(T.dot(h, W_s) + b_s) return k, beta, g, gamma, s def get_initial_states(self, X): batch_size = X.shape[0] init_M = self.M.dimshuffle(0, 'x', 'x').repeat( batch_size, axis=0).repeat(self.n_slots, axis=1).repeat( self.m_length, axis=2) init_M = init_M.flatten(ndim=2) init_h = self.init_h.dimshuffle(('x', 0)).repeat(batch_size, axis=0) init_wr = self.init_wr.dimshuffle(('x', 0)).repeat(batch_size, axis=0) init_ww = self.init_ww.dimshuffle(('x', 0)).repeat(batch_size, axis=0) init_c = self.init_c.dimshuffle(('x', 0)).repeat(batch_size, axis=0) return [init_M, T.nnet.softmax(init_wr), T.nnet.softmax(init_ww), init_h, init_c] @property def output_shape(self): input_shape = self.input_shape if self.return_sequences: return input_shape[0], input_shape[1], self.output_dim else: return input_shape[0], self.output_dim def call(self, x, mask=None): input_shape = self.u print(input_shape) if K._BACKEND == 'tensorflow': if not input_shape[1]: raise Exception('When using TensorFlow, you should define ' 'explicitly the number of timesteps of ' 'your sequences.\n' 'If your first layer is an Embedding, ' 'make sure to pass it an "input_length" ' 'argument. Otherwise, make sure ' 'the first layer has ' 'an "input_shape" or "batch_input_shape" ' 'argument, including the time axis. ' 'Found input shape at layer ' + self.name + ': ' + str(input_shape)) if self.stateful: initial_states = self.states else: initial_states = self.get_initial_states(x) constants = self.get_constants(x) preprocessed_input = self.preprocess_input(x) last_output, outputs, states = K.rnn(self.step, preprocessed_input, initial_states, go_backwards=self.go_backwards, mask=mask, constants=constants, unroll=self.unroll, input_length=input_shape[1]) if self.stateful: self.updates = [] for i in range(len(states)): self.updates.append((self.states[i], states[i])) if self.return_sequences: return outputs else: return last_output def step(self, x, states): M_tm1, wr_tm1, ww_tm1 = states[:3] # reshape M_tm1 = M_tm1.reshape((x.shape[0], self.n_slots, self.m_length)) # read h_tm1 = states[3:] k_read, beta_read, g_read, gamma_read, s_read = self._get_controller_output( h_tm1[0], self.W_k_read, self.b_k_read, self.W_c_read, self.b_c_read, self.W_s_read, self.b_s_read) wc_read = self._get_content_w(beta_read, k_read, M_tm1) wr_t = self._get_location_w(g_read, s_read, self.C, gamma_read, wc_read, wr_tm1) M_read = self._read(wr_t, M_tm1) # update controller h_t = _update_controller(self, x, h_tm1, M_read) # write k_write, beta_write, g_write, gamma_write, s_write = self._get_controller_output( h_t[0], self.W_k_write, self.b_k_write, self.W_c_write, self.b_c_write, self.W_s_write, self.b_s_write) wc_write = self._get_content_w(beta_write, k_write, M_tm1) ww_t = self._get_location_w(g_write, s_write, self.C, gamma_write, wc_write, ww_tm1) e = T.nnet.sigmoid(T.dot(h_t[0], self.W_e) + self.b_e) a = T.tanh(T.dot(h_t[0], self.W_a) + self.b_a) M_t = self._write(ww_t, e, a, M_tm1) M_t = M_t.flatten(ndim=2) return h_t[0], [M_t, wr_t, ww_t] + h_t
class Stack(Recurrent): """ Stack and queue network output_dim = output dimension n_slots = number of memory slot m_length = dimention of the memory rnn_size = output length of the memory controler inner_rnn = "lstm" only lstm is supported stack = True to create neural stack or False to create neural queue from Learning to Transduce with Unbounded Memory [[http://arxiv.org/pdf/1506.02516.pdf]] """ def __init__(self, output_dim, n_slots, m_length, inner_rnn='lstm',rnn_size=64, stack=True, init='glorot_uniform', inner_init='orthogonal', input_dim=None, input_length=None, **kwargs): self.output_dim = output_dim self.n_slots = n_slots + 1 # because we start at time 1 self.m_length = m_length self.init = init self.inner_init = inner_init if inner_rnn != "lstm": print "Only lstm is supported" raise self.inner_rnn = inner_rnn self.rnn_size = rnn_size self.stack = stack self.input_dim = input_dim self.input_length = input_length if self.input_dim: kwargs['input_shape'] = (self.input_length, self.input_dim) super(Stack, self).__init__(**kwargs) def build(self, input_shape): self.input_spec = [InputSpec(shape=input_shape)] input_leng, input_dim = input_shape[1:] if self.inner_rnn == 'gru': self.rnn = GRU( activation='relu', input_dim=input_dim+self.m_length, input_length=input_leng, output_dim=self.output_dim, init=self.init, inner_init=self.inner_init, consume_less='gpu', name="{}_inner_rnn".format(self.name)) elif self.inner_rnn == 'lstm': self.rnn = LSTM( input_dim=input_dim+self.m_length, input_length=input_leng, output_dim=self.rnn_size, init=self.init, forget_bias_init='zero', inner_init=self.inner_init, consume_less='gpu', name="{}_inner_rnn".format(self.name)) else: raise ValueError('this inner_rnn is not implemented yet.') inner_shape = list(input_shape) inner_shape[-1] = input_dim+self.m_length self.rnn.build(inner_shape) self.init_h = K.zeros((self.rnn_size), name="{}_init_h".format(self.name)) self.W_d = self.rnn.init((self.rnn_size,1), name="{}_W_d".format(self.name)) self.W_u = self.rnn.init((self.rnn_size,1), name="{}_W_u".format(self.name)) self.W_v = self.rnn.init((self.rnn_size,self.m_length), name="{}_W_v".format(self.name)) self.W_o = self.rnn.init((self.rnn_size,self.output_dim), name="{}_W_o".format(self.name)) self.b_d = K.zeros((1,), name="{}_b_d".format(self.name)) self.b_u = K.zeros((1,), name="{}_b_u".format(self.name)) self.b_v = K.zeros((self.m_length,), name="{}_b_v".format(self.name)) self.b_o = K.zeros((self.output_dim,), name="{}_b_o".format(self.name)) self.trainable_weights = self.rnn.trainable_weights + [ self.W_d, self.b_d, self.W_v, self.b_v, self.W_u, self.b_u, self.W_o, self.b_o, self.init_h] if self.inner_rnn == 'lstm': self.init_c = K.zeros((self.rnn_size), name="{}_init_c".format(self.name)) self.trainable_weights = self.trainable_weights + [self.init_c, ] if self.initial_weights is not None: self.set_weights(self.initial_weights) del self.initial_weight def get_initial_states(self, X): batch_size = X.shape[0] init_r = K.zeros((self.m_length)).dimshuffle('x',0).repeat(batch_size,axis=0) init_V = K.zeros((self.n_slots,self.m_length)).dimshuffle('x',0,1).repeat(batch_size,axis=0) init_S = K.zeros((self.n_slots)).dimshuffle('x',0).repeat(batch_size,axis=0) init_h = self.init_h.dimshuffle(('x', 0)).repeat(batch_size, axis=0) itime = K.zeros((1,),dtype=np.int32) if self.inner_rnn == 'lstm': init_c = self.init_c.dimshuffle(('x', 0)).repeat(batch_size, axis=0) return [init_r , init_V,init_S,itime,init_h,init_c] def get_output_shape_for(self, input_shape): if self.return_sequences: return input_shape[0], input_shape[1], self.output_dim else: return input_shape[0], self.output_dim def step(self, x, states): r_tm1, V_tm1,s_tm1,time = states[:4] h_tm1 = states[4:] op_t, h_t = _update_controller(self, T.concatenate([x, r_tm1], axis=-1), h_tm1) d_t = K.sigmoid( K.dot(op_t, self.W_d) + self.b_d) u_t = K.sigmoid(K.dot(op_t, self.W_u) + self.b_u) v_t = K.tanh(K.dot(op_t, self.W_v) + self.b_v) o_t = K.tanh(K.dot(op_t, self.W_o) + self.b_o) time = time + 1 V_t, s_t, r_t = _update_neural_stack(self, V_tm1, s_tm1, d_t[::,0], u_t[::,0], v_t,time[0],stack=self.stack) return o_t, [r_t, V_t, s_t, time] + h_t def get_config(self): config = {'output_dim': self.output_dim, 'n_slots': self.n_slots, 'm_length': self.m_length, 'init': self.init, 'inner_init': self.inner_init, 'inner_rnn ': self.inner_rnn, 'rnn_size': self.rnn_size, 'stack': self.stack} base_config = super(Stack, self).get_config() return dict(list(base_config.items()) + list(config.items()))
class NeuralTuringMachine(Recurrent): def __init__(self, output_dim, memory_size, shift_range=3, init='glorot_uniform', inner_init='orthogonal', input_dim=None, input_length=None, **kwargs): self.output_dim = output_dim self.n_slots = memory_size[1] self.m_length = memory_size[0] self.shift_range = shift_range self.init = init self.inner_init = inner_init self.input_dim = input_dim self.input_length = input_length if self.input_dim: kwargs['input_shape'] = (self.input_length, self.input_dim) super(NeuralTuringMachine, self).__init__(**kwargs) def build(self, input_shape): input_leng, input_dim = input_shape[1:] # self.input = T.tensor3() self.lstm = LSTM( input_dim=input_dim + self.m_length, input_length=input_leng, output_dim=self.output_dim, init=self.init, forget_bias_init='zero', inner_init=self.inner_init) self.lstm.build(input_shape) # initial memory, state, read and write vecotrs self.M = theano.shared((.001 * np.ones((1,)).astype(floatX))) self.init_h = backend.zeros((self.output_dim)) self.init_wr = self.lstm.init((self.n_slots,)) self.init_ww = self.lstm.init((self.n_slots,)) # write self.W_e = self.lstm.init((self.output_dim, self.m_length)) # erase self.b_e = backend.zeros((self.m_length)) self.W_a = self.lstm.init((self.output_dim, self.m_length)) # add self.b_a = backend.zeros((self.m_length)) # get_w parameters for reading operation self.W_k_read = self.lstm.init((self.output_dim, self.m_length)) self.b_k_read = self.lstm.init((self.m_length,)) self.W_c_read = self.lstm.init((self.output_dim, 3)) self.b_c_read = backend.zeros((3)) self.W_s_read = self.lstm.init((self.output_dim, self.shift_range)) self.b_s_read = backend.zeros((self.shift_range)) # b_s lol! not intentional # get_w parameters for writing operation self.W_k_write = self.lstm.init((self.output_dim, self.m_length)) self.b_k_write = self.lstm.init((self.m_length,)) self.W_c_write = self.lstm.init((self.output_dim, 3)) # 3 = beta, g, gamma see eq. 5, 7, 9 self.b_c_write = backend.zeros((3)) self.W_s_write = self.lstm.init((self.output_dim, self.shift_range)) self.b_s_write = backend.zeros((self.shift_range)) self.C = circulant(self.n_slots, self.shift_range) self.trainable_weights = self.lstm.trainable_weights + [ self.W_e, self.b_e, self.W_a, self.b_a, self.W_k_read, self.b_k_read, self.W_c_read, self.b_c_read, self.W_s_read, self.b_s_read, self.W_k_write, self.b_k_write, self.W_s_write, self.b_s_write, self.W_c_write, self.b_c_write, self.M, self.init_h, self.init_wr, self.init_ww] self.init_c = backend.zeros((self.output_dim)) self.trainable_weights = self.trainable_weights + [self.init_c, ] def read(self, w, M): return (w[:, :, None] * M).sum(axis=1) def write(self, w, e, a, M): Mtilda = M * (1 - w[:, :, None] * e[:, None, :]) Mout = Mtilda + w[:, :, None] * a[:, None, :] return Mout def get_content_w(self, beta, k, M): num = beta[:, None] * cosine_similarity(M, k) return soft_max(num) def get_location_w(self, g, s, C, gamma, wc, w_tm1): wg = g[:, None] * wc + (1 - g[:, None]) * w_tm1 Cs = (C[None, :, :, :] * wg[:, None, None, :]).sum(axis=3) wtilda = (Cs * s[:, :, None]).sum(axis=1) wout = re_norm(wtilda ** gamma[:, None]) return wout def get_controller_output(self, h, W_k, b_k, W_c, b_c, W_s, b_s): k = T.tanh(T.dot(h, W_k) + b_k) # + 1e-6 c = T.dot(h, W_c) + b_c beta = T.nnet.relu(c[:, 0]) + 1e-4 g = T.nnet.sigmoid(c[:, 1]) gamma = T.nnet.relu(c[:, 2]) + 1.0001 s = T.nnet.softmax(T.dot(h, W_s) + b_s) return k, beta, g, gamma, s def get_output_shape_for(self, input_shape): if self.return_sequences: return input_shape[0], input_shape[1], self.output_dim else: return input_shape[0], self.output_dim def call(self, x, mask = None): M_tm1, wr_tm1, ww_tm1 = mask[:3] # reshape M_tm1 = M_tm1.reshape((x.shape[0], self.n_slots, self.m_length)) # read h_tm1 = mask[3:] k_read, beta_read, g_read, gamma_read, s_read = self.get_controller_output( h_tm1[0], self.W_k_read, self.b_k_read, self.W_c_read, self.b_c_read, self.W_s_read, self.b_s_read) wc_read = self.get_content_w(beta_read, k_read, M_tm1) wr_t = self.get_location_w(g_read, s_read, self.C, gamma_read, wc_read, wr_tm1) M_read = self.read(wr_t, M_tm1) # update controller h_t = update_controller(self, x, h_tm1, M_read) # write k_write, beta_write, g_write, gamma_write, s_write = self.get_controller_output( h_t[0], self.W_k_write, self.b_k_write, self.W_c_write, self.b_c_write, self.W_s_write, self.b_s_write) wc_write = self.get_content_w(beta_write, k_write, M_tm1) ww_t = self.get_location_w(g_write, s_write, self.C, gamma_write, wc_write, ww_tm1) e = T.nnet.sigmoid(T.dot(h_t[0], self.W_e) + self.b_e) a = T.tanh(T.dot(h_t[0], self.W_a) + self.b_a) M_t = self.write(ww_t, e, a, M_tm1) M_t = M_t.flatten(ndim=2) return h_t[0], [M_t, wr_t, ww_t] + h_t
class Stack(Recurrent): """ Neural Turing Machines Non obvious parameter: ---------------------- shift_range: int, number of available shifts, ex. if 3, avilable shifts are (-1, 0, 1) n_slots: number of memory locations m_length: memory length at each location Known issues: ------------- Theano may complain when n_slots == 1. """ def __init__(self, output_dim, n_slots, m_length, inner_rnn='lstm',rnn_size=64, stack=True, init='glorot_uniform', inner_init='orthogonal', input_dim=None, input_length=None, **kwargs): self.output_dim = output_dim self.n_slots = n_slots + 1 # because we start at time 1 self.m_length = m_length self.init = init self.inner_init = inner_init self.inner_rnn = inner_rnn self.rnn_size = rnn_size self.stack = stack self.input_dim = input_dim self.input_length = input_length if self.input_dim: kwargs['input_shape'] = (self.input_length, self.input_dim) super(Stack, self).__init__(**kwargs) def build(self): input_leng, input_dim = self.input_shape[1:] self.input = T.tensor3() if self.inner_rnn == 'gru': self.rnn = GRU( activation='relu', input_dim=input_dim+self.m_length, input_length=input_leng, output_dim=self.output_dim, init=self.init, inner_init=self.inner_init) elif self.inner_rnn == 'lstm': self.rnn = LSTM( input_dim=input_dim+self.m_length, input_length=input_leng, output_dim=self.rnn_size, init=self.init, forget_bias_init='zero', inner_init=self.inner_init) else: raise ValueError('this inner_rnn is not implemented yet.') self.rnn.build() self.init_h = K.zeros((self.rnn_size)) self.W_d = self.rnn.init((self.rnn_size,1)) self.W_u = self.rnn.init((self.rnn_size,1)) self.W_v = self.rnn.init((self.rnn_size,self.m_length)) self.W_o = self.rnn.init((self.rnn_size,self.output_dim)) self.b_d = K.zeros((1,),name="b_d") self.b_u = K.zeros((1,),name="b_u") self.b_v = K.zeros((self.m_length,)) self.b_o = K.zeros((self.output_dim,)) self.trainable_weights = self.rnn.trainable_weights + [ self.W_d, self.b_d, self.W_v, self.b_v, self.W_u, self.b_u, self.W_o, self.b_o, self.init_h] if self.inner_rnn == 'lstm': self.init_c = K.zeros((self.rnn_size)) self.trainable_weights = self.trainable_weights + [self.init_c, ] #self.trainable_weights =[self.W_d] def get_initial_states(self, X): batch_size = X.shape[0] init_r = K.zeros((self.m_length)).dimshuffle('x',0).repeat(batch_size,axis=0) init_V = K.zeros((self.n_slots,self.m_length)).dimshuffle('x',0,1).repeat(batch_size,axis=0) init_S = K.zeros((self.n_slots)).dimshuffle('x',0).repeat(batch_size,axis=0) init_h = self.init_h.dimshuffle(('x', 0)).repeat(batch_size, axis=0) itime = K.zeros((1,),dtype=np.int32) if self.inner_rnn == 'lstm': init_c = self.init_c.dimshuffle(('x', 0)).repeat(batch_size, axis=0) return [init_r , init_V,init_S,itime,init_h,init_c] @property def output_shape(self): input_shape = self.input_shape if self.return_sequences: return input_shape[0], input_shape[1], self.output_dim else: return input_shape[0], self.output_dim def get_full_output(self, train=False): """ This method is for research and visualization purposes. Use it as X = model.get_input() # full model Y = ntm.get_output() # this layer F = theano.function([X], Y, allow_input_downcast=True) [memory, read_address, write_address, rnn_state] = F(x) if inner_rnn == "lstm" use it as [memory, read_address, write_address, rnn_cell, rnn_state] = F(x) """ # input shape: (nb_samples, time (padded with zeros), input_dim) X = self.get_input(train) assert K.ndim(X) == 3 if K._BACKEND == 'tensorflow': if not self.input_shape[1]: raise Exception('When using TensorFlow, you should define ' + 'explicitely the number of timesteps of ' + 'your sequences. Make sure the first layer ' + 'has a "batch_input_shape" argument ' + 'including the samples axis.') mask = self.get_output_mask(train) if mask: # apply mask X *= K.cast(K.expand_dims(mask), X.dtype) masking = True else: masking = False if self.stateful: initial_states = self.states else: initial_states = self.get_initial_states(X) states = rnn_states(self.step, X, initial_states, go_backwards=self.go_backwards, masking=masking) return states def step(self, x, states): r_tm1, V_tm1,s_tm1,time = states[:4] h_tm1 = states[4:] def print_name_shape(name,x): return T.cast( K.sum(theano.printing.Print(name)(x.shape)) * 0,"float32") r_tm1 = r_tm1 + print_name_shape("out\nr_tm1",r_tm1) + \ print_name_shape("V_tm1",V_tm1) + \ print_name_shape("s_tm1",s_tm1) + \ print_name_shape("x",x) + \ print_name_shape("h_tm1_0",h_tm1[0]) + \ print_name_shape("h_tm1_1",h_tm1[1]) op_t, h_t = self._update_controller( T.concatenate([x, r_tm1], axis=-1), h_tm1) # op_t = op_t + print_name_shape("W_d",self.W_d.get_value()) op_t = op_t + print_name_shape("afterop_t",op_t) #op_t = op_t[:,0,:] ao = K.dot(op_t, self.W_d) ao = ao +print_name_shape("ao",ao) d_t = K.sigmoid( ao + self.b_d) + print_name_shape("afterop2_t",op_t) u_t = K.sigmoid(K.dot(op_t, self.W_u) + self.b_u)+ print_name_shape("d_t",op_t) v_t = K.tanh(K.dot(op_t, self.W_v) + self.b_v) + print_name_shape("u_t",u_t) o_t = K.tanh(K.dot(op_t, self.W_o) + self.b_o) + print_name_shape("v_t",v_t) o_t = o_t + print_name_shape("afterbulk_t",o_t) time = time + 1 V_t, s_t, r_t = _update_neural_stack(self, V_tm1, s_tm1, d_t[::,0], u_t[::,0], v_t,time[0],stack=self.stack) #V_t, s_t, r_t = V_tm1,s_tm1,T.sum(V_tm1,axis = 1) V_t = V_t + print_name_shape("o_t",o_t) + \ print_name_shape("r_t",r_t) + \ print_name_shape("V_t",V_t) +\ print_name_shape("s_t",s_t) # T.cast( theano.printing.Print("time")(time[0]),"float32") #time = T.set_subtensor(time[0],time[0] +) return o_t, [r_t, V_t, s_t, time] + h_t def _update_controller(self, inp , h_tm1): """We have to update the inner RNN inside the NTM, this is the function to do it. Pretty much copy+pasta from Keras """ def print_name_shape(name,x,shape=True): if shape: return T.cast( K.sum(theano.printing.Print(name)(x.shape)) * 0,"float32") else: return theano.printing.Print(name)(x) #1 is for gru, 2 is for lstm if len(h_tm1) in [1,2]: if hasattr(self.rnn,"get_constants"): BW,BU = self.rnn.get_constants(inp) h_tm1 += (BW,BU) # update state op_t, h = self.rnn.step(inp + print_name_shape("inp",inp), h_tm1) return op_t + print_name_shape("opt",op_t) +print_name_shape("h",h[0]) +print_name_shape("h",h[1])\ , h
class Stack(Recurrent): """ Neural Turing Machines Non obvious parameter: ---------------------- shift_range: int, number of available shifts, ex. if 3, avilable shifts are (-1, 0, 1) n_slots: number of memory locations m_length: memory length at each location Known issues: ------------- Theano may complain when n_slots == 1. """ def __init__(self, output_dim, n_slots, m_length, inner_rnn='lstm', rnn_size=64, stack=True, init='glorot_uniform', inner_init='orthogonal', input_dim=None, input_length=None, **kwargs): self.output_dim = output_dim self.n_slots = n_slots + 1 # because we start at time 1 self.m_length = m_length self.init = init self.inner_init = inner_init self.inner_rnn = inner_rnn self.rnn_size = rnn_size self.stack = stack self.input_dim = input_dim self.input_length = input_length if self.input_dim: kwargs['input_shape'] = (self.input_length, self.input_dim) super(Stack, self).__init__(**kwargs) def build(self): input_leng, input_dim = self.input_shape[1:] self.input = T.tensor3() if self.inner_rnn == 'gru': self.rnn = GRU(activation='relu', input_dim=input_dim + self.m_length, input_length=input_leng, output_dim=self.output_dim, init=self.init, inner_init=self.inner_init) elif self.inner_rnn == 'lstm': self.rnn = LSTM(input_dim=input_dim + self.m_length, input_length=input_leng, output_dim=self.rnn_size, init=self.init, forget_bias_init='zero', inner_init=self.inner_init) else: raise ValueError('this inner_rnn is not implemented yet.') self.rnn.build() self.init_h = K.zeros((self.rnn_size)) self.W_d = self.rnn.init((self.rnn_size, 1)) self.W_u = self.rnn.init((self.rnn_size, 1)) self.W_v = self.rnn.init((self.rnn_size, self.m_length)) self.W_o = self.rnn.init((self.rnn_size, self.output_dim)) self.b_d = K.zeros((1, ), name="b_d") self.b_u = K.zeros((1, ), name="b_u") self.b_v = K.zeros((self.m_length, )) self.b_o = K.zeros((self.output_dim, )) self.trainable_weights = self.rnn.trainable_weights + [ self.W_d, self.b_d, self.W_v, self.b_v, self.W_u, self.b_u, self.W_o, self.b_o, self.init_h ] if self.inner_rnn == 'lstm': self.init_c = K.zeros((self.rnn_size)) self.trainable_weights = self.trainable_weights + [ self.init_c, ] #self.trainable_weights =[self.W_d] def get_initial_states(self, X): batch_size = X.shape[0] init_r = K.zeros((self.m_length)).dimshuffle('x', 0).repeat(batch_size, axis=0) init_V = K.zeros( (self.n_slots, self.m_length)).dimshuffle('x', 0, 1).repeat(batch_size, axis=0) init_S = K.zeros((self.n_slots)).dimshuffle('x', 0).repeat(batch_size, axis=0) init_h = self.init_h.dimshuffle(('x', 0)).repeat(batch_size, axis=0) itime = K.zeros((1, ), dtype=np.int32) if self.inner_rnn == 'lstm': init_c = self.init_c.dimshuffle(('x', 0)).repeat(batch_size, axis=0) return [init_r, init_V, init_S, itime, init_h, init_c] @property def output_shape(self): input_shape = self.input_shape if self.return_sequences: return input_shape[0], input_shape[1], self.output_dim else: return input_shape[0], self.output_dim def get_full_output(self, train=False): """ This method is for research and visualization purposes. Use it as X = model.get_input() # full model Y = ntm.get_output() # this layer F = theano.function([X], Y, allow_input_downcast=True) [memory, read_address, write_address, rnn_state] = F(x) if inner_rnn == "lstm" use it as [memory, read_address, write_address, rnn_cell, rnn_state] = F(x) """ # input shape: (nb_samples, time (padded with zeros), input_dim) X = self.get_input(train) assert K.ndim(X) == 3 if K._BACKEND == 'tensorflow': if not self.input_shape[1]: raise Exception('When using TensorFlow, you should define ' + 'explicitely the number of timesteps of ' + 'your sequences. Make sure the first layer ' + 'has a "batch_input_shape" argument ' + 'including the samples axis.') mask = self.get_output_mask(train) if mask: # apply mask X *= K.cast(K.expand_dims(mask), X.dtype) masking = True else: masking = False if self.stateful: initial_states = self.states else: initial_states = self.get_initial_states(X) states = rnn_states(self.step, X, initial_states, go_backwards=self.go_backwards, masking=masking) return states def step(self, x, states): r_tm1, V_tm1, s_tm1, time = states[:4] h_tm1 = states[4:] def print_name_shape(name, x): return T.cast( K.sum(theano.printing.Print(name)(x.shape)) * 0, "float32") r_tm1 = r_tm1 + print_name_shape("out\nr_tm1",r_tm1) + \ print_name_shape("V_tm1",V_tm1) + \ print_name_shape("s_tm1",s_tm1) + \ print_name_shape("x",x) + \ print_name_shape("h_tm1_0",h_tm1[0]) + \ print_name_shape("h_tm1_1",h_tm1[1]) op_t, h_t = self._update_controller(T.concatenate([x, r_tm1], axis=-1), h_tm1) # op_t = op_t + print_name_shape("W_d",self.W_d.get_value()) op_t = op_t + print_name_shape("afterop_t", op_t) #op_t = op_t[:,0,:] ao = K.dot(op_t, self.W_d) ao = ao + print_name_shape("ao", ao) d_t = K.sigmoid(ao + self.b_d) + print_name_shape("afterop2_t", op_t) u_t = K.sigmoid(K.dot(op_t, self.W_u) + self.b_u) + print_name_shape( "d_t", op_t) v_t = K.tanh(K.dot(op_t, self.W_v) + self.b_v) + print_name_shape( "u_t", u_t) o_t = K.tanh(K.dot(op_t, self.W_o) + self.b_o) + print_name_shape( "v_t", v_t) o_t = o_t + print_name_shape("afterbulk_t", o_t) time = time + 1 V_t, s_t, r_t = _update_neural_stack(self, V_tm1, s_tm1, d_t[::, 0], u_t[::, 0], v_t, time[0], stack=self.stack) #V_t, s_t, r_t = V_tm1,s_tm1,T.sum(V_tm1,axis = 1) V_t = V_t + print_name_shape("o_t",o_t) + \ print_name_shape("r_t",r_t) + \ print_name_shape("V_t",V_t) +\ print_name_shape("s_t",s_t) # T.cast( theano.printing.Print("time")(time[0]),"float32") #time = T.set_subtensor(time[0],time[0] +) return o_t, [r_t, V_t, s_t, time] + h_t def _update_controller(self, inp, h_tm1): """We have to update the inner RNN inside the NTM, this is the function to do it. Pretty much copy+pasta from Keras """ def print_name_shape(name, x, shape=True): if shape: return T.cast( K.sum(theano.printing.Print(name)(x.shape)) * 0, "float32") else: return theano.printing.Print(name)(x) #1 is for gru, 2 is for lstm if len(h_tm1) in [1, 2]: if hasattr(self.rnn, "get_constants"): BW, BU = self.rnn.get_constants(inp) h_tm1 += (BW, BU) # update state op_t, h = self.rnn.step(inp + print_name_shape("inp", inp), h_tm1) return op_t + print_name_shape("opt",op_t) +print_name_shape("h",h[0]) +print_name_shape("h",h[1])\ , h
class DRAW(Recurrent): '''DRAW Parameters: =========== output_dim : encoder/decoder dimension code_dim : random sample dimension (reparametrization trick output) input_shape : (n_channels, rows, cols) N_enc : Size of the encoder's filter bank (MNIST default: 2) N_dec : Size of the decoder's filter bank (MNIST default: 5) n_steps : number of sampling steps (or how long it takes to draw, default 64) inner_rnn : str with rnn type ('gru' default) truncate_gradient : int (-1 default) return_sequences : bool (False default) ''' theano_rng = theano_rng() def __init__(self, output_dim, code_dim, N_enc=2, N_dec=5, n_steps=64, inner_rnn='gru', truncate_gradient=-1, return_sequences=False, canvas_activation=T.nnet.sigmoid, init='glorot_uniform', inner_init='orthogonal', input_shape=None, **kwargs): self.output_dim = output_dim # this is 256 for MNIST self.code_dim = code_dim # this is 100 for MNIST self.N_enc = N_enc self.N_dec = N_dec self.truncate_gradient = truncate_gradient self.return_sequences = return_sequences self.n_steps = n_steps self.canvas_activation = canvas_activation self.init = init self.inner_init = inner_init self.inner_rnn = inner_rnn self.height = input_shape[1] self.width = input_shape[2] self._input_shape = input_shape super(DRAW, self).__init__(**kwargs) def build(self): self.input = T.tensor4() if self.inner_rnn == 'gru': self.enc = GRU(input_length=self.n_steps, input_dim=self._input_shape[0] * 2 * self.N_enc**2 + self.output_dim, output_dim=self.output_dim, init=self.init, inner_init=self.inner_init) self.dec = GRU(input_length=self.n_steps, input_dim=self.code_dim, output_dim=self.output_dim, init=self.init, inner_init=self.inner_init) elif self.inner_rnn == 'lstm': self.enc = LSTM( input_length=self.n_steps, input_dim=self._input_shape[0] * 2 * self.N_enc**2 + self.output_dim, output_dim=self.output_dim, init=self.init, inner_init=self.inner_init) self.dec = LSTM(input_length=self.n_steps, input_dim=self.code_dim, output_dim=self.output_dim, init=self.init, inner_init=self.inner_init) else: raise ValueError('This type of inner_rnn is not supported') self.enc.build() self.dec.build() self.init_canvas = shared_zeros( self._input_shape) # canvas and hidden state self.init_h_enc = shared_zeros((self.output_dim)) # initial values self.init_h_dec = shared_zeros((self.output_dim)) # should be trained self.L_enc = self.enc.init( (self.output_dim, 5)) # "read" attention parameters (eq. 21) self.L_dec = self.enc.init( (self.output_dim, 5)) # "write" attention parameters (eq. 28) self.b_enc = shared_zeros((5)) # "read" attention parameters (eq. 21) self.b_dec = shared_zeros((5)) # "write" attention parameters (eq. 28) self.W_patch = self.enc.init( (self.output_dim, self.N_dec**2 * self._input_shape[0])) self.b_patch = shared_zeros((self.N_dec**2 * self._input_shape[0])) self.W_mean = self.enc.init((self.output_dim, self.code_dim)) self.W_sigma = self.enc.init((self.output_dim, self.code_dim)) self.b_mean = shared_zeros((self.code_dim)) self.b_sigma = shared_zeros((self.code_dim)) self.trainable_weights = self.enc.trainable_weights + self.dec.trainable_weights + [ self.L_enc, self.L_dec, self.b_enc, self.b_dec, self.W_patch, self.b_patch, self.W_mean, self.W_sigma, self.b_mean, self.b_sigma, self.init_canvas, self.init_h_enc, self.init_h_dec ] if self.inner_rnn == 'lstm': self.init_cell_enc = shared_zeros( (self.output_dim)) # initial values self.init_cell_dec = shared_zeros( (self.output_dim)) # should be trained self.trainable_weights = self.trainable_weights + [ self.init_cell_dec, self.init_cell_enc ] def set_previous(self, layer, connection_map={}): self.previous = layer self.build() self.init_updates() def init_updates(self): self.get_output(train=True) # populate regularizers list def _get_attention_trainable_weights(self, h, L, b, N): p = T.dot(h, L) + b gx = self.width * (p[:, 0] + 1) / 2. gy = self.height * (p[:, 1] + 1) / 2. sigma2 = T.exp(p[:, 2]) delta = T.exp(p[:, 3]) * (max(self.width, self.height) - 1) / (N - 1.) gamma = T.exp(p[:, 4]) return gx, gy, sigma2, delta, gamma def _get_filterbank(self, gx, gy, sigma2, delta, N): small = 1e-4 i = T.arange(N) a = T.arange(self.width) b = T.arange(self.height) mx = gx[:, None] + delta[:, None] * (i - N / 2. - .5) my = gy[:, None] + delta[:, None] * (i - N / 2. - .5) Fx = T.exp(-(a - mx[:, :, None])**2 / 2. / sigma2[:, None, None]) Fx /= (Fx.sum(axis=-1)[:, :, None] + small) Fy = T.exp(-(b - my[:, :, None])**2 / 2. / sigma2[:, None, None]) Fy /= (Fy.sum(axis=-1)[:, :, None] + small) return Fx, Fy def _read(self, x, gamma, Fx, Fy): Fyx = (Fy[:, None, :, :, None] * x[:, :, None, :, :]).sum(axis=3) FxT = Fx.dimshuffle(0, 2, 1) FyxFx = (Fyx[:, :, :, :, None] * FxT[:, None, None, :, :]).sum(axis=3) return gamma[:, None, None, None] * FyxFx def _get_patch(self, h): write_patch = T.dot(h, self.W_patch) + self.b_patch write_patch = write_patch.reshape( (h.shape[0], self._input_shape[0], self.N_dec, self.N_dec)) return write_patch def _write(self, write_patch, gamma, Fx, Fy): Fyx = (Fy[:, None, :, :, None] * write_patch[:, :, :, None, :]).sum(axis=2) FyxFx = (Fyx[:, :, :, :, None] * Fx[:, None, None, :, :]).sum(axis=3) return FyxFx / gamma[:, None, None, None] def _get_sample(self, h, eps): mean = T.dot(h, self.W_mean) + self.b_mean # eps = self.theano_rng.normal(avg=0., std=1., size=mean.shape) logsigma = T.dot(h, self.W_sigma) + self.b_sigma sigma = T.exp(logsigma) if self._train_state: sample = mean + eps * sigma else: sample = mean + 0 * eps * sigma kl = -.5 - logsigma + .5 * (mean**2 + sigma**2) # kl = .5 * (mean**2 + sigma**2 - logsigma - 1) return sample, kl.sum(axis=-1) def _get_rnn_input(self, x, rnn): if self.inner_rnn == 'gru': x_z = T.dot(x, rnn.W_z) + rnn.b_z x_r = T.dot(x, rnn.W_r) + rnn.b_r x_h = T.dot(x, rnn.W_h) + rnn.b_h return x_z, x_r, x_h elif self.inner_rnn == 'lstm': xi = T.dot(x, rnn.W_i) + rnn.b_i xf = T.dot(x, rnn.W_f) + rnn.b_f xc = T.dot(x, rnn.W_c) + rnn.b_c xo = T.dot(x, rnn.W_o) + rnn.b_o return xi, xf, xc, xo def _get_rnn_state(self, rnn, *args): mask = 1. # no masking if self.inner_rnn == 'gru': x_z, x_r, x_h, h_tm1 = args h = rnn._step(x_z, x_r, x_h, mask, h_tm1, rnn.U_z, rnn.U_r, rnn.U_h) return h elif self.inner_rnn == 'lstm': xi, xf, xc, xo, h_tm1, cell_tm1 = args h, cell = rnn._step(xi, xf, xo, xc, mask, h_tm1, cell_tm1, rnn.U_i, rnn.U_f, rnn.U_o, rnn.U_c) return h, cell def _get_initial_states(self, X): batch_size = X.shape[0] canvas = self.init_canvas.dimshuffle('x', 0, 1, 2).repeat(batch_size, axis=0) init_enc = self.init_h_enc.dimshuffle('x', 0).repeat(batch_size, axis=0) init_dec = self.init_h_dec.dimshuffle('x', 0).repeat(batch_size, axis=0) if self.inner_rnn == 'lstm': init_cell_enc = self.init_cell_enc.dimshuffle('x', 0).repeat(batch_size, axis=0) init_cell_dec = self.init_cell_dec.dimshuffle('x', 0).repeat(batch_size, axis=0) return canvas, init_enc, init_cell_enc, init_cell_dec else: return canvas, init_enc, init_dec def _step(self, eps, canvas, h_enc, h_dec, x, *args): x_hat = x - self.canvas_activation(canvas) gx, gy, sigma2, delta, gamma = self._get_attention_trainable_weights( h_dec, self.L_enc, self.b_enc, self.N_enc) Fx, Fy = self._get_filterbank(gx, gy, sigma2, delta, self.N_enc) read_x = self._read(x, gamma, Fx, Fy).flatten(ndim=2) read_x_hat = self._read(x_hat, gamma, Fx, Fy).flatten(ndim=2) enc_input = T.concatenate([read_x, read_x_hat, h_dec], axis=-1) x_enc_z, x_enc_r, x_enc_h = self._get_rnn_input(enc_input, self.enc) new_h_enc = self._get_rnn_state(self.enc, x_enc_z, x_enc_r, x_enc_h, h_enc) sample, kl = self._get_sample(new_h_enc, eps) x_dec_z, x_dec_r, x_dec_h = self._get_rnn_input(sample, self.dec) new_h_dec = self._get_rnn_state(self.dec, x_dec_z, x_dec_r, x_dec_h, h_dec) gx_w, gy_w, sigma2_w, delta_w, gamma_w = self._get_attention_trainable_weights( new_h_dec, self.L_dec, self.b_dec, self.N_dec) Fx_w, Fy_w = self._get_filterbank(gx_w, gy_w, sigma2_w, delta_w, self.N_dec) write_patch = self._get_patch(new_h_dec) new_canvas = canvas + self._write(write_patch, gamma_w, Fx_w, Fy_w) return new_canvas, new_h_enc, new_h_dec, kl def _step_lstm(self, eps, canvas, h_enc, cell_enc, h_dec, cell_dec, x, *args): x_hat = x - self.canvas_activation(canvas) gx, gy, sigma2, delta, gamma = self._get_attention_trainable_weights( h_dec, self.L_enc, self.b_enc, self.N_enc) Fx, Fy = self._get_filterbank(gx, gy, sigma2, delta, self.N_enc) read_x = self._read(x, gamma, Fx, Fy).flatten(ndim=2) read_x_hat = self._read(x_hat, gamma, Fx, Fy).flatten(ndim=2) enc_input = T.concatenate( [read_x, read_x_hat, h_dec.flatten(ndim=2)], axis=1) x_enc_i, x_enc_f, x_enc_c, x_enc_o = self._get_rnn_input( enc_input, self.enc) new_h_enc, new_cell_enc = self._get_rnn_state(self.enc, x_enc_i, x_enc_f, x_enc_c, x_enc_o, h_enc, cell_enc) sample, kl = self._get_sample(new_h_enc, eps) x_dec_i, x_dec_f, x_dec_c, x_dec_o = self._get_rnn_input( sample, self.dec) new_h_dec, new_cell_dec = self._get_rnn_state(self.dec, x_dec_i, x_dec_f, x_dec_c, x_dec_o, h_dec, cell_dec) gx_w, gy_w, sigma2_w, delta_w, gamma_w = self._get_attention_trainable_weights( new_h_dec, self.L_dec, self.b_dec, self.N_dec) Fx_w, Fy_w = self._get_filterbank(gx_w, gy_w, sigma2_w, delta_w, self.N_dec) write_patch = self._get_patch(new_h_dec) new_canvas = canvas + self._write(write_patch, gamma_w, Fx_w, Fy_w) return new_canvas, new_h_enc, new_cell_enc, new_h_dec, new_cell_dec, kl def get_output(self, train=False): self._train_state = train X, eps = self.get_input(train).values() eps = eps.dimshuffle(1, 0, 2) if self.inner_rnn == 'gru': outputs, updates = scan( self._step, sequences=eps, outputs_info=self._get_initial_states(X) + (None, ), non_sequences=[ X, ] + self.trainable_weights, # n_steps=self.n_steps, truncate_gradient=self.truncate_gradient) elif self.inner_rnn == 'lstm': outputs, updates = scan(self._step_lstm, sequences=eps, outputs_info=self._get_initial_states(X) + (None, ), non_sequences=[ X, ] + self.trainable_weights, truncate_gradient=self.truncate_gradient) kl = outputs[-1].sum(axis=0).mean() if train: # self.updates = updates self.regularizers = [ SimpleCost(kl), ] if self.return_sequences: return [outputs[0].dimshuffle(1, 0, 2, 3, 4), kl] else: return [outputs[0][-1], kl]
class Stack(Recurrent): """ Stack and queue network output_dim = output dimension n_slots = number of memory slot m_length = dimention of the memory rnn_size = output length of the memory controler inner_rnn = "lstm" only lstm is supported stack = True to create neural stack or False to create neural queue from Learning to Transduce with Unbounded Memory [[http://arxiv.org/pdf/1506.02516.pdf]] """ def __init__(self, output_dim, n_slots, m_length, inner_rnn='lstm',rnn_size=64, stack=True, init='glorot_uniform', inner_init='orthogonal', input_dim=None, input_length=None, **kwargs): self.output_dim = output_dim self.n_slots = n_slots + 1 # because we start at time 1 self.m_length = m_length self.init = init self.inner_init = inner_init if inner_rnn != "lstm": print "Only lstm is supported" raise self.inner_rnn = inner_rnn self.rnn_size = rnn_size self.stack = stack self.input_dim = input_dim self.input_length = input_length if self.input_dim: kwargs['input_shape'] = (self.input_length, self.input_dim) super(Stack, self).__init__(**kwargs) def build(self): input_leng, input_dim = self.input_shape[1:] self.input = T.tensor3() if self.inner_rnn == 'gru': self.rnn = GRU( activation='relu', input_dim=input_dim+self.m_length, input_length=input_leng, output_dim=self.output_dim, init=self.init, inner_init=self.inner_init) elif self.inner_rnn == 'lstm': self.rnn = LSTM( input_dim=input_dim+self.m_length, input_length=input_leng, output_dim=self.rnn_size, init=self.init, forget_bias_init='zero', inner_init=self.inner_init) else: raise ValueError('this inner_rnn is not implemented yet.') self.rnn.build() self.init_h = K.zeros((self.rnn_size)) self.W_d = self.rnn.init((self.rnn_size,1)) self.W_u = self.rnn.init((self.rnn_size,1)) self.W_v = self.rnn.init((self.rnn_size,self.m_length)) self.W_o = self.rnn.init((self.rnn_size,self.output_dim)) self.b_d = K.zeros((1,),name="b_d") self.b_u = K.zeros((1,),name="b_u") self.b_v = K.zeros((self.m_length,)) self.b_o = K.zeros((self.output_dim,)) self.trainable_weights = self.rnn.trainable_weights + [ self.W_d, self.b_d, self.W_v, self.b_v, self.W_u, self.b_u, self.W_o, self.b_o, self.init_h] if self.inner_rnn == 'lstm': self.init_c = K.zeros((self.rnn_size)) self.trainable_weights = self.trainable_weights + [self.init_c, ] #self.trainable_weights =[self.W_d] def get_initial_states(self, X): batch_size = X.shape[0] init_r = K.zeros((self.m_length)).dimshuffle('x',0).repeat(batch_size,axis=0) init_V = K.zeros((self.n_slots,self.m_length)).dimshuffle('x',0,1).repeat(batch_size,axis=0) init_S = K.zeros((self.n_slots)).dimshuffle('x',0).repeat(batch_size,axis=0) init_h = self.init_h.dimshuffle(('x', 0)).repeat(batch_size, axis=0) itime = K.zeros((1,),dtype=np.int32) if self.inner_rnn == 'lstm': init_c = self.init_c.dimshuffle(('x', 0)).repeat(batch_size, axis=0) return [init_r , init_V,init_S,itime,init_h,init_c] @property def output_shape(self): input_shape = self.input_shape if self.return_sequences: return input_shape[0], input_shape[1], self.output_dim else: return input_shape[0], self.output_dim def step(self, x, states): r_tm1, V_tm1,s_tm1,time = states[:4] h_tm1 = states[4:] r_tm1 = r_tm1 op_t, h_t = _update_controller(self, T.concatenate([x, r_tm1], axis=-1), h_tm1) # op_t = op_t + print_name_shape("W_d",self.W_d.get_value()) op_t = op_t #op_t = op_t[:,0,:] d_t = K.sigmoid( K.dot(op_t, self.W_d) + self.b_d) u_t = K.sigmoid(K.dot(op_t, self.W_u) + self.b_u) v_t = K.tanh(K.dot(op_t, self.W_v) + self.b_v) o_t = K.tanh(K.dot(op_t, self.W_o) + self.b_o) time = time + 1 V_t, s_t, r_t = _update_neural_stack(self, V_tm1, s_tm1, d_t[::,0], u_t[::,0], v_t,time[0],stack=self.stack) return o_t, [r_t, V_t, s_t, time] + h_t