def model(x, embedding_size, n_hidden): # hidden and input weights U = shared_glorot_uniform((embedding_size, n_hidden), name="U") W = shared_glorot_uniform((n_hidden, n_hidden), name="W") bh = shared_zeros((n_hidden, ), name="bh") # output weights V = shared_glorot_uniform((n_hidden, embedding_size), name="V") by = shared_zeros((embedding_size, ), name="by") params = [U, V, W, by, bh] def step(x_t, h_tm1): h_t = T.tanh(U[x_t] + T.dot(h_tm1, W) + bh) y_t = T.dot(h_t, V) + by return h_t, y_t h0 = shared_zeros((n_hidden, ), name='h0') [h, y_pred], _ = theano.scan(step, sequences=x, outputs_info=[h0, None], truncate_gradient=10) model = T.nnet.softmax(y_pred) return model, params
def test_gen_cloning_with_shape_change(self): data = floatX(np.random.uniform(size=(1000, 10))) minibatches = DataSampler(data, batchsize=50) gen = generator(minibatches) gen_r = tt_rng().normal(size=gen.shape).T X = gen.dot(gen_r) res, _ = theano.scan(lambda x: x.sum(), X, n_steps=X.shape[0]) assert res.eval().shape == (50, ) shared = theano.shared(data) res2 = theano.clone(res, {gen: shared**2}) assert res2.eval().shape == (1000, )
def test_gen_cloning_with_shape_change(self): data = floatX(np.random.uniform(size=(1000, 10))) minibatches = DataSampler(data, batchsize=50) gen = generator(minibatches) gen_r = tt_rng().normal(size=gen.shape).T X = gen.dot(gen_r) res, _ = theano.scan(lambda x: x.sum(), X, n_steps=X.shape[0]) assert res.eval().shape == (50,) shared = theano.shared(data) res2 = theano.clone(res, {gen: shared**2}) assert res2.eval().shape == (1000,)
def apply(self, f): # f: kernel function for KSD f(histogram) -> (k(x,.), \nabla_x k(x,.)) X = self.approx.histogram t = self.approx.normalizing_constant dlogpdx = theano.scan( fn=lambda zg: theano.grad(self.logp_norm(zg), zg), sequences=[X])[0] # bottleneck Kxy, dxkxy = f(X) # scaling factor # not needed for Kxy as we already scaled dlogpdx dxkxy /= t n = X.shape[0].astype('float32') / t svgd_grad = (tt.dot(Kxy, dlogpdx) + dxkxy) / n return -1 * svgd_grad # gradient
def apply(self, f): # f: kernel function for KSD f(histogram) -> (k(x,.), \nabla_x k(x,.)) X = self.approx.histogram t = self.approx.normalizing_constant dlogpdx = theano.scan( fn=lambda zg: theano.grad(self.logp_norm(zg), zg), sequences=[X] )[0] # bottleneck Kxy, dxkxy = f(X) # scaling factor # not needed for Kxy as we already scaled dlogpdx dxkxy /= t n = X.shape[0].astype('float32') / t svgd_grad = (tt.dot(Kxy, dlogpdx) + dxkxy) / n return -1 * svgd_grad # gradient
def model(x, embedding_size, n_hidden): # Update gate weights W_xz = shared_glorot_uniform((embedding_size, n_hidden)) W_hz = shared_glorot_uniform((n_hidden, n_hidden)) b_z = shared_zeros((n_hidden, )) # Reset gate weights W_xr = shared_glorot_uniform((embedding_size, n_hidden)) W_hr = shared_glorot_uniform((n_hidden, n_hidden)) b_r = shared_zeros((n_hidden, )) # Hidden layer W_xh = shared_glorot_uniform((embedding_size, n_hidden)) W_hh = shared_glorot_uniform((n_hidden, n_hidden)) b_h = shared_zeros((n_hidden, )) # Output weights W_y = shared_glorot_uniform((n_hidden, embedding_size), name="V") b_y = shared_zeros((embedding_size, ), name="by") params = [W_xz, W_hz, b_z, W_xr, W_hr, b_r, W_xh, W_hh, b_h, W_y, b_y] def step(x_t, h_tm1): z_t = T.nnet.sigmoid(W_xz[x_t] + T.dot(W_hz, h_tm1) + b_z) r_t = T.nnet.sigmoid(W_xr[x_t] + T.dot(W_hr, h_tm1) + b_r) can_h_t = T.tanh(W_xh[x_t] + r_t * T.dot(W_hh, h_tm1) + b_h) h_t = (1 - z_t) * h_tm1 + z_t * can_h_t y_t = T.dot(h_t, W_y) + b_y return h_t, y_t h0 = shared_zeros((n_hidden, ), name='h0') [h, y_pred], _ = theano.scan(step, sequences=x, outputs_info=[h0, None], truncate_gradient=10) model = T.nnet.softmax(y_pred) return model, params
def dlogp(self): return theano.scan( fn=lambda zg: theano.grad(self.approx.logp_norm(zg), zg), sequences=[self.input_matrix])[0]
from theano import function, theano from theano import pp import theano.tensor as T #computing Gradients x = T.dscalar('x') y= x ** 2 gy = T.grad(y, x) print pp(gy) f = function([x], gy) print f(4) x = T.matrix('x') s = T.sum(1 / (1 + T.exp(-x))) gs = T.grad(s, x) dlogistic = function([[0, 1], [-1, -2]]) print dlogistic #computing Jacobian x = T.dvectors('x') y = x ** 2 J, updates = theano.scan(lambda i, y,x : T.grad(y[i], x), sequences = T.arange(y.shape[0]), non_sequences = [y, x]) f = function([x], J, updates = updates) print f([4, 4])
def get_output_for(self, inputs, **kwargs): """ Compute this layer's output function given a symbolic input variable Parameters ---------- inputs : list of theano.TensorType `inputs[0]` should always be the symbolic input variable. When this layer has a mask input (i.e. was instantiated with `mask_input != None`, indicating that the lengths of sequences in each batch vary), `inputs` should have length 2, where `inputs[1]` is the `mask`. The `mask` should be supplied as a Theano variable denoting whether each time step in each sequence in the batch is part of the sequence or not. `mask` should be a matrix of shape ``(n_batch, n_time_steps)`` where ``mask[i, j] = 1`` when ``j <= (length of sequence i)`` and ``mask[i, j] = 0`` when ``j > (length of sequence i)``. Returns ------- layer_output : theano.TensorType Symbolic output variable. """ # Retrieve the layer input input = inputs[0] encoder_output = inputs[1] # Treat all dimensions after the second as flattened feature dimensions if input.ndim > 3: input = T.flatten(input, 3) encoder_output = T.flatten(encoder_output, 3) # Because scan iterates over the first dimension we dimshuffle to # (n_time_steps, n_batch, n_features) input = input.dimshuffle(1, 0, 2) encoder_output = encoder_output.dimshuffle(1, 0, 2) seq_len, num_batch, _ = input.shape # Stack input weight matrices into a (num_inputs, 3*num_units) # matrix, which speeds up computation W_in_stacked = T.concatenate( [self.W_in_to_resetgate, self.W_in_to_updategate, self.W_in_to_hidden_update], axis=1 ) # Same for hidden weight matrices W_hid_stacked = T.concatenate( [self.W_hid_to_resetgate, self.W_hid_to_updategate, self.W_hid_to_hidden_update], axis=1 ) # Stack gate biases into a (3*num_units) vector b_stacked = T.concatenate([self.b_resetgate, self.b_updategate, self.b_hidden_update], axis=0) if self.precompute_input: # precompute_input inputs*W. W_in is (n_features, 3*num_units). # input is then (n_batch, n_time_steps, 3*num_units). input = T.dot(input, W_in_stacked) + b_stacked # At each call to scan, input_n will be (n_time_steps, 3*num_units). # We define a slicing function that extract the input to each GRU gate def slice_w(x, n): return x[:, n * self.num_units : (n + 1) * self.num_units] # Create single recurrent computation step function # input__n is the n'th vector of the input def step( input_n, hid_previous, encoder_output, W_hid_stacked, W_in_stacked, b_stacked, W_att_enc, W_att_dec, W_att_out, W_out, b_out, ): # Compute W_{hr} h_{t - 1}, W_{hu} h_{t - 1}, and W_{hc} h_{t - 1} hid_input = T.dot(hid_previous, W_hid_stacked) if self.grad_clipping is not False: input_n = theano.gradient.grad_clip(input_n, -self.grad_clipping, self.grad_clipping) hid_input = theano.gradient.grad_clip(hid_input, -self.grad_clipping, self.grad_clipping) if not self.precompute_input: # Compute W_{xr}x_t + b_r, W_{xu}x_t + b_u, and W_{xc}x_t + b_c input_n = T.dot(input_n, W_in_stacked) + b_stacked # Reset and update gates resetgate = slice_w(hid_input, 0) + slice_w(input_n, 0) updategate = slice_w(hid_input, 1) + slice_w(input_n, 1) resetgate = self.nonlinearity_resetgate(resetgate) updategate = self.nonlinearity_updategate(updategate) # Compute W_{xc}x_t + r_t \odot (W_{hc} h_{t - 1}) hidden_update_in = slice_w(input_n, 2) hidden_update_hid = slice_w(hid_input, 2) hidden_update = hidden_update_in + resetgate * hidden_update_hid if self.grad_clipping is not False: hidden_update = theano.gradient.grad_clip(hidden_update, -self.grad_clipping, self.grad_clipping) hidden_update = self.nonlinearity_hid(hidden_update) # Compute (1 - u_t)h_{t - 1} + u_t c_t hid = (1 - updategate) * hid_previous + updategate * hidden_update # # Add the attention hid += self.attention(encoder_output, hid_previous, W_att_enc, W_att_dec, W_att_out) # Compute the probas probs = T.nnet.softmax(T.dot(hid, W_out) + b_out) return [hid, probs] sequences = [input] step_fun = step if isinstance(self.hid_init, T.TensorVariable): hid_init = self.hid_init else: # Dot against a 1s vector to repeat to shape (num_batch, num_units) hid_init = T.dot(T.ones((num_batch, 1)), self.hid_init) # The hidden-to-hidden weight matrix is always used in step non_seqs = [encoder_output, W_hid_stacked] # When we aren't precomputing the input outside of scan, we need to # provide the input weights and biases to the step function if not self.precompute_input: non_seqs += [ W_in_stacked, b_stacked, self.W_att_enc, self.W_att_dec, self.W_att_out, self.W_out, self.b_out, ] # theano.scan only allows for positional arguments, so when # self.precompute_input is True, we need to supply fake placeholder # arguments for the input weights and biases. else: non_seqs += [(), (), self.W_att_enc, self.W_att_dec, self.W_att_out, self.W_out, self.b_out] if self.unroll_scan: # Retrieve the dimensionality of the incoming layer input_shape = self.input_shapes[0] # Explicitly unroll the recurrence instead of using scan out, _ = unroll_scan( fn=step_fun, sequences=sequences, outputs_info=[hid_init], go_backwards=self.backwards, non_sequences=non_seqs, n_steps=input_shape[1], ) else: # Scan op iterates over first dimension of input and repeatedly # applies the step function out, _ = theano.scan( fn=step_fun, sequences=sequences, go_backwards=self.backwards, outputs_info=[hid_init, None], non_sequences=non_seqs, truncate_gradient=self.gradient_steps, strict=True, ) # dimshuffle back to (n_batch, n_time_steps, n_features)) # hid_out = hid_out[0].dimshuffle(1, 0, 2) s_out = out[1] # # if scan is backward reverse the output # if self.backwards: # out = out[:, ::-1, :] return s_out
def get_output_for(self, inputs, **kwargs): """ Compute this layer's output function given a symbolic input variable Parameters ---------- inputs : list of theano.TensorType `inputs[0]` should always be the symbolic input variable. When this layer has a mask input (i.e. was instantiated with `mask_input != None`, indicating that the lengths of sequences in each batch vary), `inputs` should have length 2, where `inputs[1]` is the `mask`. The `mask` should be supplied as a Theano variable denoting whether each time step in each sequence in the batch is part of the sequence or not. `mask` should be a matrix of shape ``(n_batch, n_time_steps)`` where ``mask[i, j] = 1`` when ``j <= (length of sequence i)`` and ``mask[i, j] = 0`` when ``j > (length of sequence i)``. Returns ------- layer_output : theano.TensorType Symbolic output variable. """ # Retrieve the layer input input = inputs[0] encoder_output = inputs[1] # Treat all dimensions after the second as flattened feature dimensions if input.ndim > 3: input = T.flatten(input, 3) encoder_output = T.flatten(encoder_output, 3) # Because scan iterates over the first dimension we dimshuffle to # (n_time_steps, n_batch, n_features) input = input.dimshuffle(1, 0, 2) encoder_output = encoder_output.dimshuffle(1, 0, 2) seq_len, num_batch, _ = input.shape # Stack input weight matrices into a (num_inputs, 3*num_units) # matrix, which speeds up computation W_in_stacked = T.concatenate([ self.W_in_to_resetgate, self.W_in_to_updategate, self.W_in_to_hidden_update ], axis=1) # Same for hidden weight matrices W_hid_stacked = T.concatenate([ self.W_hid_to_resetgate, self.W_hid_to_updategate, self.W_hid_to_hidden_update ], axis=1) # Stack gate biases into a (3*num_units) vector b_stacked = T.concatenate( [self.b_resetgate, self.b_updategate, self.b_hidden_update], axis=0) if self.precompute_input: # precompute_input inputs*W. W_in is (n_features, 3*num_units). # input is then (n_batch, n_time_steps, 3*num_units). input = T.dot(input, W_in_stacked) + b_stacked # At each call to scan, input_n will be (n_time_steps, 3*num_units). # We define a slicing function that extract the input to each GRU gate def slice_w(x, n): return x[:, n * self.num_units:(n + 1) * self.num_units] # Create single recurrent computation step function # input__n is the n'th vector of the input def step(input_n, hid_previous, encoder_output, W_hid_stacked, W_in_stacked, b_stacked, W_att_enc, W_att_dec, W_att_out, W_out, b_out): # Compute W_{hr} h_{t - 1}, W_{hu} h_{t - 1}, and W_{hc} h_{t - 1} hid_input = T.dot(hid_previous, W_hid_stacked) if self.grad_clipping is not False: input_n = theano.gradient.grad_clip(input_n, -self.grad_clipping, self.grad_clipping) hid_input = theano.gradient.grad_clip(hid_input, -self.grad_clipping, self.grad_clipping) if not self.precompute_input: # Compute W_{xr}x_t + b_r, W_{xu}x_t + b_u, and W_{xc}x_t + b_c input_n = T.dot(input_n, W_in_stacked) + b_stacked # Reset and update gates resetgate = slice_w(hid_input, 0) + slice_w(input_n, 0) updategate = slice_w(hid_input, 1) + slice_w(input_n, 1) resetgate = self.nonlinearity_resetgate(resetgate) updategate = self.nonlinearity_updategate(updategate) # Compute W_{xc}x_t + r_t \odot (W_{hc} h_{t - 1}) hidden_update_in = slice_w(input_n, 2) hidden_update_hid = slice_w(hid_input, 2) hidden_update = hidden_update_in + resetgate * hidden_update_hid if self.grad_clipping is not False: hidden_update = theano.gradient.grad_clip( hidden_update, -self.grad_clipping, self.grad_clipping) hidden_update = self.nonlinearity_hid(hidden_update) # Compute (1 - u_t)h_{t - 1} + u_t c_t hid = (1 - updategate) * hid_previous + updategate * hidden_update # # Add the attention hid += self.attention(encoder_output, hid_previous, W_att_enc, W_att_dec, W_att_out) # Compute the probas probs = T.nnet.softmax(T.dot(hid, W_out) + b_out) return [hid, probs] sequences = [input] step_fun = step if isinstance(self.hid_init, T.TensorVariable): hid_init = self.hid_init else: # Dot against a 1s vector to repeat to shape (num_batch, num_units) hid_init = T.dot(T.ones((num_batch, 1)), self.hid_init) # The hidden-to-hidden weight matrix is always used in step non_seqs = [encoder_output, W_hid_stacked] # When we aren't precomputing the input outside of scan, we need to # provide the input weights and biases to the step function if not self.precompute_input: non_seqs += [ W_in_stacked, b_stacked, self.W_att_enc, self.W_att_dec, self.W_att_out, self.W_out, self.b_out, ] # theano.scan only allows for positional arguments, so when # self.precompute_input is True, we need to supply fake placeholder # arguments for the input weights and biases. else: non_seqs += [(), (), self.W_att_enc, self.W_att_dec, self.W_att_out, self.W_out, self.b_out] if self.unroll_scan: # Retrieve the dimensionality of the incoming layer input_shape = self.input_shapes[0] # Explicitly unroll the recurrence instead of using scan out, _ = unroll_scan(fn=step_fun, sequences=sequences, outputs_info=[hid_init], go_backwards=self.backwards, non_sequences=non_seqs, n_steps=input_shape[1]) else: # Scan op iterates over first dimension of input and repeatedly # applies the step function out, _ = theano.scan(fn=step_fun, sequences=sequences, go_backwards=self.backwards, outputs_info=[hid_init, None], non_sequences=non_seqs, truncate_gradient=self.gradient_steps, strict=True) # dimshuffle back to (n_batch, n_time_steps, n_features)) # hid_out = hid_out[0].dimshuffle(1, 0, 2) s_out = out[1] # # if scan is backward reverse the output # if self.backwards: # out = out[:, ::-1, :] return s_out