def test_unroll_scan(): from lasagne.utils import unroll_scan k = 2 a = T.scalar("a") result = unroll_scan(fn=lambda step, prior_result, a: prior_result * a, sequences=T.arange(k), outputs_info=[1.], non_sequences=[a], n_steps=k) final_result = result[-1] power = theano.function(inputs=[a], outputs=final_result) assert np.all(power(10) == [10, 100]) b = T.scalar("b") def mul_div(step, previous_mul, previous_div, mul, div): return previous_mul * mul, previous_div / div result = unroll_scan(fn=mul_div, sequences=T.arange(k), outputs_info=[1., 1.], non_sequences=[a, b], n_steps=k) power = theano.function(inputs=[a, b], outputs=result) assert np.allclose(power(10, 10), [[10, 100], [.1, .01]])
def test_unroll_scan(): from lasagne.utils import unroll_scan k = 2 a = T.scalar("a") result = unroll_scan( fn=lambda step, prior_result, a: prior_result * a, sequences=T.arange(k), outputs_info=[1.], non_sequences=[a], n_steps=k) final_result = result[-1] power = theano.function(inputs=[a], outputs=final_result) assert np.all(power(10) == [10, 100]) b = T.scalar("b") def mul_div(step, previous_mul, previous_div, mul, div): return previous_mul*mul, previous_div/div result = unroll_scan( fn=mul_div, sequences=T.arange(k), outputs_info=[1., 1.], non_sequences=[a, b], n_steps=k) power = theano.function(inputs=[a, b], outputs=result) assert np.allclose(power(10, 10), [[10, 100], [.1, .01]])
def get_output_for(self, inputs, **kwargs): # Retrieve the layer input input = inputs[0] # Retrieve the mask when it is supplied mask = None hid_init = None cell_init = None if self.mask_incoming_index > 0: mask = inputs[self.mask_incoming_index] if self.hid_init_incoming_index > 0: hid_init = inputs[self.hid_init_incoming_index] if self.cell_init_incoming_index > 0: cell_init = inputs[self.cell_init_incoming_index] # Treat all dimensions after the second as flattened feature dimensions if input.ndim > 3: input = T.flatten(input, 3) # if self.bn: # input = self.bn.get_output_for(input) # Because scan iterates over the first dimension we dimshuffle to # (n_time_steps, n_batch, n_features) input = input.dimshuffle(1, 0, 2) seq_len, num_batch, _ = input.shape # Stack input weight matrices into a (num_inputs, 4*num_units) # matrix, which speeds up computation W_in_stacked = T.concatenate([ self.W_in_to_ingate, self.W_in_to_forgetgate, self.W_in_to_cell, self.W_in_to_outgate ], axis=1) # Same for hidden weight matrices W_hid_stacked = T.concatenate([ self.W_hid_to_ingate, self.W_hid_to_forgetgate, self.W_hid_to_cell, self.W_hid_to_outgate ], axis=1) # Stack biases into a (4*num_units) vector b_stacked = T.concatenate( [self.b_ingate, self.b_forgetgate, self.b_cell, self.b_outgate], axis=0) # Stack second order term gating biases into a (4*num_units) vector a_g_stacked = T.concatenate([ self.a_g_ingate, self.a_g_forgetgate, self.a_g_cell, self.a_g_outgate ], axis=0) # Stack second order term gating biases into a (4*num_units) vector b_g_in_to_hid_stacked = T.concatenate([ self.b_g_in_to_hid_ingate, self.b_g_in_to_hid_forgetgate, self.b_g_in_to_hid_cell, self.b_g_in_to_hid_outgate ], axis=0) # Stack second order term gating biases into a (4*num_units) vector b_g_hid_to_hid_stacked = T.concatenate([ self.b_g_hid_to_hid_ingate, self.b_g_hid_to_hid_forgetgate, self.b_g_hid_to_hid_cell, self.b_g_hid_to_hid_outgate ], axis=0) if self.precompute_input: # Because the input is given for all time steps, we can # precompute_input the inputs dot weight matrices before scanning. # W_in_stacked is (n_features, 4*num_units). input is then # (n_time_steps, n_batch, 4*num_units). input = T.dot(input, W_in_stacked) if self.bn: print 'Using batch normalization' input = self.bn.get_output_for(input.dimshuffle(1, 0, 2)) input = input.dimshuffle(1, 0, 2) # When theano.scan calls step, input_n will be (n_batch, 4*num_units). # We define a slicing function that extract the input to each LSTM gate def slice_w(x, n): s = x[:, n * self.num_units:(n + 1) * self.num_units] if self.num_units == 1: s = T.addbroadcast(s, 1) # Theano cannot infer this by itself return s # Create single recurrent computation step function # input_n is the n'th vector of the input def step(input_n, cell_previous, hid_previous, *args): # Compute the input-to-hidden activation if not self.precompute_input: input_n = T.dot(input_n, W_in_stacked) # Compute the hidden-to-hidden activation hid_to_hid = T.dot(hid_previous, W_hid_stacked) # Compute the second order term second_order_term = a_g_stacked * input_n * hid_to_hid # Compute the first order input-to-hidden term f_o_in_to_hid = b_g_in_to_hid_stacked * input_n # Compute the first order hidden-to-hidden term f_o_hid_to_hid = b_g_hid_to_hid_stacked * hid_to_hid # Calculate gates pre-activations and slice gates = (second_order_term + f_o_in_to_hid + f_o_hid_to_hid + b_stacked) # Clip gradients if self.grad_clipping: gates = theano.gradient.grad_clip(gates, -self.grad_clipping, self.grad_clipping) # Extract the pre-activation gate values ingate = slice_w(gates, 0) forgetgate = slice_w(gates, 1) cell_input = slice_w(gates, 2) outgate = slice_w(gates, 3) # Apply nonlinearities ingate = self.nonlinearity_ingate(ingate) forgetgate = self.nonlinearity_forgetgate(forgetgate) cell_input = self.nonlinearity_cell(cell_input) # Compute new cell value cell = forgetgate * cell_previous + ingate * cell_input outgate = self.nonlinearity_outgate(outgate) # Compute new hidden unit activation hid = outgate * self.nonlinearity(cell) return [cell, hid] def step_masked(input_n, mask_n, cell_previous, hid_previous, *args): cell, hid = step(input_n, cell_previous, hid_previous, *args) # Skip over any input with mask 0 by copying the previous # hidden state; proceed normally for any input with mask 1. cell = T.switch(mask_n, cell, cell_previous) hid = T.switch(mask_n, hid, hid_previous) return [cell, hid] if mask is not None: # mask is given as (batch_size, seq_len). Because scan iterates # over first dimension, we dimshuffle to (seq_len, batch_size) and # add a broadcastable dimension mask = mask.dimshuffle(1, 0, 'x') sequences = [input, mask] step_fun = step_masked else: sequences = input step_fun = step ones = T.ones((num_batch, 1)) if not isinstance(self.cell_init, Layer): # Dot against a 1s vector to repeat to shape (num_batch, num_units) cell_init = T.dot(ones, self.cell_init) if not isinstance(self.hid_init, Layer): # Dot against a 1s vector to repeat to shape (num_batch, num_units) hid_init = T.dot(ones, self.hid_init) # The hidden-to-hidden weight matrix and the gating biases # are always used in this step and the bias non_seqs = [ W_hid_stacked, a_g_stacked, b_g_in_to_hid_stacked, b_g_hid_to_hid_stacked, b_stacked ] # When we aren't precomputing the input outside of scan, we need to # provide the input weights and biases to the step function if not self.precompute_input: non_seqs += [W_in_stacked] if self.unroll_scan: # Retrieve the dimensionality of the incoming layer input_shape = self.input_shapes[0] # Explicitly unroll the recurrence instead of using scan cell_out, hid_out = unroll_scan(fn=step_fun, sequences=sequences, outputs_info=[cell_init, hid_init], go_backwards=self.backwards, non_sequences=non_seqs, n_steps=input_shape[1]) else: # Scan op iterates over first dimension of input and repeatedly # applies the step function cell_out, hid_out = theano.scan( fn=step_fun, sequences=sequences, outputs_info=[cell_init, hid_init], go_backwards=self.backwards, truncate_gradient=self.gradient_steps, non_sequences=non_seqs, strict=True)[0] # When it is requested that we only return the final sequence step, # we need to slice it out immediately after scan is applied if self.only_return_final: hid_out = hid_out[-1] else: # dimshuffle back to (n_batch, n_time_steps, n_features)) hid_out = hid_out.dimshuffle(1, 0, 2) # if scan is backward reverse the output if self.backwards: hid_out = hid_out[:, ::-1] return hid_out
def get_output_for(self, inputs, mask=None, **kwargs): """ Compute this layer's output function given a symbolic input variable. Parameters ---------- inputs : list of theano.TensorType `inputs[0]` should always be the symbolic input variable. When this layer has a mask input (i.e. was instantiated with `mask_input != None`, indicating that the lengths of sequences in each batch vary), `inputs` should have length 2, where `inputs[1]` is the `mask`. The `mask` should be supplied as a Theano variable denoting whether each time step in each sequence in the batch is part of the sequence or not. `mask` should be a matrix of shape ``(n_batch, n_time_steps)`` where ``mask[i, j] = 1`` when ``j <= (length of sequence i)`` and ``mask[i, j] = 0`` when ``j > (length of sequence i)``. When the hidden state of this layer is to be pre-filled (i.e. was set to a :class:`Layer` instance) `inputs` should have length at least 2, and `inputs[-1]` is the hidden state to prefill with. Returns ------- layer_output : theano.TensorType Symbolic output variable. """ # Retrieve the layer input input = inputs[0] # Retrieve the mask when it is supplied hid_init = None if self.mask_incoming_index > 0: mask = inputs[self.mask_incoming_index] if self.hid_init_incoming_index > 0: hid_init = inputs[self.hid_init_incoming_index] # Input should be provided as (n_batch, n_time_steps, n_features) # but scan requires the iterable dimension to be first # So, we need to dimshuffle to (n_time_steps, n_batch, n_features) input = input.dimshuffle(1, 0, *range(2, input.ndim)) seq_len, num_batch = input.shape[0], input.shape[1] if self.precompute_input: # Because the input is given for all time steps, we can precompute # the inputs to hidden before scanning. First we need to reshape # from (seq_len, batch_size, trailing dimensions...) to # (seq_len*batch_size, trailing dimensions...) # This strange use of a generator in a tuple was because # input.shape[2:] was raising a Theano error trailing_dims = tuple(input.shape[n] for n in range(2, input.ndim)) input = T.reshape(input, (seq_len*num_batch,) + trailing_dims) input = helper.get_output( self.input_to_hidden, input, **kwargs) # Reshape back to (seq_len, batch_size, trailing dimensions...) trailing_dims = tuple(input.shape[n] for n in range(1, input.ndim)) input = T.reshape(input, (seq_len, num_batch) + trailing_dims) # We will always pass the hidden-to-hidden layer params to step non_seqs = helper.get_all_params(self.hidden_to_hidden) # When we are not precomputing the input, we also need to pass the # input-to-hidden parameters to step if not self.precompute_input: non_seqs += helper.get_all_params(self.input_to_hidden) # Create single recurrent computation step function def step(input_n, hid_previous, *args): # Compute the hidden-to-hidden activation hid_pre = helper.get_output( self.hidden_to_hidden, hid_previous, **kwargs) # If the dot product is precomputed then add it, otherwise # calculate the input_to_hidden values and add them if self.precompute_input: hid_pre += input_n else: hid_pre += helper.get_output( self.input_to_hidden, input_n, **kwargs) # Clip gradients if self.grad_clipping: hid_pre = theano.gradient.grad_clip( hid_pre, -self.grad_clipping, self.grad_clipping) return self.nonlinearity(hid_pre) def step_masked(input_n, mask_n, hid_previous, *args): # Skip over any input with mask 0 by copying the previous # hidden state; proceed normally for any input with mask 1. hid = step(input_n, hid_previous, *args) hid_out = T.switch(mask_n, hid, hid_previous) return [hid_out] if mask is not None: mask = mask.dimshuffle(1, 0, 'x') sequences = [input, mask] step_fun = step_masked else: sequences = input step_fun = step if not isinstance(self.hid_init, Layer): # The code below simply repeats self.hid_init num_batch times in # its first dimension. Turns out using a dot product and a # dimshuffle is faster than T.repeat. dot_dims = (list(range(1, self.hid_init.ndim - 1)) + [0, self.hid_init.ndim - 1]) hid_init = T.dot(T.ones((num_batch, 1)), self.hid_init.dimshuffle(dot_dims)) if self.unroll_scan: # Retrieve the dimensionality of the incoming layer input_shape = self.input_shapes[0] # Explicitly unroll the recurrence instead of using scan hid_out = unroll_scan( fn=step_fun, sequences=sequences, outputs_info=[hid_init], go_backwards=self.backwards, non_sequences=non_seqs, n_steps=input_shape[1])[0] else: # Scan op iterates over first dimension of input and repeatedly # applies the step function hid_out = theano.scan( fn=step_fun, sequences=sequences, go_backwards=self.backwards, outputs_info=[hid_init], non_sequences=non_seqs, truncate_gradient=self.gradient_steps, strict=True)[0] # When it is requested that we only return the final sequence step, # we need to slice it out immediately after scan is applied if self.only_return_final: hid_out = hid_out[-1] else: # dimshuffle back to (n_batch, n_time_steps, n_features)) hid_out = hid_out.dimshuffle(1, 0, *range(2, hid_out.ndim)) # if scan is backward reverse the output if self.backwards: hid_out = hid_out[:, ::-1] return hid_out
def get_output_for(self, inputs, **kwargs): # Retrieve the layer input input = inputs[0] # Retrieve the mask when it is supplied mask = None hid_init = None cell_init = None if self.mask_incoming_index > 0: mask = inputs[self.mask_incoming_index] if self.hid_init_incoming_index > 0: hid_init = inputs[self.hid_init_incoming_index] if self.cell_init_incoming_index > 0: cell_init = inputs[self.cell_init_incoming_index] # TLSTM: Define new input time_mat = inputs[self.time_incoming_index] # Treat all dimensions after the second as flattened feature dimensions if input.ndim > 3: input = T.flatten(input, 3) if self.bn: input = self.bn.get_output_for(input) # Because scan iterates over the first dimension we dimshuffle to # (n_time_steps, n_batch, n_features) input = input.dimshuffle(1, 0, 2) #(n_time_steps, n_batch) time_input = time_mat.dimshuffle(1, 0, 'x') time_seq_len, time_num_batch, _ = time_input.shape seq_len, num_batch, _ = input.shape # Stack input weight matrices into a (num_inputs, 5*num_units) # matrix, which speeds up computation W_in_stacked = T.concatenate( [self.W_in_to_ingate, self.W_in_to_cell, self.W_in_to_outgate, self.W_x2_to_tg2, self.W_x1_to_tg1], axis=1) # Same for hidden weight matrices W_hid_stacked = T.concatenate( [self.W_hid_to_ingate, self.W_hid_to_cell, self.W_hid_to_outgate], axis=1) # Stack biases into a (5*num_units) vector b_stacked = T.concatenate( [self.b_ingate, self.b_cell, self.b_outgate, self.b2_tg2, self.b1_tg1], axis=0) # W_t1_to_tg1_constraint < 0 W_t1_to_tg1_constraint = T.switch(T.ge(self.W_t1_to_tg1, self.boundary), self.W_t1_to_tg1, self.boundary) # Stack delta time weight matrices into a (num_inputs, 2* num_units) W_t_stacked = T.concatenate([ self.W_to_to_outgate, self.W_t2_to_tg2, W_t1_to_tg1_constraint ], axis=1) if self.precompute_input: # Because the input is given for all time steps, we can # precompute_input the inputs dot weight matrices before scanning. # W_in_stacked is (n_features, 4*num_units). input is then # (n_time_steps, n_batch, 4*num_units). time_input = T.dot(time_input, W_t_stacked) input = T.dot(input, W_in_stacked) + b_stacked # When theano.scan calls step, input_n will be (n_batch, 4*num_units). # We define a slicing function that extract the input to each LSTM gate def slice_w(x, start, stride=1): return x[:, start*self.num_units:(start+stride)*self.num_units] # Create single recurrent computation step function # input_n is the n'th vector of the input # todo # insert Tm_n, weight_t_o_n in to mask_n and xell_previous def step(input_n, time_input_n, cell_previous, hid_previous, *args): if not self.precompute_input: time_input_n = T.dot(time_input_n, W_t_stacked) input_n = T.dot(input_n, W_in_stacked) + b_stacked tm_wto_n = slice_w(time_input_n, 0) tm_w2_n = slice_w(time_input_n, 1) tm_w1_n = slice_w(time_input_n, 2) tm_w2_n = self.nonlinearity_inside_tg2(tm_w2_n) tm_w1_n = self.nonlinearity_inside_tg1(tm_w1_n) tm2_xwb_n = slice_w(input_n, 3) tm1_xwb_n = slice_w(input_n, 4) timegate2 = self.nonlinearity_outside_tg2(tm_w2_n + tm2_xwb_n) timegate1 = self.nonlinearity_outside_tg1(tm_w1_n + tm1_xwb_n) input_n = slice_w(input_n, 0, 3) # Calculate gates pre-activations and slice gates = input_n + T.dot(hid_previous, W_hid_stacked) # Clip gradients if self.grad_clipping: gates = theano.gradient.grad_clip( gates, -self.grad_clipping, self.grad_clipping) # Extract the pre-activation gate values ingate = slice_w(gates, 0) cell_input = slice_w(gates, 1) outgate = slice_w(gates, 2) outgate += tm_wto_n if self.peepholes: # Compute peephole connections ingate += cell_previous*self.W_cell_to_ingate # Apply nonlinearities ingate = self.nonlinearity_ingate(ingate) cell_input = self.nonlinearity_cell(cell_input) # Compute new cell value cell = (1 - ingate)*cell_previous + ingate*timegate2*cell_input tilde_cell = (1 - ingate*timegate1)*cell_previous + ingate*timegate1*cell_input if self.peepholes: outgate += tilde_cell*self.W_cell_to_outgate outgate = self.nonlinearity_outgate(outgate) # Compute new hidden unit activation hid = outgate*self.nonlinearity(tilde_cell) return [cell, hid] def step_masked(input_n, time_input_n, mask_n, cell_previous, hid_previous, *args): cell, hid = step(input_n, time_input_n, cell_previous, hid_previous, *args) # Skip over any input with mask 0 by copying the previous # hidden state; proceed normally for any input with mask 1. cell = T.switch(mask_n, cell, cell_previous) hid = T.switch(mask_n, hid, hid_previous) return [cell, hid] if mask is not None: # mask is given as (batch_size, seq_len). Because scan iterates # over first dimension, we dimshuffle to (seq_len, batch_size) and # add a broadcastable dimension mask = mask.dimshuffle(1, 0, 'x') sequences = [input, time_input, mask] step_fun = step_masked else: sequences = [input, time_input] step_fun = step ones = T.ones((num_batch, 1)) if not isinstance(self.cell_init, Layer): # Dot against a 1s vector to repeat to shape (num_batch, num_units) cell_init = T.dot(ones, self.cell_init) if not isinstance(self.hid_init, Layer): # Dot against a 1s vector to repeat to shape (num_batch, num_units) hid_init = T.dot(ones, self.hid_init) # The hidden-to-hidden weight matrix is always used in step non_seqs = [W_hid_stacked] # The "peephole" weight matrices are only used when self.peepholes=True if self.peepholes: non_seqs += [self.W_cell_to_ingate, self.W_cell_to_outgate] # When we aren't precomputing the input outside of scan, we need to # provide the input weights and biases to the step function if not self.precompute_input: non_seqs += [W_in_stacked, b_stacked, W_t_stacked] else: pass if self.unroll_scan: # Retrieve the dimensionality of the incoming layer input_shape = self.input_shapes[0] # Explicitly unroll the recurrence instead of using scan cell_out, hid_out = unroll_scan( fn=step_fun, sequences=sequences, outputs_info=[cell_init, hid_init], go_backwards=self.backwards, non_sequences=non_seqs, n_steps=input_shape[1]) else: # Scan op iterates over first dimension of input and repeatedly # applies the step function cell_out, hid_out = theano.scan( fn=step_fun, sequences=sequences, outputs_info=[cell_init, hid_init], go_backwards=self.backwards, truncate_gradient=self.gradient_steps, non_sequences=non_seqs, strict=True)[0] # When it is requested that we only return the final sequence step, # we need to slice it out immediately after scan is applied if self.only_return_final: hid_out = hid_out[-1] else: # dimshuffle back to (n_batch, n_time_steps, n_features)) hid_out = hid_out.dimshuffle(1, 0, 2) # if scan is backward reverse the output if self.backwards: hid_out = hid_out[:, ::-1] return hid_out
def get_output_for(self, inputs, deterministic=False, **kwargs): """ Compute this layer's output function given a symbolic input variable Parameters ---------- inputs : list of theano.TensorType `inputs[0]` should always be the symbolic input variable. When this layer has a mask input (i.e. was instantiated with `mask_input != None`, indicating that the lengths of sequences in each batch vary), `inputs` should have length 2, where `inputs[1]` is the `mask`. The `mask` should be supplied as a Theano variable denoting whether each time step in each sequence in the batch is part of the sequence or not. `mask` should be a matrix of shape ``(n_batch, n_time_steps)`` where ``mask[i, j] = 1`` when ``j <= (length of sequence i)`` and ``mask[i, j] = 0`` when ``j > (length of sequence i)``. When the hidden state of this layer is to be pre-filled (i.e. was set to a :class:`Layer` instance) `inputs` should have length at least 2, and `inputs[-1]` is the hidden state to prefill with. When the cell state of this layer is to be pre-filled (i.e. was set to a :class:`Layer` instance) `inputs` should have length at least 2, and `inputs[-1]` is the hidden state to prefill with. When both the cell state and the hidden state are being pre-filled `inputs[-2]` is the hidden state, while `inputs[-1]` is the cell state. Returns ------- layer_output : theano.TensorType Symbolic output variable. """ # Retrieve the layer input input = inputs[0] # Retrieve the mask when it is supplied mask = None hid_init = None cell_init = None if self.mask_incoming_index > 0: mask = inputs[self.mask_incoming_index] if self.hid_init_incoming_index > 0: hid_init = inputs[self.hid_init_incoming_index] if self.cell_init_incoming_index > 0: cell_init = inputs[self.cell_init_incoming_index] # PHASED LSTM: Define new input time_mat = inputs[self.time_incoming_index] # Treat all dimensions after the second as flattened feature dimensions if input.ndim > 3: input = T.flatten(input, 3) if self.bn: input = self.bn.get_output_for(input) # Because scan iterates over the first dimension we dimshuffle to # (n_time_steps, n_batch, n_features) input = input.dimshuffle(1, 0, 2) # PHASED LSTM: Get shapes for time input and rearrange for the scan fn time_input = time_mat.dimshuffle(1,0) time_seq_len, time_num_batch = time_input.shape seq_len, num_batch, _ = input.shape # Stack input weight matrices into a (num_inputs, 4*num_units) # matrix, which speeds up computation W_in_stacked = T.concatenate( [self.W_in_to_ingate, self.W_in_to_forgetgate, self.W_in_to_cell, self.W_in_to_outgate], axis=1) # Same for hidden weight matrices W_hid_stacked = T.concatenate( [self.W_hid_to_ingate, self.W_hid_to_forgetgate, self.W_hid_to_cell, self.W_hid_to_outgate], axis=1) # Stack biases into a (4*num_units) vector b_stacked = T.concatenate( [self.b_ingate, self.b_forgetgate, self.b_cell, self.b_outgate], axis=0) # PHASED LSTM: If test time, off-phase means really shut. if deterministic: print('Using true off for testing.') off_slope = 0.0 else: print('Using {} for off_slope.'.format(self.off_alpha)) off_slope = self.off_alpha # PHASED LSTM: Pregenerate broadcast vars. # Same neuron in different batches has same shift and period. Also, # precalculate the middle (on_mid) and end (on_end) of the open-phase # ramp. shift_broadcast = self.shift_timegate.dimshuffle(['x',0]) period_broadcast = T.abs_(self.period_timegate.dimshuffle(['x',0])) on_mid_broadcast = T.abs_(self.on_end_timegate.dimshuffle(['x',0])) * 0.5 * period_broadcast on_end_broadcast = T.abs_(self.on_end_timegate.dimshuffle(['x',0])) * period_broadcast if self.precompute_input: # Because the input is given for all time steps, we can # precompute_input the inputs dot weight matrices before scanning. # W_in_stacked is (n_features, 4*num_units). input is then # (n_time_steps, n_batch, 4*num_units). input = T.dot(input, W_in_stacked) + b_stacked # At each call to scan, input_n will be (n_time_steps, 4*num_units). # We define a slicing function that extract the input to each LSTM gate def slice_w(x, n): return x[:, n*self.num_units:(n+1)*self.num_units] # Create single recurrent computation step function # input_n is the n'th vector of the input def step(input_n, time_input_n, cell_previous, hid_previous, *args): if not self.precompute_input: input_n = T.dot(input_n, W_in_stacked) + b_stacked # Calculate gates pre-activations and slice gates = input_n + T.dot(hid_previous, W_hid_stacked) # Clip gradients if self.grad_clipping: gates = theano.gradient.grad_clip( gates, -self.grad_clipping, self.grad_clipping) # Extract the pre-activation gate values ingate = slice_w(gates, 0) forgetgate = slice_w(gates, 1) cell_input = slice_w(gates, 2) outgate = slice_w(gates, 3) if self.peepholes: # Compute peephole connections ingate += cell_previous*self.W_cell_to_ingate forgetgate += cell_previous*self.W_cell_to_forgetgate # Apply nonlinearities ingate = self.nonlinearity_ingate(ingate) forgetgate = self.nonlinearity_forgetgate(forgetgate) cell_input = self.nonlinearity_cell(cell_input) # Mix in new stuff cell = forgetgate*cell_previous + ingate*cell_input if self.peepholes: outgate += cell*self.W_cell_to_outgate outgate = self.nonlinearity_outgate(outgate) # Compute new hidden unit activation hid = outgate*self.nonlinearity(cell) return [cell, hid] # PHASED LSTM: The actual calculation of the time gate def calc_time_gate(time_input_n): # Broadcast the time across all units t_broadcast = time_input_n.dimshuffle([0,'x']) # Get the time within the period in_cycle_time = T.mod(t_broadcast + shift_broadcast, period_broadcast) # Find the phase is_up_phase = T.le(in_cycle_time, on_mid_broadcast) is_down_phase = T.gt(in_cycle_time, on_mid_broadcast)*T.le(in_cycle_time, on_end_broadcast) # Set the mask sleep_wake_mask = T.switch(is_up_phase, in_cycle_time/on_mid_broadcast, T.switch(is_down_phase, (on_end_broadcast-in_cycle_time)/on_mid_broadcast, off_slope*(in_cycle_time/period_broadcast))) return sleep_wake_mask # PHASED LSTM: Mask the updates based on the time phase def step_masked(input_n, time_input_n, mask_n, cell_previous, hid_previous, *args): cell, hid = step(input_n, time_input_n, cell_previous, hid_previous, *args) # Get time gate openness sleep_wake_mask = calc_time_gate(time_input_n) # Sleep if off, otherwise stay a bit on cell = sleep_wake_mask*cell + (1.-sleep_wake_mask)*cell_previous hid = sleep_wake_mask*hid + (1.-sleep_wake_mask)*hid_previous # Skip over any input with mask 0 by copying the previous # hidden state; proceed normally for any input with mask 1. cell = T.switch(mask_n, cell, cell_previous) hid = T.switch(mask_n, hid, hid_previous) return [cell, hid] if mask is not None: # mask is given as (batch_size, seq_len). Because scan iterates # over first dimension, we dimshuffle to (seq_len, batch_size) and # add a broadcastable dimension mask = mask.dimshuffle(1, 0, 'x') else: mask = T.ones_like(time_input).dimshuffle(0,1,'x') sequences = [input, time_input, mask] step_fun = step_masked ones = T.ones((num_batch, 1)) if not isinstance(self.cell_init, Layer): # Dot against a 1s vector to repeat to shape (num_batch, num_units) cell_init = T.dot(ones, self.cell_init) if not isinstance(self.hid_init, Layer): # Dot against a 1s vector to repeat to shape (num_batch, num_units) hid_init = T.dot(ones, self.hid_init) non_seqs = [W_hid_stacked, self.period_timegate, self.shift_timegate, self.on_end_timegate] # The "peephole" weight matrices are only used when self.peepholes=True if self.peepholes: non_seqs += [self.W_cell_to_ingate, self.W_cell_to_forgetgate, self.W_cell_to_outgate] # When we aren't precomputing the input outside of scan, we need to # provide the input weights and biases to the step function if not self.precompute_input: non_seqs += [W_in_stacked, b_stacked] if self.unroll_scan: # Retrieve the dimensionality of the incoming layer input_shape = self.input_shapes[0] # Explicitly unroll the recurrence instead of using scan cell_out, hid_out = unroll_scan( fn=step_fun, sequences=sequences, outputs_info=[cell_init, hid_init], go_backwards=self.backwards, non_sequences=non_seqs, n_steps=input_shape[1]) else: # Scan op iterates over first dimension of input and repeatedly # applies the step function cell_out, hid_out = theano.scan( fn=step_fun, sequences=sequences, outputs_info=[cell_init, hid_init], go_backwards=self.backwards, truncate_gradient=self.gradient_steps, non_sequences=non_seqs, strict=True)[0] # When it is requested that we only return the final sequence step, # we need to slice it out immediately after scan is applied if self.only_return_final: hid_out = hid_out[-1] else: # dimshuffle back to (n_batch, n_time_steps, n_features)) hid_out = hid_out.dimshuffle(1, 0, 2) # if scan is backward reverse the output if self.backwards: hid_out = hid_out[:, ::-1] return hid_out
def get_output_for(self, inputs, **kwargs): """ Have to re-write LSTMLayer's output construction because we need cell_out, which is not stored in the original """ # Retrieve the layer input input = inputs[0] # Retrieve the mask when it is supplied mask = None hid_init = None cell_init = None if self.mask_incoming_index > 0: mask = inputs[self.mask_incoming_index] if self.hid_init_incoming_index > 0: hid_init = inputs[self.hid_init_incoming_index] if self.cell_init_incoming_index > 0: cell_init = inputs[self.cell_init_incoming_index] # Treat all dimensions after the second as flattened feature dimensions if input.ndim > 3: input = T.flatten(input, 3) # Because scan iterates over the first dimension we dimshuffle to # (n_time_steps, n_batch, n_features) input = input.dimshuffle(1, 0, 2) seq_len, num_batch, _ = input.shape # Stack input weight matrices into a (num_inputs, 4*num_units) # matrix, which speeds up computation W_in_stacked = T.concatenate( [self.W_in_to_ingate, self.W_in_to_forgetgate, self.W_in_to_cell, self.W_in_to_outgate], axis=1) # Same for hidden weight matrices W_hid_stacked = T.concatenate( [self.W_hid_to_ingate, self.W_hid_to_forgetgate, self.W_hid_to_cell, self.W_hid_to_outgate], axis=1) # Stack biases into a (4*num_units) vector b_stacked = T.concatenate( [self.b_ingate, self.b_forgetgate, self.b_cell, self.b_outgate], axis=0) if self.precompute_input: # Because the input is given for all time steps, we can # precompute_input the inputs dot weight matrices before scanning. # W_in_stacked is (n_features, 4*num_units). input is then # (n_time_steps, n_batch, 4*num_units). input = T.dot(input, W_in_stacked) + b_stacked # At each call to scan, input_n will be (n_time_steps, 4*num_units). # We define a slicing function that extract the input to each LSTM gate def slice_w(x, n): return x[:, n*self.num_units:(n+1)*self.num_units] # Create single recurrent computation step function # input_n is the n'th vector of the input def step(input_n, cell_previous, hid_previous, *args): if not self.precompute_input: input_n = T.dot(input_n, W_in_stacked) + b_stacked # Calculate gates pre-activations and slice gates = input_n + T.dot(hid_previous, W_hid_stacked) # Clip gradients if self.grad_clipping: gates = theano.gradient.grad_clip( gates, -self.grad_clipping, self.grad_clipping) # Extract the pre-activation gate values ingate = slice_w(gates, 0) forgetgate = slice_w(gates, 1) cell_input = slice_w(gates, 2) outgate = slice_w(gates, 3) if self.peepholes: # Compute peephole connections ingate += cell_previous*self.W_cell_to_ingate forgetgate += cell_previous*self.W_cell_to_forgetgate # Apply nonlinearities ingate = self.nonlinearity_ingate(ingate) forgetgate = self.nonlinearity_forgetgate(forgetgate) cell_input = self.nonlinearity_cell(cell_input) # Compute new cell value cell = forgetgate*cell_previous + ingate*cell_input if self.peepholes: outgate += cell*self.W_cell_to_outgate outgate = self.nonlinearity_outgate(outgate) # Compute new hidden unit activation hid = outgate*self.nonlinearity(cell) return [cell, hid] def step_masked(input_n, mask_n, cell_previous, hid_previous, *args): cell, hid = step(input_n, cell_previous, hid_previous, *args) # Skip over any input with mask 0 by copying the previous # hidden state; proceed normally for any input with mask 1. not_mask = 1 - mask_n cell = cell*mask_n + cell_previous*not_mask hid = hid*mask_n + hid_previous*not_mask return [cell, hid] if mask is not None: # mask is given as (batch_size, seq_len). Because scan iterates # over first dimension, we dimshuffle to (seq_len, batch_size) and # add a broadcastable dimension mask = mask.dimshuffle(1, 0, 'x') sequences = [input, mask] step_fun = step_masked else: sequences = input step_fun = step ones = T.ones((num_batch, 1)) if isinstance(self.cell_init, Layer): pass elif isinstance(self.cell_init, T.TensorVariable): cell_init = self.cell_init else: # Dot against a 1s vector to repeat to shape (num_batch, num_units) cell_init = T.dot(ones, self.cell_init) if isinstance(self.hid_init, Layer): pass elif isinstance(self.hid_init, T.TensorVariable): hid_init = self.hid_init else: # Dot against a 1s vector to repeat to shape (num_batch, num_units) hid_init = T.dot(ones, self.hid_init) # The hidden-to-hidden weight matrix is always used in step non_seqs = [W_hid_stacked] # The "peephole" weight matrices are only used when self.peepholes=True if self.peepholes: non_seqs += [self.W_cell_to_ingate, self.W_cell_to_forgetgate, self.W_cell_to_outgate] # When we aren't precomputing the input outside of scan, we need to # provide the input weights and biases to the step function if not self.precompute_input: non_seqs += [W_in_stacked, b_stacked] if self.unroll_scan: # Retrieve the dimensionality of the incoming layer input_shape = self.input_shapes[0] # Explicitly unroll the recurrence instead of using scan cell_out, hid_out = unroll_scan( fn=step_fun, sequences=sequences, outputs_info=[cell_init, hid_init], go_backwards=self.backwards, non_sequences=non_seqs, n_steps=input_shape[1]) else: # Scan op iterates over first dimension of input and repeatedly # applies the step function cell_out, hid_out = theano.scan( fn=step_fun, sequences=sequences, outputs_info=[cell_init, hid_init], go_backwards=self.backwards, truncate_gradient=self.gradient_steps, non_sequences=non_seqs, strict=True)[0] # When it is requested that we only return the final sequence step, # we need to slice it out immediately after scan is applied if self.only_return_final: hid_out = hid_out[-1] cell_out = cell_out[-1] else: # dimshuffle back to (n_batch, n_time_steps, n_features)) hid_out = hid_out.dimshuffle(1, 0, 2) cell_out = cell_out.dimshuffle(1, 0, 2) # if scan is backward reverse the output if self.backwards: hid_out = hid_out[:, ::-1] cell_out = cell_out[:, ::-1] return T.concatenate([cell_out, hid_out], axis=2)
def get_sessions(self, environment = None, session_length = 10, batch_size = None, initial_env_states = 'zeros', initial_observations = 'zeros', initial_state_variables = 'zeros', **flags ): """returns history of agent interaction with environment for given number of turns: parameters: environment - an environment to interact with (BaseEnvironment instance) session_length - how many turns of interaction shall there be for each batch batch_size - [required parameter] amount of independed sessions [number or symbolic].Irrelevant if you manually set all initial_*. initial_<something> - initial values for all variables at 0-th time step Unless you are doing something nasty, initial policy (qvalues) and actions will not matter at all 'zeros' default means filling variable with zeros Initial values are NOT included in history sequences additional_output_layers - any layers of a network which outputs need to be added to the outputs flags: optional flags to be sent to NN when calling get_output (e.g. deterministic = True) returns: state_seq,observation_seq,hidden_seq,policy_seq,action_seq, [additional_output_0, additional_output_1] for environment state, observation, hidden state, agent policy and chosen actions respectively each of them having dimensions of [batch_i,seq_i,...] time synchronization policy: state_seq,observation_seq correspond to observation BASED ON WHICH agent generated hidden_seq,policy_seq,action_seq """ env = environment #assert that environment is None if and only if there are no observations assert (env is None) == (len(self.observation_layers) == 0) if env is not None: if initial_env_states == 'zeros': initial_env_states = [T.zeros([batch_size,size]) for size in check_list(env.state_size)] else: initial_env_states = check_list(initial_env_states) if initial_observations == 'zeros': initial_observations = [T.zeros((batch_size,)+tuple(obs_layer.shape[1:])) for obs_layer in self.observation_layers] else: initial_observations = check_list(initial_observations) else: initial_env_states = initial_observations = [] if initial_state_variables == 'zeros': initial_state_variables = [] for memory in self.state_variables: state_shape = lasagne.layers.get_output_shape(memory)[1:] #drom batch_i dimension initial_state = T.zeros((batch_size,)+tuple(state_shape)) initial_state_variables.append(initial_state) #recurrent step function #during SCAN, time synchronization is reverse: state_1 came after action_1 based on observation_0 from state_0 def step(time_tick,*args): #slice previous: they contain #[*env_states_if_any, *observations, *state_variables, *prev_actions, *prev_outputs, *rubbish] # we only need env state, prev observation and agent state to iterate on if env is not None: n_env_states = len(check_list(env.state_size)) else: n_env_states = 0 n_observations = len(self.observation_layers) n_memories = len(self.state_variables) env_states,observations,prev_agent_states = unpack_list(args,n_env_states,n_observations,n_memories) prev_states_dict = OrderedDict(zip(self.state_variables.keys(),prev_agent_states)) new_actions,new_agent_states,new_outputs = self.get_agent_reaction(prev_states_dict,observations,**flags) if env is not None: new_env_states,new_observations = env.get_action_results(env_states,new_actions,time_tick) new_env_states = check_list(new_env_states) new_observations = check_list(new_observations) else: new_env_states = new_observations = [] return new_env_states + new_observations + new_agent_states + new_actions + new_outputs #main recurrent loop configuration outputs_info = initial_env_states+initial_observations + initial_state_variables+\ [None]*(len(self.action_layers)+len(self.tracked_outputs)) time_ticks = T.arange(session_length) sequences = [time_ticks] history = unroll_scan(step, sequences = sequences, outputs_info = outputs_info, non_sequences = [], n_steps = session_length ) #for the record self.last_history = history #from [time,batch,...] to [batch,time,...] history = [ (var.swapaxes(1,0) if var.ndim >1 else var) for var in history] groups = unpack_list(history, len(initial_env_states),len(self.observation_layers), len(self.state_variables),len(self.action_layers), len(self.tracked_outputs)) env_state_sequences, observation_sequences, agent_state_sequences,\ action_sequences, output_sequences = groups agent_state_dict = OrderedDict(zip(self.state_variables.keys(),agent_state_sequences)) #allign time axes: actions come AFTER states with the same index #add first env turn, crop to session length env_state_sequences = [ T.concatenate([insert_dim(initial_env_state,1), state_seq[:,:-1]],axis=1) for state_seq, initial_env_state in zip(env_state_sequences, initial_env_states) ] observation_seqs = [ T.concatenate([insert_dim(initial_observation,1), observation_seq[:,:-1]],axis=1) for observation_seq, initial_observation in zip(observation_sequences, initial_observations) ] return env_state_sequences, observation_sequences, agent_state_dict,action_sequences, output_sequences
def get_output_for(self, inputs, accumulate_updates="warn",recurrence_flags={}, **kwargs): """ returns history of agent interaction with environment for given number of turns. parameters: inputs - [state init] + [input_nonsequences] + [input_sequences] Each part is a list of theano expressions for layers in the order they were provided when creating this layer. recurrence_flags - a set of flags to be passed to the one step agent (anything that lasagne supports) e.g. {deterministic=True} returns: [state_sequences] + [output sequences] - a list of all states and all outputs sequences Shape of each such sequence is [batch, tick, shape_of_one_state_or_output...] """ #aliases n_states = len(self.state_variables) n_state_inits = len(self.state_init) n_input_nonseq = len(self.input_nonsequences) n_input_seq = len(self.input_sequences) n_outputs = len(self.tracked_outputs) #slice inputs if self.mask_input is not None: mask,inputs = inputs[0],inputs[1:] initial_states_provided, nonsequences, sequences = unpack_list(inputs, [n_state_inits, n_input_nonseq, n_input_seq]) # infer batch size if self.batch_size is not None: batch_size = self.batch_size elif len(inputs) != 0: batch_size = inputs[0].shape[0] else: raise ValueError("Need to set batch_size explicitly for recurrence") #here we create outputs_info for scan, basically initial values for states and outputs ## initial states that are given as input initial_states_provided = OrderedDict(list(zip(self.state_init, initial_states_provided))) def get_initial_state(layer, batch_size=batch_size): """Pick dedicated initial state or create zeros of appropriate shape and dtype :param layer: layer for new hidden state (key of self.state_variables) :param batch_size: symbolic batch_size """ # if we have a dedicated init, use it if layer in initial_states_provided: initial_state = initial_states_provided[layer] # otherwise initialize with zeros else: assert None not in layer.output_shape[1:],\ "Some of your state layers ({}) has undefined shape along non-batch dimension. (shape: {}) " \ "Therefore, it's initial value can't be inferred. Please set explicit initial value via state_init" \ "".format(layer.name or layer, layer.output_shape) dtype = get_layer_dtype(layer) initial_state = T.zeros((batch_size,) + tuple(layer.output_shape[1:]), dtype=dtype) #disable broadcasting along all axes (lasagne outputs are non-broadcastable) initial_state = T.unbroadcast(initial_state, *range(initial_state.ndim)) return initial_state initial_states = list(map(get_initial_state, self.state_variables)) # dummy initial values for tracked_outputs. # We need to provide them for step_masked to be able to backtrack to them. Also unroll scan requires them. # Initial shapes for outputs are inferred by calling get_one_step and taking shapes from it. # Theano optimizes shape computation without computing get_out_step outputs themselves # the resulting graph would be like (var1.shape[0],var1.shape[2]*3,10) so this operation is zero-cost. state_feed_dict = dict(zip(self.state_variables.keys(),initial_states)) input_feed_dict = dict(zip(list(chain(self.input_nonsequences.keys(), self.input_sequences.keys())), list(chain(nonsequences,[seq[:,0] for seq in sequences])))) initial_output_fillers = self.get_one_step(state_feed_dict,input_feed_dict,**recurrence_flags)[1] # disable broadcasting of zeros_like(v) along all axes (since lasagne outputs are non-broadcastable) initial_output_fillers = [T.unbroadcast(T.zeros_like(v),*range(v.ndim)) for v in initial_output_fillers] #/end of that nonsense #stack all initializers together outputs_info = initial_states + initial_output_fillers # reshape sequences from [batch, time, ...] to [time,batch,...] to fit scan sequences = [seq.swapaxes(1, 0) for seq in sequences] # recurrent step function def step(*args): sequence_slices, prev_states, prev_outputs, nonsequences = \ unpack_list(args, [n_input_seq, n_states, n_outputs, n_input_nonseq]) # make dicts of prev_states and inputs prev_states_dict = OrderedDict(zip(list(self.state_variables.keys()), prev_states)) input_layers = list(chain(self.input_nonsequences.keys(), self.input_sequences.keys())) assert len(input_layers) == len(nonsequences + sequence_slices) inputs_dict = OrderedDict(zip(input_layers, nonsequences + sequence_slices)) # call one step recurrence new_states, new_outputs = self.get_one_step(prev_states_dict, inputs_dict, **recurrence_flags) #make sure new state variables are of exactly the same type as their initial value state_names = [layer.name or str(layer) for layer in list(self.state_variables.keys())] for i in range(len(state_names)): try: if self.force_cast_types: new_states[i] = new_states[i].astype(prev_states[i].dtype) new_states[i] = cast_to_type(new_states[i],get_type(prev_states[i])) except: raise ValueError("Could not convert new state {}, of type {}, to it's previous/initial state type " "{}. Cast type manually or set force_cast_types=True on creation." "".format(state_names[i],get_type(new_states[i]),get_type(prev_states[i]))) #make sure output variables are of exactly the same type as their initial value output_names = [layer.name or str(layer) for layer in self.tracked_outputs] for i in range(len(output_names)): try: if self.force_cast_types: new_outputs[i] = new_outputs[i].astype(prev_outputs[i].dtype) new_outputs[i] = cast_to_type(new_outputs[i],get_type(prev_outputs[i])) except: raise ValueError("Could not convert output of {}, of type {}, to it's previous/initial state type " "{}. Cast type manually or set force_cast_types=True on creation." "".format(output_names[i],get_type(new_outputs[i]),get_type(prev_outputs[i]))) return new_states + new_outputs ###handling mask_input### #a step function that utilizes a mask def step_masked(mask_t,*args): #unpack arrays sequence_slices, prev_states, prev_outputs, nonsequences = \ unpack_list(args, [n_input_seq, n_states, n_outputs, n_input_nonseq]) #get regular step new_states_and_outputs = step(*args) old_states_and_outputs = prev_states+prev_outputs #if mask_t, return new ones, else return old ones def apply_mask(mask_t,new_state,old_state): assert new_state.ndim == old_state.ndim ndim = new_state.ndim #append dims to mask pattern = list(range(mask_t.ndim)) + ['x'] * (ndim - mask_t.ndim) return T.switch(mask_t.dimshuffle(pattern), new_state, old_state) next_states_and_outputs = [apply_mask(mask_t,new_state,old_state) for new_state,old_state in zip(new_states_and_outputs, old_states_and_outputs)] return next_states_and_outputs if self.mask_input is not None: sequences = [mask.swapaxes(1, 0)]+sequences step_function = step_masked else: step_function = step #scan itself if self.unroll_scan: # call scan itself history = unroll_scan(step_function, sequences=sequences, outputs_info=outputs_info, non_sequences=nonsequences, n_steps=self.n_steps ) #if explicitly asked to reset updates, do so if accumulate_updates == False: self.updates=OrderedUpdates() else: history,updates = theano.scan(step_function, sequences=sequences, outputs_info=outputs_info, non_sequences=nonsequences, n_steps=self.n_steps ) if accumulate_updates in (True,'warn'): self.updates += updates else:#replace updates self.updates = updates #check if user received last updates if not self._updates_received and accumulate_updates=='warn': warn("You called get_output from recurrence several times without gathering the updates.\n" "(A) If you wanted to get two outputs from recurrence, use NOT\n" ">>>out1 = get_output(rec[layer1])\n" ">>>out2 = get_output(rec[layer2])\n" "but instead:\n" ">>>out1,out2 = get_output((rec[layer1],rec[layer2])) #or rec[layer1,layer2].\n" "(B) If you want to run recurrence several times and accumulate updates from all runs," "use get_output(...,accumulate_updates=True) to silence the warning.\n" "(C) If you want to get rid of old updates, use get_output(...,accumulate_updates=False)\n" ) if len(self.updates) !=0: self._updates_received=False warn("Recurrent loop without unroll_scan got nonempty random state updates list. That happened" " because there is some source of randomness (e.g. dropout) inside recurrent step graph." " To compile such graph, one must either call .get_automatic_updates() right after .get_output" " and pass these updates to a function when compiling theano.function.",verbosity_level=2) # reordering from [time,batch,...] to [batch,time,...] history = [(var.swapaxes(1, 0) if var.ndim > 1 else var) for var in check_list(history)] assert len(history) == n_states+n_outputs state_seqs, output_seqs = unpack_list(history, [n_states, n_outputs]) # handle delayed_states # selectively shift state sequences by 1 tick into the past, padding with their initialisations for i in range(len(state_seqs)): if list(self.state_variables.keys())[i] in self.delayed_states: state_seq = state_seqs[i] state_init = initial_states[i] state_seq = T.concatenate([insert_dim(state_init, 1), state_seq[:, :-1]], axis=1) state_seqs[i] = state_seq #keys corresponding to output sequences. Note that we do not use self.keys() to correctly # handle cases where some variable is present in both state_variables and tracked_outputs output_keys = list(self.state_variables.keys()) + list(self.tracked_outputs) output_values = state_seqs + output_seqs assert len(output_keys) == len(output_values) return OrderedDict(zip(output_keys,output_values))
def get_output_for(self, inputs, **kwargs): """ Compute this layer's output function given a symbolic input variable Parameters ---------- inputs : list of theano.TensorType `inputs[0]` should always be the symbolic input variable. When this layer has a mask input (i.e. was instantiated with `mask_input != None`, indicating that the lengths of sequences in each batch vary), `inputs` should have length 2, where `inputs[1]` is the `mask`. The `mask` should be supplied as a Theano variable denoting whether each time step in each sequence in the batch is part of the sequence or not. `mask` should be a matrix of shape ``(n_batch, n_time_steps)`` where ``mask[i, j] = 1`` when ``j <= (length of sequence i)`` and ``mask[i, j] = 0`` when ``j > (length of sequence i)``. Returns ------- layer_output : theano.TensorType Symbolic output variable. """ # Retrieve the layer input input = inputs[0] encoder_output = inputs[1] # Treat all dimensions after the second as flattened feature dimensions if input.ndim > 3: input = T.flatten(input, 3) encoder_output = T.flatten(encoder_output, 3) # Because scan iterates over the first dimension we dimshuffle to # (n_time_steps, n_batch, n_features) input = input.dimshuffle(1, 0, 2) encoder_output = encoder_output.dimshuffle(1, 0, 2) seq_len, num_batch, _ = input.shape # Stack input weight matrices into a (num_inputs, 3*num_units) # matrix, which speeds up computation W_in_stacked = T.concatenate( [self.W_in_to_resetgate, self.W_in_to_updategate, self.W_in_to_hidden_update], axis=1 ) # Same for hidden weight matrices W_hid_stacked = T.concatenate( [self.W_hid_to_resetgate, self.W_hid_to_updategate, self.W_hid_to_hidden_update], axis=1 ) # Stack gate biases into a (3*num_units) vector b_stacked = T.concatenate([self.b_resetgate, self.b_updategate, self.b_hidden_update], axis=0) if self.precompute_input: # precompute_input inputs*W. W_in is (n_features, 3*num_units). # input is then (n_batch, n_time_steps, 3*num_units). input = T.dot(input, W_in_stacked) + b_stacked # At each call to scan, input_n will be (n_time_steps, 3*num_units). # We define a slicing function that extract the input to each GRU gate def slice_w(x, n): return x[:, n * self.num_units : (n + 1) * self.num_units] # Create single recurrent computation step function # input__n is the n'th vector of the input def step( input_n, hid_previous, encoder_output, W_hid_stacked, W_in_stacked, b_stacked, W_att_enc, W_att_dec, W_att_out, W_out, b_out, ): # Compute W_{hr} h_{t - 1}, W_{hu} h_{t - 1}, and W_{hc} h_{t - 1} hid_input = T.dot(hid_previous, W_hid_stacked) if self.grad_clipping is not False: input_n = theano.gradient.grad_clip(input_n, -self.grad_clipping, self.grad_clipping) hid_input = theano.gradient.grad_clip(hid_input, -self.grad_clipping, self.grad_clipping) if not self.precompute_input: # Compute W_{xr}x_t + b_r, W_{xu}x_t + b_u, and W_{xc}x_t + b_c input_n = T.dot(input_n, W_in_stacked) + b_stacked # Reset and update gates resetgate = slice_w(hid_input, 0) + slice_w(input_n, 0) updategate = slice_w(hid_input, 1) + slice_w(input_n, 1) resetgate = self.nonlinearity_resetgate(resetgate) updategate = self.nonlinearity_updategate(updategate) # Compute W_{xc}x_t + r_t \odot (W_{hc} h_{t - 1}) hidden_update_in = slice_w(input_n, 2) hidden_update_hid = slice_w(hid_input, 2) hidden_update = hidden_update_in + resetgate * hidden_update_hid if self.grad_clipping is not False: hidden_update = theano.gradient.grad_clip(hidden_update, -self.grad_clipping, self.grad_clipping) hidden_update = self.nonlinearity_hid(hidden_update) # Compute (1 - u_t)h_{t - 1} + u_t c_t hid = (1 - updategate) * hid_previous + updategate * hidden_update # # Add the attention hid += self.attention(encoder_output, hid_previous, W_att_enc, W_att_dec, W_att_out) # Compute the probas probs = T.nnet.softmax(T.dot(hid, W_out) + b_out) return [hid, probs] sequences = [input] step_fun = step if isinstance(self.hid_init, T.TensorVariable): hid_init = self.hid_init else: # Dot against a 1s vector to repeat to shape (num_batch, num_units) hid_init = T.dot(T.ones((num_batch, 1)), self.hid_init) # The hidden-to-hidden weight matrix is always used in step non_seqs = [encoder_output, W_hid_stacked] # When we aren't precomputing the input outside of scan, we need to # provide the input weights and biases to the step function if not self.precompute_input: non_seqs += [ W_in_stacked, b_stacked, self.W_att_enc, self.W_att_dec, self.W_att_out, self.W_out, self.b_out, ] # theano.scan only allows for positional arguments, so when # self.precompute_input is True, we need to supply fake placeholder # arguments for the input weights and biases. else: non_seqs += [(), (), self.W_att_enc, self.W_att_dec, self.W_att_out, self.W_out, self.b_out] if self.unroll_scan: # Retrieve the dimensionality of the incoming layer input_shape = self.input_shapes[0] # Explicitly unroll the recurrence instead of using scan out, _ = unroll_scan( fn=step_fun, sequences=sequences, outputs_info=[hid_init], go_backwards=self.backwards, non_sequences=non_seqs, n_steps=input_shape[1], ) else: # Scan op iterates over first dimension of input and repeatedly # applies the step function out, _ = theano.scan( fn=step_fun, sequences=sequences, go_backwards=self.backwards, outputs_info=[hid_init, None], non_sequences=non_seqs, truncate_gradient=self.gradient_steps, strict=True, ) # dimshuffle back to (n_batch, n_time_steps, n_features)) # hid_out = hid_out[0].dimshuffle(1, 0, 2) s_out = out[1] # # if scan is backward reverse the output # if self.backwards: # out = out[:, ::-1, :] return s_out
def get_output_for(self, inputs, **kwargs): """ Compute this layer's output function given a symbolic input variable Parameters ---------- input : theano.TensorType Symbolic input variable. mask : theano.TensorType Theano variable denoting whether each time step in each sequence in the batch is part of the sequence or not. If ``None``, then it is assumed that all sequences are of the same length. If not all sequences are of the same length, then it must be supplied as a matrix of shape ``(n_batch, n_time_steps)`` where ``mask[i, j] = 1`` when ``j <= (length of sequence i)`` and ``mask[i, j] = 0`` when ``j > (length of sequence i)``. Returns ------- layer_output : theano.TensorType Symblic output variable. """ input = inputs[0] # Retrieve the mask when it is supplied mask = inputs[1] if len(inputs) > 1 else None # Treat all dimensions after the second as flattened feature dimensions if input.ndim > 3: input = input.reshape((input.shape[0], input.shape[1], T.prod(input.shape[2:]))) num_batch = input.shape[0] encode_seqlen = input.shape[1] if mask is None: mask = T.ones((num_batch, encode_seqlen),dtype='float32') # At each call to scan, input_n will be (n_time_steps, 4*num_units). # We define a slicing function that extract the input to each LSTM gate def slice_w(x, n): return x[:, n*self.num_units:(n+1)*self.num_units] # Create single recurrent computation step function # input_n is the n'th vector of the input def step(cell_previous, hid_previous, alpha_prev, weighted_hidden_prev, input, mask, hUa, W_align, v_align, W_hid_stacked, W_weightedhid_stacked, W_cell_to_ingate, W_cell_to_forgetgate, W_cell_to_outgate, b_stacked, *args): #compute (unormalized) attetion vector sWa = T.dot(hid_previous, W_align) # (BS, aln_num_units) sWa = sWa.dimshuffle(0, 'x', 1) # (BS, 1, aln_num_units) align_act = sWa + hUa tanh_sWahUa = self.nonlinearity_align(align_act) # (BS, seqlen, num_units_aln) # CALCULATE WEIGHT FOR EACH HIDDEN STATE VECTOR a = T.dot(tanh_sWahUa, v_align) # (BS, Seqlen, 1) a = T.reshape(a, (a.shape[0], a.shape[1])) # # (BS, Seqlen) # # ->(BS, seq_len) a = a*mask - (1-mask)*10000 alpha = self.attention_softmax_function(a) #alpha = T.reshape(alpha, (input.shape[0], input.shape[1])) # input: (BS, Seqlen, num_units) weighted_hidden = input * alpha.dimshuffle(0, 1, 'x') weighted_hidden = T.sum(weighted_hidden, axis=1) #sum seqlen out # Calculate gates pre-activations and slice # (BS, dec_hid) x (dec_hid, dec_hid) gates = T.dot(hid_previous, W_hid_stacked) + b_stacked # (BS, enc_hid) x (enc_hid, dec_hid) gates += T.dot(weighted_hidden, W_weightedhid_stacked) # Clip gradients if self.grad_clipping is not False: gates = theano.gradient.grad_clip( gates, -self.grad_clipping, self.grad_clipping) # Extract the pre-activation gate values ingate = slice_w(gates, 0) forgetgate = slice_w(gates, 1) cell_input = slice_w(gates, 2) outgate = slice_w(gates, 3) if self.peepholes: # Compute peephole connections ingate += cell_previous*W_cell_to_ingate forgetgate += cell_previous*W_cell_to_forgetgate # Apply nonlinearities ingate = self.nonlinearity_ingate(ingate) forgetgate = self.nonlinearity_forgetgate(forgetgate) cell_input = self.nonlinearity_cell(cell_input) outgate = self.nonlinearity_outgate(outgate) # Compute new cell value cell = forgetgate*cell_previous + ingate*cell_input if self.peepholes: outgate += cell*W_cell_to_outgate # W_align: (num_units, aln_num_units) # U_align: (num_feats, aln_num_units) # v_align: (aln_num_units, 1) # hUa: (BS, Seqlen, aln_num_units) # hid: (BS, num_units_dec) # input: (BS, Seqlen, num_inputs) # Compute new hidden unit activation hid = outgate*self.nonlinearity_out(cell) return [cell, hid, alpha, weighted_hidden] sequences = [] step_fun = step ones = T.ones((num_batch, 1)) if isinstance(self.cell_init, T.TensorVariable): cell_init = self.cell_init else: # Dot against a 1s vector to repeat to shape (num_batch, num_units) cell_init = T.dot(ones, self.cell_init) if isinstance(self.hid_init, T.TensorVariable): hid_init = self.hid_init else: # Dot against a 1s vector to repeat to shape (num_batch, num_units) hid_init = T.dot(ones, self.hid_init) #weighted_hidden_init = T.zeros((num_batch, input.shape[2])) alpha_init = T.zeros((num_batch, encode_seqlen)) weighted_hidden_init = T.zeros((num_batch, self.num_inputs)) # The hidden-to-hidden weight matrix is always used in step hUa = T.dot(input, self.U_align) # (num_batch, seq_len, num_units_aln) non_seqs = [input, mask, hUa, self.W_align, self.v_align, self.W_hid_stacked, self.W_weightedhid_stacked] # The "peephole" weight matrices are only used when self.peepholes=True if self.peepholes: non_seqs += [self.W_cell_to_ingate, self.W_cell_to_forgetgate, self.W_cell_to_outgate] # theano.scan only allows for positional arguments, so when # self.peepholes is False, we need to supply fake placeholder arguments # for the three peephole matrices. else: non_seqs += [(), (), ()] # When we aren't precomputing the input outside of scan, we need to # provide the input weights and biases to the step function non_seqs += [self.b_stacked] if self.unroll_scan: # Explicitly unroll the recurrence instead of using scan cell_out, hid_out, alpha_out, weighted_hidden_out = unroll_scan( fn=step_fun, sequences=sequences, outputs_info=[cell_init, hid_init, alpha_init, weighted_hidden_init], go_backwards=self.backwards, non_sequences=non_seqs, n_steps=self.n_decodesteps + self.decode_pre_steps) else: # Scan op iterates over first dimension of input and repeatedly # applies the step function cell_out, hid_out, alpha_out, weighted_hidden_out = theano.scan( fn=step_fun, sequences=sequences, outputs_info=[cell_init, hid_init, alpha_init, weighted_hidden_init], go_backwards=self.backwards, truncate_gradient=self.gradient_steps, non_sequences=non_seqs, n_steps=self.n_decodesteps + self.decode_pre_steps, strict=True)[0] # dimshuffle back to (n_batch, n_time_steps, n_features)) #a_out - (n_decodesteps, bs, seqlen) #hid_out - (n_decode_steps, bs, num_units) # mask: (BS, encode_seqlen # a_out; (n_decodesteps, BS, encode_seqlen) cell_out = cell_out.dimshuffle(1, 0, 2) hid_out = hid_out.dimshuffle(1, 0, 2) # (BS, n_decodesteps, encode_seqlen) mask = mask.dimshuffle(0, 'x', 1) alpha_out = alpha_out.dimshuffle(1, 0, 2) # (BS, n_decodesteps, encode_seqlen) weighted_hidden_out = weighted_hidden_out.dimshuffle(1, 0, 2) # if scan is backward reverse the output if self.backwards: hid_out = hid_out[:, ::-1] cell_out = cell_out[:, ::-1] weighted_hidden_out = weighted_hidden_out[:, ::-1] alpha_out = alpha_out[:, ::-1] if self.decode_pre_steps > 0: hid_out = hid_out[:, self.decode_pre_steps:] cell_out = hid_out[:, self.decode_pre_steps:] weighted_hidden_out = weighted_hidden_out[:, self.decode_pre_steps:] alpha_out = hid_out[:, self.decode_pre_steps:] self.hid_out = hid_out self.cell_out = cell_out self.weighted_hidden_out = weighted_hidden_out self.alpha = alpha_out if self.return_decodehid: return hid_out else: return weighted_hidden_out
def get_output_for(self, inputs, deterministic=False, **kwargs): input = inputs[0] mask = None if self.mask_incoming_index > 0: mask = inputs[self.mask_incoming_index] hid_init = None if self.hid_init_incoming_index > 0: hid_init = inputs[self.hid_init_incoming_index] cell_init = None if self.cell_init_incoming_index > 0: cell_init = inputs[self.cell_init_incoming_index] if input.ndim > 3: input = T.flatten(input, 3) input = input.dimshuffle(1, 0, 2) seq_len, num_batch, _ = input.shape #### input #### W_in_stacked = T.concatenate([ self.W_in_to_ingate, self.W_in_to_forgetgate, self.W_in_to_cell, self.W_in_to_outgate ], axis=1) #### hidden #### W_hid_stacked = T.concatenate([ self.W_hid_to_ingate, self.W_hid_to_forgetgate, self.W_hid_to_cell, self.W_hid_to_outgate ], axis=1) #### bias #### b_stacked = T.concatenate( [self.b_ingate, self.b_forgetgate, self.b_cell, self.b_outgate], axis=0) #### weight noise #### if self.weight_noise > 0 and deterministic is True: W_in_stacked += self.normal(size=W_in_stacked.shape, std=self.weight_noise) W_hid_stacked += self.normal(size=W_hid_stacked.shape, std=self.weight_noise) def slice_w(x, n): return x[:, n * self.num_units:(n + 1) * self.num_units] #### set dropout mask #### if deterministic: self.using_dropout = False else: self.using_dropout = True cell_mask = self.binomial((num_batch, self.num_units), p=T.constant(1) - self.p, dtype=floatX) input = T.dot(input, W_in_stacked) + b_stacked def step(input_n, cell_previous, hid_previous, *args): gates = input_n + T.dot(hid_previous, W_hid_stacked) ingate = slice_w(gates, 0) forgetgate = slice_w(gates, 1) cell_input = slice_w(gates, 2) outgate = slice_w(gates, 3) if self.peepholes: ingate += cell_previous * self.W_cell_to_ingate forgetgate += cell_previous * self.W_cell_to_forgetgate if self.grad_clipping: ingate = theano.gradient.grad_clip(ingate, -self.grad_clipping, self.grad_clipping) forgetgate = theano.gradient.grad_clip(forgetgate, -self.grad_clipping, self.grad_clipping) cell_input = theano.gradient.grad_clip(cell_input, -self.grad_clipping, self.grad_clipping) ingate = self.nonlinearity_ingate(ingate) forgetgate = self.nonlinearity_forgetgate(forgetgate) cell_input = self.nonlinearity_cell(cell_input) # Compute new cell value if self.using_dropout == False or self.p == 0: cell_input = cell_input else: one = T.constant(1) retain_prob = one - self.p cell_input /= retain_prob cell_input = cell_input * cell_mask # Compute new cell value cell = forgetgate * cell_previous + ingate * cell_input if self.peepholes: outgate += cell * self.W_cell_to_outgate if self.grad_clipping: outgate = theano.gradient.grad_clip(outgate, -self.grad_clipping, self.grad_clipping) cell = theano.gradient.grad_clip(cell, -self.grad_clipping, self.grad_clipping) outgate = self.nonlinearity_outgate(outgate) # Compute new hidden unit activation hid = outgate * self.nonlinearity(cell) hid = T.dot(hid, self.W_hid_projection) return [cell, hid] def step_masked(input_n, mask_n, cell_previous, hid_previous, *args): cell, hid = step(input_n, cell_previous, hid_previous, *args) cell = T.switch(mask_n, cell, cell_previous) hid = T.switch(mask_n, hid, hid_previous) return [cell, hid] if mask is not None: mask = mask.dimshuffle(1, 0, 'x') sequences = [input, mask] step_fun = step_masked else: sequences = input step_fun = step ones = T.ones((num_batch, 1)) if not isinstance(self.cell_init, Layer): cell_init = T.dot(ones, self.cell_init) if not isinstance(self.hid_init, Layer): hid_init = T.dot(ones, self.hid_init) non_seqs = [cell_mask, W_hid_stacked] non_seqs += [ self.W_hid_projection, ] if self.peepholes: non_seqs += [ self.W_cell_to_ingate, self.W_cell_to_forgetgate, self.W_cell_to_outgate ] if self.unroll_scan: input_shape = self.input_shapes[0] cell_out, hid_out = unroll_scan(fn=step_fun, sequences=sequences, outputs_info=[cell_init, hid_init], non_sequences=non_seqs, n_steps=input_shape[1]) else: cell_out, hid_out = theano.scan( fn=step_fun, sequences=sequences, outputs_info=[cell_init, hid_init], go_backwards=self.backwards, truncate_gradient=self.gradient_steps, non_sequences=non_seqs, strict=True)[0] if self.only_return_final: hid_out = hid_out[-1] else: hid_out = hid_out.dimshuffle(1, 0, 2) if self.backwards: hid_out = hid_out[:, ::-1] if self.only_return_hidden: return hid_out else: if self.only_return_final: cell_out = cell_out[-1] else: cell_out = cell_out.dimshuffle(1, 0, 2) if self.backwards: cell_out = cell_out[:, ::-1] return T.concatenate([hid_out, cell_out], axis=-1)
def get_output_for(self, inputs, **kwargs): # Retrieve the layer input input = inputs[0] # Retrieve the mask when it is supplied mask = None hid_init = None cell_init = None avg_init = None if self.mask_incoming_index > 0: mask = inputs[self.mask_incoming_index] if self.hid_init_incoming_index > 0: hid_init = inputs[self.hid_init_incoming_index] if self.cell_init_incoming_index > 0: cell_init = inputs[self.cell_init_incoming_index] # Treat all dimensions after the second as flattened feature dimensions if input.ndim > 3: input = T.flatten(input, 3) # Because scan iterates over the first dimension we dimshuffle to # (n_time_steps, n_batch, n_features) input = input.dimshuffle(1, 0, 2) seq_len, num_batch, _ = input.shape # Stack input weight matrices into a (num_inputs, 4*num_units) # matrix, which speeds up computation W_in_stacked = T.concatenate([ self.W_in_to_ingate, self.W_in_to_forgetgate, self.W_in_to_cell, self.W_in_to_outgate ], axis=1) # Same for hidden weight matrices W_hid_stacked = T.concatenate([ self.W_hid_to_ingate, self.W_hid_to_forgetgate, self.W_hid_to_cell, self.W_hid_to_outgate ], axis=1) # Stack biases into a (4*num_units) vector b_stacked = T.concatenate( [self.b_ingate, self.b_forgetgate, self.b_cell, self.b_outgate], axis=0) if self.precompute_input: # Because the input is given for all time steps, we can # precompute_input the inputs dot weight matrices before scanning. # W_in_stacked is (n_features, 4*num_units). input is then # (n_time_steps, n_batch, 4*num_units). input = T.dot(input, W_in_stacked) + b_stacked # When theano.scan calls step, input_n will be (n_batch, 4*num_units). # We define a slicing function that extract the input to each LSTM gate def slice_w(x, n): return x[:, n * self.num_units:(n + 1) * self.num_units] # Create single recurrent computation step function # input_n is the n'th vector of the input def step(input_n, cell_previous, hid_previous, avg_previous, *args): x = input_n if not self.precompute_input: input_n = T.dot(input_n, W_in_stacked) + b_stacked # Calculate gates pre-activations and slice gates = input_n + T.dot(hid_previous, W_hid_stacked) # Clip gradients if self.grad_clipping: gates = theano.gradient.grad_clip(gates, -self.grad_clipping, self.grad_clipping) # Extract the pre-activation gate values ingate = slice_w(gates, 0) forgetgate = slice_w(gates, 1) cell_input = slice_w(gates, 2) outgate = slice_w(gates, 3) if self.peepholes: # Compute peephole connections ingate += cell_previous * self.W_cell_to_ingate forgetgate += cell_previous * self.W_cell_to_forgetgate # Apply nonlinearities ingate = self.nonlinearity_ingate(ingate) forgetgate = self.nonlinearity_forgetgate(forgetgate) cell_input = self.nonlinearity_cell(cell_input) # Compute new cell value cell = forgetgate * cell_previous + ingate * cell_input if self.peepholes: outgate += cell * self.W_cell_to_outgate outgate = self.nonlinearity_outgate(outgate) # Compute new hidden unit activation hid = outgate * self.nonlinearity(cell) avg_input = T.dot(x, self.W_avg1) + T.dot(hid, self.W_avg2) + self.b_avg if self.model_type == 1: avg = x * nonlinearities.sigmoid(avg_input) elif self.model_type == 6: avg = nonlinearities.tanh(avg_input) elif self.model_type == 7: avg_input = T.dot(x, self.W_avg1) * T.dot( hid, self.W_avg2) + self.b_avg avg = x * nonlinearities.sigmoid(avg_input) elif self.model_type == 2: avg = hid * nonlinearities.sigmoid(avg_input) elif self.model_type == 3: avg_input2 = T.dot(x, self.W_avg12) + T.dot( hid, self.W_avg22) + self.b_avg2 g1 = nonlinearities.sigmoid(avg_input) g2 = nonlinearities.sigmoid(avg_input2) avg = avg_previous * g1 + x * g2 elif self.model_type == 4: avg_input = T.dot( x, self.W_avg1) + T.dot(hid, self.W_avg2) + T.dot( avg_previous, self.W_avg3) + self.b_avg avg_input2 = T.dot( x, self.W_avg12) + T.dot(hid, self.W_avg22) + T.dot( avg_previous, self.W_avg32) + self.b_avg2 g1 = nonlinearities.sigmoid(avg_input) g2 = nonlinearities.sigmoid(avg_input2) avg = avg_previous * g1 + x * g2 elif self.model_type == 5: avg_input2 = T.dot(x, self.W_avg12) + T.dot( hid, self.W_avg22) + self.b_avg2 g1 = nonlinearities.sigmoid(avg_input) g2 = nonlinearities.sigmoid(avg_input2) avg = x * g1 havg = hid * g2 avg = avg + havg return [cell, hid, avg] def step_masked(input_n, mask_n, cell_previous, hid_previous, avg_previous, *args): cell, hid, avg = step(input_n, cell_previous, hid_previous, avg_previous, *args) # Skip over any input with mask 0 by copying the previous # hidden state; proceed normally for any input with mask 1. cell = T.switch(mask_n, cell, cell_previous) hid = T.switch(mask_n, hid, hid_previous) avg = T.switch(mask_n, avg, avg_previous) return [cell, hid, avg] if mask is not None: # mask is given as (batch_size, seq_len). Because scan iterates # over first dimension, we dimshuffle to (seq_len, batch_size) and # add a broadcastable dimension mask = mask.dimshuffle(1, 0, 'x') sequences = [input, mask] step_fun = step_masked else: sequences = input step_fun = step ones = T.ones((num_batch, 1)) if not isinstance(self.cell_init, lasagne.layers.Layer): # Dot against a 1s vector to repeat to shape (num_batch, num_units) cell_init = T.dot(ones, self.cell_init) if not isinstance(self.hid_init, lasagne.layers.Layer): # Dot against a 1s vector to repeat to shape (num_batch, num_units) hid_init = T.dot(ones, self.hid_init) if not isinstance(self.avg_init, lasagne.layers.Layer): # Dot against a 1s vector to repeat to shape (num_batch, num_units) avg_init = T.dot(ones, self.avg_init) # The hidden-to-hidden weight matrix is always used in step non_seqs = [W_hid_stacked, self.W_avg1, self.W_avg2, self.b_avg] # The "peephole" weight matrices are only used when self.peepholes=True if self.peepholes: non_seqs += [ self.W_cell_to_ingate, self.W_cell_to_forgetgate, self.W_cell_to_outgate ] # When we aren't precomputing the input outside of scan, we need to # provide the input weights and biases to the step function if not self.precompute_input: non_seqs += [W_in_stacked, b_stacked] if self.model_type == 3 or self.model_type == 5: non_seqs += [self.W_avg12, self.W_avg22, self.b_avg2] if self.model_type == 4: non_seqs += [ self.W_avg12, self.W_avg22, self.b_avg2, self.W_avg3, self.W_avg32 ] if self.unroll_scan: # Retrieve the dimensionality of the incoming layer input_shape = self.input_shapes[0] # Explicitly unroll the recurrence instead of using scan cell_out, hid_out, avg_out = unroll_scan( fn=step_fun, sequences=sequences, outputs_info=[cell_init, hid_init, avg_init], go_backwards=self.backwards, non_sequences=non_seqs, n_steps=input_shape[1]) else: # Scan op iterates over first dimension of input and repeatedly # applies the step function cell_out, hid_out, avg_out = theano.scan( fn=step_fun, sequences=sequences, outputs_info=[cell_init, hid_init, avg_init], go_backwards=self.backwards, truncate_gradient=self.gradient_steps, non_sequences=non_seqs, strict=True)[0] # When it is requested that we only return the final sequence step, # we need to slice it out immediately after scan is applied if self.only_return_final: avg_out = avg_out[-1] else: # dimshuffle back to (n_batch, n_time_steps, n_features)) avg_out = avg_out.dimshuffle(1, 0, 2) # if scan is backward reverse the output if self.backwards: avg_out = avg_out[:, ::-1] return avg_out
def get_output_for(self, inputs, **kwargs): # Retrieve the layer input input = inputs[0] # Retrieve the mask when it is supplied mask = inputs[1] if len(inputs) > 1 else None context = inputs[2] # Treat all dimensions after the second as flattened feature dimensions if input.ndim > 3: input = T.flatten(input, 3) # Because scan iterates over the first dimension we dimshuffle to # (n_time_steps, n_batch, n_features) input = input.dimshuffle(1, 0, 2) seq_len, num_batch, _ = input.shape context = context.dimshuffle(1, 0, 2) # Stack input weight matrices into a (num_inputs, 3*num_units) # matrix, which speeds up computation W_in_stacked = T.concatenate([ self.W_in_to_resetgate, self.W_in_to_updategate, self.W_in_to_hidden_update ], axis=1) # Same for hidden weight matrices W_hid_stacked = T.concatenate([ self.W_hid_to_resetgate, self.W_hid_to_updategate, self.W_hid_to_hidden_update ], axis=1) W_tid_stacked = T.concatenate([ self.W_tid_to_resetgate, self.W_tid_to_updategate, self.W_tid_to_hidden_update ], axis=1) # Stack gate biases into a (3*num_units) vector b_stacked = T.concatenate( [self.b_resetgate, self.b_updategate, self.b_hidden_update], axis=0) if self.precompute_input: # precompute_input inputs*W. W_in is (n_features, 3*num_units). # input is then (n_batch, n_time_steps, 3*num_units). input = T.dot(input, W_in_stacked) + b_stacked context = T.dot(context, W_tid_stacked) + b_stacked # At each call to scan, input_n will be (n_time_steps, 3*num_units). # We define a slicing function that extract the input to each GRU gate def slice_w(x, n): return x[:, n * self.num_units:(n + 1) * self.num_units] # Create single recurrent computation step function # input__n is the n'th vector of the input def step(input_n, context_n, hid_previous, W_hid_stacked, W_in_stacked, W_tid_stacked, b_stacked): # Compute W_{hr} h_{t - 1}, W_{hu} h_{t - 1}, and W_{hc} h_{t - 1} hid_input = T.dot(hid_previous, W_hid_stacked) if self.grad_clipping is not False: input_n = theano.gradient.grad_clip(input_n, -self.grad_clipping, self.grad_clipping) context_n = theano.gradient.grad_clip(context_n, -self.grad_clipping, self.grad_clipping) hid_input = theano.gradient.grad_clip(hid_input, -self.grad_clipping, self.grad_clipping) if not self.precompute_input: # Compute W_{xr}x_t + b_r, W_{xu}x_t + b_u, and W_{xc}x_t + b_c input_n = T.dot(input_n, W_in_stacked) + b_stacked context_n = T.dot(context_n, W_tid_stacked) + b_stacked # Reset and update gates resetgate = slice_w(hid_input, 0) + slice_w(input_n, 0) + slice_w( context_n, 0) updategate = slice_w(hid_input, 1) + slice_w(input_n, 1) + slice_w( context_n, 1) resetgate = self.nonlinearity_resetgate(resetgate) updategate = self.nonlinearity_updategate(updategate) # Compute W_{xc}x_t + r_t \odot (W_{hc} h_{t - 1}) hidden_update_in = slice_w(input_n, 2) hidden_update_hid = slice_w(hid_input, 2) hidden_update_tid = slice_w(context_n, 2) hidden_update = hidden_update_in + resetgate * hidden_update_hid + hidden_update_tid if self.grad_clipping is not False: hidden_update = theano.gradient.grad_clip( hidden_update, -self.grad_clipping, self.grad_clipping) hidden_update = self.nonlinearity_hid(hidden_update) # Compute (1 - u_t)h_{t - 1} + u_t c_t hid = (1 - updategate) * hid_previous + updategate * hidden_update return hid def step_masked(input_n, mask_n, context_n, hid_previous, W_hid_stacked, W_in_stacked, W_tid_stacked, b_stacked): hid = step(input_n, context_n, hid_previous, W_hid_stacked, W_in_stacked, W_tid_stacked, b_stacked) # Skip over any input with mask 0 by copying the previous # hidden state; proceed normally for any input with mask 1. not_mask = 1 - mask_n hid = hid * mask_n + hid_previous * not_mask return hid if mask is not None: # mask is given as (batch_size, seq_len). Because scan iterates # over first dimension, we dimshuffle to (seq_len, batch_size) and # add a broadcastable dimension mask = mask.dimshuffle(1, 0, 'x') sequences = [input, mask, context] step_fun = step_masked else: sequences = [input, context] step_fun = step if isinstance(self.hid_init, T.TensorVariable): hid_init = self.hid_init else: # Dot against a 1s vector to repeat to shape (num_batch, num_units) hid_init = T.dot(T.ones((num_batch, 1)), self.hid_init) # The hidden-to-hidden weight matrix is always used in step non_seqs = [W_hid_stacked] # When we aren't precomputing the input outside of scan, we need to # provide the input weights and biases to the step function if not self.precompute_input: non_seqs += [W_in_stacked, W_tid_stacked, b_stacked] # theano.scan only allows for positional arguments, so when # self.precompute_input is True, we need to supply fake placeholder # arguments for the input weights and biases. else: non_seqs += [(), (), ()] if self.unroll_scan: # Retrieve the dimensionality of the incoming layer input_shape = self.input_shapes[0] # Explicitly unroll the recurrence instead of using scan hid_out = unroll_scan(fn=step_fun, sequences=sequences, outputs_info=[hid_init], go_backwards=self.backwards, non_sequences=non_seqs, n_steps=input_shape[1])[0] else: # Scan op iterates over first dimension of input and repeatedly # applies the step function hid_out = theano.scan(fn=step_fun, sequences=sequences, go_backwards=self.backwards, outputs_info=[hid_init], non_sequences=non_seqs, truncate_gradient=self.gradient_steps, strict=True)[0] # dimshuffle back to (n_batch, n_time_steps, n_features)) hid_out = hid_out.dimshuffle(1, 0, 2) # if scan is backward reverse the output if self.backwards: hid_out = hid_out[:, ::-1, :] return hid_out
def get_sessions(self, environment, session_length = 10, batch_size = None, initial_env_state = 'zeros',initial_observation = 'zeros',initial_hidden = 'zeros', additional_output_layers = [], **flags ): """returns history of agent interaction with environment for given number of turns: parameters: environment - an environment to interact with (BaseEnvironment instance) session_length - how many turns of interaction shall there be for each batch batch_size - [required parameter] amount of independed sessions [number or symbolic].Irrelevant if you manually set all initial_*. initial_<something> - initial values for all variables at 0-th time step Unless you are doing something nasty, initial policy (qvalues) and actions will not matter at all 'zeros' default means filling variable with zeros Initial values are NOT included in history sequences additional_output_layers - any layers of a network which outputs need to be added to the outputs flags: optional flags to be sent to NN when calling get_output (e.g. deterministic = True) returns: state_seq,observation_seq,hidden_seq,policy_seq,action_seq, [additional_output_0, additional_output_1] for environment state, observation, hidden state, agent policy and chosen actions respectively each of them having dimensions of [batch_i,seq_i,...] time synchronization policy: state_seq,observation_seq correspond to observation BASED ON WHICH agent generated hidden_seq,policy_seq,action_seq """ env = environment if initial_env_state == 'zeros': initial_env_state = T.zeros([batch_size,env.state_size]) if initial_observation == 'zeros': initial_observation = T.zeros([batch_size,env.observation_size]) if initial_hidden == 'zeros': memory_state_shape = lasagne.layers.get_output_shape(self.memory)[1:] initial_hidden = T.zeros((batch_size,)+tuple(memory_state_shape)) time_ticks = T.arange(session_length) #recurrent step function #during SCAN, time synchronization is reverse: state_1 came after action_1 based on observation_0 from state_0 def step(time_tick,env_state,observation,last_hidden,last_policy,last_action, *args): hidden,policy,action,additional_outputs = self.get_agent_reaction(last_hidden,observation, additional_output_layers,**flags) new_env_state,new_observation = env.get_action_results(env_state,action,time_tick) return [new_env_state,new_observation,hidden,policy,action]+additional_outputs #main recurrent loop configuration additional_init = [None for i in additional_output_layers] outputs_info = [initial_env_state,initial_observation,initial_hidden,None,None] + additional_init history = unroll_scan(step, sequences = [time_ticks], outputs_info = outputs_info, non_sequences = [], n_steps = session_length ) self.history = history #from [time,batch,...] to [batch,time,...] history = [ (var.swapaxes(1,0) if var.ndim >1 else var) for var in history] #what's inside: state_seq,observation_seq,hidden_seq,policy_seq,action_seq = history[:5] additional_output_sequences = tuple(history[5:]) #allign time axes: actions come AFTER states with the same index #add first env turn, crop to session length state_seq = T.concatenate([insert_dim(initial_env_state,1), state_seq[:,:-1]],axis=1) observation_seq = T.concatenate([insert_dim(initial_observation,1), observation_seq[:,:-1]],axis=1) return (state_seq,observation_seq,hidden_seq,policy_seq,action_seq) + additional_output_sequences
def get_output_for(self, inputs, mask=None, **kwargs): """ Compute this layer's output function given a symbolic input variable Parameters ---------- inputs : list of theano.TensorType `inputs[0]` should always be the symbolic input variable. When this layer has a mask input (i.e. was instantiated with `mask_input != None`, indicating that the lengths of sequences in each batch vary), `inputs` should have length 2, where `inputs[1]` is the `mask`. The `mask` should be supplied as a Theano variable denoting whether each time step in each sequence in the batch is part of the sequence or not. `mask` should be a matrix of shape ``(n_batch, n_time_steps)`` where ``mask[i, j] = 1`` when ``j <= (length of sequence i)`` and ``mask[i, j] = 0`` when ``j > (length of sequence i)``. When the hidden state of this layer is to be pre-filled (i.e. was set to a :class:`Layer` instance) `inputs` should have length at least 2, and `inputs[-1]` is the hidden state to prefill with. When the cell state of this layer is to be pre-filled (i.e. was set to a :class:`Layer` instance) `inputs` should have length at least 2, and `inputs[-1]` is the hidden state to prefill with. When both the cell state and the hidden state are being pre-filled `inputs[-2]` is the hidden state, while `inputs[-1]` is the cell state. Returns ------- layer_output : theano.TensorType Symbolic output variable. """ # Retrieve the layer input input = inputs[0] # Retrieve the mask when it is supplied hid_init = None cell_init = None if self.mask_incoming_index > 0: mask = inputs[self.mask_incoming_index] if self.hid_init_incoming_index > 0: hid_init = inputs[self.hid_init_incoming_index] if self.cell_init_incoming_index > 0: cell_init = inputs[self.cell_init_incoming_index] # Treat all dimensions after the second as flattened feature dimensions if input.ndim > 3: input = T.flatten(input, 3) # Because scan iterates over the first dimension we dimshuffle to # (n_time_steps, n_batch, n_features) input = input.dimshuffle(1, 0, 2) seq_len, num_batch, _ = input.shape # Stack input weight matrices into a (num_inputs, 4*num_units) # matrix, which speeds up computation W_in_stacked = T.concatenate( [self.W_in_to_ingate, self.W_in_to_forgetgate, self.W_in_to_cell, self.W_in_to_outgate], axis=1) # Same for hidden weight matrices W_hid_stacked = T.concatenate( [self.W_hid_to_ingate, self.W_hid_to_forgetgate, self.W_hid_to_cell, self.W_hid_to_outgate], axis=1) # Stack biases into a (4*num_units) vector b_stacked = T.concatenate( [self.b_ingate, self.b_forgetgate, self.b_cell, self.b_outgate], axis=0) if self.precompute_input: # Because the input is given for all time steps, we can # precompute_input the inputs dot weight matrices before scanning. # W_in_stacked is (n_features, 4*num_units). input is then # (n_time_steps, n_batch, 4*num_units). input = T.dot(input, W_in_stacked) + b_stacked # At each call to scan, input_n will be (n_time_steps, 4*num_units). # We define a slicing function that extract the input to each LSTM gate def slice_w(x, n): return x[:, n*self.num_units:(n+1)*self.num_units] # Create single recurrent computation step function # input_n is the n'th vector of the input def step(input_n, cell_previous, hid_previous, *args): if not self.precompute_input: input_n = T.dot(input_n, W_in_stacked) + b_stacked # Calculate gates pre-activations and slice gates = input_n + T.dot(hid_previous, W_hid_stacked) # Clip gradients if self.grad_clipping: gates = theano.gradient.grad_clip( gates, -self.grad_clipping, self.grad_clipping) # Extract the pre-activation gate values ingate = slice_w(gates, 0) forgetgate = slice_w(gates, 1) cell_input = slice_w(gates, 2) outgate = slice_w(gates, 3) if self.peepholes: # Compute peephole connections ingate += cell_previous*self.W_cell_to_ingate forgetgate += cell_previous*self.W_cell_to_forgetgate # Apply nonlinearities ingate = self.nonlinearity_ingate(ingate) forgetgate = self.nonlinearity_forgetgate(forgetgate) cell_input = self.nonlinearity_cell(cell_input) # Compute new cell value cell = forgetgate*cell_previous + ingate*cell_input if self.peepholes: outgate += cell*self.W_cell_to_outgate outgate = self.nonlinearity_outgate(outgate) # Compute new hidden unit activation hid = outgate*self.nonlinearity(cell) return [cell, hid] def step_masked(input_n, mask_n, cell_previous, hid_previous, *args): cell, hid = step(input_n, cell_previous, hid_previous, *args) # Skip over any input with mask 0 by copying the previous # hidden state; proceed normally for any input with mask 1. cell = T.switch(mask_n, cell, cell_previous) hid = T.switch(mask_n, hid, hid_previous) return [cell, hid] if mask is not None: # mask is given as (batch_size, seq_len). Because scan iterates # over first dimension, we dimshuffle to (seq_len, batch_size) and # add a broadcastable dimension mask = mask.dimshuffle(1, 0, 'x') sequences = [input, mask] step_fun = step_masked else: sequences = input step_fun = step ones = T.ones((num_batch, 1)) if not isinstance(self.cell_init, Layer): # Dot against a 1s vector to repeat to shape (num_batch, num_units) cell_init = T.dot(ones, self.cell_init) if not isinstance(self.hid_init, Layer): # Dot against a 1s vector to repeat to shape (num_batch, num_units) hid_init = T.dot(ones, self.hid_init) # The hidden-to-hidden weight matrix is always used in step non_seqs = [W_hid_stacked] # The "peephole" weight matrices are only used when self.peepholes=True if self.peepholes: non_seqs += [self.W_cell_to_ingate, self.W_cell_to_forgetgate, self.W_cell_to_outgate] # When we aren't precomputing the input outside of scan, we need to # provide the input weights and biases to the step function if not self.precompute_input: non_seqs += [W_in_stacked, b_stacked] if self.unroll_scan: # Retrieve the dimensionality of the incoming layer input_shape = self.input_shapes[0] # Explicitly unroll the recurrence instead of using scan cell_out, hid_out = unroll_scan( fn=step_fun, sequences=sequences, outputs_info=[cell_init, hid_init], go_backwards=self.backwards, non_sequences=non_seqs, n_steps=input_shape[1]) else: # Scan op iterates over first dimension of input and repeatedly # applies the step function cell_out, hid_out = theano.scan( fn=step_fun, sequences=sequences, outputs_info=[cell_init, hid_init], go_backwards=self.backwards, truncate_gradient=self.gradient_steps, non_sequences=non_seqs, strict=True)[0] # When it is requested that we only return the final sequence step, # we need to slice it out immediately after scan is applied if self.only_return_final: hid_out = hid_out[-1] else: # dimshuffle back to (n_batch, n_time_steps, n_features)) hid_out = hid_out.dimshuffle(1, 0, 2) # if scan is backward reverse the output if self.backwards: hid_out = hid_out[:, ::-1] return hid_out
def get_output_for(self, inputs, **kwargs): """ Compute this layer's output function given a symbolic input variable. Parameters ---------- inputs : list of theano.TensorType `inputs[0]` should always be the symbolic input variable. When this layer has a mask input (i.e. was instantiated with `mask_input != None`, indicating that the lengths of sequences in each batch vary), `inputs` should have length 2, where `inputs[1]` is the `mask`. The `mask` should be supplied as a Theano variable denoting whether each time step in each sequence in the batch is part of the sequence or not. `mask` should be a matrix of shape ``(n_batch, n_time_steps)`` where ``mask[i, j] = 1`` when ``j <= (length of sequence i)`` and ``mask[i, j] = 0`` when ``j > (length of sequence i)``. Returns ------- layer_output : theano.TensorType Symbolic output variable. """ # Retrieve the layer input input = inputs[0] # Retrieve the mask when it is supplied mask = inputs[1] if len(inputs) > 1 else None # Input should be provided as (n_batch, n_time_steps, n_features) # but scan requires the iterable dimension to be first # So, we need to dimshuffle to (n_time_steps, n_batch, n_features) input = input.dimshuffle(1, 0, *range(2, input.ndim)) seq_len, num_batch = input.shape[0], input.shape[1] if self.precompute_input: # Because the input is given for all time steps, we can precompute # the inputs to hidden before scanning. First we need to reshape # from (seq_len, batch_size, trailing dimensions...) to # (seq_len*batch_size, trailing dimensions...) # This strange use of a generator in a tuple was because # input.shape[2:] was raising a Theano error trailing_dims = tuple(input.shape[n] for n in range(2, input.ndim)) input = T.reshape(input, (seq_len * num_batch, ) + trailing_dims) input = helper.get_output(self.input_to_hidden, input, **kwargs) # Reshape back to (seq_len, batch_size, trailing dimensions...) trailing_dims = tuple(input.shape[n] for n in range(1, input.ndim)) input = T.reshape(input, (seq_len, num_batch) + trailing_dims) # We will always pass the hidden-to-hidden layer params to step non_seqs = helper.get_all_params(self.hidden_to_hidden) non_seqs += helper.get_all_params(self.output_to_hidden) # When we are not precomputing the input, we also need to pass the # input-to-hidden parameters to step if not self.precompute_input: non_seqs += helper.get_all_params(self.input_to_hidden) # Create single recurrent computation step function def step(input_n, hid_previous, *args): # Compute the hidden-to-hidden activation hid_pre = helper.get_output(self.hidden_to_hidden, hid_previous, **kwargs) # out_layers = helper.get_all_layers(self.output_to_hidden) # out_layers[1].incoming_layer = self.hidden_to_hidden hid_pre += helper.get_output(self.output_to_hidden, hid_previous, **kwargs) # If the dot product is precomputed then add it, otherwise # calculate the input_to_hidden values and add them if self.precompute_input: hid_pre += input_n else: hid_pre += helper.get_output(self.input_to_hidden, input_n, **kwargs) # Clip gradients if self.grad_clipping: hid_pre = theano.gradient.grad_clip(hid_pre, -self.grad_clipping, self.grad_clipping) return self.nonlinearity(hid_pre) def step_masked(input_n, mask_n, hid_previous, *args): # Skip over any input with mask 0 by copying the previous # hidden state; proceed normally for any input with mask 1. hid = step(input_n, hid_previous, *args) hid_out = hid * mask_n + hid_previous * (1 - mask_n) return [hid_out] if mask is not None: mask = mask.dimshuffle(1, 0, 'x') sequences = [input, mask] step_fun = step_masked else: sequences = input step_fun = step # When hid_init is provided as a TensorVariable, use it as-is if isinstance(self.hid_init, T.TensorVariable): hid_init = self.hid_init else: # The code below simply repeats self.hid_init num_batch times in # its first dimension. Turns out using a dot product and a # dimshuffle is faster than T.repeat. dot_dims = (list(range(1, self.hid_init.ndim - 1)) + [0, self.hid_init.ndim - 1]) hid_init = T.dot(T.ones((num_batch, 1)), self.hid_init.dimshuffle(dot_dims)) if self.unroll_scan: # Retrieve the dimensionality of the incoming layer input_shape = self.input_shapes[0] # Explicitly unroll the recurrence instead of using scan hid_out = unroll_scan(fn=step_fun, sequences=sequences, outputs_info=[hid_init], go_backwards=self.backwards, non_sequences=non_seqs, n_steps=input_shape[1])[0] else: # Scan op iterates over first dimension of input and repeatedly # applies the step function hid_out = theano.scan(fn=step_fun, sequences=sequences, go_backwards=self.backwards, outputs_info=[hid_init], non_sequences=non_seqs, truncate_gradient=self.gradient_steps, strict=True)[0] # dimshuffle back to (n_batch, n_time_steps, n_features)) hid_out = hid_out.dimshuffle(1, 0, *range(2, hid_out.ndim)) # if scan is backward reverse the output if self.backwards: hid_out = hid_out[:, ::-1] return hid_out
def get_output_for(self, inputs, **kwargs): """ Compute this layer's output function given a symbolic input variable Parameters ---------- inputs : list of theano.TensorType `inputs[0]` should always be the symbolic input variable. When this layer has a mask input (i.e. was instantiated with `mask_input != None`, indicating that the lengths of sequences in each batch vary), `inputs` should have length 2, where `inputs[1]` is the `mask`. The `mask` should be supplied as a Theano variable denoting whether each time step in each sequence in the batch is part of the sequence or not. `mask` should be a matrix of shape ``(n_batch, n_time_steps)`` where ``mask[i, j] = 1`` when ``j <= (length of sequence i)`` and ``mask[i, j] = 0`` when ``j > (length of sequence i)``. Returns ------- layer_output : theano.TensorType Symbolic output variable. """ # Retrieve the layer input input = inputs[0] # Retrieve the mask when it is supplied mask = inputs[1] if len(inputs) > 1 else None # Treat all dimensions after the second as flattened feature dimensions if input.ndim > 3: input = T.flatten(input, 3) # Because scan iterates over the first dimension we dimshuffle to # (n_time_steps, n_batch, n_features) input = input.dimshuffle(1, 0, 2) seq_len, num_batch, _ = input.shape # Stack input weight matrices into a (num_inputs, 4*num_units) # matrix, which speeds up computation W_in_stacked = T.concatenate( [ self.W_in_to_ingate, self.W_in_to_forgetgate, self.W_in_to_cell #self.W_in_to_outgate ], axis=1) # Same for hidden weight matrices W_hid_stacked = T.concatenate( [ self.W_hid_to_ingate, self.W_hid_to_forgetgate, self.W_hid_to_cell #self.W_hid_to_outgate ], axis=1) # Stack biases into a (4*num_units) vector b_stacked = T.concatenate( [ self.b_ingate, self.b_forgetgate, self.b_cell #self.b_outgate ], axis=0) if self.precompute_input: # Because the input is given for all time steps, we can # precompute_input the inputs dot weight matrices before scanning. # W_in_stacked is (n_features, 4*num_units). input is then # (n_time_steps, n_batch, 4*num_units). input = T.dot(input, W_in_stacked) + b_stacked # At each call to scan, input_n will be (n_time_steps, 4*num_units). # We define a slicing function that extract the input to each LSTM gate def slice_w(x, n): return x[:, n * self.num_units:(n + 1) * self.num_units] # Create single recurrent computation step function # input_n is the n'th vector of the input def step( input_n, cell_previous, hid_previous, W_hid_stacked, W_cell_to_ingate, W_cell_to_forgetgate, #W_cell_to_outgate, W_in_stacked, b_stacked): if not self.precompute_input: input_n = T.dot(input_n, W_in_stacked) + b_stacked # Calculate gates pre-activations and slice gates = input_n + T.dot(hid_previous, W_hid_stacked) # Clip gradients if self.grad_clipping is not False: gates = theano.gradient.grad_clip(gates, -self.grad_clipping, self.grad_clipping) # Extract the pre-activation gate values ingate = slice_w(gates, 0) forgetgate = slice_w(gates, 1) cell_input = slice_w(gates, 2) #outgate = slice_w(gates, 3) if self.peepholes: # Compute peephole connections ingate += cell_previous * W_cell_to_ingate forgetgate += cell_previous * W_cell_to_forgetgate # Apply nonlinearities ingate = self.nonlinearity_ingate(ingate) forgetgate = self.nonlinearity_forgetgate(forgetgate) cell_input = self.nonlinearity_cell(cell_input) #outgate = self.nonlinearity_outgate(outgate) # Compute new cell value cell = forgetgate * cell_previous + ingate * cell_input #if self.peepholes: # outgate += cell*W_cell_to_outgate # Compute new hidden unit activation #hid = outgate*self.nonlinearity(cell) hid = self.nonlinearity(cell) return [cell, hid] def step_masked( input_n, mask_n, cell_previous, hid_previous, W_hid_stacked, W_cell_to_ingate, W_cell_to_forgetgate, #W_cell_to_outgate, W_in_stacked, b_stacked): cell, hid = step( input_n, cell_previous, hid_previous, W_hid_stacked, W_cell_to_ingate, W_cell_to_forgetgate, #W_cell_to_outgate, W_in_stacked, b_stacked) # Skip over any input with mask 0 by copying the previous # hidden state; proceed normally for any input with mask 1. not_mask = 1 - mask_n cell = cell * mask_n + cell_previous * not_mask hid = hid * mask_n + hid_previous * not_mask return [cell, hid] if mask is not None: # mask is given as (batch_size, seq_len). Because scan iterates # over first dimension, we dimshuffle to (seq_len, batch_size) and # add a broadcastable dimension mask = mask.dimshuffle(1, 0, 'x') sequences = [input, mask] step_fun = step_masked else: sequences = input step_fun = step ones = T.ones((num_batch, 1)) if isinstance(self.cell_init, T.TensorVariable): cell_init = self.cell_init else: # Dot against a 1s vector to repeat to shape (num_batch, num_units) cell_init = T.dot(ones, self.cell_init) if isinstance(self.hid_init, T.TensorVariable): hid_init = self.hid_init else: # Dot against a 1s vector to repeat to shape (num_batch, num_units) hid_init = T.dot(ones, self.hid_init) # The hidden-to-hidden weight matrix is always used in step non_seqs = [W_hid_stacked] # The "peephole" weight matrices are only used when self.peepholes=True if self.peepholes: non_seqs += [ self.W_cell_to_ingate, self.W_cell_to_forgetgate #self.W_cell_to_outgate ] # theano.scan only allows for positional arguments, so when # self.peepholes is False, we need to supply fake placeholder arguments # for the three peephole matrices. else: non_seqs += [(), () # () ] # When we aren't precomputing the input outside of scan, we need to # provide the input weights and biases to the step function if not self.precompute_input: non_seqs += [W_in_stacked, b_stacked] # As above, when we aren't providing these parameters, we need to # supply placehold arguments else: non_seqs += [(), ()] if self.unroll_scan: # Retrieve the dimensionality of the incoming layer input_shape = self.input_shapes[0] # Explicitly unroll the recurrence instead of using scan cell_out, hid_out = unroll_scan(fn=step_fun, sequences=sequences, outputs_info=[cell_init, hid_init], go_backwards=self.backwards, non_sequences=non_seqs, n_steps=input_shape[1]) else: # Scan op iterates over first dimension of input and repeatedly # applies the step function cell_out, hid_out = theano.scan( fn=step_fun, sequences=sequences, outputs_info=[cell_init, hid_init], go_backwards=self.backwards, truncate_gradient=self.gradient_steps, non_sequences=non_seqs, strict=True)[0] # dimshuffle back to (n_batch, n_time_steps, n_features)) hid_out = hid_out.dimshuffle(1, 0, 2) cell_out = cell_out.dimshuffle(1, 0, 2) # if scan is backward reverse the output if self.backwards: hid_out = hid_out[:, ::-1] cell_out = cell_out[:, ::-1] return hid_out
def get_output_for(self, inputs, **kwargs): """ Compute this layer's output function given a symbolic input variable Parameters ---------- inputs : list of theano.TensorType `inputs[0]` should always be the symbolic input variable. When this layer has a mask input (i.e. was instantiated with `mask_input != None`, indicating that the lengths of sequences in each batch vary), `inputs` should have length 2, where `inputs[1]` is the `mask`. The `mask` should be supplied as a Theano variable denoting whether each time step in each sequence in the batch is part of the sequence or not. `mask` should be a matrix of shape ``(n_batch, n_time_steps)`` where ``mask[i, j] = 1`` when ``j <= (length of sequence i)`` and ``mask[i, j] = 0`` when ``j > (length of sequence i)``. Returns ------- layer_output : theano.TensorType Symbolic output variable. """ # Retrieve the layer input input = inputs[0] encoder_output = inputs[1] # Treat all dimensions after the second as flattened feature dimensions if input.ndim > 3: input = T.flatten(input, 3) encoder_output = T.flatten(encoder_output, 3) # Because scan iterates over the first dimension we dimshuffle to # (n_time_steps, n_batch, n_features) input = input.dimshuffle(1, 0, 2) encoder_output = encoder_output.dimshuffle(1, 0, 2) seq_len, num_batch, _ = input.shape # Stack input weight matrices into a (num_inputs, 3*num_units) # matrix, which speeds up computation W_in_stacked = T.concatenate([ self.W_in_to_resetgate, self.W_in_to_updategate, self.W_in_to_hidden_update ], axis=1) # Same for hidden weight matrices W_hid_stacked = T.concatenate([ self.W_hid_to_resetgate, self.W_hid_to_updategate, self.W_hid_to_hidden_update ], axis=1) # Stack gate biases into a (3*num_units) vector b_stacked = T.concatenate( [self.b_resetgate, self.b_updategate, self.b_hidden_update], axis=0) if self.precompute_input: # precompute_input inputs*W. W_in is (n_features, 3*num_units). # input is then (n_batch, n_time_steps, 3*num_units). input = T.dot(input, W_in_stacked) + b_stacked # At each call to scan, input_n will be (n_time_steps, 3*num_units). # We define a slicing function that extract the input to each GRU gate def slice_w(x, n): return x[:, n * self.num_units:(n + 1) * self.num_units] # Create single recurrent computation step function # input__n is the n'th vector of the input def step(input_n, hid_previous, encoder_output, W_hid_stacked, W_in_stacked, b_stacked, W_att_enc, W_att_dec, W_att_out, W_out, b_out): # Compute W_{hr} h_{t - 1}, W_{hu} h_{t - 1}, and W_{hc} h_{t - 1} hid_input = T.dot(hid_previous, W_hid_stacked) if self.grad_clipping is not False: input_n = theano.gradient.grad_clip(input_n, -self.grad_clipping, self.grad_clipping) hid_input = theano.gradient.grad_clip(hid_input, -self.grad_clipping, self.grad_clipping) if not self.precompute_input: # Compute W_{xr}x_t + b_r, W_{xu}x_t + b_u, and W_{xc}x_t + b_c input_n = T.dot(input_n, W_in_stacked) + b_stacked # Reset and update gates resetgate = slice_w(hid_input, 0) + slice_w(input_n, 0) updategate = slice_w(hid_input, 1) + slice_w(input_n, 1) resetgate = self.nonlinearity_resetgate(resetgate) updategate = self.nonlinearity_updategate(updategate) # Compute W_{xc}x_t + r_t \odot (W_{hc} h_{t - 1}) hidden_update_in = slice_w(input_n, 2) hidden_update_hid = slice_w(hid_input, 2) hidden_update = hidden_update_in + resetgate * hidden_update_hid if self.grad_clipping is not False: hidden_update = theano.gradient.grad_clip( hidden_update, -self.grad_clipping, self.grad_clipping) hidden_update = self.nonlinearity_hid(hidden_update) # Compute (1 - u_t)h_{t - 1} + u_t c_t hid = (1 - updategate) * hid_previous + updategate * hidden_update # # Add the attention hid += self.attention(encoder_output, hid_previous, W_att_enc, W_att_dec, W_att_out) # Compute the probas probs = T.nnet.softmax(T.dot(hid, W_out) + b_out) return [hid, probs] sequences = [input] step_fun = step if isinstance(self.hid_init, T.TensorVariable): hid_init = self.hid_init else: # Dot against a 1s vector to repeat to shape (num_batch, num_units) hid_init = T.dot(T.ones((num_batch, 1)), self.hid_init) # The hidden-to-hidden weight matrix is always used in step non_seqs = [encoder_output, W_hid_stacked] # When we aren't precomputing the input outside of scan, we need to # provide the input weights and biases to the step function if not self.precompute_input: non_seqs += [ W_in_stacked, b_stacked, self.W_att_enc, self.W_att_dec, self.W_att_out, self.W_out, self.b_out, ] # theano.scan only allows for positional arguments, so when # self.precompute_input is True, we need to supply fake placeholder # arguments for the input weights and biases. else: non_seqs += [(), (), self.W_att_enc, self.W_att_dec, self.W_att_out, self.W_out, self.b_out] if self.unroll_scan: # Retrieve the dimensionality of the incoming layer input_shape = self.input_shapes[0] # Explicitly unroll the recurrence instead of using scan out, _ = unroll_scan(fn=step_fun, sequences=sequences, outputs_info=[hid_init], go_backwards=self.backwards, non_sequences=non_seqs, n_steps=input_shape[1]) else: # Scan op iterates over first dimension of input and repeatedly # applies the step function out, _ = theano.scan(fn=step_fun, sequences=sequences, go_backwards=self.backwards, outputs_info=[hid_init, None], non_sequences=non_seqs, truncate_gradient=self.gradient_steps, strict=True) # dimshuffle back to (n_batch, n_time_steps, n_features)) # hid_out = hid_out[0].dimshuffle(1, 0, 2) s_out = out[1] # # if scan is backward reverse the output # if self.backwards: # out = out[:, ::-1, :] return s_out
def get_output_for(self, inputs, **kwargs): """ Compute this layer's output function given a symbolic input variable Parameters ---------- inputs : list of theano.TensorType `inputs[0]` should always be the symbolic input variable. When this layer has a mask input (i.e. was instantiated with `mask_input != None`, indicating that the lengths of sequences in each batch vary), `inputs` should have length 2, where `inputs[1]` is the `mask`. The `mask` should be supplied as a Theano variable denoting whether each time step in each sequence in the batch is part of the sequence or not. `mask` should be a matrix of shape ``(n_batch, n_time_steps)`` where ``mask[i, j] = 1`` when ``j <= (length of sequence i)`` and ``mask[i, j] = 0`` when ``j > (length of sequence i)``. When the hidden state of this layer is to be pre-filled (i.e. was set to a :class:`Layer` instance) `inputs` should have length at least 2, and `inputs[-1]` is the hidden state to prefill with. Returns ------- layer_output : theano.TensorType Symbolic output variable. """ # Retrieve the layer input input_p = inputs[0] input_q = inputs[1] z_init = inputs[2] mu_p_init = inputs[3] # Retrieve the mask when it is supplied mask = None if self.mask_incoming_index > 0: mask = inputs[self.mask_incoming_index] # Because scan iterates over the first dimension we dimshuffle to # (n_time_steps, n_batch, n_features) input_p = input_p.dimshuffle(1, 0, 2) input_q = input_q.dimshuffle(1, 0, 2) seq_len, num_batch, _ = input_p.shape # Create single recurrent computation step function # input__n is the n'th vector of the input def log_sum_exp(a, b): return T.log(T.exp(a) + T.exp(b)) def step(noise_n, input_p_n, input_q_n, z_previous, mu_p_previous, logvar_p_previous, mu_q_previous, logvar_q_previous, *args): input_p = T.concatenate([input_p_n, z_previous], axis=1) mu_p = get_output(self.mu_p_mlp, input_p) logvar_p = get_output(self.logvar_p_mlp, input_p) logvar_p = T.log(T.exp(logvar_p) + self.cons) q_input_n = T.concatenate([input_q_n, z_previous], axis=1) mu_q = get_output(self.q_mu_mlp, q_input_n) if self.use_mu_residual_q: print "Using residuals for mean_q" mu_q += mu_p logvar_q = get_output(self.q_logvar_mlp, q_input_n) # Numerical stability logvar_q = T.log(T.exp(logvar_q) + self.cons) z_n = mu_q + T.exp(0.5 * logvar_q) * noise_n return z_n, mu_p, logvar_p, mu_q, logvar_q def step_masked(noise_n, input_p_n, input_q_n, mask_n, z_previous, mu_p_previous, logvar_p_previous, mu_q_previous, logvar_q_previous, *args): # Skip over any input with mask 0 by copying the previous # hidden state; proceed normally for any input with mask 1. z_n, mu_p, logvar_p, mu_q, logvar_q = step( noise_n, input_p_n, input_q_n, z_previous, mu_p_previous, logvar_p_previous, mu_q_previous, logvar_q_previous, *args) z_n = T.switch(mask_n, z_n, z_previous) mu_p = T.switch(mask_n, mu_p, mu_p_previous) logvar_p = T.switch(mask_n, logvar_p, logvar_p_previous) mu_q = T.switch(mask_n, mu_q, mu_q_previous) logvar_q = T.switch(mask_n, logvar_q, logvar_q_previous) return z_n, mu_p, logvar_p, mu_q, logvar_q eps = self._srng.normal(size=(seq_len, num_batch, self.num_units), avg=0.0, std=1.0) logvar_init = T.zeros((num_batch, self.num_units)) if mask is not None: # mask is given as (batch_size, seq_len). Because scan iterates # over first dimension, we dimshuffle to (seq_len, batch_size) and # add a broadcastable dimension mask = mask.dimshuffle(1, 0, 'x') sequences = [eps, input_p, input_q, mask] step_fun = step_masked else: sequences = [eps, input_p, input_q] step_fun = step # The hidden-to-hidden weight matrix is always used in step non_seqs = helper.get_all_params(self.logvar_p_mlp) non_seqs += helper.get_all_params(self.mu_p_mlp) non_seqs += helper.get_all_params(self.q_mu_mlp) non_seqs += helper.get_all_params(self.q_logvar_mlp) if self.unroll_scan: # Retrieve the dimensionality of the incoming layer input_shape = self.input_shapes[0] # Explicitly unroll the recurrence instead of using scan scan_out = unroll_scan(fn=step_fun, sequences=sequences, outputs_info=[ z_init, mu_p_init, logvar_init, mu_p_init, logvar_init ], go_backwards=self.backwards, non_sequences=non_seqs, n_steps=input_shape[1]) else: # Scan op iterates over first dimension of input and repeatedly # applies the step function scan_out = theano.scan(fn=step_fun, sequences=sequences, go_backwards=self.backwards, outputs_info=[ z_init, mu_p_init, logvar_init, mu_p_init, logvar_init ], non_sequences=non_seqs, truncate_gradient=self.gradient_steps, strict=True)[0] z, mu_p, logvar_p, mu_q, logvar_q = scan_out # When it is requested that we only return the final sequence step, # we need to slice it out immediately after scan is applied if self.only_return_final: assert False else: # dimshuffle back to (n_batch, n_time_steps, n_features)) z = z.dimshuffle(1, 0, 2) mu_p = mu_p.dimshuffle(1, 0, 2) logvar_p = logvar_p.dimshuffle(1, 0, 2) mu_q = mu_q.dimshuffle(1, 0, 2) logvar_q = logvar_q.dimshuffle(1, 0, 2) # if scan is backward reverse the output if self.backwards: z = z[:, ::-1] mu_p = mu_p[:, ::-1] logvar_p = logvar_p[:, ::-1] mu_q = mu_q[:, ::-1] logvar_q = logvar_q[:, ::-1] return z, mu_p, logvar_p, mu_q, logvar_q
def predict_symbolic(self, mx, Sx, unroll_scan=False): idims = self.D odims = self.E Ms = self.sr.shape[1] sf2M = (self.hyp[:, idims]**2)/tt.cast(Ms, floatX) sn2 = self.hyp[:, idims+1]**2 # TODO this should just fallback to the method from the SSGP class if Sx is None: # first check if we received a vector [D] or a matrix [nxD] if mx.ndim == 1: mx = mx[None, :] srdotx = self.sr.dot(self.X.T).transpose(0,2,1) phi_x = tt.concatenate([tt.sin(srdotx), tt.cos(srdotx)], 2) M = (phi_x*self.beta_ss[:, None, :]).sum(-1) phi_x_L = tt.stack([ solve_lower_triangular(self.Lmm[i], phi_x[i].T) for i in range(odims)]) S = sn2[:, None]*(1 + (sf2M[:, None])*(phi_x_L**2).sum(-2)) + 1e-6 return M, S # precompute some variables srdotx = self.sr.dot(mx) srdotSx = self.sr.dot(Sx) srdotSxdotsr = tt.sum(srdotSx*self.sr, 2) e = tt.exp(-0.5*srdotSxdotsr) cos_srdotx = tt.cos(srdotx) sin_srdotx = tt.sin(srdotx) cos_srdotx_e = cos_srdotx*e sin_srdotx_e = sin_srdotx*e # compute the mean vector mphi = tt.horizontal_stack(sin_srdotx_e, cos_srdotx_e) # E x 2*Ms M = tt.sum(mphi*self.beta_ss, 1) # input output covariance mx_c = mx.dimshuffle(0, 'x') sin_srdotx_e_r = sin_srdotx_e.dimshuffle(0, 'x', 1) cos_srdotx_e_r = cos_srdotx_e.dimshuffle(0, 'x', 1) srdotSx_tr = srdotSx.transpose(0, 2, 1) c = tt.concatenate([mx_c*sin_srdotx_e_r + srdotSx_tr*cos_srdotx_e_r, mx_c*cos_srdotx_e_r - srdotSx_tr*sin_srdotx_e_r], axis=2) # E x D x 2*Ms beta_ss_r = self.beta_ss.dimshuffle(0, 'x', 1) # input output covariance (notice this is not premultiplied by the # input covariance inverse) V = tt.sum(c*beta_ss_r, 2).T - tt.outer(mx, M) srdotSxdotsr_c = srdotSxdotsr.dimshuffle(0, 1, 'x') srdotSxdotsr_r = srdotSxdotsr.dimshuffle(0, 'x', 1) M2 = tt.zeros((odims, odims)) # initialize indices triu_indices = np.triu_indices(odims) indices = [tt.as_index_variable(idx) for idx in triu_indices] def second_moments(i, j, M2, beta, iA, sn2, sf2M, sr, srdotSx, srdotSxdotsr_c, srdotSxdotsr_r, sin_srdotx, cos_srdotx, *args): # compute the second moments of the spectrum feature vectors siSxsj = srdotSx[i].dot(sr[j].T) # Ms x Ms sijSxsij = -0.5*(srdotSxdotsr_c[i] + srdotSxdotsr_r[j]) em = tt.exp(sijSxsij+siSxsj) # MsxMs ep = tt.exp(sijSxsij-siSxsj) # MsxMs si = sin_srdotx[i] # Msx1 ci = cos_srdotx[i] # Msx1 sj = sin_srdotx[j] # Msx1 cj = cos_srdotx[j] # Msx1 sicj = tt.outer(si, cj) # MsxMs cisj = tt.outer(ci, sj) # MsxMs sisj = tt.outer(si, sj) # MsxMs cicj = tt.outer(ci, cj) # MsxMs sm = (sicj-cisj)*em sp = (sicj+cisj)*ep cm = (sisj+cicj)*em cp = (cicj-sisj)*ep # Populate the second moment matrix of the feature vector Q_up = tt.concatenate([cm-cp, sm+sp], axis=1) Q_lo = tt.concatenate([sp-sm, cm+cp], axis=1) Q = tt.concatenate([Q_up, Q_lo], axis=0) # Compute the second moment of the output m2 = 0.5*matrix_dot(beta[i], Q, beta[j].T) m2 = theano.ifelse.ifelse( tt.eq(i, j), m2 + sn2[i]*(1.0 + sf2M[i]*tt.sum(self.iA[i]*Q)) + 1e-6, m2) M2 = tt.set_subtensor(M2[i, j], m2) return M2 nseq = [self.beta_ss, self.iA, sn2, sf2M, self.sr, srdotSx, srdotSxdotsr_c, srdotSxdotsr_r, sin_srdotx, cos_srdotx, self.Lmm] if unroll_scan: from lasagne.utils import unroll_scan [M2_] = unroll_scan(second_moments, indices, [M2], nseq, len(triu_indices[0])) updts = {} else: M2_, updts = theano.scan(fn=second_moments, sequences=indices, outputs_info=[M2], non_sequences=nseq, allow_gc=False, name="%s>M2_scan" % (self.name)) M2 = M2_[-1] M2 = M2 + tt.triu(M2, k=1).T S = M2 - tt.outer(M, M) return M, S, V
def get_output_for(self, inputs, recurrence_flags={}, **kwargs): """ returns history of agent interaction with environment for given number of turns. parameters: inputs - [state init] + [input_nonsequences] + [input_sequences] Each part is a list of theano expressions for layers in the order they were provided when creating this layer. recurrence_flags - a set of flags to be passed to the one step agent (anything that lasagne supports) e.g. {deterministic=True} returns: [state_sequences] + [output sequences] - a list of all states and all outputs sequences Shape of each such sequence is [batch, tick, shape_of_one_state_or_output...] """ # set batch size if len(inputs) != 0: batch_size = inputs[0].shape[0] else: batch_size = self.batch_size n_states = len(self.state_variables) n_state_inits = len(self.state_init) n_input_nonseq = len(self.input_nonsequences) n_input_seq = len(self.input_sequences) n_outputs = len(self.tracked_outputs) initial_states, nonsequences, sequences = unpack_list(inputs, [n_state_inits, n_input_nonseq, n_input_seq]) # reshape sequences from [batch, time, ...] to [time,batch,...] to fit scan sequences = [seq.swapaxes(1, 0) for seq in sequences] # create outputs_info for scan initial_states = OrderedDict(list(zip(self.state_init, initial_states))) def get_initial_state(state_out_layer): """Pick dedicated initial state or create zeros of appropriate shape and dtype""" # if we have a dedicated init, use it if state_out_layer in initial_states: initial_state = initial_states[state_out_layer] # otherwise initialize with zeros else: initial_state = T.zeros((batch_size,) + tuple(state_out_layer.output_shape[1:]), dtype=get_layer_dtype(state_out_layer)) return initial_state initial_state_variables = list(map(get_initial_state, self.state_variables)) outputs_info = initial_state_variables + [None] * len(self.tracked_outputs) # recurrent step function def step(*args): sequence_slices, prev_states, prev_outputs, nonsequences = \ unpack_list(args, [n_input_seq, n_states, n_outputs, n_input_nonseq]) # make dicts of prev_states and inputs prev_states_dict = OrderedDict(zip(list(self.state_variables.keys()), prev_states)) input_layers = list(chain(self.input_nonsequences.keys(), self.input_sequences.keys())) assert len(input_layers) == len(nonsequences + sequence_slices) inputs_dict = OrderedDict(zip(input_layers, nonsequences + sequence_slices)) # call one step recurrence new_states, new_outputs = self.get_one_step(prev_states_dict, inputs_dict, **recurrence_flags) return new_states + new_outputs if self.unroll_scan: # call scan itself history = unroll_scan(step, sequences=sequences, outputs_info=outputs_info, non_sequences=nonsequences, n_steps=self.n_steps ) self.updates=OrderedDict() else: history,updates = theano.scan(step, sequences=sequences, outputs_info=outputs_info, non_sequences=nonsequences, n_steps=self.n_steps ) self.updates = updates if len(updates) !=0: warn("Warning: recurrent loop without unroll_scan got nonempty random state updates list. That happened" " because there is some source of randomness (e.g. dropout) inside recurrent step graph." " To compile such graph, one must either call .get_automatic_updates() right after .get_output" " and pass these updates to a function, or use no_defalt_updates=True when compiling theano.function.") # reordering from [time,batch,...] to [batch,time,...] history = [(var.swapaxes(1, 0) if var.ndim > 1 else var) for var in history] state_seqs, output_seqs = unpack_list(history, [n_states, n_outputs]) # handle delayed_states # selectively shift state sequences by 1 tick into the past, padding with their initialisations for i in range(len(state_seqs)): if list(self.state_variables.keys())[i] in self.delayed_states: state_seq = state_seqs[i] state_init = initial_state_variables[i] state_seq = T.concatenate([insert_dim(state_init, 1), state_seq[:, :-1]], axis=1) state_seqs[i] = state_seq return OrderedDict(zip(self.keys(),state_seqs + output_seqs))
def get_loss(self, unroll_scan=False, cache_intermediate=True): utils.print_with_stamp('Building Sparse Spectrum loss', self.name) idims = self.D if self.sr is None: self.sr = self.w/(self.hyp[:, :idims]) self.sr = self.sr.transpose(1, 0, 2) # init variables N = self.X.shape[0].astype(floatX) M = self.sr.shape[1].astype(floatX) Mi = 2*self.sr.shape[1] EyeM = tt.eye(Mi) sf2 = self.hyp[:, idims]**2 sf2M = (sf2/M).dimshuffle(0, 'x', 'x') sn2 = (self.hyp[:, idims+1]**2).dimshuffle(0, 'x', 'x') srdotX = self.sr.dot(self.X.T) phi_f = tt.concatenate([tt.sin(srdotX), tt.cos(srdotX)], axis=1) Phi_f = tt.batched_dot(phi_f, phi_f.transpose(0, 2, 1)) A = sf2M*Phi_f A += (sn2 + 1e-6)*EyeM phi_f_dotY = tt.batched_dot(phi_f, self.Y.T) def nlml(A, phidotY, EyeM): Lmm = Cholesky()(A) rhs = tt.concatenate([EyeM, phidotY[:, None]], axis=1) sol = solve_upper_triangular( Lmm.T, solve_lower_triangular(Lmm, rhs)) iA = sol[:, :-1] beta_ss = sol[:, -1] return iA, Lmm, beta_ss seq = [A, phi_f_dotY] nseq = [EyeM] if unroll_scan: from lasagne.utils import unroll_scan [iA, Lmm, beta_ss] = unroll_scan(nlml, seq, [], nseq, self.E) updts = {} else: (iA, Lmm, beta_ss), updts = theano.scan( fn=nlml, sequences=seq, non_sequences=nseq, allow_gc=False, return_list=True, name='%s>logL_ss' % (self.name)) # scale beta_ss beta_ss *= sf2M[:, :, 0] # And finally, the negative log marginal likelihood YdotY = tt.sum(self.Y**2, 0) Ydotphidotbeta = tt.sum(phi_f_dotY*beta_ss, -1) loss_ss = 0.5*(YdotY - Ydotphidotbeta)/sn2 idx = [theano.tensor.arange(Lmm.shape[i]) for i in [1, 2]] loss_ss += tt.sum(tt.log(Lmm[:, idx[0], idx[1]]), 1) loss_ss += (0.5*N - M)*tt.log(sn2) loss_ss += 0.5*N*np.log(2*np.pi, dtype=floatX) if cache_intermediate: # we are going to save the intermediate results in the following # shared variables, so we can use them during prediction without # having to recompute them kk = 2*self.n_inducing N, E = self.N, self.E if type(self.iA) is not tt.sharedvar.SharedVariable: self.iA = S(np.tile(np.eye(kk, dtype=floatX), (E, 1, 1)), name="%s>iA" % (self.name)) if type(self.Lmm) is not tt.sharedvar.SharedVariable: self.Lmm = S(np.tile(np.eye(kk, dtype=floatX), (E, 1, 1)), name="%s>Lmm" % (self.name)) if type(self.beta_ss) is not tt.sharedvar.SharedVariable: self.beta_ss = S(np.ones((E, kk), dtype=floatX), name="%s>beta_ss" % (self.name)) updts = [(self.iA, iA), (self.Lmm, Lmm), (self.beta_ss, beta_ss)] else: self.iA, self.Lmm, self.beta_ss = iA, Lmm, beta_ss updts = None # we add some penalty to avoid having parameters that are too large if self.snr_penalty is not None: penalty_params = {'log_snr': np.log(1000, dtype=floatX), 'log_ls': np.log(100, dtype=floatX), 'log_std': tt.log(self.X.std(0)*(N/(N-1.0))), 'p': 30} loss_ss += self.snr_penalty(tt.log(self.hyp), **penalty_params) # add a penalty for high frequencies freq_penalty = tt.square(self.w).sum(-1).mean(0) loss_ss = loss_ss + freq_penalty inps = [] self.state_changed = True # for saving return loss_ss.sum(), inps, updts
def get_output_for(self, inputs, **kwargs): # Retrieve the layer input input = inputs[0] # Retrieve the mask when it is supplied mask = None hid_init = None cell_init = None if self.mask_incoming_index > 0: mask = inputs[self.mask_incoming_index] if self.hid_init_incoming_index > 0: hid_init = inputs[self.hid_init_incoming_index] if self.cell_init_incoming_index > 0: cell_init = inputs[self.cell_init_incoming_index] # TLSTM: Define new input time_mat = inputs[self.time_incoming_index] # Treat all dimensions after the second as flattened feature dimensions if input.ndim > 3: input = T.flatten(input, 3) if self.bn: input = self.bn.get_output_for(input) # Because scan iterates over the first dimension we dimshuffle to # (n_time_steps, n_batch, n_features) input = input.dimshuffle(1, 0, 2) #(n_time_steps, n_batch) time_input = time_mat.dimshuffle(1, 0, 'x') time_seq_len, time_num_batch, _ = time_input.shape seq_len, num_batch, _ = input.shape # Stack input weight matrices into a (num_inputs, 4*num_units) # matrix, which speeds up computation W_in_stacked = T.concatenate([ self.W_in_to_ingate, self.W_in_to_forgetgate, self.W_in_to_cell, self.W_in_to_outgate, self.W_x2_to_tg2, self.W_x1_to_tg1 ], axis=1) # Same for hidden weight matrices W_hid_stacked = T.concatenate([ self.W_hid_to_ingate, self.W_hid_to_forgetgate, self.W_hid_to_cell, self.W_hid_to_outgate ], axis=1) # Stack biases into a (4*num_units) vector b_stacked = T.concatenate([ self.b_ingate, self.b_forgetgate, self.b_cell, self.b_outgate, self.b2_tg2, self.b1_tg1 ], axis=0) # W_t1_to_tg1_constraint < 0 W_t1_to_tg1_constraint = T.switch( T.ge(self.W_t1_to_tg1, self.boundary), self.W_t1_to_tg1, self.boundary) # Stack delta time weight matrices into a (num_inputs, 2* num_units) W_t_stacked = T.concatenate( [self.W_to_to_outgate, self.W_t2_to_tg2, W_t1_to_tg1_constraint], axis=1) if self.precompute_input: # Because the input is given for all time steps, we can # precompute_input the inputs dot weight matrices before scanning. # W_in_stacked is (n_features, 4*num_units). input is then # (n_time_steps, n_batch, 4*num_units). time_input = T.dot(time_input, W_t_stacked) input = T.dot(input, W_in_stacked) + b_stacked # When theano.scan calls step, input_n will be (n_batch, 4*num_units). # We define a slicing function that extract the input to each LSTM gate def slice_w(x, start, stride=1): return x[:, start * self.num_units:(start + stride) * self.num_units] # Create single recurrent computation step function # input_n is the n'th vector of the input # todo # insert Tm_n, weight_t_o_n in to mask_n and xell_previous def step(input_n, time_input_n, cell_previous, hid_previous, *args): if not self.precompute_input: time_input_n = T.dot(time_input_n, W_t_stacked) input_n = T.dot(input_n, W_in_stacked) + b_stacked tm_wto_n = slice_w(time_input_n, 0) tm_w2_n = slice_w(time_input_n, 1) tm_w1_n = slice_w(time_input_n, 2) tm_w2_n = self.nonlinearity_inside_tg2(tm_w2_n) tm_w1_n = self.nonlinearity_inside_tg1(tm_w1_n) tm2_xwb_n = slice_w(input_n, 4) tm1_xwb_n = slice_w(input_n, 5) timegate2 = self.nonlinearity_outside_tg2(tm_w2_n + tm2_xwb_n) timegate1 = self.nonlinearity_outside_tg1(tm_w1_n + tm1_xwb_n) input_n = slice_w(input_n, 0, 4) # Calculate gates pre-activations and slice gates = input_n + T.dot(hid_previous, W_hid_stacked) # Clip gradients if self.grad_clipping: gates = theano.gradient.grad_clip(gates, -self.grad_clipping, self.grad_clipping) # Extract the pre-activation gate values ingate = slice_w(gates, 0) forgetgate = slice_w(gates, 1) cell_input = slice_w(gates, 2) outgate = slice_w(gates, 3) outgate += tm_wto_n if self.peepholes: # Compute peephole connections ingate += cell_previous * self.W_cell_to_ingate forgetgate += cell_previous * self.W_cell_to_forgetgate # Apply nonlinearities ingate = self.nonlinearity_ingate(ingate) forgetgate = self.nonlinearity_forgetgate(forgetgate) cell_input = self.nonlinearity_cell(cell_input) # Compute new cell value cell = forgetgate * cell_previous + ingate * timegate2 * cell_input tilde_cell = forgetgate * cell_previous + ingate * timegate1 * cell_input if self.peepholes: outgate += tilde_cell * self.W_cell_to_outgate outgate = self.nonlinearity_outgate(outgate) # Compute new hidden unit activation hid = outgate * self.nonlinearity(tilde_cell) return [cell, hid] def step_masked(input_n, time_input_n, mask_n, cell_previous, hid_previous, *args): cell, hid = step(input_n, time_input_n, cell_previous, hid_previous, *args) # Skip over any input with mask 0 by copying the previous # hidden state; proceed normally for any input with mask 1. cell = T.switch(mask_n, cell, cell_previous) hid = T.switch(mask_n, hid, hid_previous) return [cell, hid] if mask is not None: # mask is given as (batch_size, seq_len). Because scan iterates # over first dimension, we dimshuffle to (seq_len, batch_size) and # add a broadcastable dimension mask = mask.dimshuffle(1, 0, 'x') sequences = [input, time_input, mask] step_fun = step_masked else: sequences = [input, time_input] step_fun = step ones = T.ones((num_batch, 1)) if not isinstance(self.cell_init, Layer): # Dot against a 1s vector to repeat to shape (num_batch, num_units) cell_init = T.dot(ones, self.cell_init) if not isinstance(self.hid_init, Layer): # Dot against a 1s vector to repeat to shape (num_batch, num_units) hid_init = T.dot(ones, self.hid_init) # The hidden-to-hidden weight matrix is always used in step non_seqs = [W_hid_stacked] # The "peephole" weight matrices are only used when self.peepholes=True if self.peepholes: non_seqs += [ self.W_cell_to_ingate, self.W_cell_to_forgetgate, self.W_cell_to_outgate ] # When we aren't precomputing the input outside of scan, we need to # provide the input weights and biases to the step function if not self.precompute_input: non_seqs += [W_in_stacked, b_stacked, W_t_stacked] else: pass if self.unroll_scan: # Retrieve the dimensionality of the incoming layer input_shape = self.input_shapes[0] # Explicitly unroll the recurrence instead of using scan cell_out, hid_out = unroll_scan(fn=step_fun, sequences=sequences, outputs_info=[cell_init, hid_init], go_backwards=self.backwards, non_sequences=non_seqs, n_steps=input_shape[1]) else: # Scan op iterates over first dimension of input and repeatedly # applies the step function cell_out, hid_out = theano.scan( fn=step_fun, sequences=sequences, outputs_info=[cell_init, hid_init], go_backwards=self.backwards, truncate_gradient=self.gradient_steps, non_sequences=non_seqs, strict=True)[0] # When it is requested that we only return the final sequence step, # we need to slice it out immediately after scan is applied if self.only_return_final: hid_out = hid_out[-1] else: # dimshuffle back to (n_batch, n_time_steps, n_features)) hid_out = hid_out.dimshuffle(1, 0, 2) # if scan is backward reverse the output if self.backwards: hid_out = hid_out[:, ::-1] return hid_out
def get_output_for(self, inputs, **kwargs): """ Compute this layer's output function given a symbolic input variable Parameters ---------- inputs : list of theano.TensorType `inputs[0]` should always be the symbolic input variable. When this layer has a mask input (i.e. was instantiated with `mask_input != None`, indicating that the lengths of sequences in each batch vary), `inputs` should have length 2, where `inputs[1]` is the `mask`. The `mask` should be supplied as a Theano variable denoting whether each time step in each sequence in the batch is part of the sequence or not. `mask` should be a matrix of shape ``(n_batch, n_time_steps)`` where ``mask[i, j] = 1`` when ``j <= (length of sequence i)`` and ``mask[i, j] = 0`` when ``j > (length of sequence i)``. When the hidden state of this layer is to be pre-filled (i.e. was set to a :class:`Layer` instance) `inputs` should have length at least 2, and `inputs[-1]` is the hidden state to prefill with. When the cell state of this layer is to be pre-filled (i.e. was set to a :class:`Layer` instance) `inputs` should have length at least 2, and `inputs[-1]` is the hidden state to prefill with. When both the cell state and the hidden state are being pre-filled `inputs[-2]` is the hidden state, while `inputs[-1]` is the cell state. Returns ------- layer_output : theano.TensorType Symbolic output variable. """ # Retrieve the layer input input = inputs[0] gate = inputs[1] cell_init = None if self.cell_init_incoming_index > 0: cell_init = inputs[self.cell_init_incoming_index] mask = None if self.mask_incoming_index > 0: mask = inputs[self.mask_incoming_index] # Because scan iterates over the first dimension we dimshuffle to # (n_time_steps, n_batch, n_features) input = input[:, :, :, 0].dimshuffle(2, 0, 1) gate = gate[:, :, :, 0].dimshuffle(2, 0, 1) seq_len, num_batch, _ = input.shape # Stack input weight matrices into a (num_inputs, 4*num_units) # matrix, which speeds up computation # We define a slicing function that extract the input to each LSTM gate def slice_w(x, n): return x[:, n * self.num_units:(n + 1) * self.num_units] # Create single recurrent computation step function # input_n is the n'th vector of the input def step(input_n, gate_n, hid_previous, *args): hid = input_n # temp=rectify( gate_pos_n*rectify(hid_previous) ) # temp+=neg_rectify_neg( gate_neg_n*neg_rectify_neg(hid_previous) ) # temp = T.nnet.hard_sigmoid(gate_n)*hid_previous temp = gate_n * hid_previous hid += temp return hid, temp def step_masked(input_n, gate_n, mask_n, hid_previous, *args): hid, temp = step(input_n, gate_n, hid_previous, *args) # Skip over any input with mask 0 by copying the previous # hidden state; proceed normally for any input with mask 1. hid = T.switch(mask_n, hid, hid_previous) return hid, temp if mask is not None: # mask is given as (batch_size, seq_len). Because scan iterates # over first dimension, we dimshuffle to (seq_len, batch_size) and # add a broadcastable dimension mask = mask.dimshuffle(1, 0, 'x') sequences = [input, gate, mask] step_fun = step_masked else: sequences = [input, gate] step_fun = step ones = T.ones((num_batch, 1)) if not isinstance(self.cell_init, Layer): # Dot against a 1s vector to repeat to shape (num_batch, num_units) cell_init = T.dot(ones, self.cell_init) outputs_info = [cell_init, None] if self.unroll_scan: # Retrieve the dimensionality of the incoming layer input_shape = self.input_shapes[0] # Explicitly unroll the recurrence instead of using scan outs = unroll_scan(fn=step_fun, sequences=sequences, non_sequences=[], outputs_info=outputs_info, go_backwards=self.backwards, n_steps=self.seq_len) else: # Scan op iterates over first dimension of input and repeatedly # applies the step function outs = theano.scan( fn=step_fun, sequences=sequences, outputs_info=outputs_info, go_backwards=self.backwards, # truncate_gradient=self.gradient_steps, strict=True)[0] if self.only_return_final: return outs[-1] # dimshuffle back to (n_batch, n_time_steps, n_features)) cell_out = outs[0].dimshuffle(1, 0, 2) temp_out = outs[1].dimshuffle(1, 0, 2) # if scan is backward reverse the output if self.backwards: cell_out = cell_out[:, ::-1] return cell_out, temp_out
def get_output_for(self, inputs, **kwargs): # Retrieve the layer input input = inputs[0] # Retrieve the mask when it is supplied mask = None hid_init = None cell_init = None visual_input = None if self.mask_incoming_index > 0: mask = inputs[self.mask_incoming_index] if self.hid_init_incoming_index > 0: hid_init = inputs[self.hid_init_incoming_index] if self.cell_init_incoming_index > 0: cell_init = inputs[self.cell_init_incoming_index] if self.visual_input_index > 0: visual_input = inputs[self.visual_input_index] # Treat all dimensions after the second as flattened feature dimensions if input.ndim > 3: input = T.flatten(input, 3) # Because scan iterates over the first dimension we dimshuffle to # (n_time_steps, n_batch, n_features) input = input.dimshuffle(1, 0, 2) seq_len, num_batch, _ = input.shape # Stack input weight matrices into a (num_inputs, 4*num_units) # matrix, which speeds up computation W_in_stacked = T.concatenate( [self.W_in_to_ingate, self.W_in_to_forgetgate, self.W_in_to_cell, self.W_in_to_outgate, self.W_in_to_ggate], axis=1 ) # Same for hidden weight matrices # pdb.set_trace() W_hid_stacked = T.concatenate( [self.W_hid_to_ingate, self.W_hid_to_forgetgate, self.W_hid_to_cell, self.W_hid_to_outgate, self.W_hid_to_ggate], axis=1 ) # Stack biases into a (4*num_units) vector b_stacked = T.concatenate( [self.b_ingate, self.b_forgetgate, self.b_cell, self.b_outgate, self.b_ggate], axis=0) if self.precompute_input: # Because the input is given for all time steps, we can # precompute_input the inputs dot weight matrices before scanning. # W_in_stacked is (n_features, 4*num_units). input is then # (n_time_steps, n_batch, 4*num_units). input = T.dot(input, W_in_stacked) + b_stacked # When theano.scan calls step, input_n will be (n_batch, 4*num_units). # We define a slicing function that extract the input to each LSTM gate def slice_w(x, n): return x[:, n*self.num_units:(n+1)*self.num_units] # Create single recurrent computation step function # input_n is the n'th vector of the input def step( input_n, cell_previous, hid_previous, visual, W_hid_stacked, W_in_stacked, b_stacked, W_cell_to_ingate, W_cell_to_forgetgate, W_cell_to_outgate, W_h_to_attenGate, W_g_to_attenGate, W_v_to_attenGate, W_s_to_attenGate, W_p ): if not self.precompute_input: input_n = T.dot(input_n, W_in_stacked) + b_stacked # Calculate gates pre-activations and slice gates = input_n + T.dot(hid_previous, W_hid_stacked) # Clip gradients if self.grad_clipping: gates = theano.gradient.grad_clip( gates, -self.grad_clipping, self.grad_clipping) # Extract the pre-activation gate values ingate = slice_w(gates, 0) forgetgate = slice_w(gates, 1) cell_input = slice_w(gates, 2) outgate = slice_w(gates, 3) ggate = slice_w(gates, 4) if self.peepholes: # Compute peephole connections ingate += cell_previous*W_cell_to_ingate forgetgate += cell_previous*W_cell_to_forgetgate # Apply nonlinearities ingate = self.nonlinearity_ingate(ingate) forgetgate = self.nonlinearity_forgetgate(forgetgate) cell_input = self.nonlinearity_cell(cell_input) # ggate gt ggate = self.nonlinearity_ggate(ggate) # Compute new cell value cell = forgetgate*cell_previous + ingate*cell_input if self.peepholes: outgate += cell*W_cell_to_outgate outgate = self.nonlinearity_outgate(outgate) # Compute new hidden unit activation hid = outgate*self.nonlinearity(cell) st = ggate*self.nonlinearity(cell) # zt = T.dot( # self.nonlinearity( # T.dot(visual, W_v_to_attenGate) + # T.dot( # T.dot(hid, W_g_to_attenGate).dimshuffle(0, 1, 'x'), # T.ones((1, self.video_len)) # ) # ), # W_h_to_attenGate # )[:, :, 0] # to avoid optimization failure of Tenseor 3D dot vector, we should transform # e = A.dot(B) to e = A*B.dimshuffle('x', 'x', 0), e=e.sum(axis=2) zt_dot_A = self.nonlinearity( T.dot(visual, W_v_to_attenGate) + T.dot( T.dot(hid, W_g_to_attenGate).dimshuffle(0, 1, 'x'), T.ones((1, self.video_len)) ) ) zt = zt_dot_A*W_h_to_attenGate.dimshuffle('x', 'x', 0) zt = zt.sum(axis=2) # vt = T.dot( # self.nonlinearity( # T.dot( # st, W_s_to_attenGate # ) + # T.dot( # hid, W_g_to_attenGate # ) # ), # W_h_to_attenGate # ) vt_dot_A = self.nonlinearity( T.dot( st, W_s_to_attenGate ) + T.dot( hid, W_g_to_attenGate ) ) vt = vt_dot_A*W_h_to_attenGate.dimshuffle('x', 0) vt = vt.sum(axis=1) vt = vt.dimshuffle(0, 'x') alpha_hat_t = self.nonlinearity_attenGate(T.concatenate( [zt, vt], axis=-1 )) feature = T.concatenate( [visual_input, st.dimshuffle(0, 'x', 1)], axis=1 ).dimshuffle(2, 0, 1) c_hat_t = T.sum(alpha_hat_t*feature, axis=-1) It = T.dot( (c_hat_t.T+hid), W_p ) return [cell, hid, It] def step_masked( input_n, mask_n, cell_previous, hid_previous, It_previous, visual, W_hid_stacked, W_in_stacked, b_stacked, W_cell_to_ingate, W_cell_to_forgetgate, W_cell_to_outgate, W_h_to_attenGate, W_g_to_attenGate, W_v_to_attenGate, W_s_to_attenGate, W_p ): cell, hid, It = step( input_n, cell_previous, hid_previous, visual, W_hid_stacked, W_in_stacked, b_stacked, W_cell_to_ingate, W_cell_to_forgetgate, W_cell_to_outgate, W_h_to_attenGate, W_g_to_attenGate, W_v_to_attenGate, W_s_to_attenGate, W_p ) # Skip over any input with mask 0 by copying the previous # hidden state; proceed normally for any input with mask 1. cell = T.switch(mask_n, cell, cell_previous) hid = T.switch(mask_n, hid, hid_previous) It = T.switch(mask_n, It, It_previous) # theano.printing.Print('It')(It) return [cell, hid, It] if mask is not None: # mask is given as (batch_size, seq_len). Because scan iterates # over first dimension, we dimshuffle to (seq_len, batch_size) and # add a broadcastable dimension mask = mask.dimshuffle(1, 0, 'x') sequences = [input, mask] step_fun = step_masked else: sequences = input step_fun = step ones = T.ones((num_batch, 1)) if not isinstance(self.cell_init, Layer): # Dot against a 1s vector to repeat to shape (num_batch, num_units) cell_init = T.dot(ones, self.cell_init) if not isinstance(self.hid_init, Layer): # Dot against a 1s vector to repeat to shape (num_batch, num_units) hid_init = T.dot(ones, self.hid_init) It_init = T.dot(ones, self.It_init) # The hidden-to-hidden weight matrix is always used in step non_seqs = [visual_input, W_hid_stacked] if not self.precompute_input: non_seqs += [W_in_stacked, b_stacked] else: non_seqs += [(), ()] # The "peephole" weight matrices are only used when self.peepholes=True if self.peepholes: non_seqs += [self.W_cell_to_ingate, self.W_cell_to_forgetgate, self.W_cell_to_outgate] else: non_seqs += [(), (), ()] # When we aren't precomputing the input outside of scan, we need to # provide the input weights and biases to the step function non_seqs += [self.W_h_to_attenGate, self.W_g_to_attenGate, self.W_v_to_attenGate, self.W_s_to_attenGate, self.W_p] if self.unroll_scan: # Retrieve the dimensionality of the incoming layer input_shape = self.input_shapes[0] # Explicitly unroll the recurrence instead of using scan cell_out, hid_out, It = unroll_scan( fn=step_fun, sequences=sequences, outputs_info=[cell_init, hid_init, It_init], go_backwards=self.backwards, non_sequences=non_seqs, n_steps=input_shape[1]) else: # Scan op iterates over first dimension of input and repeatedly # applies the step function cell_out, hid_out, It = theano.scan( fn=step_fun, sequences=sequences, outputs_info=[cell_init, hid_init, It_init], go_backwards=self.backwards, truncate_gradient=self.gradient_steps, non_sequences=non_seqs, strict=True)[0] It = It.dimshuffle(1, 0, 2) if self.backwards: It = It[:, ::-1] return It
def get_output_for(self, inputs, **kwargs): # Retrieve the layer input input = inputs[0] # Retrieve the mask when it is supplied mask = None hid_init = None cell_init = None # 取出之前存储在inputs中的几个输入 if self.mask_incoming_index > 0: mask = inputs[self.mask_incoming_index] if self.hid_init_incoming_index > 0: hid_init = inputs[self.hid_init_incoming_index] if self.cell_init_incoming_index > 0: cell_init = inputs[self.cell_init_incoming_index] time_mat = inputs[self.time_incoming_index] # addv duration_mat = inputs[self.duration_incoming_index] # 如果ndim>3,则折叠input的后面的尺寸 ''' 例如,如果我们用展平(x,outdim = 2)展平shape(2,3,4,5)的张量, 那么我们将具有相同的(2-1 = 1)前导尺寸(2,), 其余尺寸已折叠.因此,此示例中的输出将具有形状(2,60). ''' if input.ndim > 3: input = T.flatten(input, 3) # 批量正则化 if self.bn: input = self.bn.get_output_for(input) # 交换1 2维的数据 input = input.dimshuffle(1, 0, 2) seq_len, num_batch, _ = input.shape # (n_time_steps, n_batch) # add time_input = time_mat.dimshuffle(1, 0, 'x') time_seq_len, time_num_batch, _ = time_input.shape # addv duration_input = duration_mat.dimshuffle(1, 0, 'x') duration_seq_len, duration_num_batch, _ = duration_input.shape # 合成 num_features,(num_units*6)) vector # 同时添加一个权重矩阵(x) W_in_stacked = T.concatenate( [ self.W_in_to_ingate, self.W_in_to_cell, self.W_in_to_outgate, self.W_x1_to_tg1, # add self.W_x2_to_dg2, # addv ], axis=1) W_hid_stacked = T.concatenate( [self.W_hid_to_ingate, self.W_hid_to_cell, self.W_hid_to_outgate], axis=1) # 合成 (6*num_units) vector b_stacked = T.concatenate( [ self.b_ingate, self.b_cell, self.b_outgate, self.b1_tg1, # add 添加时间的偏置 self.b2_dg2, # addv ], axis=0) # add2 永用于控制Wt1 使用 W_t1_to_tg1_constraint 代替 W_t1_to_tg1 # W_t1_to_tg1_constraint < 0 # W_t1_to_tg1_constraint = self.W_t1_to_tg1 # W_t1_to_tg1_constraint = T.switch(T.ge(self.W_t1_to_tg1, self.boundary), self.W_t1_to_tg1, self.boundary) # add t只有和两个矩阵相乘(t) # Stack delta time weight matrices into a (1, 2* num_units) W_t_stacked = T.concatenate( [ self.W_to_to_outgate, self.W_t1_to_tg1 # change ], axis=1) # addv W_d_stacked = T.concatenate([ self.W_d2_to_dg2, ], axis=1) if self.precompute_input: # Because the input is given for all time steps, we can # precompute_input the inputs dot weight matrices before scanning. # W_in_stacked is (n_features, 4*num_units). input is then # (n_time_steps, n_batch, 4*num_units). # add 输入预计算 time_input = T.dot(time_input, W_t_stacked) input = T.dot(input, W_in_stacked) + b_stacked # addv duration_input = T.dot(duration_input, W_d_stacked) # When theano.scan calls step, input_n will be (n_batch, 4*num_units). # We define a slicing function that extract the input to each LSTM gate # change def slice_w(x, start, stride=1): return x[:, start * self.num_units:(start + stride) * self.num_units] # Create single recurrent computation step function # input_n is the n'th vector of the input def step(input_n, time_input_n, duration_input_n, cell_previous, hid_previous, hid1_pre, hid2_pre, n_pre, *args): # 之前已经有预计算的时候,这里不用,但是不知道什么用 # 可能在这里是分步计算 if not self.precompute_input: # add # time_input_n是time序列中的一个输入 # 之前time_input_n(n_batch,'x') # time_input_n(n_time_steps, n_batch,'x') time_input_n = T.dot(time_input_n, W_t_stacked) # 之前input_n(n_batch, n_features) # input_n(n_time_steps, n_batch, num_units) input_n = T.dot(input_n, W_in_stacked) + b_stacked # addv duration_input_n = T.dot(duration_input_n, W_d_stacked) # 通过分片的函数,将输入的数据分成几个部分, # 这几个部分分别对应一块, # 如 tm_wto_n表示t和wto相乘的块 # add tm_wto_n = slice_w(time_input_n, 0) tm_w1_n = slice_w(time_input_n, 1) tm_w1_n = self.nonlinearity_inside_tg1(tm_w1_n) # addv dm_w2_n = slice_w(duration_input_n, 0) dm_w2_n = self.nonlinearity_dg2(dm_w2_n) tm1_xwb_n = slice_w(input_n, 3) timegate1 = self.nonlinearity_outside_tg1(tm_w1_n + tm1_xwb_n) dm2_xwb_n = slice_w(input_n, 4) duration_gate2 = self.nonlinearity_outside_dg2(dm_w2_n + dm2_xwb_n) input_n = slice_w(input_n, 0, 3) # Calculate gates pre-activations and slice gates = input_n + T.dot(hid_previous, W_hid_stacked) # Clip gradients if self.grad_clipping: gates = theano.gradient.grad_clip(gates, -self.grad_clipping, self.grad_clipping) # Extract the pre-activation gate values ingate = slice_w(gates, 0) cell_input = slice_w(gates, 1) outgate = slice_w(gates, 2) # add outgate 的在输出到激活函数之前,会添加一个tm outgate += tm_wto_n if self.peepholes: # Compute peephole connections ingate += cell_previous * self.W_cell_to_ingate # Apply nonlinearities ingate = self.nonlinearity_ingate(ingate) cell_input = self.nonlinearity_cell(cell_input) # Compute new cell value # add 在cell中,后面的乘项增加一个 # cell = forgetgate * cell_previous + ingate * timegate1 * cell_input # add2 # addv cell = (1 - ingate * timegate1 ) * cell_previous + duration_gate2 * ingate * cell_input # tilde_cell = (1 - ingate) * cell_previous + ingate * timegate1 * cell_input tilde_cell = cell + ingate * timegate1 * cell_input if self.peepholes: outgate += cell * self.W_cell_to_outgate outgate = self.nonlinearity_outgate(outgate) hid = outgate * self.nonlinearity(tilde_cell) # addv22_6 hid1 = self.nonlinearity(T.dot(hid, self.W_x_wg) + self.b_wg) n = n_pre hid2 = hid1 return [cell, hid, hid1, hid2, n] def step_masked( input_n, time_input_n, # add 添加时间的输入 duration_input_n, # addv mask_n, cell_previous, hid_previous, hid1_pre, hid2_pre, n_pre, *args): cell, hid, hid1, hid2, n = step( input_n, time_input_n, # add 添加时间的输入 duration_input_n, # addv cell_previous, hid_previous, hid1_pre, hid2_pre, n_pre, *args) # Skip over any input with mask 0 by copying the previous # hidden state; proceed normally for any input with mask 1. cell = T.switch(mask_n, cell, cell_previous) hid = T.switch(mask_n, hid, hid_previous) hid1 = T.switch(mask_n, hid1, hid1_pre) hid2 = T.switch(mask_n, hid2, hid2_pre) return [cell, hid, hid1, hid2, n] if mask is not None: # mask is given as (batch_size, seq_len). Because scan iterates # over first dimension, we dimshuffle to (seq_len, batch_size) and # add a broadcastable dimension # (1, 0, ‘x’) -> AxB to BxAx(可广播的维度) mask = mask.dimshuffle(1, 0, 'x') # input(seq_len,batch_size,n_feature),mask(seq_len, batch_size,(可广播的维度)) # add 这里设置sequences,可能是用于后面的scan # addv sequences = [input, time_input, duration_input, mask] step_fun = step_masked else: # add # addv sequences = [input, time_input, duration_input] step_fun = step # 后面不太懂 ones = T.ones((num_batch, 1)) if not isinstance(self.cell_init, Layer): # Dot against a 1s vector to repeat to shape (num_batch, num_units) # ones(num_batch,1) self.cell(1,num_units) cell_init = T.dot(ones, self.cell_init) if not isinstance(self.hid_init, Layer): # Dot against a 1s vector to repeat to shape (num_batch, num_units) # ones(num_batch,1) self.hid(1,num_units) hid_init = T.dot(ones, self.hid_init) # addv22_6 zeros = T.zeros((num_batch, 1)) hid1_init = T.dot(zeros, self.hid1_init) hid2_init = T.dot(zeros, self.hid2_init) # The hidden-to-hidden weight matrix is always used in step # 权重属于不变的量 non_seqs = [W_hid_stacked] # The "peephole" weight matrices are only used when self.peepholes=True if self.peepholes: non_seqs += [self.W_cell_to_ingate, self.W_cell_to_outgate] # When we aren't precomputing the input outside of scan, we need to # provide the input weights and biases to the step function # 之前没有计算的话,就需要 if not self.precompute_input: non_seqs += [W_in_stacked, b_stacked] # addv22_6 non_seqs += [self.W_x_wg, self.b_wg] if self.unroll_scan: # Retrieve the dimensionality of the incoming layer input_shape = self.input_shapes[0] # Explicitly unroll the recurrence instead of using scan cell_out, hid_out = unroll_scan(fn=step_fun, sequences=sequences, outputs_info=[cell_init, hid_init], go_backwards=self.backwards, non_sequences=non_seqs, n_steps=input_shape[1]) else: # Scan op iterates over first dimension of input and repeatedly # applies the step function cell_out, hid_out2, hid1_out, hid_out, n = theano.scan( fn=step_fun, sequences=sequences, # [input, time_input, mask] outputs_info=[cell_init, hid_init, hid1_init, hid2_init, 0], go_backwards=self.backwards, truncate_gradient=self.gradient_steps, non_sequences=non_seqs, strict=True)[0] # When it is requested that we only return the final sequence step, # we need to slice it out immediately after scan is applied if self.only_return_final: hid_out = hid_out[-1] else: # dimshuffle back to (n_batch, n_time_steps, n_features)) hid_out = hid_out.dimshuffle(1, 0, 2) # if scan is backward reverse the output if self.backwards: hid_out = hid_out[:, ::-1] return hid_out
def get_sessions(self, session_length = 10, batch_size = None, recorded_sequence = None, initial_hidden = 'zeros', initial_policy = 'zeros', initial_actions = 'zeros', additional_output_layers = [], **flags ): """returns history of agent-generated sequences for given number of turns: parameters: session_length - how many turns of interaction shall there be for each batch batch_size - [required parameter] amount of independed sessions [number or symbolic].Irrelevant if you manually set all initial_*. recorded_sequence - if None, generator is actually generating output. if a tensor[batch_i,time_tick,...] is passed instead, the generator observes this sequence instead of it's own output initial_<something> - initial values for all variables at 0-th time step Unless you are doing something nasty, initial policy and actions will not matter at all 'zeros' default means filling variable with zeros Initial values are NOT included in history sequences additional_output_layers - any layers of a network which outputs need to be added to the outputs flags: optional flags to be sent to NN when calling get_output (e.g. deterministic = True) returns: hidden_seq,policy_seq,action_seq, [additional_output_0, additional_output_1] for hidden state, agent policy and chosen actions respectively each of them having dimensions of [batch_i,seq_i,...] """ if initial_hidden == 'zeros': memory_state_shape = lasagne.layers.get_output_shape(self.memory)[1:] initial_hidden = T.zeros((batch_size,)+tuple(memory_state_shape)) if initial_actions == 'zeros': initial_actions = T.zeros([batch_size],dtype='int32') time_ticks = T.arange(session_length) #recurrent step functions def step_active(time_tick,last_hidden,last_policy,last_action, *args): """a recurrent step function where generator actually generates sequence""" hidden,policy,action,additional_outputs = self.get_agent_reaction(last_hidden,last_action, additional_output_layers,**flags) return [hidden,policy,action]+additional_outputs def step_passive(time_tick,current_observation,last_hidden,last_policy,last_action, *args): """a recurrent step function where generator observes recorded sequence of actions and generates possible next steps for recorded sequence prefices. Used for passive training (like language model)""" hidden,policy,action,additional_outputs = self.get_agent_reaction(last_hidden,current_observation, additional_output_layers,**flags) return [hidden,policy,action]+additional_outputs ##main recurrence loop #state 0 values additional_init = [None for i in additional_output_layers] outputs_info = [initial_hidden,None,initial_actions] + additional_init #time ticks and [optional] transposed recorded sequence [tick,batch,...] sequences = [time_ticks] if recorded_sequence is not None: sequences.append(recorded_sequence.swapaxes(1,0)) step = step_active if recorded_sequence is None else step_passive history = unroll_scan(step, sequences = sequences, outputs_info = outputs_info, non_sequences = [], n_steps = session_length ) self.history = history #from [time,batch,...] to [batch,time,...] history = [ (var.swapaxes(1,0) if var.ndim >1 else var) for var in history] #what's inside: hidden_seq,policy_seq,action_seq = history[:3] additional_output_sequences = tuple(history[3:]) return (hidden_seq,policy_seq,action_seq) + additional_output_sequences
def get_output_for(self, inputs, **kwargs): """ Compute this layer's output function given a symbolic input variable. Parameters ---------- inputs : list of theano.TensorType `inputs[0]` should always be the symbolic input variable. When this layer has a mask input (i.e. was instantiated with `mask_input != None`, indicating that the lengths of sequences in each batch vary), `inputs` should have length 2, where `inputs[1]` is the `mask`. The `mask` should be supplied as a Theano variable denoting whether each time step in each sequence in the batch is part of the sequence or not. `mask` should be a matrix of shape ``(n_batch, n_time_steps)`` where ``mask[i, j] = 1`` when ``j <= (length of sequence i)`` and ``mask[i, j] = 0`` when ``j > (length of sequence i)``. When the hidden state of this layer is to be pre-filled (i.e. was set to a :class:`Layer` instance) `inputs` should have length at least 2, and `inputs[-1]` is the hidden state to prefill with. Returns ------- layer_output : theano.TensorType Symbolic output variable. """ # Retrieve the layer input input = inputs[0] # Retrieve the mask when it is supplied mask = None hid_init = None if self.mask_incoming_index > 0: mask = inputs[self.mask_incoming_index] if self.hid_init_incoming_index > 0: hid_init = inputs[self.hid_init_incoming_index] # Input should be provided as (n_batch, n_time_steps, n_features) # but scan requires the iterable dimension to be first # So, we need to dimshuffle to (n_time_steps, n_batch, n_features) input = input.dimshuffle(1, 0, *range(2, input.ndim)) seq_len, num_batch = input.shape[0], input.shape[1] if self.precompute_input: # Because the input is given for all time steps, we can precompute # the inputs to hidden before scanning. First we need to reshape # from (seq_len, batch_size, trailing dimensions...) to # (seq_len*batch_size, trailing dimensions...) # This strange use of a generator in a tuple was because # input.shape[2:] was raising a Theano error trailing_dims = tuple(input.shape[n] for n in range(2, input.ndim)) input = T.reshape(input, (seq_len * num_batch, ) + trailing_dims) input = helper.get_output(self.input_to_hidden, input, **kwargs) # Reshape back to (seq_len, batch_size, trailing dimensions...) trailing_dims = tuple(input.shape[n] for n in range(1, input.ndim)) input = T.reshape(input, (seq_len, num_batch) + trailing_dims) # We will always pass the hidden-to-hidden layer params to step non_seqs = helper.get_all_params(self.hidden_to_hidden) non_seqs += self._get_mi_params() # When we are not precomputing the input, we also need to pass the # input-to-hidden parameters to step if not self.precompute_input: non_seqs += helper.get_all_params(self.input_to_hidden) # Create single recurrent computation step function def step(input_n, hid_previous, *args): # Compute the hidden-to-hidden activation hid_to_hid = helper.get_output(self.hidden_to_hidden, hid_previous, **kwargs) # Compute the input-to-hidden activation if self.precompute_input: # if the input is precomputed in_to_hid = input_n else: # compute the input in_to_hid = helper.get_output(self.input_to_hidden, input_n, **kwargs) # Compute the second order term if self.a_g is not None: second_order_term = (self.a_g * in_to_hid * hid_to_hid) # second_order_term = in_to_hid * hid_to_hid else: second_order_term = 0 # Compute the first order hidden-to-hidden term if self.b_g_hid_to_hid is not None: f_o_hid_to_hid = self.b_g_hid_to_hid * hid_to_hid else: f_o_hid_to_hid = 0 # Compute first order input to hidden term if self.b_g_in_to_hid is not None: f_o_in_to_hid = self.b_g_in_to_hid * in_to_hid else: # if all else is None, it will output zeros of the right size f_o_in_to_hid = T.zeros_like(in_to_hid) hid_pre = second_order_term + f_o_in_to_hid + f_o_hid_to_hid if self.b is not None: hid_pre = hid_pre + self.b return self.nonlinearity(hid_pre) def step_masked(input_n, mask_n, hid_previous, *args): # Skip over any input with mask 0 by copying the previous # hidden state; proceed normally for any input with mask 1. hid = step(input_n, hid_previous, *args) hid_out = T.switch(mask_n, hid, hid_previous) return [hid_out] if mask is not None: mask = mask.dimshuffle(1, 0, 'x') sequences = [input, mask] step_fun = step_masked else: sequences = input step_fun = step if not isinstance(self.hid_init, Layer): # The code below simply repeats self.hid_init num_batch times in # its first dimension. Turns out using a dot product and a # dimshuffle is faster than T.repeat. dot_dims = (list(range(1, self.hid_init.ndim - 1)) + [0, self.hid_init.ndim - 1]) hid_init = T.dot(T.ones((num_batch, 1)), self.hid_init.dimshuffle(dot_dims)) if self.unroll_scan: # Retrieve the dimensionality of the incoming layer input_shape = self.input_shapes[0] # Explicitly unroll the recurrence instead of using scan hid_out = unroll_scan(fn=step_fun, sequences=sequences, outputs_info=[hid_init], go_backwards=self.backwards, non_sequences=non_seqs, n_steps=input_shape[1])[0] else: # Scan op iterates over first dimension of input and repeatedly # applies the step function hid_out = theano.scan(fn=step_fun, sequences=sequences, go_backwards=self.backwards, outputs_info=[hid_init], non_sequences=non_seqs, truncate_gradient=self.gradient_steps, strict=True)[0] # When it is requested that we only return the final sequence step, # we need to slice it out immediately after scan is applied if self.only_return_final: hid_out = hid_out[-1] else: # dimshuffle back to (n_batch, n_time_steps, n_features)) hid_out = hid_out.dimshuffle(1, 0, *range(2, hid_out.ndim)) # if scan is backward reverse the output if self.backwards: hid_out = hid_out[:, ::-1] return hid_out
def get_loss(self, unroll_scan=False, cache_intermediate=True): msg = 'Building full GP loss' utils.print_with_stamp(msg, self.name) idims = self.D N = self.X.shape[0].astype(floatX) def nlml(Y, hyp, i, X, EyeN, nigp=None, y_var=None): # initialise the (before compilation) kernel function hyps = (hyp[:idims + 1], hyp[idims + 1]) kernel_func = partial(cov.Sum, hyps, self.covs) # We initialise the kernel matrices (one for each output dimension) K = kernel_func(X) # add the contribution from the input noise if nigp: K += tt.diag(nigp[i]) # add the contribution from the output uncertainty (acts as weight) if y_var: K += tt.diag(y_var[i]) # compute chol(K) L = Cholesky()(K) # compute K^-1 and (K^-1)dot(y) rhs = tt.concatenate([EyeN, Y[:, None]], axis=1) sol = solve_upper_triangular(L.T, solve_lower_triangular(L, rhs)) iK = sol[:, :-1] beta = sol[:, -1] return iK, L, beta nseq = [self.X, tt.eye(self.X.shape[0])] if self.nigp: nseq.append(self.nigp) if self.Y_var: nseq.append(self.Y_var.T) seq = [self.Y.T, self.hyp, tt.arange(self.X.shape[0])] if unroll_scan: from lasagne.utils import unroll_scan [iK, L, beta] = unroll_scan(nlml, seq, [], nseq, self.E) updts = {} else: (iK, L, beta), updts = theano.scan(fn=nlml, sequences=seq, non_sequences=nseq, allow_gc=False, strict=True, return_list=True, name="%s>logL_scan" % (self.name)) # And finally, the negative log marginal likelihood loss = 0.5 * tt.sum(self.Y.T * beta, 1) idx = [theano.tensor.arange(L.shape[i]) for i in [1, 2]] loss += tt.sum(tt.log(L[:, idx[0], idx[1]]), 1) loss += 0.5 * N * tt.log(2 * np.pi) if cache_intermediate: # we are going to save the intermediate results in the following # shared variables, so we can use them during prediction without # having to recompute them N, E = self.N, self.E if type(self.iK) is not tt.sharedvar.SharedVariable: self.iK = S(np.tile(np.eye(N, dtype=floatX), (E, 1, 1)), name="%s>iK" % (self.name)) if type(self.L) is not tt.sharedvar.SharedVariable: self.L = S(np.tile(np.eye(N, dtype=floatX), (E, 1, 1)), name="%s>L" % (self.name)) if type(self.beta) is not tt.sharedvar.SharedVariable: self.beta = S(np.ones((E, N), dtype=floatX), name="%s>beta" % (self.name)) updts = [(self.iK, iK), (self.L, L), (self.beta, beta)] else: # save intermediate graphs (in case we require grads wrt params) self.iK, self.L, self.beta = iK, L, beta updts = None # we add some penalty to avoid having parameters that are too large if self.snr_penalty is not None: penalty_params = { 'log_snr': np.log(1000, dtype=floatX), 'log_ls': np.log(100, dtype=floatX), 'log_std': tt.log(self.X.std(0) * (N / (N - 1.0))), 'p': 30 } loss += self.snr_penalty(tt.log(self.hyp), **penalty_params) inps = [] self.state_changed = True # for saving return loss.sum(), inps, updts
def get_output_for(self, inputs, **kwargs): """ Compute this layer's output function given a symbolic input variable Parameters ---------- inputs : list of theano.TensorType `inputs[0]` should always be the symbolic input variable. When this layer has a mask input (i.e. was instantiated with `mask_input != None`, indicating that the lengths of sequences in each batch vary), `inputs` should have length 2, where `inputs[1]` is the `mask`. The `mask` should be supplied as a Theano variable denoting whether each time step in each sequence in the batch is part of the sequence or not. `mask` should be a matrix of shape ``(n_batch, n_time_steps)`` where ``mask[i, j] = 1`` when ``j <= (length of sequence i)`` and ``mask[i, j] = 0`` when ``j > (length of sequence i)``. When the hidden state of this layer is to be pre-filled (i.e. was set to a :class:`Layer` instance) `inputs` should have length at least 2, and `inputs[-1]` is the hidden state to prefill with. Returns ------- layer_output : theano.TensorType Symbolic output variable. """ # Retrieve the layer input input = inputs[0] # Retrieve the mask when it is supplied mask = None hid_init = None if self.mask_incoming_index > 0: mask = inputs[self.mask_incoming_index] if self.hid_init_incoming_index > 0: hid_init = inputs[self.hid_init_incoming_index] # Treat all dimensions after the second as flattened feature dimensions if input.ndim > 3: input = T.flatten(input, 3) if self.bn: input = self.bn.get_output_for(input) # Because scan iterates over the first dimension we dimshuffle to # (n_time_steps, n_batch, n_features) input = input.dimshuffle(1, 0, 2) seq_len, num_batch, _ = input.shape # Stack input weight matrices into a (num_inputs, 3*num_units) # matrix, which speeds up computation W_in_stacked = T.concatenate([ self.W_in_to_resetgate, self.W_in_to_updategate, self.W_in_to_hidden_update ], axis=1) # Same for hidden weight matrices W_hid_stacked = T.concatenate([ self.W_hid_to_resetgate, self.W_hid_to_updategate, self.W_hid_to_hidden_update ], axis=1) # Stack gate biases into a (3*num_units) vector b_stacked = T.concatenate( [self.b_resetgate, self.b_updategate, self.b_hidden_update], axis=0) # Stack second order gating biases into a (3*num_units) vector a_g_stacked = T.concatenate( [self.a_g_resetgate, self.a_g_updategate, self.a_g_hidden_update], axis=0) # Stack second order gating biases into a (3*num_units) vector b_g_in_to_hid_stacked = T.concatenate([ self.b_g_in_to_hid_resetgate, self.b_g_in_to_hid_updategate, self.b_g_in_to_hid_hidden_update ], axis=0) # Stack second order gating biases into a (3*num_units) vector b_g_hid_to_hid_stacked = T.concatenate([ self.b_g_hid_to_hid_resetgate, self.b_g_hid_to_hid_updategate, self.b_g_hid_to_hid_hidden_update ], axis=0) if self.precompute_input: # precompute_input inputs*W. W_in is (n_features, 3*num_units). # input is then (n_batch, n_time_steps, 3*num_units). input = T.dot(input, W_in_stacked) # When theano.scan calls step, input_n will be (n_batch, 3*num_units). # We define a slicing function that extract the input to each GRU gate def slice_w(x, n): s = x[:, n * self.num_units:(n + 1) * self.num_units] if self.num_units == 1: s = T.addbroadcast(s, 1) # Theano cannot infer this by itself return s # Create single recurrent computation step function # input__n is the n'th vector of the input def step(input_n, hid_previous, *args): # Compute W_{hr} h_{t - 1}, W_{hu} h_{t - 1}, and W_{hc} h_{t - 1} hid_input = T.dot(hid_previous, W_hid_stacked) if self.grad_clipping: input_n = theano.gradient.grad_clip(input_n, -self.grad_clipping, self.grad_clipping) hid_input = theano.gradient.grad_clip(hid_input, -self.grad_clipping, self.grad_clipping) if not self.precompute_input: # Compute W_{xr}x_t + b_r, W_{xu}x_t + b_u, and W_{xc}x_t + b_c input_n = T.dot(input_n, W_in_stacked) # Compute the second_order_term second_order_term = a_g_stacked * input_n * hid_input # Compute the first order input-to-hidden term f_o_input = b_g_in_to_hid_stacked * input_n + b_stacked # Compute the first order hidden-to-hidden term f_o_hid_input = b_g_hid_to_hid_stacked * hid_input # Reset and update gates resetgate = (slice_w(second_order_term, 0) + slice_w(f_o_hid_input, 0) + slice_w(f_o_input, 0)) updategate = (slice_w(second_order_term, 1) + slice_w(f_o_hid_input, 1) + slice_w(f_o_input, 1)) resetgate = self.nonlinearity_resetgate(resetgate) updategate = self.nonlinearity_updategate(updategate) # Compute # (W_{xc}x_t + r_t \odot (W_{hc} h_{t - 1}) + # r_t \odot (W_{xc}x_t * W_{hc} h_{t-1})) # This is different from the paper, but follows the # formulation used in Lasagne hidden_update_in = slice_w(f_o_hid_input, 2) hidden_update_hid = slice_w(f_o_hid_input, 2) hidden_update_s_o = slice_w(second_order_term, 2) hidden_update = (hidden_update_in + resetgate * (hidden_update_hid + hidden_update_s_o)) if self.grad_clipping: hidden_update = theano.gradient.grad_clip( hidden_update, -self.grad_clipping, self.grad_clipping) hidden_update = self.nonlinearity_hidden_update(hidden_update) # Compute (1 - u_t)h_{t - 1} + u_t c_t hid = (1 - updategate) * hid_previous + updategate * hidden_update return hid def step_masked(input_n, mask_n, hid_previous, *args): hid = step(input_n, hid_previous, *args) # Skip over any input with mask 0 by copying the previous # hidden state; proceed normally for any input with mask 1. hid = T.switch(mask_n, hid, hid_previous) return hid if mask is not None: # mask is given as (batch_size, seq_len). Because scan iterates # over first dimension, we dimshuffle to (seq_len, batch_size) and # add a broadcastable dimension mask = mask.dimshuffle(1, 0, 'x') sequences = [input, mask] step_fun = step_masked else: sequences = [input] step_fun = step if not isinstance(self.hid_init, Layer): # Dot against a 1s vector to repeat to shape (num_batch, num_units) hid_init = T.dot(T.ones((num_batch, 1)), self.hid_init) # The hidden-to-hidden weight matrix is always used in step non_seqs = [ W_hid_stacked, a_g_stacked, b_g_in_to_hid_stacked, b_g_hid_to_hid_stacked, b_stacked ] # When we aren't precomputing the input outside of scan, we need to # provide the input weights and biases to the step function if not self.precompute_input: non_seqs += [W_in_stacked] if self.unroll_scan: # Retrieve the dimensionality of the incoming layer input_shape = self.input_shapes[0] # Explicitly unroll the recurrence instead of using scan hid_out = unroll_scan(fn=step_fun, sequences=sequences, outputs_info=[hid_init], go_backwards=self.backwards, non_sequences=non_seqs, n_steps=input_shape[1])[0] else: # Scan op iterates over first dimension of input and repeatedly # applies the step function hid_out = theano.scan(fn=step_fun, sequences=sequences, go_backwards=self.backwards, outputs_info=[hid_init], non_sequences=non_seqs, truncate_gradient=self.gradient_steps, strict=True)[0] # When it is requested that we only return the final sequence step, # we need to slice it out immediately after scan is applied if self.only_return_final: hid_out = hid_out[-1] else: # dimshuffle back to (n_batch, n_time_steps, n_features)) hid_out = hid_out.dimshuffle(1, 0, 2) # if scan is backward reverse the output if self.backwards: hid_out = hid_out[:, ::-1] return hid_out
def predict_symbolic(self, mx, Sx, unroll_scan=False): idims = self.D odims = self.E # centralize inputs zeta = self.X - mx # initialize some variables sf2 = self.hyp[:, idims]**2 eyeE = tt.tile(tt.eye(idims), (odims, 1, 1)) lscales = self.hyp[:, :idims] iL = eyeE / lscales.dimshuffle(0, 1, 'x') # predictive mean inp = iL.dot(zeta.T).transpose(0, 2, 1) iLdotSx = iL.dot(Sx) # TODO vectorize this B = (iLdotSx[:, :, None, :] * iL[:, None, :, :]).sum(-1) + tt.eye(idims) t = tt.stack([solve(B[i].T, inp[i].T).T for i in range(odims)]) c = sf2 / tt.sqrt(tt.stack([det(B[i]) for i in range(odims)])) l = tt.exp(-0.5 * tt.sum(inp * t, 2)) lb = l * self.beta # E x N dot E x N M = tt.sum(lb, 1) * c # input output covariance tiL = (t[:, :, None, :] * iL[:, None, :, :]).sum(-1) # tiL = tt.stack([t[i].dot(iL[i]) for i in range(odims)]) V = tt.stack([tiL[i].T.dot(lb[i]) for i in range(odims)]).T * c # predictive covariance logk = (tt.log(sf2))[:, None] - 0.5 * tt.sum(inp * inp, 2) logk_r = logk.dimshuffle(0, 'x', 1) logk_c = logk.dimshuffle(0, 1, 'x') Lambda = tt.square(iL) LL = (Lambda.dimshuffle(0, 'x', 1, 2) + Lambda).transpose(0, 1, 3, 2) R = tt.dot(LL, Sx).transpose(0, 1, 3, 2) + tt.eye(idims) z_ = Lambda.dot(zeta.T).transpose(0, 2, 1) M2 = tt.zeros((odims, odims)) # initialize indices triu_indices = np.triu_indices(odims) indices = [tt.as_index_variable(idx) for idx in triu_indices] def second_moments(i, j, M2, beta, iK, sf2, R, logk_c, logk_r, z_, Sx, *args): # This comes from Deisenroth's thesis ( Eqs 2.51- 2.54 ) Rij = R[i, j] n2 = logk_c[i] + logk_r[j] n2 += utils.maha(z_[i], -z_[j], 0.5 * solve(Rij, Sx)) Q = tt.exp(n2) / tt.sqrt(det(Rij)) # Eq 2.55 m2 = matrix_dot(beta[i], Q, beta[j]) m2 = theano.ifelse.ifelse(tt.eq(i, j), m2 - tt.sum(iK[i] * Q) + sf2[i], m2) M2 = tt.set_subtensor(M2[i, j], m2) return M2 nseq = [self.beta, self.iK, sf2, R, logk_c, logk_r, z_, Sx, self.L] if unroll_scan: from lasagne.utils import unroll_scan [M2_] = unroll_scan(second_moments, indices, [M2], nseq, len(triu_indices[0])) updts = {} else: M2_, updts = theano.scan(fn=second_moments, sequences=indices, outputs_info=[M2], non_sequences=nseq, allow_gc=False, strict=True, name="%s>M2_scan" % (self.name)) M2 = M2_[-1] M2 = M2 + tt.triu(M2, k=1).T S = M2 - tt.outer(M, M) return M, S, V
def get_output_for(self, inputs, **kwargs): # Retrieve the layer input input = inputs[0] # Retrieve the mask when it is supplied mask = None hid_init = None cell_init = None if self.mask_incoming_index > 0: mask = inputs[self.mask_incoming_index] if self.hid_init_incoming_index > 0: hid_init = inputs[self.hid_init_incoming_index] if self.cell_init_incoming_index > 0: cell_init = inputs[self.cell_init_incoming_index] # Stack input weight matrices into a (num_inputs, 4*num_units) # matrix, which speeds up computation W_in_stacked = T.concatenate([ self.W_in_to_ingate, self.W_in_to_forgetgate, self.W_in_to_cell, self.W_in_to_outgate ], axis=0) # Same for hidden weight matrices W_hid_stacked = T.concatenate([ self.W_hid_to_ingate, self.W_hid_to_forgetgate, self.W_hid_to_cell, self.W_hid_to_outgate ], axis=0) # Stack biases into a (4*num_units) vector b_stacked = T.concatenate( [self.b_ingate, self.b_forgetgate, self.b_cell, self.b_outgate], axis=0).dimshuffle('x', 0, 'x', 'x') # border_mode = (self.num_units // 2, self.num_units // 2) if self.precompute_input: # Because the input is given for all time steps, we can # precompute_input the inputs dot weight matrices before scanning. # W_in_stacked is (n_features, 4*num_units). input is then # (batch_size, 4*num_units, num_rows, num_columns) # (n_batch, 4*num_units, height, width). input = T.nnet.conv2d(input, W_in_stacked, None, None, subsample=(1, 1), border_mode='half', filter_flip=False) + b_stacked # Because scan iterates over the first dimension we dimshuffle to # (n_time_steps, n_batch, n_features) # (height, n_batch, 4*num_units, width) # (n_batch, num_units, width) input = input.dimshuffle(2, 0, 1, 3) seq_len, num_batch = input.shape[:2] # When theano.scan calls step, input_n will be (n_batch, 4*num_units). # We define a slicing function that extract the input to each LSTM gate def slice_w(x, n): return x[:, n * self.num_units:(n + 1) * self.num_units] # Create single recurrent computation step function # input_n is the n'th vector of the input def step(input_n, cell_previous, hid_previous, *args): if not self.precompute_input: input_n = T.nnet.conv2d(input_n, W_in_stacked, None, None, subsample=(1, 1), border_mode='half', filter_flip=False) + b_stacked # Calculate gates pre-activations and slice hid_previous = pad(hid_previous, [(1, 0)], 0, 2) gates = input_n + conv1d_mc1(hid_previous, W_hid_stacked, None, None, subsample=(1, ), border_mode='valid', filter_flip=False) # Clip gradients if self.grad_clipping: gates = theano.gradient.grad_clip(gates, -self.grad_clipping, self.grad_clipping) # Extract the pre-activation gate values ingate = slice_w(gates, 0) forgetgate = slice_w(gates, 1) cell_input = slice_w(gates, 2) outgate = slice_w(gates, 3) if self.peepholes: # Compute peephole connections ingate += cell_previous * self.W_cell_to_ingate forgetgate += cell_previous * self.W_cell_to_forgetgate # Apply nonlinearities ingate = self.nonlinearity_ingate(ingate) forgetgate = self.nonlinearity_forgetgate(forgetgate) cell_input = self.nonlinearity_cell(cell_input) # Compute new cell value cell = forgetgate * cell_previous + ingate * cell_input if self.peepholes: outgate += cell * self.W_cell_to_outgate outgate = self.nonlinearity_outgate(outgate) # Compute new hidden unit activation hid = outgate * self.nonlinearity(cell) return [cell, hid] def step_masked(input_n, mask_n, cell_previous, hid_previous, *args): cell, hid = step(input_n, cell_previous, hid_previous, *args) # Skip over any input with mask 0 by copying the previous # hidden state; proceed normally for any input with mask 1. cell = T.switch(mask_n, cell, cell_previous) hid = T.switch(mask_n, hid, hid_previous) return [cell, hid] if mask is not None: # mask is given as (batch_size, seq_len). Because scan iterates # over first dimension, we dimshuffle to (seq_len, batch_size) and # add a broadcastable dimension mask = mask.dimshuffle(1, 0, 'x') sequences = [input, mask] step_fun = step_masked else: sequences = input step_fun = step ones = T.ones((num_batch, 1)) if not isinstance(self.cell_init, Layer): # Dot against a 1s vector to repeat to shape (num_batch, num_units) # cell_init = T.dot(ones, self.cell_init) cell_init = T.tensordot(ones, T.unbroadcast(self.cell_init, 0), axes=[1, 0]) if not isinstance(self.hid_init, Layer): # Dot against a 1s vector to repeat to shape (num_batch, num_units) # hid_init = T.dot(ones, self.hid_init) hid_init = T.tensordot(ones, T.unbroadcast(self.hid_init, 0), axes=[1, 0]) # print(self.cell_init.ndim, self.cell_init.broadcastable) # print(cell_init.ndim, cell_init.broadcastable) # print(self.hid_init.ndim, self.hid_init.broadcastable) # print(hid_init.ndim, hid_init.broadcastable) # print(self.cell_init.get_value(True).shape) # print(self.hid_init.get_value(True).shape) # The hidden-to-hidden weight matrix is always used in step non_seqs = [W_hid_stacked] # The "peephole" weight matrices are only used when self.peepholes=True if self.peepholes: non_seqs += [ self.W_cell_to_ingate, self.W_cell_to_forgetgate, self.W_cell_to_outgate ] # When we aren't precomputing the input outside of scan, we need to # provide the input weights and biases to the step function if not self.precompute_input: non_seqs += [W_in_stacked, b_stacked] if self.unroll_scan: # Retrieve the dimensionality of the incoming layer input_shape = self.input_shapes[0] # Explicitly unroll the recurrence instead of using scan cell_out, hid_out = unroll_scan(fn=step_fun, sequences=sequences, outputs_info=[cell_init, hid_init], go_backwards=self.backwards, non_sequences=non_seqs, n_steps=input_shape[1]) else: # Scan op iterates over first dimension of input and repeatedly # applies the step function cell_out, hid_out = theano.scan( fn=step_fun, sequences=sequences, # input outputs_info=[cell_init, hid_init], go_backwards=self.backwards, truncate_gradient=self.gradient_steps, non_sequences=non_seqs, strict=True)[0] # When it is requested that we only return the final sequence step, # we need to slice it out immediately after scan is applied if self.only_return_final: hid_out = hid_out[-1] else: # dimshuffle back to (n_batch, n_time_steps, n_features)) # hid_out = hid_out.dimshuffle(1, 0, 2) hid_out = hid_out.dimshuffle(1, 2, 0, 3) # if scan is backward reverse the output if self.backwards: hid_out = hid_out[:, :, ::-1] return hid_out
def predict_symbolic(self, mx, Sx=None, unroll_scan=False): idims = self.D odims = self.E # initialize some variables sf2 = self.hyp[:, idims]**2 eyeE = tt.tile(tt.eye(idims), (odims, 1, 1)) lscales = self.hyp[:, :idims] iL = eyeE / lscales.dimshuffle(0, 1, 'x') if Sx is None: # first check if we received a vector [D] or a matrix [nxD] if mx.ndim == 1: mx = mx[None, :] # centralize inputs zeta = self.X[:, None, :] - mx[None, :, :] # predictive mean ( we don't need to do the rest ) inp = (iL[:, None, :, None, :] * zeta[:, None, :, :]).sum(2) l = tt.exp(-0.5 * tt.sum(inp**2, -1)) lb = l * self.beta[:, :, None] # E x N M = tt.sum(lb, 1).T * sf2 # apply saturating function to the output if available if self.sat_func is not None: # saturate the output M = self.sat_func(M) return M # centralize inputs zeta = self.X - mx # predictive mean inp = iL.dot(zeta.T).transpose(0, 2, 1) iLdotSx = iL.dot(Sx) B = (iLdotSx[:, :, None, :] * iL[:, None, :, :]).sum(-1) + tt.eye(idims) t = tt.stack([solve(B[i].T, inp[i].T).T for i in range(odims)]) c = sf2 / tt.sqrt(tt.stack([det(B[i]) for i in range(odims)])) l = tt.exp(-0.5 * tt.sum(inp * t, 2)) lb = l * self.beta M = tt.sum(lb, 1) * c # input output covariance tiL = tt.stack([t[i].dot(iL[i]) for i in range(odims)]) V = tt.stack([tiL[i].T.dot(lb[i]) for i in range(odims)]).T * c # predictive covariance logk = (tt.log(sf2))[:, None] - 0.5 * tt.sum(inp * inp, 2) logk_r = logk.dimshuffle(0, 'x', 1) logk_c = logk.dimshuffle(0, 1, 'x') Lambda = tt.square(iL) LL = (Lambda.dimshuffle(0, 'x', 1, 2) + Lambda).transpose(0, 1, 3, 2) R = tt.dot(LL, Sx).transpose(0, 1, 3, 2) + tt.eye(idims) z_ = Lambda.dot(zeta.T).transpose(0, 2, 1) M2 = tt.zeros((odims, odims)) # initialize indices triu_indices = np.triu_indices(odims) indices = [tt.as_index_variable(idx) for idx in triu_indices] def second_moments(i, j, M2, beta, R, logk_c, logk_r, z_, Sx, *args): # This comes from Deisenroth's thesis ( Eqs 2.51- 2.54 ) Rij = R[i, j] n2 = logk_c[i] + logk_r[j] n2 += utils.maha(z_[i], -z_[j], 0.5 * solve(Rij, Sx)) Q = tt.exp(n2) / tt.sqrt(det(Rij)) # Eq 2.55 m2 = matrix_dot(beta[i], Q, beta[j]) m2 = theano.ifelse.ifelse(tt.eq(i, j), m2 + 1e-6, m2) M2 = tt.set_subtensor(M2[i, j], m2) return M2 nseq = [self.beta, R, logk_c, logk_r, z_, Sx, self.iK, self.L] if unroll_scan: from lasagne.utils import unroll_scan [M2_] = unroll_scan(second_moments, indices, [M2], nseq, len(triu_indices[0])) updts = {} else: M2_, updts = theano.scan(fn=second_moments, sequences=indices, outputs_info=[M2], non_sequences=nseq, allow_gc=False, strict=True, name="%s>M2_scan" % (self.name)) M2 = M2_[-1] M2 = M2 + tt.triu(M2, k=1).T S = M2 - tt.outer(M, M) # apply saturating function to the output if available if self.sat_func is not None: # saturate the output M, S, U = self.sat_func(M, S) # compute the joint input output covariance V = V.dot(U) return M, S, V
def get_output_for(self, inputs, accumulate_updates="warn", recurrence_flags={}, **kwargs): """ returns history of agent interaction with environment for given number of turns. parameters: inputs - [state init] + [input_nonsequences] + [input_sequences] Each part is a list of theano expressions for layers in the order they were provided when creating this layer. recurrence_flags - a set of flags to be passed to the one step agent (anything that lasagne supports) e.g. {deterministic=True} returns: [state_sequences] + [output sequences] - a list of all states and all outputs sequences Shape of each such sequence is [batch, tick, shape_of_one_state_or_output...] """ #aliases n_states = len(self.state_variables) n_state_inits = len(self.state_init) n_input_nonseq = len(self.input_nonsequences) n_input_seq = len(self.input_sequences) n_outputs = len(self.tracked_outputs) #slice inputs if self.mask_input is not None: mask, inputs = inputs[0], inputs[1:] initial_states_provided, nonsequences, sequences = unpack_list( inputs, [n_state_inits, n_input_nonseq, n_input_seq]) # infer batch size if self.batch_size is not None: batch_size = self.batch_size elif len(inputs) != 0: batch_size = inputs[0].shape[0] else: raise ValueError( "Need to set batch_size explicitly for recurrence") # reshape sequences from [batch, time, ...] to [time,batch,...] to fit scan sequences = [seq.swapaxes(1, 0) for seq in sequences] #here we create outputs_info for scan ## initial states that are given as input initial_states_provided = OrderedDict( list(zip(self.state_init, initial_states_provided))) def get_initial_state(state_out_layer, batch_size=batch_size): """Pick dedicated initial state or create zeros of appropriate shape and dtype""" # if we have a dedicated init, use it if state_out_layer in initial_states_provided: initial_state = initial_states_provided[state_out_layer] # otherwise initialize with zeros else: dtype = get_layer_dtype(state_out_layer) initial_state = T.zeros( (batch_size, ) + tuple(state_out_layer.output_shape[1:]), dtype=dtype) #cast to non-broadcastable tensortype t_state = T.TensorType(dtype, (False, ) * initial_state.ndim) initial_state = t_state.convert_variable(initial_state) assert initial_state is not None #if None, conversion failed. report ASAP return initial_state initial_states = list(map(get_initial_state, self.state_variables)) #dummy values for initial outputs. They have no role in computation, but if nonsequences are present, # AND scan is not unrolled, the step function will not receive prev outputs as parameters, while # if unroll_scan, these parameters are present. we forcibly initialize outputs to prevent # complications during parameter parsing in step function below. initial_output_fillers = list( map(get_initial_state, self.tracked_outputs)) outputs_info = initial_states + initial_output_fillers # recurrent step function def step(*args): sequence_slices, prev_states, prev_outputs, nonsequences = \ unpack_list(args, [n_input_seq, n_states, n_outputs, n_input_nonseq]) # make dicts of prev_states and inputs prev_states_dict = OrderedDict( zip(list(self.state_variables.keys()), prev_states)) input_layers = list( chain(self.input_nonsequences.keys(), self.input_sequences.keys())) assert len(input_layers) == len(nonsequences + sequence_slices) inputs_dict = OrderedDict( zip(input_layers, nonsequences + sequence_slices)) # call one step recurrence new_states, new_outputs = self.get_one_step( prev_states_dict, inputs_dict, **recurrence_flags) #make sure output variable is of exactly the same type as corresponding input get_type = lambda tensor: T.TensorType( tensor.dtype, tensor.broadcastable, sparse_grad=getattr(tensor.type, "sparse_grad", False)) new_states = [ get_type(prev_state).convert_variable( state.astype(prev_state.dtype)) for (prev_state, state) in zip(prev_states, new_states) ] assert None not in new_states, "Some state variables has different dtype/shape from init ." new_outputs = [ get_type(prev_out).convert_variable(out.astype(prev_out.dtype)) for (prev_out, out) in zip(prev_outputs, new_outputs) ] assert None not in new_outputs, "Some of the tracked outputs has shape/dtype changing over time. Please report this." return new_states + new_outputs ###handling mask_input### #a step function that utilizes a mask def step_masked(mask_t, *args): #unpack arrays sequence_slices, prev_states, prev_outputs, nonsequences = \ unpack_list(args, [n_input_seq, n_states, n_outputs, n_input_nonseq]) #get regular step new_states_and_outputs = step(*args) old_states_and_outputs = prev_states + prev_outputs #if mask_t, return new ones, else return old ones def apply_mask(mask_t, new_state, old_state): assert new_state.ndim == old_state.ndim ndim = new_state.ndim #append dims to mask pattern = list(range( mask_t.ndim)) + ['x'] * (ndim - mask_t.ndim) return T.switch(mask_t.dimshuffle(pattern), new_state, old_state) next_states_and_outputs = [ apply_mask(mask_t, new_state, old_state) for new_state, old_state in zip( new_states_and_outputs, old_states_and_outputs) ] return next_states_and_outputs if self.mask_input is not None: sequences = [mask.swapaxes(1, 0)] + sequences step_function = step_masked else: step_function = step #scan itself if self.unroll_scan: # call scan itself history = unroll_scan(step_function, sequences=sequences, outputs_info=outputs_info, non_sequences=nonsequences, n_steps=self.n_steps) #if explicitly asked to reset updates, do so if accumulate_updates == False: self.updates = OrderedUpdates() else: history, updates = theano.scan(step_function, sequences=sequences, outputs_info=outputs_info, non_sequences=nonsequences, n_steps=self.n_steps) if accumulate_updates in (True, 'warn'): self.updates += updates else: #replace updates self.updates = updates #check if user received last updates if not self._updates_received and accumulate_updates == 'warn': warn( "You called get_output from recurrence several times without gathering the updates.\n" "(A) If you wanted to get two outputs from recurrence, use NOT\n" ">>>out1 = get_output(rec[layer1])\n" ">>>out2 = get_output(rec[layer2])\n" "but instead:\n" ">>>out1,out2 = get_output((rec[layer1],rec[layer2])) #or rec[layer1,layer2].\n" "(B) If you want to run recurrence several times and accumulate updates from all runs," "use get_output(...,accumulate_updates=True) to silence the warning.\n" "(C) If you want to get rid of old updates, use get_output(...,accumulate_updates=False)\n" ) if len(self.updates) != 0: self._updates_received = False warn( "Recurrent loop without unroll_scan got nonempty random state updates list. That happened" " because there is some source of randomness (e.g. dropout) inside recurrent step graph." " To compile such graph, one must either call .get_automatic_updates() right after .get_output" " and pass these updates to a function when compiling theano.function.", verbosity_level=2) # reordering from [time,batch,...] to [batch,time,...] history = [(var.swapaxes(1, 0) if var.ndim > 1 else var) for var in check_list(history)] assert len(history) == n_states + n_outputs state_seqs, output_seqs = unpack_list(history, [n_states, n_outputs]) # handle delayed_states # selectively shift state sequences by 1 tick into the past, padding with their initialisations for i in range(len(state_seqs)): if list(self.state_variables.keys())[i] in self.delayed_states: state_seq = state_seqs[i] state_init = initial_states[i] state_seq = T.concatenate( [insert_dim(state_init, 1), state_seq[:, :-1]], axis=1) state_seqs[i] = state_seq #keys corresponding to output sequences. Note that we do not use self.keys() to correctly # handle cases where some variable is present in both state_variables and tracked_outputs output_keys = list(self.state_variables.keys()) + list( self.tracked_outputs) output_values = state_seqs + output_seqs assert len(output_keys) == len(output_values) return OrderedDict(zip(output_keys, output_values))
def get_output_for(self, inputs, **kwargs): """ Compute this layer's output function given a symbolic input variable Parameters ---------- inputs : list of theano.TensorType `inputs[0]` should always be the symbolic input variable. When this layer has a mask input (i.e. was instantiated with `mask_input != None`, indicating that the lengths of sequences in each batch vary), `inputs` should have length 2, where `inputs[1]` is the `mask`. The `mask` should be supplied as a Theano variable denoting whether each time step in each sequence in the batch is part of the sequence or not. `mask` should be a matrix of shape ``(n_batch, n_time_steps)`` where ``mask[i, j] = 1`` when ``j <= (length of sequence i)`` and ``mask[i, j] = 0`` when ``j > (length of sequence i)``. When the hidden state of this layer is to be pre-filled (i.e. was set to a :class:`Layer` instance) `inputs` should have length at least 2, and `inputs[-1]` is the hidden state to prefill with. Returns ------- layer_output : theano.TensorType Symbolic output variable. """ # Retrieve the layer input input = inputs[0] # Retrieve the mask when it is supplied mask = None hid_init = None cov_init = None if self.mask_incoming_index > 0: mask = inputs[self.mask_incoming_index] if self.hid_init_incoming_index > 0: hid_init = inputs[self.hid_init_incoming_index] if self.cov_init_incoming_index > 0: cov_init = inputs[self.cov_init_incoming_index] # Treat all dimensions after the second as flattened feature dimensions if input.ndim > 3: input = T.flatten(input, 3) # Because scan iterates over the first dimension we dimshuffle to # (n_time_steps, n_batch, n_features) input = input.dimshuffle(1, 0, 2) seq_len, num_batch, _ = input.shape # Stack input weight matrices into a (num_inputs, 3*num_units) # matrix, which speeds up computation W_in_stacked = T.concatenate( [self.W_in_to_resetgate, self.W_in_to_updategate, self.W_in_to_hidden_update], axis=1) # Same for hidden weight matrices W_hid_stacked = T.concatenate( [self.W_hid_to_resetgate, self.W_hid_to_updategate, self.W_hid_to_hidden_update], axis=1) # Stack gate biases into a (3*num_units) vector b_stacked = T.concatenate( [self.b_resetgate, self.b_updategate, self.b_hidden_update], axis=0) if self.precompute_input: # precompute_input inputs*W. W_in is (n_features, 3*num_units). # input is then (n_batch, n_time_steps, 3*num_units). input = T.dot(input, W_in_stacked) + b_stacked # At each call to scan, input_n will be (n_time_steps, 3*num_units). # We define a slicing function that extract the input to each GRU gate def slice_w(x, n): return x[:, n*self.num_units:(n+1)*self.num_units] # Create single recurrent computation step function # input__n is the n'th vector of the input def step(input_n, hid_previous, cov_previous, *args): # Compute W_{hr} h_{t - 1}, W_{hu} h_{t - 1}, and W_{hc} h_{t - 1} hid_input = T.dot(hid_previous, W_hid_stacked) if self.grad_clipping: input_n = theano.gradient.grad_clip( input_n, -self.grad_clipping, self.grad_clipping) hid_input = theano.gradient.grad_clip( hid_input, -self.grad_clipping, self.grad_clipping) if not self.precompute_input: # Compute W_{xr}x_t + b_r, W_{xu}x_t + b_u, and W_{xc}x_t + b_c input_n = T.dot(input_n, W_in_stacked) + b_stacked # Reset and update gates resetgate = slice_w(hid_input, 0) + slice_w(input_n, 0) updategate = slice_w(hid_input, 1) + slice_w(input_n, 1) resetgate = self.nonlinearity_resetgate(resetgate) updategate = self.nonlinearity_updategate(updategate) # Compute W_{xc}x_t + r_t \odot (W_{hc} h_{t - 1}) hidden_update_in = slice_w(input_n, 2) hidden_update_hid = slice_w(hid_input, 2) hidden_update = hidden_update_in + resetgate*hidden_update_hid if self.grad_clipping: hidden_update = theano.gradient.grad_clip( hidden_update, -self.grad_clipping, self.grad_clipping) hidden_update = self.nonlinearity_hid(hidden_update) # Compute (1 - u_t)h_{t - 1} + u_t c_t hid = (1 - updategate)*hid_previous + updategate*hidden_update cov = cov_previous + hid.dimshuffle((0, 'x', 1)) * hid.dimshuffle((0, 1, 'x')) return hid, cov def step_masked(input_n, mask_n, hid_previous, cov_previous, *args): hid, cov = step(input_n, hid_previous, cov_previous, *args) # Skip over any input with mask 0 by copying the previous # hidden state; proceed normally for any input with mask 1. hid = T.switch(mask_n, hid, hid_previous) cov = T.switch(mask_n, cov, hid_previous) return hid, cov if mask is not None: # mask is given as (batch_size, seq_len). Because scan iterates # over first dimension, we dimshuffle to (seq_len, batch_size) and # add a broadcastable dimension mask = mask.dimshuffle(1, 0, 'x') sequences = [input, mask] step_fun = step_masked else: sequences = [input] step_fun = step if not isinstance(self.hid_init, Layer): # Dot against a 1s vector to repeat to shape (num_batch, num_units) hid_init = T.dot(T.ones((num_batch, 1)), self.hid_init) # The hidden-to-hidden weight matrix is always used in step non_seqs = [W_hid_stacked] # When we aren't precomputing the input outside of scan, we need to # provide the input weights and biases to the step function if not self.precompute_input: non_seqs += [W_in_stacked, b_stacked] if self.unroll_scan: # Retrieve the dimensionality of the incoming layer input_shape = self.input_shapes[0] # Explicitly unroll the recurrence instead of using scan hid_out, cov_out = unroll_scan( fn=step_fun, sequences=sequences, outputs_info=[hid_init, cov_init], go_backwards=self.backwards, non_sequences=non_seqs, n_steps=input_shape[1])[0] else: # Scan op iterates over first dimension of input and repeatedly # applies the step function hid_out, cov_out = theano.scan( fn=step_fun, sequences=sequences, go_backwards=self.backwards, outputs_info=[hid_init, cov_init], non_sequences=non_seqs, truncate_gradient=self.gradient_steps, strict=True)[0] # When it is requested that we only return the final sequence step, # we need to slice it out immediately after scan is applied if self.only_return_final: hid_out = hid_out[-1] cov_out = cov_out[-1] else: # dimshuffle back to (n_batch, n_time_steps, n_features)) hid_out = hid_out.dimshuffle(1, 0, 2) cov_out = cov_out.dimshuffle(1, 0, 2, 3) # if scan is backward reverse the output if self.backwards: hid_out = hid_out[:, ::-1] cov_out = cov_out[:, ::-1] return hid_out, cov_out
def get_output_for(self, inputs, **kwargs): """ Compute this layer's output function given a symbolic input variable Parameters ---------- inputs : list of theano.TensorType `inputs[0]` should always be the symbolic input variable. When this layer has a mask input (i.e. was instantiated with `mask_input != None`, indicating that the lengths of sequences in each batch vary), `inputs` should have length 2, where `inputs[1]` is the `mask`. The `mask` should be supplied as a Theano variable denoting whether each time step in each sequence in the batch is part of the sequence or not. `mask` should be a matrix of shape ``(n_batch, n_time_steps)`` where ``mask[i, j] = 1`` when ``j <= (length of sequence i)`` and ``mask[i, j] = 0`` when ``j > (length of sequence i)``. When the hidden state of this layer is to be pre-filled (i.e. was set to a :class:`Layer` instance) `inputs` should have length at least 2, and `inputs[-1]` is the hidden state to prefill with. When the cell state of this layer is to be pre-filled (i.e. was set to a :class:`Layer` instance) `inputs` should have length at least 2, and `inputs[-1]` is the hidden state to prefill with. When both the cell state and the hidden state are being pre-filled `inputs[-2]` is the hidden state, while `inputs[-1]` is the cell state. Returns ------- layer_output : theano.TensorType Symbolic output variable. """ # Retrieve the layer input input = inputs[0] # Retrieve the mask when it is supplied mask = None hid_init = None cell_init = None if self.mask_incoming_index > 0: mask = inputs[self.mask_incoming_index] if self.hid_init_incoming_index > 0: hid_init = inputs[self.hid_init_incoming_index] if self.cell_init_incoming_index > 0: cell_init = inputs[self.cell_init_incoming_index] # Treat all dimensions after the second as flattened feature dimensions if input.ndim > 3: input = T.flatten(input, 3) if self.bn: input = self.bn.get_output_for(input) # Because scan iterates over the first dimension we dimshuffle to # (n_time_steps, n_batch, n_features) input = input.dimshuffle(1, 0, 2) seq_len, num_batch, _ = input.shape # Stack input weight matrices into a (num_inputs, 4*num_units) # matrix, which speeds up computation W_in_stacked = T.concatenate([ self.W_in_to_ingate, self.W_in_to_forgetgate, self.W_in_to_cell, self.W_in_to_outgate ], axis=1) # Same for hidden weight matrices W_hid_stacked = T.concatenate([ self.W_hid_to_ingate, self.W_hid_to_forgetgate, self.W_hid_to_cell, self.W_hid_to_outgate ], axis=1) # Stack biases into a (4*num_units) vector b_stacked = T.concatenate( [self.b_ingate, self.b_forgetgate, self.b_cell, self.b_outgate], axis=0) if self.precompute_input: # Because the input is given for all time steps, we can # precompute_input the inputs dot weight matrices before scanning. # W_in_stacked is (n_features, 4*num_units). input is then # (n_time_steps, n_batch, 4*num_units). input = T.dot(input, W_in_stacked) + b_stacked # At each call to scan, input_n will be (n_time_steps, 4*num_units). # We define a slicing function that extract the input to each LSTM gate def slice_w(x, n): return x[:, n * self.num_units:(n + 1) * self.num_units] # Create single recurrent computation step function # input_n is the n'th vector of the input def step(input_n, cell_previous, hid_previous, *args): if not self.precompute_input: input_n = T.dot(input_n, W_in_stacked) + b_stacked # Calculate gates pre-activations and slice gates = input_n + T.dot(hid_previous, W_hid_stacked) # Clip gradients if self.grad_clipping: gates = theano.gradient.grad_clip(gates, -self.grad_clipping, self.grad_clipping) # Extract the pre-activation gate values ingate = slice_w(gates, 0) forgetgate = slice_w(gates, 1) cell_input = slice_w(gates, 2) outgate = slice_w(gates, 3) if self.peepholes: # Compute peephole connections ingate += cell_previous * self.W_cell_to_ingate forgetgate += cell_previous * self.W_cell_to_forgetgate # Apply nonlinearities ingate = self.nonlinearity_ingate(ingate) forgetgate = self.nonlinearity_forgetgate(forgetgate) cell_input = self.nonlinearity_cell(cell_input) # Compute new cell value cell = forgetgate * cell_previous + ingate * cell_input if self.peepholes: outgate += cell * self.W_cell_to_outgate outgate = self.nonlinearity_outgate(outgate) # Compute new hidden unit activation hid = outgate * self.nonlinearity(cell) return [cell, hid] def step_masked(input_n, mask_n, iter_idx, cell_previous, hid_previous, *args): cell, hid = step(input_n, cell_previous, hid_previous, *args) # if self.sleepy: # sleep_mask = T.eq(T.mod(iter_idx, (T.arange(cell.shape[-1])+1)),0) # final_mask_n = T.switch(sleep_mask, mask_n, 0) # else: final_mask_n = mask_n # Skip over any input with mask 0 by copying the previous # hidden state; proceed normally for any input with mask 1. cell = T.switch(final_mask_n, cell, cell_previous) hid = T.switch(final_mask_n, hid, hid_previous) return [cell, hid] if mask is not None: # mask is given as (batch_size, seq_len). Because scan iterates # over first dimension, we dimshuffle to (seq_len, batch_size) and # add a broadcastable dimension mask = mask.dimshuffle(1, 0, 'x') iter_range = T.arange(mask.shape[0]).astype('int32') sequences = [input, mask, iter_range] step_fun = step_masked else: sequences = input step_fun = step ones = T.ones((num_batch, 1)) if not isinstance(self.cell_init, Layer): # Dot against a 1s vector to repeat to shape (num_batch, num_units) cell_init = T.dot(ones, self.cell_init) if not isinstance(self.hid_init, Layer): # Dot against a 1s vector to repeat to shape (num_batch, num_units) hid_init = T.dot(ones, self.hid_init) # The hidden-to-hidden weight matrix is always used in step non_seqs = [W_hid_stacked] # The "peephole" weight matrices are only used when self.peepholes=True if self.peepholes: non_seqs += [ self.W_cell_to_ingate, self.W_cell_to_forgetgate, self.W_cell_to_outgate ] # When we aren't precomputing the input outside of scan, we need to # provide the input weights and biases to the step function if not self.precompute_input: non_seqs += [W_in_stacked, b_stacked] if self.unroll_scan: # Retrieve the dimensionality of the incoming layer input_shape = self.input_shapes[0] # Explicitly unroll the recurrence instead of using scan cell_out, hid_out = unroll_scan(fn=step_fun, sequences=sequences, outputs_info=[cell_init, hid_init], go_backwards=self.backwards, non_sequences=non_seqs, n_steps=input_shape[1]) else: # Scan op iterates over first dimension of input and repeatedly # applies the step function cell_out, hid_out = theano.scan( fn=step_fun, sequences=sequences, outputs_info=[cell_init, hid_init], go_backwards=self.backwards, truncate_gradient=self.gradient_steps, non_sequences=non_seqs, strict=True)[0] # When it is requested that we only return the final sequence step, # we need to slice it out immediately after scan is applied if self.only_return_final: hid_out = hid_out[-1] else: # dimshuffle back to (n_batch, n_time_steps, n_features)) hid_out = hid_out.dimshuffle(1, 0, 2) # if scan is backward reverse the output if self.backwards: hid_out = hid_out[:, ::-1] return hid_out
def get_output_for(self, inputs, recurrence_flags={}, **kwargs): """ returns history of agent interaction with environment for given number of turns. parameters: inputs - [state init] + [input_nonsequences] + [input_sequences] Each part is a list of theano expressions for layers in the order they were provided when creating this layer. recurrence_flags - a set of flags to be passed to the one step agent (anything that lasagne supports) e.g. {deterministic=True} returns: [state_sequences] + [output sequences] - a list of all states and all outputs sequences Shape of each such sequence is [batch, tick, shape_of_one_state_or_output...] """ #aliases n_states = len(self.state_variables) n_state_inits = len(self.state_init) n_input_nonseq = len(self.input_nonsequences) n_input_seq = len(self.input_sequences) n_outputs = len(self.tracked_outputs) #slice inputs if self.mask_input is not None: mask, inputs = inputs[0], inputs[1:] initial_states_provided, nonsequences, sequences = unpack_list( inputs, [n_state_inits, n_input_nonseq, n_input_seq]) # infer batch size if self.batch_size is not None: batch_size = self.batch_size elif len(inputs) != 0: batch_size = inputs[0].shape[0] else: raise ValueError( "Need to set batch_size explicitly for recurrence") # reshape sequences from [batch, time, ...] to [time,batch,...] to fit scan sequences = [seq.swapaxes(1, 0) for seq in sequences] #here we create outputs_info for scan ## initial states that are given as input initial_states_provided = OrderedDict( list(zip(self.state_init, initial_states_provided))) def get_initial_state(state_out_layer, batch_size=batch_size): """Pick dedicated initial state or create zeros of appropriate shape and dtype""" # if we have a dedicated init, use it if state_out_layer in initial_states_provided: initial_state = initial_states_provided[state_out_layer] # otherwise initialize with zeros else: #constant batch_size==1 causes T.zeros to get broadcastable, which results in an error #TODO(jheuristic) investigate a better way to do so. if (type(batch_size) is int) and (batch_size == 1): batch_size = theano.shared(batch_size) initial_state = T.zeros( (batch_size, ) + tuple(state_out_layer.output_shape[1:]), dtype=get_layer_dtype(state_out_layer)) return initial_state initial_states = list(map(get_initial_state, self.state_variables)) #dummy values for initial outputs. They have no role in computation, but if nonsequences are present, # AND scan is not unrolled, the step function will not receive prev outputs as parameters, while # if unroll_scan, these parameters are present. we forcibly initialize outputs to prevent # complications during parameter parsing in step function below. initial_output_fillers = list( map(get_initial_state, self.tracked_outputs)) outputs_info = initial_states + initial_output_fillers # recurrent step function def step(*args): sequence_slices, prev_states, prev_outputs, nonsequences = \ unpack_list(args, [n_input_seq, n_states, n_outputs, n_input_nonseq]) # make dicts of prev_states and inputs prev_states_dict = OrderedDict( zip(list(self.state_variables.keys()), prev_states)) input_layers = list( chain(self.input_nonsequences.keys(), self.input_sequences.keys())) assert len(input_layers) == len(nonsequences + sequence_slices) inputs_dict = OrderedDict( zip(input_layers, nonsequences + sequence_slices)) # call one step recurrence new_states, new_outputs = self.get_one_step( prev_states_dict, inputs_dict, **recurrence_flags) return new_states + new_outputs ###handling mask_input### #a step function that utilizes a mask def step_masked(mask_t, *args): #unpack arrays sequence_slices, prev_states, prev_outputs, nonsequences = \ unpack_list(args, [n_input_seq, n_states, n_outputs, n_input_nonseq]) #get regular step new_states_and_outputs = step(*args) old_states_and_outputs = prev_states + prev_outputs #if mask_t, return new ones, else return old ones def apply_mask(mask_t, new_state, old_state): assert new_state.ndim == old_state.ndim ndim = new_state.ndim #append dims to mask pattern = list(range( mask_t.ndim)) + ['x'] * (ndim - mask_t.ndim) return T.switch(mask_t.dimshuffle(pattern), new_state, old_state) next_states_and_outputs = [ apply_mask(mask_t, new_state, old_state) for new_state, old_state in zip( new_states_and_outputs, old_states_and_outputs) ] return next_states_and_outputs if self.mask_input is not None: sequences = [mask.swapaxes(1, 0)] + sequences step_function = step_masked else: step_function = step #scan itself if self.unroll_scan: # call scan itself history = unroll_scan(step_function, sequences=sequences, outputs_info=outputs_info, non_sequences=nonsequences, n_steps=self.n_steps) self.updates = OrderedDict() else: history, updates = theano.scan(step_function, sequences=sequences, outputs_info=outputs_info, non_sequences=nonsequences, n_steps=self.n_steps) self.updates = updates if len(updates) != 0: warn( "Warning: recurrent loop without unroll_scan got nonempty random state updates list. That happened" " because there is some source of randomness (e.g. dropout) inside recurrent step graph." " To compile such graph, one must either call .get_automatic_updates() right after .get_output" " and pass these updates to a function, or use no_defalt_updates=True when compiling theano.function." ) # reordering from [time,batch,...] to [batch,time,...] history = [(var.swapaxes(1, 0) if var.ndim > 1 else var) for var in check_list(history)] assert len(history) == n_states + n_outputs state_seqs, output_seqs = unpack_list(history, [n_states, n_outputs]) # handle delayed_states # selectively shift state sequences by 1 tick into the past, padding with their initialisations for i in range(len(state_seqs)): if list(self.state_variables.keys())[i] in self.delayed_states: state_seq = state_seqs[i] state_init = initial_states[i] state_seq = T.concatenate( [insert_dim(state_init, 1), state_seq[:, :-1]], axis=1) state_seqs[i] = state_seq #keys corresponding to output sequences. Note that we do not use self.keys() to correctly # handle cases where some variable is present in both state_variables and tracked_outputs output_keys = list(self.state_variables.keys()) + list( self.tracked_outputs) output_values = state_seqs + output_seqs assert len(output_keys) == len(output_values) return OrderedDict(zip(output_keys, output_values))