def get_output_for(self, input, *args, **kwargs): if input.ndim > 2: # if the input has more than two dimensions, flatten it into a # batch of feature vectors. input = input.flatten(2) num_inputs = int(np.prod(self.input_shape[1:])) activations = [] for team in range(2): for i, plr in enumerate(range(ppt)): feat = num_inputs / 2 / ppt mid = feat * i + num_inputs / 2 * team if i == 0: activation = T.dot(input[:,mid:mid+feat], self.W) else: activation += T.dot(input[:,mid:mid+feat], self.W) activations.append(activation) if self.b is not None: activations[0] = activations[0] + self.b.dimshuffle('x', 0) activations[1] = activations[1] + self.b.dimshuffle('x', 0) return self.nonlinearity(concatenate(activations, axis=1))
def get_output_for(self, input, *args, **kwargs): if input.ndim > 2: # if the input has more than two dimensions, flatten it into a # batch of feature vectors. input = input.flatten(2) num_inputs = int(np.prod(self.input_shape[1:])) activations = [] for team in range(2): for i, plr in enumerate(range(ppt)): feat = num_inputs / 2 / ppt mid = feat * i + num_inputs / 2 * team if i == 0: activation = T.dot(input[:, mid:mid + feat], self.W) else: activation += T.dot(input[:, mid:mid + feat], self.W) activations.append(activation) if self.b is not None: activations[0] = activations[0] + self.b.dimshuffle('x', 0) activations[1] = activations[1] + self.b.dimshuffle('x', 0) return self.nonlinearity(concatenate(activations, axis=1))
def get_output_for(self, input_fwd, mask=None, blstm_hooks=None, *args, **kwargs): ''' Compute this layer's output function given a symbolic input variable :parameters: - input : theano.TensorType Symbolic input variable - mask : theano.TensorType Theano variable denoting whether each time step in each sequence in the batch is part of the sequence or not. This is needed when scanning backwards. If all sequences are of the same length, it should be all 1s. :returns: - layer_output : theano.TensorType Symbolic output variable ''' mask_fwd = mask assert mask_fwd is not None, "Mask must be given for bidirectional layer" # Treat all layers after the first as flattened feature dimensions if input_fwd.ndim > 3: input_fwd = input.reshape((input_fwd.shape[0], input_fwd.shape[1], T.prod(input_fwd.shape[2:]))) input_fwd = input_fwd.dimshuffle(1, 0, 2) input_bck = input_fwd[::-1, :, :] # precompute inputs*W and dimshuffle # Input is provided as (n_batch, n_time_steps, n_features) # W _in_to_gates is (n_features, 4*num_units). input dot W is then # (n_batch, n_time_steps, 4*num_units). Because scan iterate over the # first dimension we dimshuffle to (n_time_steps, n_batch, n_features) # flip input and mask if we ar going backwards input_dot_W_fwd = T.dot( input_fwd, self.W_in_to_gates[0]) input_dot_W_bck = T.dot( input_bck, self.W_in_to_gates[1]) input_dot_W_fwd += self.b_gates[0] input_dot_W_bck += self.b_gates[1] # mask is given as (batch_size, seq_len) or (batch_size, seq_len). # Because scan iterates over # first dim. If mask is 2d we dimshuffle to (seq_len, batch_size) and # add a broadcastable dimension. If 3d assume that third dim is # broadcastable. if mask_fwd.ndim == 2: mask_fwd = mask_fwd.dimshuffle(1, 0, 'x') else: assert mask_fwd.broadcastable == (False, False, True), \ "When mask is 3d the last dimension must be boadcastable" mask_fwd = mask_fwd.dimshuffle(1, 0, 2) mask_bck = mask_fwd[::-1, :] # reverse # input_dow_w is (n_batch, n_time_steps, 4*num_units). We define a # slicing function that extract the input to each LSTM gate # slice_c is similar but for peephole weights. def slice_w(x, n): return x[:, n*self.num_units:(n+1)*self.num_units] def slice_c(x, n): return x[n*self.num_units:(n+1)*self.num_units] # Create single recurrent computation step function. # Calculates both the forward and the backward pass. # The step function calculates the following: # # i_t = \sigma(W_{xi}x_t + W_{hi}h_{t-1} + W_{ci}c_{t-1} + b_i) # f_t = \sigma(W_{xf}x_t + W_{hf}h_{t-1} + W_{cf}c_{t-1} + b_f) # c_t = f_tc_{t - 1} + i_t\tanh(W_{xc}x_t + W_{hc}h_{t-1} + b_c) # o_t = \sigma(W_{xo}x_t + W_{ho}h_{t-1} + W_{co}c_t + b_o) # h_t = o_t \tanh(c_t) # # Gate names are taken from http://arxiv.org/abs/1409.2329 figure 1 def dostep(input_dot_W_n, cell_previous, hid_previous, W_hid_to_gates, W_cell_to_gates): # calculate gates pre-activations and slice gates = input_dot_W_n + T.dot(hid_previous, W_hid_to_gates) ingate = slice_w(gates,0) forgetgate = slice_w(gates,1) modulationgate = slice_w(gates,2) outgate = slice_w(gates,3) if self.peepholes: ingate += cell_previous*slice_c(W_cell_to_gates, 0) forgetgate += cell_previous*slice_c(W_cell_to_gates,1) ingate = self.nonlinearity_ingate(ingate) forgetgate = self.nonlinearity_forgetgate(forgetgate) modulationgate = self.nonlinearity_modulationgate(modulationgate) cell = forgetgate*cell_previous + ingate*modulationgate if self.peepholes: outgate += cell*slice_c(W_cell_to_gates, 2) outgate = self.nonlinearity_outgate(outgate) hid = outgate*self.nonlinearity_out(cell) return cell, hid def step(input_dot_W_fwd, input_dot_W_bck, mask_fwd_n, mask_bck_n, cell_previous_fwd, hid_previous_fwd, cell_previous_bck, hid_previous_bck, W_hid_to_gates, W_cell_to_gates): #forward cell_fwd, hid_fwd = dostep( input_dot_W_fwd, cell_previous_fwd, hid_previous_fwd, W_hid_to_gates[0], W_cell_to_gates[0]) # backward cell_bck, hid_bck = dostep( input_dot_W_bck, cell_previous_bck, hid_previous_bck, W_hid_to_gates[1], W_cell_to_gates[1]) # If mask is 0, use previous state until mask = 1 is found. # This propagates the layer initial state when moving backwards # until the end of the sequence is found. not_mask_bck_n = 1 - mask_bck_n not_mask_fwd_n = 1 - mask_fwd_n cell_bck = cell_bck*mask_bck_n + cell_previous_bck*not_mask_bck_n cell_fwd = cell_fwd*mask_fwd_n + cell_previous_fwd*not_mask_fwd_n hid_bck = hid_bck*mask_bck_n + hid_previous_bck*not_mask_bck_n hid_fwd = hid_fwd*mask_fwd_n + hid_previous_fwd*not_mask_fwd_n return [cell_fwd, cell_bck, hid_fwd, hid_bck] sequences = [input_dot_W_fwd, input_dot_W_bck,mask_fwd, mask_bck] init = [self.cell_init_fwd, self.cell_init_bck, self.hid_init_fwd, self.hid_init_bck] # Scan op iterates over first dimension of input and repeatedly # applied the step function nonseqs = [self.W_hid_to_gates, self.W_cell_to_gates] scan_out = theano.scan(step, sequences=sequences, outputs_info=init, non_sequences=nonseqs) # output is (n_time_steps, n_batch, n_units)) output_hid_fwd = scan_out[0][2] output_hid_bck = scan_out[0][3] # reverse bck output output_hid_bck = output_hid_bck[::-1, :, :] # concateante fwd and bck output_hid = utils.concatenate([output_hid_fwd, output_hid_bck], axis=2) self.output_hid = output_hid self.output_hid.name = "BidireactionaLSTMLayer: output_hid" # Now, dimshuffle back to (n_batch, n_time_steps, n_units)) output_hid = output_hid.dimshuffle(1, 0, 2) if self.returncell: output_cell_fwd = scan_out[0][0] output_cell_bck = scan_out[0][1] output_cell_bck = output_cell_bck[::-1, :, :] output_cell = utils.concatenate( [output_cell_fwd, output_cell_bck], axis=2) output_cell = output_cell.dimshuffle(1, 0, 2) self.output_cell = output_cell self.output_cell.name = "BidireactionaLSTMLayer: output_cell" return output_cell, output_hid else: return output_hid
def __init__( self, input_layer, num_units, W_in_to_ingate=init.Normal(0.1), W_hid_to_ingate=init.Normal(0.1), W_cell_to_ingate=init.Normal(0.1), b_ingate=init.Normal(0.1), nonlinearity_ingate=nonlinearities.sigmoid, W_in_to_forgetgate=init.Normal(0.1), W_hid_to_forgetgate=init.Normal(0.1), W_cell_to_forgetgate=init.Normal(0.1), b_forgetgate=init.Normal(0.1), nonlinearity_forgetgate=nonlinearities.sigmoid, W_in_to_cell=init.Normal(0.1), W_hid_to_cell=init.Normal(0.1), b_cell=init.Normal(0.1), nonlinearity_cell=nonlinearities.tanh, W_in_to_outgate=init.Normal(0.1), W_hid_to_outgate=init.Normal(0.1), W_cell_to_outgate=init.Normal(0.1), b_outgate=init.Normal(0.1), nonlinearity_outgate=nonlinearities.sigmoid, nonlinearity_out=nonlinearities.tanh, cell_init=init.Constant(0.0), hid_init=init.Constant(0.0), backwards=False, learn_init=False, peepholes=True, gradient_steps=-1, ): """ Initialize an LSTM layer. For details on what the parameters mean, see (7-11) from [#graves2014generating]_. :parameters: - input_layer : layers.Layer Input to this recurrent layer - num_units : int Number of hidden units - W_in_to_ingate : function or np.ndarray or theano.shared :math:`W_{xi}` - W_hid_to_ingate : function or np.ndarray or theano.shared :math:`W_{hi}` - W_cell_to_ingate : function or np.ndarray or theano.shared :math:`W_{ci}` - b_ingate : function or np.ndarray or theano.shared :math:`b_i` - nonlinearity_ingate : function :math:`\sigma` - W_in_to_forgetgate : function or np.ndarray or theano.shared :math:`W_{xf}` - W_hid_to_forgetgate : function or np.ndarray or theano.shared :math:`W_{hf}` - W_cell_to_forgetgate : function or np.ndarray or theano.shared :math:`W_{cf}` - b_forgetgate : function or np.ndarray or theano.shared :math:`b_f` - nonlinearity_forgetgate : function :math:`\sigma` - W_in_to_cell : function or np.ndarray or theano.shared :math:`W_{ic}` - W_hid_to_cell : function or np.ndarray or theano.shared :math:`W_{hc}` - b_cell : function or np.ndarray or theano.shared :math:`b_c` - nonlinearity_cell : function or np.ndarray or theano.shared :math:`\tanh` - W_in_to_outgate : function or np.ndarray or theano.shared :math:`W_{io}` - W_hid_to_outgate : function or np.ndarray or theano.shared :math:`W_{ho}` - W_cell_to_outgate : function or np.ndarray or theano.shared :math:`W_{co}` - b_outgate : function or np.ndarray or theano.shared :math:`b_o` - nonlinearity_outgate : function :math:`\sigma` - nonlinearity_out : function or np.ndarray or theano.shared :math:`\tanh` - cell_init : function or np.ndarray or theano.shared :math:`c_0` - hid_init : function or np.ndarray or theano.shared :math:`h_0` - backwards : boolean If True, process the sequence backwards and then reverse the output again such that the output from the layer is always from x_1 to x_n. - learn_init : boolean If True, initial hidden values are learned - peepholes : boolean If True, the LSTM uses peephole connections. When False, W_cell_to_ingate, W_cell_to_forgetgate and W_cell_to_outgate are ignored. - gradient_steps : int Number of timesteps to include in backpropagated gradient If -1, backpropagate through the entire sequence """ # Initialize parent layer super(LSTMLayer, self).__init__(input_layer) # For any of the nonlinearities, if None is supplied, use identity if nonlinearity_ingate is None: self.nonlinearity_ingate = nonlinearities.identity else: self.nonlinearity_ingate = nonlinearity_ingate if nonlinearity_forgetgate is None: self.nonlinearity_forgetgate = nonlinearities.identity else: self.nonlinearity_forgetgate = nonlinearity_forgetgate if nonlinearity_cell is None: self.nonlinearity_cell = nonlinearities.identity else: self.nonlinearity_cell = nonlinearity_cell if nonlinearity_outgate is None: self.nonlinearity_outgate = nonlinearities.identity else: self.nonlinearity_outgate = nonlinearity_outgate if nonlinearity_out is None: self.nonlinearity_out = nonlinearities.identity else: self.nonlinearity_out = nonlinearity_out self.learn_init = learn_init self.num_units = num_units self.backwards = backwards self.peepholes = peepholes self.gradient_steps = gradient_steps # Input dimensionality is the output dimensionality of the input layer (num_batch, _, num_inputs) = self.input_layer.get_output_shape() # Initialize parameters using the supplied args self.W_in_to_ingate = self.create_param(W_in_to_ingate, (num_inputs, num_units), name="W_in_to_ingate") self.W_hid_to_ingate = self.create_param(W_hid_to_ingate, (num_units, num_units), name="W_hid_to_ingate") self.b_ingate = self.create_param(b_ingate, (num_units), name="b_ingate") self.W_in_to_forgetgate = self.create_param( W_in_to_forgetgate, (num_inputs, num_units), name="W_in_to_forgetgate" ) self.W_hid_to_forgetgate = self.create_param( W_hid_to_forgetgate, (num_units, num_units), name="W_hid_to_forgetgate" ) self.b_forgetgate = self.create_param(b_forgetgate, (num_units,), name="b_forgetgate") self.W_in_to_cell = self.create_param(W_in_to_cell, (num_inputs, num_units), name="W_in_to_cell") self.W_hid_to_cell = self.create_param(W_hid_to_cell, (num_units, num_units), name="W_hid_to_cell") self.b_cell = self.create_param(b_cell, (num_units,), name="b_cell") self.W_in_to_outgate = self.create_param(W_in_to_outgate, (num_inputs, num_units), name="W_in_to_outgate") self.W_hid_to_outgate = self.create_param(W_hid_to_outgate, (num_units, num_units), name="W_hid_to_outgate") self.b_outgate = self.create_param(b_outgate, (num_units,), name="b_outgate") # Stack input to gate weight matrices into a (num_inputs, 4*num_units) # matrix, which speeds up computation self.W_in_to_gates = utils.concatenate( [self.W_in_to_ingate, self.W_in_to_forgetgate, self.W_in_to_cell, self.W_in_to_outgate], axis=1 ) # Same for hidden to gate weight matrices self.W_hid_to_gates = utils.concatenate( [self.W_hid_to_ingate, self.W_hid_to_forgetgate, self.W_hid_to_cell, self.W_hid_to_outgate], axis=1 ) # Stack gate biases into a (4*num_units) vector self.b_gates = utils.concatenate([self.b_ingate, self.b_forgetgate, self.b_cell, self.b_outgate], axis=0) # Initialize peephole (cell to gate) connections. These are # elementwise products with the cell state, so they are represented as # vectors. if self.peepholes: self.W_cell_to_ingate = self.create_param(W_cell_to_ingate, (num_units), name="W_cell_to_ingate") self.W_cell_to_forgetgate = self.create_param( W_cell_to_forgetgate, (num_units), name="W_cell_to_forgetgate" ) self.W_cell_to_outgate = self.create_param(W_cell_to_outgate, (num_units), name="W_cell_to_outgate") # Setup initial values for the cell and the hidden units self.cell_init = self.create_param(cell_init, (num_batch, num_units), name="cell_init") self.hid_init = self.create_param(hid_init, (num_batch, num_units), name="hid_init")
def __init__(self, input_layer, num_units, W_in_to_ingate=init.Normal(0.1), W_hid_to_ingate=init.Normal(0.1), W_cell_to_ingate=init.Normal(0.1), b_ingate=init.Normal(0.1), nonlinearity_ingate=nonlinearities.sigmoid, W_in_to_forgetgate=init.Normal(0.1), W_hid_to_forgetgate=init.Normal(0.1), W_cell_to_forgetgate=init.Normal(0.1), b_forgetgate=init.Normal(0.1), nonlinearity_forgetgate=nonlinearities.sigmoid, W_in_to_cell=init.Normal(0.1), W_hid_to_cell=init.Normal(0.1), b_cell=init.Normal(0.1), nonlinearity_cell=nonlinearities.tanh, W_in_to_outgate=init.Normal(0.1), W_hid_to_outgate=init.Normal(0.1), W_cell_to_outgate=init.Normal(0.1), b_outgate=init.Normal(0.1), nonlinearity_outgate=nonlinearities.sigmoid, nonlinearity_out=nonlinearities.tanh, cell_init=init.Constant(0.), hid_init=init.Constant(0.), backwards=False, learn_init=False, peepholes=True, gradient_steps=-1): ''' Initialize an LSTM layer. For details on what the parameters mean, see (7-11) from [#graves2014generating]_. :parameters: - input_layer : layers.Layer Input to this recurrent layer - num_units : int Number of hidden units - W_in_to_ingate : function or np.ndarray or theano.shared :math:`W_{xi}` - W_hid_to_ingate : function or np.ndarray or theano.shared :math:`W_{hi}` - W_cell_to_ingate : function or np.ndarray or theano.shared :math:`W_{ci}` - b_ingate : function or np.ndarray or theano.shared :math:`b_i` - nonlinearity_ingate : function :math:`\sigma` - W_in_to_forgetgate : function or np.ndarray or theano.shared :math:`W_{xf}` - W_hid_to_forgetgate : function or np.ndarray or theano.shared :math:`W_{hf}` - W_cell_to_forgetgate : function or np.ndarray or theano.shared :math:`W_{cf}` - b_forgetgate : function or np.ndarray or theano.shared :math:`b_f` - nonlinearity_forgetgate : function :math:`\sigma` - W_in_to_cell : function or np.ndarray or theano.shared :math:`W_{ic}` - W_hid_to_cell : function or np.ndarray or theano.shared :math:`W_{hc}` - b_cell : function or np.ndarray or theano.shared :math:`b_c` - nonlinearity_cell : function or np.ndarray or theano.shared :math:`\tanh` - W_in_to_outgate : function or np.ndarray or theano.shared :math:`W_{io}` - W_hid_to_outgate : function or np.ndarray or theano.shared :math:`W_{ho}` - W_cell_to_outgate : function or np.ndarray or theano.shared :math:`W_{co}` - b_outgate : function or np.ndarray or theano.shared :math:`b_o` - nonlinearity_outgate : function :math:`\sigma` - nonlinearity_out : function or np.ndarray or theano.shared :math:`\tanh` - cell_init : function or np.ndarray or theano.shared :math:`c_0` - hid_init : function or np.ndarray or theano.shared :math:`h_0` - backwards : boolean If True, process the sequence backwards and then reverse the output again such that the output from the layer is always from x_1 to x_n. - learn_init : boolean If True, initial hidden values are learned - peepholes : boolean If True, the LSTM uses peephole connections. When False, W_cell_to_ingate, W_cell_to_forgetgate and W_cell_to_outgate are ignored. - gradient_steps : int Number of timesteps to include in backpropagated gradient If -1, backpropagate through the entire sequence ''' # Initialize parent layer super(LSTMLayer, self).__init__(input_layer) # For any of the nonlinearities, if None is supplied, use identity if nonlinearity_ingate is None: self.nonlinearity_ingate = nonlinearities.identity else: self.nonlinearity_ingate = nonlinearity_ingate if nonlinearity_forgetgate is None: self.nonlinearity_forgetgate = nonlinearities.identity else: self.nonlinearity_forgetgate = nonlinearity_forgetgate if nonlinearity_cell is None: self.nonlinearity_cell = nonlinearities.identity else: self.nonlinearity_cell = nonlinearity_cell if nonlinearity_outgate is None: self.nonlinearity_outgate = nonlinearities.identity else: self.nonlinearity_outgate = nonlinearity_outgate if nonlinearity_out is None: self.nonlinearity_out = nonlinearities.identity else: self.nonlinearity_out = nonlinearity_out self.learn_init = learn_init self.num_units = num_units self.backwards = backwards self.peepholes = peepholes self.gradient_steps = gradient_steps # Input dimensionality is the output dimensionality of the input layer (num_batch, _, num_inputs) = self.input_layer.get_output_shape() # Initialize parameters using the supplied args self.W_in_to_ingate = self.create_param(W_in_to_ingate, (num_inputs, num_units), name="W_in_to_ingate") self.W_hid_to_ingate = self.create_param(W_hid_to_ingate, (num_units, num_units), name="W_hid_to_ingate") self.b_ingate = self.create_param(b_ingate, (num_units), name="b_ingate") self.W_in_to_forgetgate = self.create_param(W_in_to_forgetgate, (num_inputs, num_units), name="W_in_to_forgetgate") self.W_hid_to_forgetgate = self.create_param( W_hid_to_forgetgate, (num_units, num_units), name="W_hid_to_forgetgate") self.b_forgetgate = self.create_param(b_forgetgate, (num_units, ), name="b_forgetgate") self.W_in_to_cell = self.create_param(W_in_to_cell, (num_inputs, num_units), name="W_in_to_cell") self.W_hid_to_cell = self.create_param(W_hid_to_cell, (num_units, num_units), name="W_hid_to_cell") self.b_cell = self.create_param(b_cell, (num_units, ), name="b_cell") self.W_in_to_outgate = self.create_param(W_in_to_outgate, (num_inputs, num_units), name="W_in_to_outgate") self.W_hid_to_outgate = self.create_param(W_hid_to_outgate, (num_units, num_units), name="W_hid_to_outgate") self.b_outgate = self.create_param(b_outgate, (num_units, ), name="b_outgate") # Stack input to gate weight matrices into a (num_inputs, 4*num_units) # matrix, which speeds up computation self.W_in_to_gates = utils.concatenate([ self.W_in_to_ingate, self.W_in_to_forgetgate, self.W_in_to_cell, self.W_in_to_outgate ], axis=1) # Same for hidden to gate weight matrices self.W_hid_to_gates = utils.concatenate([ self.W_hid_to_ingate, self.W_hid_to_forgetgate, self.W_hid_to_cell, self.W_hid_to_outgate ], axis=1) # Stack gate biases into a (4*num_units) vector self.b_gates = utils.concatenate( [self.b_ingate, self.b_forgetgate, self.b_cell, self.b_outgate], axis=0) # Initialize peephole (cell to gate) connections. These are # elementwise products with the cell state, so they are represented as # vectors. if self.peepholes: self.W_cell_to_ingate = self.create_param(W_cell_to_ingate, (num_units), name="W_cell_to_ingate") self.W_cell_to_forgetgate = self.create_param( W_cell_to_forgetgate, (num_units), name="W_cell_to_forgetgate") self.W_cell_to_outgate = self.create_param( W_cell_to_outgate, (num_units), name="W_cell_to_outgate") # Setup initial values for the cell and the hidden units self.cell_init = self.create_param(cell_init, (num_batch, num_units), name="cell_init") self.hid_init = self.create_param(hid_init, (num_batch, num_units), name="hid_init")