Ejemplo n.º 1
0
    def get_output_for(self, input, *args, **kwargs):
        if input.ndim > 2:
            # if the input has more than two dimensions, flatten it into a
            # batch of feature vectors.
            input = input.flatten(2)

        num_inputs = int(np.prod(self.input_shape[1:]))
        activations = []
        for team in range(2):
            for i, plr in enumerate(range(ppt)):
                feat = num_inputs / 2 / ppt
                mid = feat * i + num_inputs / 2 * team
                if i == 0:
                    activation = T.dot(input[:,mid:mid+feat], self.W)
                else:
                    activation += T.dot(input[:,mid:mid+feat], self.W)
            activations.append(activation)
        if self.b is not None:
            activations[0] = activations[0] + self.b.dimshuffle('x', 0)
            activations[1] = activations[1] + self.b.dimshuffle('x', 0)
        return self.nonlinearity(concatenate(activations, axis=1))
    def get_output_for(self, input, *args, **kwargs):
        if input.ndim > 2:
            # if the input has more than two dimensions, flatten it into a
            # batch of feature vectors.
            input = input.flatten(2)

        num_inputs = int(np.prod(self.input_shape[1:]))
        activations = []
        for team in range(2):
            for i, plr in enumerate(range(ppt)):
                feat = num_inputs / 2 / ppt
                mid = feat * i + num_inputs / 2 * team
                if i == 0:
                    activation = T.dot(input[:, mid:mid + feat], self.W)
                else:
                    activation += T.dot(input[:, mid:mid + feat], self.W)
            activations.append(activation)
        if self.b is not None:
            activations[0] = activations[0] + self.b.dimshuffle('x', 0)
            activations[1] = activations[1] + self.b.dimshuffle('x', 0)
        return self.nonlinearity(concatenate(activations, axis=1))
Ejemplo n.º 3
0
    def get_output_for(self, input_fwd, mask=None, blstm_hooks=None,
                       *args, **kwargs):
        '''
        Compute this layer's output function given a symbolic input variable

        :parameters:
            - input : theano.TensorType
                Symbolic input variable
            - mask : theano.TensorType
                Theano variable denoting whether each time step in each
                sequence in the batch is part of the sequence or not.  This is
                needed when scanning backwards.  If all sequences are of the
                same length, it should be all 1s.

        :returns:
            - layer_output : theano.TensorType
                Symbolic output variable
        '''
        mask_fwd = mask
        assert mask_fwd is not None, "Mask must be given for bidirectional layer"

        # Treat all layers after the first as flattened feature dimensions
        if input_fwd.ndim > 3:
            input_fwd = input.reshape((input_fwd.shape[0], input_fwd.shape[1],
                                   T.prod(input_fwd.shape[2:])))

        input_fwd = input_fwd.dimshuffle(1, 0, 2)
        input_bck = input_fwd[::-1, :, :]
        # precompute inputs*W and dimshuffle
        # Input is provided as (n_batch, n_time_steps, n_features)
        # W _in_to_gates is (n_features, 4*num_units). input dot W is then
        # (n_batch, n_time_steps, 4*num_units). Because scan iterate over the
        # first dimension we dimshuffle to (n_time_steps, n_batch, n_features)
        # flip input and mask if we ar going backwards

        input_dot_W_fwd = T.dot(
            input_fwd, self.W_in_to_gates[0])
        input_dot_W_bck = T.dot(
            input_bck, self.W_in_to_gates[1])
        input_dot_W_fwd += self.b_gates[0]
        input_dot_W_bck += self.b_gates[1]


        # mask is given as (batch_size, seq_len) or (batch_size, seq_len).
        # Because scan iterates over
        # first dim. If mask is 2d we dimshuffle to (seq_len, batch_size) and
        # add a broadcastable dimension. If 3d assume that third dim is
        # broadcastable.

        if mask_fwd.ndim == 2:
            mask_fwd = mask_fwd.dimshuffle(1, 0, 'x')
        else:
            assert mask_fwd.broadcastable == (False, False, True), \
                "When mask is 3d the last dimension must be boadcastable"
            mask_fwd = mask_fwd.dimshuffle(1, 0, 2)
        mask_bck = mask_fwd[::-1, :]   # reverse


        # input_dow_w is (n_batch, n_time_steps, 4*num_units). We define a
        # slicing function that extract the input to each LSTM gate
        # slice_c is similar but for peephole weights.
        def slice_w(x, n):
            return x[:, n*self.num_units:(n+1)*self.num_units]
        def slice_c(x, n):
            return x[n*self.num_units:(n+1)*self.num_units]

        # Create single recurrent computation step function.
        # Calculates both the forward and the backward pass.
        # The step function calculates the following:
        #
        # i_t = \sigma(W_{xi}x_t + W_{hi}h_{t-1} + W_{ci}c_{t-1} + b_i)
        # f_t = \sigma(W_{xf}x_t + W_{hf}h_{t-1} + W_{cf}c_{t-1} + b_f)
        # c_t = f_tc_{t - 1} + i_t\tanh(W_{xc}x_t + W_{hc}h_{t-1} + b_c)
        # o_t = \sigma(W_{xo}x_t + W_{ho}h_{t-1} + W_{co}c_t + b_o)
        # h_t = o_t \tanh(c_t)
        #
        # Gate names are taken from http://arxiv.org/abs/1409.2329 figure 1
        def dostep(input_dot_W_n, cell_previous, hid_previous,
                    W_hid_to_gates, W_cell_to_gates):

            # calculate gates pre-activations and slice
            gates = input_dot_W_n + T.dot(hid_previous, W_hid_to_gates)
            ingate = slice_w(gates,0)
            forgetgate = slice_w(gates,1)
            modulationgate = slice_w(gates,2)
            outgate = slice_w(gates,3)

            if self.peepholes:
                ingate += cell_previous*slice_c(W_cell_to_gates, 0)
                forgetgate += cell_previous*slice_c(W_cell_to_gates,1)

            ingate = self.nonlinearity_ingate(ingate)
            forgetgate = self.nonlinearity_forgetgate(forgetgate)
            modulationgate = self.nonlinearity_modulationgate(modulationgate)

            cell = forgetgate*cell_previous + ingate*modulationgate
            if self.peepholes:
                outgate += cell*slice_c(W_cell_to_gates, 2)
            outgate = self.nonlinearity_outgate(outgate)

            hid = outgate*self.nonlinearity_out(cell)
            return cell, hid


        def step(input_dot_W_fwd, input_dot_W_bck, mask_fwd_n, mask_bck_n,
                cell_previous_fwd, hid_previous_fwd,
                cell_previous_bck, hid_previous_bck,
                W_hid_to_gates, W_cell_to_gates):

            #forward
            cell_fwd, hid_fwd = dostep(
                input_dot_W_fwd, cell_previous_fwd, hid_previous_fwd,
                       W_hid_to_gates[0], W_cell_to_gates[0])
            # backward
            cell_bck, hid_bck = dostep(
                input_dot_W_bck, cell_previous_bck, hid_previous_bck,
                W_hid_to_gates[1], W_cell_to_gates[1])

            # If mask is 0, use previous state until mask = 1 is found.
            # This propagates the layer initial state when moving backwards
            # until the end of the sequence is found.
            not_mask_bck_n = 1 - mask_bck_n
            not_mask_fwd_n = 1 - mask_fwd_n
            cell_bck = cell_bck*mask_bck_n + cell_previous_bck*not_mask_bck_n
            cell_fwd = cell_fwd*mask_fwd_n + cell_previous_fwd*not_mask_fwd_n
            hid_bck = hid_bck*mask_bck_n + hid_previous_bck*not_mask_bck_n
            hid_fwd = hid_fwd*mask_fwd_n + hid_previous_fwd*not_mask_fwd_n

            return [cell_fwd, cell_bck, hid_fwd, hid_bck] 

        sequences = [input_dot_W_fwd, input_dot_W_bck,mask_fwd, mask_bck]
        init = [self.cell_init_fwd, self.cell_init_bck,
                self.hid_init_fwd, self.hid_init_bck]

        # Scan op iterates over first dimension of input and repeatedly
        # applied the step function
        nonseqs = [self.W_hid_to_gates, self.W_cell_to_gates]
        scan_out = theano.scan(step, sequences=sequences, outputs_info=init,
                               non_sequences=nonseqs)

        # output is  (n_time_steps, n_batch, n_units))
        output_hid_fwd = scan_out[0][2]
        output_hid_bck = scan_out[0][3]

        # reverse bck output
        output_hid_bck = output_hid_bck[::-1, :, :]

        # concateante fwd and bck
        output_hid = utils.concatenate([output_hid_fwd, output_hid_bck], axis=2)
        self.output_hid = output_hid
        self.output_hid.name = "BidireactionaLSTMLayer: output_hid"

        # Now, dimshuffle back to (n_batch, n_time_steps, n_units))
        output_hid = output_hid.dimshuffle(1, 0, 2)

        if self.returncell:
            output_cell_fwd = scan_out[0][0]
            output_cell_bck = scan_out[0][1]
            output_cell_bck = output_cell_bck[::-1, :, :]
            output_cell = utils.concatenate(
                [output_cell_fwd, output_cell_bck], axis=2)
            output_cell = output_cell.dimshuffle(1, 0, 2)
            self.output_cell = output_cell
            self.output_cell.name = "BidireactionaLSTMLayer: output_cell"
            return output_cell, output_hid
        else:
            return output_hid
Ejemplo n.º 4
0
    def __init__(
        self,
        input_layer,
        num_units,
        W_in_to_ingate=init.Normal(0.1),
        W_hid_to_ingate=init.Normal(0.1),
        W_cell_to_ingate=init.Normal(0.1),
        b_ingate=init.Normal(0.1),
        nonlinearity_ingate=nonlinearities.sigmoid,
        W_in_to_forgetgate=init.Normal(0.1),
        W_hid_to_forgetgate=init.Normal(0.1),
        W_cell_to_forgetgate=init.Normal(0.1),
        b_forgetgate=init.Normal(0.1),
        nonlinearity_forgetgate=nonlinearities.sigmoid,
        W_in_to_cell=init.Normal(0.1),
        W_hid_to_cell=init.Normal(0.1),
        b_cell=init.Normal(0.1),
        nonlinearity_cell=nonlinearities.tanh,
        W_in_to_outgate=init.Normal(0.1),
        W_hid_to_outgate=init.Normal(0.1),
        W_cell_to_outgate=init.Normal(0.1),
        b_outgate=init.Normal(0.1),
        nonlinearity_outgate=nonlinearities.sigmoid,
        nonlinearity_out=nonlinearities.tanh,
        cell_init=init.Constant(0.0),
        hid_init=init.Constant(0.0),
        backwards=False,
        learn_init=False,
        peepholes=True,
        gradient_steps=-1,
    ):
        """
        Initialize an LSTM layer.  For details on what the parameters mean, see
        (7-11) from [#graves2014generating]_.

        :parameters:
            - input_layer : layers.Layer
                Input to this recurrent layer
            - num_units : int
                Number of hidden units
            - W_in_to_ingate : function or np.ndarray or theano.shared
                :math:`W_{xi}`
            - W_hid_to_ingate : function or np.ndarray or theano.shared
                :math:`W_{hi}`
            - W_cell_to_ingate : function or np.ndarray or theano.shared
                :math:`W_{ci}`
            - b_ingate : function or np.ndarray or theano.shared
                :math:`b_i`
            - nonlinearity_ingate : function
                :math:`\sigma`
            - W_in_to_forgetgate : function or np.ndarray or theano.shared
                :math:`W_{xf}`
            - W_hid_to_forgetgate : function or np.ndarray or theano.shared
                :math:`W_{hf}`
            - W_cell_to_forgetgate : function or np.ndarray or theano.shared
                :math:`W_{cf}`
            - b_forgetgate : function or np.ndarray or theano.shared
                :math:`b_f`
            - nonlinearity_forgetgate : function
                :math:`\sigma`
            - W_in_to_cell : function or np.ndarray or theano.shared
                :math:`W_{ic}`
            - W_hid_to_cell : function or np.ndarray or theano.shared
                :math:`W_{hc}`
            - b_cell : function or np.ndarray or theano.shared
                :math:`b_c`
            - nonlinearity_cell : function or np.ndarray or theano.shared
                :math:`\tanh`
            - W_in_to_outgate : function or np.ndarray or theano.shared
                :math:`W_{io}`
            - W_hid_to_outgate : function or np.ndarray or theano.shared
                :math:`W_{ho}`
            - W_cell_to_outgate : function or np.ndarray or theano.shared
                :math:`W_{co}`
            - b_outgate : function or np.ndarray or theano.shared
                :math:`b_o`
            - nonlinearity_outgate : function
                :math:`\sigma`
            - nonlinearity_out : function or np.ndarray or theano.shared
                :math:`\tanh`
            - cell_init : function or np.ndarray or theano.shared
                :math:`c_0`
            - hid_init : function or np.ndarray or theano.shared
                :math:`h_0`
            - backwards : boolean
                If True, process the sequence backwards and then reverse the
                output again such that the output from the layer is always
                from x_1 to x_n.
            - learn_init : boolean
                If True, initial hidden values are learned
            - peepholes : boolean
                If True, the LSTM uses peephole connections.
                When False, W_cell_to_ingate, W_cell_to_forgetgate and
                W_cell_to_outgate are ignored.
            - gradient_steps : int
                Number of timesteps to include in backpropagated gradient
                If -1, backpropagate through the entire sequence
        """

        # Initialize parent layer
        super(LSTMLayer, self).__init__(input_layer)

        # For any of the nonlinearities, if None is supplied, use identity
        if nonlinearity_ingate is None:
            self.nonlinearity_ingate = nonlinearities.identity
        else:
            self.nonlinearity_ingate = nonlinearity_ingate

        if nonlinearity_forgetgate is None:
            self.nonlinearity_forgetgate = nonlinearities.identity
        else:
            self.nonlinearity_forgetgate = nonlinearity_forgetgate

        if nonlinearity_cell is None:
            self.nonlinearity_cell = nonlinearities.identity
        else:
            self.nonlinearity_cell = nonlinearity_cell

        if nonlinearity_outgate is None:
            self.nonlinearity_outgate = nonlinearities.identity
        else:
            self.nonlinearity_outgate = nonlinearity_outgate

        if nonlinearity_out is None:
            self.nonlinearity_out = nonlinearities.identity
        else:
            self.nonlinearity_out = nonlinearity_out

        self.learn_init = learn_init
        self.num_units = num_units
        self.backwards = backwards
        self.peepholes = peepholes
        self.gradient_steps = gradient_steps

        # Input dimensionality is the output dimensionality of the input layer
        (num_batch, _, num_inputs) = self.input_layer.get_output_shape()

        # Initialize parameters using the supplied args
        self.W_in_to_ingate = self.create_param(W_in_to_ingate, (num_inputs, num_units), name="W_in_to_ingate")

        self.W_hid_to_ingate = self.create_param(W_hid_to_ingate, (num_units, num_units), name="W_hid_to_ingate")

        self.b_ingate = self.create_param(b_ingate, (num_units), name="b_ingate")

        self.W_in_to_forgetgate = self.create_param(
            W_in_to_forgetgate, (num_inputs, num_units), name="W_in_to_forgetgate"
        )

        self.W_hid_to_forgetgate = self.create_param(
            W_hid_to_forgetgate, (num_units, num_units), name="W_hid_to_forgetgate"
        )

        self.b_forgetgate = self.create_param(b_forgetgate, (num_units,), name="b_forgetgate")

        self.W_in_to_cell = self.create_param(W_in_to_cell, (num_inputs, num_units), name="W_in_to_cell")

        self.W_hid_to_cell = self.create_param(W_hid_to_cell, (num_units, num_units), name="W_hid_to_cell")

        self.b_cell = self.create_param(b_cell, (num_units,), name="b_cell")

        self.W_in_to_outgate = self.create_param(W_in_to_outgate, (num_inputs, num_units), name="W_in_to_outgate")

        self.W_hid_to_outgate = self.create_param(W_hid_to_outgate, (num_units, num_units), name="W_hid_to_outgate")

        self.b_outgate = self.create_param(b_outgate, (num_units,), name="b_outgate")

        # Stack input to gate weight matrices into a (num_inputs, 4*num_units)
        # matrix, which speeds up computation
        self.W_in_to_gates = utils.concatenate(
            [self.W_in_to_ingate, self.W_in_to_forgetgate, self.W_in_to_cell, self.W_in_to_outgate], axis=1
        )

        # Same for hidden to gate weight matrices
        self.W_hid_to_gates = utils.concatenate(
            [self.W_hid_to_ingate, self.W_hid_to_forgetgate, self.W_hid_to_cell, self.W_hid_to_outgate], axis=1
        )

        # Stack gate biases into a (4*num_units) vector
        self.b_gates = utils.concatenate([self.b_ingate, self.b_forgetgate, self.b_cell, self.b_outgate], axis=0)

        # Initialize peephole (cell to gate) connections.  These are
        # elementwise products with the cell state, so they are represented as
        # vectors.
        if self.peepholes:
            self.W_cell_to_ingate = self.create_param(W_cell_to_ingate, (num_units), name="W_cell_to_ingate")

            self.W_cell_to_forgetgate = self.create_param(
                W_cell_to_forgetgate, (num_units), name="W_cell_to_forgetgate"
            )

            self.W_cell_to_outgate = self.create_param(W_cell_to_outgate, (num_units), name="W_cell_to_outgate")

        # Setup initial values for the cell and the hidden units
        self.cell_init = self.create_param(cell_init, (num_batch, num_units), name="cell_init")
        self.hid_init = self.create_param(hid_init, (num_batch, num_units), name="hid_init")
Ejemplo n.º 5
0
    def __init__(self,
                 input_layer,
                 num_units,
                 W_in_to_ingate=init.Normal(0.1),
                 W_hid_to_ingate=init.Normal(0.1),
                 W_cell_to_ingate=init.Normal(0.1),
                 b_ingate=init.Normal(0.1),
                 nonlinearity_ingate=nonlinearities.sigmoid,
                 W_in_to_forgetgate=init.Normal(0.1),
                 W_hid_to_forgetgate=init.Normal(0.1),
                 W_cell_to_forgetgate=init.Normal(0.1),
                 b_forgetgate=init.Normal(0.1),
                 nonlinearity_forgetgate=nonlinearities.sigmoid,
                 W_in_to_cell=init.Normal(0.1),
                 W_hid_to_cell=init.Normal(0.1),
                 b_cell=init.Normal(0.1),
                 nonlinearity_cell=nonlinearities.tanh,
                 W_in_to_outgate=init.Normal(0.1),
                 W_hid_to_outgate=init.Normal(0.1),
                 W_cell_to_outgate=init.Normal(0.1),
                 b_outgate=init.Normal(0.1),
                 nonlinearity_outgate=nonlinearities.sigmoid,
                 nonlinearity_out=nonlinearities.tanh,
                 cell_init=init.Constant(0.),
                 hid_init=init.Constant(0.),
                 backwards=False,
                 learn_init=False,
                 peepholes=True,
                 gradient_steps=-1):
        '''
        Initialize an LSTM layer.  For details on what the parameters mean, see
        (7-11) from [#graves2014generating]_.

        :parameters:
            - input_layer : layers.Layer
                Input to this recurrent layer
            - num_units : int
                Number of hidden units
            - W_in_to_ingate : function or np.ndarray or theano.shared
                :math:`W_{xi}`
            - W_hid_to_ingate : function or np.ndarray or theano.shared
                :math:`W_{hi}`
            - W_cell_to_ingate : function or np.ndarray or theano.shared
                :math:`W_{ci}`
            - b_ingate : function or np.ndarray or theano.shared
                :math:`b_i`
            - nonlinearity_ingate : function
                :math:`\sigma`
            - W_in_to_forgetgate : function or np.ndarray or theano.shared
                :math:`W_{xf}`
            - W_hid_to_forgetgate : function or np.ndarray or theano.shared
                :math:`W_{hf}`
            - W_cell_to_forgetgate : function or np.ndarray or theano.shared
                :math:`W_{cf}`
            - b_forgetgate : function or np.ndarray or theano.shared
                :math:`b_f`
            - nonlinearity_forgetgate : function
                :math:`\sigma`
            - W_in_to_cell : function or np.ndarray or theano.shared
                :math:`W_{ic}`
            - W_hid_to_cell : function or np.ndarray or theano.shared
                :math:`W_{hc}`
            - b_cell : function or np.ndarray or theano.shared
                :math:`b_c`
            - nonlinearity_cell : function or np.ndarray or theano.shared
                :math:`\tanh`
            - W_in_to_outgate : function or np.ndarray or theano.shared
                :math:`W_{io}`
            - W_hid_to_outgate : function or np.ndarray or theano.shared
                :math:`W_{ho}`
            - W_cell_to_outgate : function or np.ndarray or theano.shared
                :math:`W_{co}`
            - b_outgate : function or np.ndarray or theano.shared
                :math:`b_o`
            - nonlinearity_outgate : function
                :math:`\sigma`
            - nonlinearity_out : function or np.ndarray or theano.shared
                :math:`\tanh`
            - cell_init : function or np.ndarray or theano.shared
                :math:`c_0`
            - hid_init : function or np.ndarray or theano.shared
                :math:`h_0`
            - backwards : boolean
                If True, process the sequence backwards and then reverse the
                output again such that the output from the layer is always
                from x_1 to x_n.
            - learn_init : boolean
                If True, initial hidden values are learned
            - peepholes : boolean
                If True, the LSTM uses peephole connections.
                When False, W_cell_to_ingate, W_cell_to_forgetgate and
                W_cell_to_outgate are ignored.
            - gradient_steps : int
                Number of timesteps to include in backpropagated gradient
                If -1, backpropagate through the entire sequence
        '''

        # Initialize parent layer
        super(LSTMLayer, self).__init__(input_layer)

        # For any of the nonlinearities, if None is supplied, use identity
        if nonlinearity_ingate is None:
            self.nonlinearity_ingate = nonlinearities.identity
        else:
            self.nonlinearity_ingate = nonlinearity_ingate

        if nonlinearity_forgetgate is None:
            self.nonlinearity_forgetgate = nonlinearities.identity
        else:
            self.nonlinearity_forgetgate = nonlinearity_forgetgate

        if nonlinearity_cell is None:
            self.nonlinearity_cell = nonlinearities.identity
        else:
            self.nonlinearity_cell = nonlinearity_cell

        if nonlinearity_outgate is None:
            self.nonlinearity_outgate = nonlinearities.identity
        else:
            self.nonlinearity_outgate = nonlinearity_outgate

        if nonlinearity_out is None:
            self.nonlinearity_out = nonlinearities.identity
        else:
            self.nonlinearity_out = nonlinearity_out

        self.learn_init = learn_init
        self.num_units = num_units
        self.backwards = backwards
        self.peepholes = peepholes
        self.gradient_steps = gradient_steps

        # Input dimensionality is the output dimensionality of the input layer
        (num_batch, _, num_inputs) = self.input_layer.get_output_shape()

        # Initialize parameters using the supplied args
        self.W_in_to_ingate = self.create_param(W_in_to_ingate,
                                                (num_inputs, num_units),
                                                name="W_in_to_ingate")

        self.W_hid_to_ingate = self.create_param(W_hid_to_ingate,
                                                 (num_units, num_units),
                                                 name="W_hid_to_ingate")

        self.b_ingate = self.create_param(b_ingate, (num_units),
                                          name="b_ingate")

        self.W_in_to_forgetgate = self.create_param(W_in_to_forgetgate,
                                                    (num_inputs, num_units),
                                                    name="W_in_to_forgetgate")

        self.W_hid_to_forgetgate = self.create_param(
            W_hid_to_forgetgate, (num_units, num_units),
            name="W_hid_to_forgetgate")

        self.b_forgetgate = self.create_param(b_forgetgate, (num_units, ),
                                              name="b_forgetgate")

        self.W_in_to_cell = self.create_param(W_in_to_cell,
                                              (num_inputs, num_units),
                                              name="W_in_to_cell")

        self.W_hid_to_cell = self.create_param(W_hid_to_cell,
                                               (num_units, num_units),
                                               name="W_hid_to_cell")

        self.b_cell = self.create_param(b_cell, (num_units, ), name="b_cell")

        self.W_in_to_outgate = self.create_param(W_in_to_outgate,
                                                 (num_inputs, num_units),
                                                 name="W_in_to_outgate")

        self.W_hid_to_outgate = self.create_param(W_hid_to_outgate,
                                                  (num_units, num_units),
                                                  name="W_hid_to_outgate")

        self.b_outgate = self.create_param(b_outgate, (num_units, ),
                                           name="b_outgate")

        # Stack input to gate weight matrices into a (num_inputs, 4*num_units)
        # matrix, which speeds up computation
        self.W_in_to_gates = utils.concatenate([
            self.W_in_to_ingate, self.W_in_to_forgetgate, self.W_in_to_cell,
            self.W_in_to_outgate
        ],
                                               axis=1)

        # Same for hidden to gate weight matrices
        self.W_hid_to_gates = utils.concatenate([
            self.W_hid_to_ingate, self.W_hid_to_forgetgate, self.W_hid_to_cell,
            self.W_hid_to_outgate
        ],
                                                axis=1)

        # Stack gate biases into a (4*num_units) vector
        self.b_gates = utils.concatenate(
            [self.b_ingate, self.b_forgetgate, self.b_cell, self.b_outgate],
            axis=0)

        # Initialize peephole (cell to gate) connections.  These are
        # elementwise products with the cell state, so they are represented as
        # vectors.
        if self.peepholes:
            self.W_cell_to_ingate = self.create_param(W_cell_to_ingate,
                                                      (num_units),
                                                      name="W_cell_to_ingate")

            self.W_cell_to_forgetgate = self.create_param(
                W_cell_to_forgetgate, (num_units), name="W_cell_to_forgetgate")

            self.W_cell_to_outgate = self.create_param(
                W_cell_to_outgate, (num_units), name="W_cell_to_outgate")

        # Setup initial values for the cell and the hidden units
        self.cell_init = self.create_param(cell_init, (num_batch, num_units),
                                           name="cell_init")
        self.hid_init = self.create_param(hid_init, (num_batch, num_units),
                                          name="hid_init")