Esempio n. 1
def test_unroll_scan():
    from lasagne.utils import unroll_scan
    k = 2
    a = T.scalar("a")

    result = unroll_scan(fn=lambda step, prior_result, a: prior_result * a,
    final_result = result[-1]
    power = theano.function(inputs=[a], outputs=final_result)

    assert np.all(power(10) == [10, 100])

    b = T.scalar("b")

    def mul_div(step, previous_mul, previous_div, mul, div):
        return previous_mul * mul, previous_div / div

    result = unroll_scan(fn=mul_div,
                         outputs_info=[1., 1.],
                         non_sequences=[a, b],
    power = theano.function(inputs=[a, b], outputs=result)
    assert np.allclose(power(10, 10), [[10, 100], [.1, .01]])
Esempio n. 2
def test_unroll_scan():
    from lasagne.utils import unroll_scan
    k = 2
    a = T.scalar("a")

    result = unroll_scan(
        fn=lambda step, prior_result, a: prior_result * a,
        sequences=T.arange(k), outputs_info=[1.], non_sequences=[a], n_steps=k)
    final_result = result[-1]
    power = theano.function(inputs=[a], outputs=final_result)

    assert np.all(power(10) == [10, 100])

    b = T.scalar("b")

    def mul_div(step, previous_mul, previous_div, mul, div):
            return previous_mul*mul, previous_div/div

    result = unroll_scan(
        fn=mul_div, sequences=T.arange(k), outputs_info=[1., 1.],
        non_sequences=[a, b], n_steps=k)
    power = theano.function(inputs=[a, b], outputs=result)
    assert np.allclose(power(10, 10), [[10, 100], [.1, .01]])
Esempio n. 3
    def get_output_for(self, inputs, **kwargs):
        # Retrieve the layer input
        input = inputs[0]
        # Retrieve the mask when it is supplied
        mask = None
        hid_init = None
        cell_init = None
        if self.mask_incoming_index > 0:
            mask = inputs[self.mask_incoming_index]
        if self.hid_init_incoming_index > 0:
            hid_init = inputs[self.hid_init_incoming_index]
        if self.cell_init_incoming_index > 0:
            cell_init = inputs[self.cell_init_incoming_index]

        # Treat all dimensions after the second as flattened feature dimensions
        if input.ndim > 3:
            input = T.flatten(input, 3)

        # if
        #     input =

        # Because scan iterates over the first dimension we dimshuffle to
        # (n_time_steps, n_batch, n_features)
        input = input.dimshuffle(1, 0, 2)
        seq_len, num_batch, _ = input.shape

        # Stack input weight matrices into a (num_inputs, 4*num_units)
        # matrix, which speeds up computation
        W_in_stacked = T.concatenate([
            self.W_in_to_ingate, self.W_in_to_forgetgate, self.W_in_to_cell,

        # Same for hidden weight matrices
        W_hid_stacked = T.concatenate([
            self.W_hid_to_ingate, self.W_hid_to_forgetgate, self.W_hid_to_cell,

        # Stack biases into a (4*num_units) vector
        b_stacked = T.concatenate(
            [self.b_ingate, self.b_forgetgate, self.b_cell, self.b_outgate],

        # Stack second order term gating biases into a (4*num_units) vector
        a_g_stacked = T.concatenate([
            self.a_g_ingate, self.a_g_forgetgate, self.a_g_cell,

        # Stack second order term gating biases into a (4*num_units) vector
        b_g_in_to_hid_stacked = T.concatenate([
            self.b_g_in_to_hid_ingate, self.b_g_in_to_hid_forgetgate,
            self.b_g_in_to_hid_cell, self.b_g_in_to_hid_outgate

        # Stack second order term gating biases into a (4*num_units) vector
        b_g_hid_to_hid_stacked = T.concatenate([
            self.b_g_hid_to_hid_ingate, self.b_g_hid_to_hid_forgetgate,
            self.b_g_hid_to_hid_cell, self.b_g_hid_to_hid_outgate

        if self.precompute_input:
            # Because the input is given for all time steps, we can
            # precompute_input the inputs dot weight matrices before scanning.
            # W_in_stacked is (n_features, 4*num_units). input is then
            # (n_time_steps, n_batch, 4*num_units).
            input =, W_in_stacked)

                print 'Using batch normalization'
                input =, 0, 2))
                input = input.dimshuffle(1, 0, 2)

        # When theano.scan calls step, input_n will be (n_batch, 4*num_units).
        # We define a slicing function that extract the input to each LSTM gate
        def slice_w(x, n):
            s = x[:, n * self.num_units:(n + 1) * self.num_units]
            if self.num_units == 1:
                s = T.addbroadcast(s, 1)  # Theano cannot infer this by itself
            return s

        # Create single recurrent computation step function
        # input_n is the n'th vector of the input
        def step(input_n, cell_previous, hid_previous, *args):

            # Compute the input-to-hidden activation
            if not self.precompute_input:
                input_n =, W_in_stacked)

            # Compute the hidden-to-hidden activation
            hid_to_hid =, W_hid_stacked)

            # Compute the second order term
            second_order_term = a_g_stacked * input_n * hid_to_hid

            # Compute the first order input-to-hidden term
            f_o_in_to_hid = b_g_in_to_hid_stacked * input_n

            # Compute the first order hidden-to-hidden term
            f_o_hid_to_hid = b_g_hid_to_hid_stacked * hid_to_hid

            # Calculate gates pre-activations and slice
            gates = (second_order_term + f_o_in_to_hid + f_o_hid_to_hid +

            # Clip gradients
            if self.grad_clipping:
                gates = theano.gradient.grad_clip(gates, -self.grad_clipping,

            # Extract the pre-activation gate values
            ingate = slice_w(gates, 0)
            forgetgate = slice_w(gates, 1)
            cell_input = slice_w(gates, 2)
            outgate = slice_w(gates, 3)

            # Apply nonlinearities
            ingate = self.nonlinearity_ingate(ingate)
            forgetgate = self.nonlinearity_forgetgate(forgetgate)
            cell_input = self.nonlinearity_cell(cell_input)

            # Compute new cell value
            cell = forgetgate * cell_previous + ingate * cell_input

            outgate = self.nonlinearity_outgate(outgate)

            # Compute new hidden unit activation
            hid = outgate * self.nonlinearity(cell)
            return [cell, hid]

        def step_masked(input_n, mask_n, cell_previous, hid_previous, *args):
            cell, hid = step(input_n, cell_previous, hid_previous, *args)

            # Skip over any input with mask 0 by copying the previous
            # hidden state; proceed normally for any input with mask 1.
            cell = T.switch(mask_n, cell, cell_previous)
            hid = T.switch(mask_n, hid, hid_previous)

            return [cell, hid]

        if mask is not None:
            # mask is given as (batch_size, seq_len). Because scan iterates
            # over first dimension, we dimshuffle to (seq_len, batch_size) and
            # add a broadcastable dimension
            mask = mask.dimshuffle(1, 0, 'x')
            sequences = [input, mask]
            step_fun = step_masked
            sequences = input
            step_fun = step

        ones = T.ones((num_batch, 1))
        if not isinstance(self.cell_init, Layer):
            # Dot against a 1s vector to repeat to shape (num_batch, num_units)
            cell_init =, self.cell_init)

        if not isinstance(self.hid_init, Layer):
            # Dot against a 1s vector to repeat to shape (num_batch, num_units)
            hid_init =, self.hid_init)

        # The hidden-to-hidden weight matrix and the gating biases
        # are always used in this step and the bias
        non_seqs = [
            W_hid_stacked, a_g_stacked, b_g_in_to_hid_stacked,
            b_g_hid_to_hid_stacked, b_stacked

        # When we aren't precomputing the input outside of scan, we need to
        # provide the input weights and biases to the step function
        if not self.precompute_input:
            non_seqs += [W_in_stacked]

        if self.unroll_scan:
            # Retrieve the dimensionality of the incoming layer
            input_shape = self.input_shapes[0]
            # Explicitly unroll the recurrence instead of using scan
            cell_out, hid_out = unroll_scan(fn=step_fun,
                                            outputs_info=[cell_init, hid_init],
            # Scan op iterates over first dimension of input and repeatedly
            # applies the step function
            cell_out, hid_out = theano.scan(
                outputs_info=[cell_init, hid_init],

        # When it is requested that we only return the final sequence step,
        # we need to slice it out immediately after scan is applied
        if self.only_return_final:
            hid_out = hid_out[-1]
            # dimshuffle back to (n_batch, n_time_steps, n_features))
            hid_out = hid_out.dimshuffle(1, 0, 2)

            # if scan is backward reverse the output
            if self.backwards:
                hid_out = hid_out[:, ::-1]

        return hid_out
Esempio n. 4
File: Progetto: lpigou/ijcv16
    def get_output_for(self, inputs, mask=None, **kwargs):
        Compute this layer's output function given a symbolic input variable.

        inputs : list of theano.TensorType
            `inputs[0]` should always be the symbolic input variable.  When
            this layer has a mask input (i.e. was instantiated with
            `mask_input != None`, indicating that the lengths of sequences in
            each batch vary), `inputs` should have length 2, where `inputs[1]`
            is the `mask`.  The `mask` should be supplied as a Theano variable
            denoting whether each time step in each sequence in the batch is
            part of the sequence or not.  `mask` should be a matrix of shape
            ``(n_batch, n_time_steps)`` where ``mask[i, j] = 1`` when ``j <=
            (length of sequence i)`` and ``mask[i, j] = 0`` when ``j > (length
            of sequence i)``. When the hidden state of this layer is to be
            pre-filled (i.e. was set to a :class:`Layer` instance) `inputs`
            should have length at least 2, and `inputs[-1]` is the hidden state
            to prefill with.

        layer_output : theano.TensorType
            Symbolic output variable.
        # Retrieve the layer input
        input = inputs[0]
        # Retrieve the mask when it is supplied
        hid_init = None
        if self.mask_incoming_index > 0:
            mask = inputs[self.mask_incoming_index]
        if self.hid_init_incoming_index > 0:
            hid_init = inputs[self.hid_init_incoming_index]

        # Input should be provided as (n_batch, n_time_steps, n_features)
        # but scan requires the iterable dimension to be first
        # So, we need to dimshuffle to (n_time_steps, n_batch, n_features)
        input = input.dimshuffle(1, 0, *range(2, input.ndim))
        seq_len, num_batch = input.shape[0], input.shape[1]

        if self.precompute_input:
            # Because the input is given for all time steps, we can precompute
            # the inputs to hidden before scanning. First we need to reshape
            # from (seq_len, batch_size, trailing dimensions...) to
            # (seq_len*batch_size, trailing dimensions...)
            # This strange use of a generator in a tuple was because
            # input.shape[2:] was raising a Theano error
            trailing_dims = tuple(input.shape[n] for n in range(2, input.ndim))
            input = T.reshape(input, (seq_len*num_batch,) + trailing_dims)
            input = helper.get_output(
                self.input_to_hidden, input, **kwargs)

            # Reshape back to (seq_len, batch_size, trailing dimensions...)
            trailing_dims = tuple(input.shape[n] for n in range(1, input.ndim))
            input = T.reshape(input, (seq_len, num_batch) + trailing_dims)

        # We will always pass the hidden-to-hidden layer params to step
        non_seqs = helper.get_all_params(self.hidden_to_hidden)
        # When we are not precomputing the input, we also need to pass the
        # input-to-hidden parameters to step
        if not self.precompute_input:
            non_seqs += helper.get_all_params(self.input_to_hidden)

        # Create single recurrent computation step function
        def step(input_n, hid_previous, *args):
            # Compute the hidden-to-hidden activation
            hid_pre = helper.get_output(
                self.hidden_to_hidden, hid_previous, **kwargs)

            # If the dot product is precomputed then add it, otherwise
            # calculate the input_to_hidden values and add them
            if self.precompute_input:
                hid_pre += input_n
                hid_pre += helper.get_output(
                    self.input_to_hidden, input_n, **kwargs)

            # Clip gradients
            if self.grad_clipping:
                hid_pre = theano.gradient.grad_clip(
                    hid_pre, -self.grad_clipping, self.grad_clipping)

            return self.nonlinearity(hid_pre)

        def step_masked(input_n, mask_n, hid_previous, *args):
            # Skip over any input with mask 0 by copying the previous
            # hidden state; proceed normally for any input with mask 1.
            hid = step(input_n, hid_previous, *args)
            hid_out = T.switch(mask_n, hid, hid_previous)
            return [hid_out]

        if mask is not None:
            mask = mask.dimshuffle(1, 0, 'x')
            sequences = [input, mask]
            step_fun = step_masked
            sequences = input
            step_fun = step

        if not isinstance(self.hid_init, Layer):
            # The code below simply repeats self.hid_init num_batch times in
            # its first dimension.  Turns out using a dot product and a
            # dimshuffle is faster than T.repeat.
            dot_dims = (list(range(1, self.hid_init.ndim - 1)) +
                        [0, self.hid_init.ndim - 1])
            hid_init =, 1)),

        if self.unroll_scan:
            # Retrieve the dimensionality of the incoming layer
            input_shape = self.input_shapes[0]
            # Explicitly unroll the recurrence instead of using scan
            hid_out = unroll_scan(
            # Scan op iterates over first dimension of input and repeatedly
            # applies the step function
            hid_out = theano.scan(

        # When it is requested that we only return the final sequence step,
        # we need to slice it out immediately after scan is applied
        if self.only_return_final:
            hid_out = hid_out[-1]
            # dimshuffle back to (n_batch, n_time_steps, n_features))
            hid_out = hid_out.dimshuffle(1, 0, *range(2, hid_out.ndim))

            # if scan is backward reverse the output
            if self.backwards:
                hid_out = hid_out[:, ::-1]

        return hid_out
Esempio n. 5
    def get_output_for(self, inputs, **kwargs):
        # Retrieve the layer input
        input = inputs[0]
        # Retrieve the mask when it is supplied
        mask = None
        hid_init = None
        cell_init = None
        if self.mask_incoming_index > 0:
            mask = inputs[self.mask_incoming_index]
        if self.hid_init_incoming_index > 0:
            hid_init = inputs[self.hid_init_incoming_index]
        if self.cell_init_incoming_index > 0:
            cell_init = inputs[self.cell_init_incoming_index]

        # TLSTM: Define new input
        time_mat = inputs[self.time_incoming_index]

        # Treat all dimensions after the second as flattened feature dimensions
        if input.ndim > 3:
            input = T.flatten(input, 3)

            input =

        # Because scan iterates over the first dimension we dimshuffle to
        # (n_time_steps, n_batch, n_features)
        input = input.dimshuffle(1, 0, 2)
        #(n_time_steps, n_batch)
        time_input = time_mat.dimshuffle(1, 0, 'x')
        time_seq_len, time_num_batch, _ = time_input.shape
        seq_len, num_batch, _ = input.shape

        # Stack input weight matrices into a (num_inputs, 5*num_units)
        # matrix, which speeds up computation
        W_in_stacked = T.concatenate(
             self.W_in_to_cell, self.W_in_to_outgate, self.W_x2_to_tg2, self.W_x1_to_tg1], axis=1)

        # Same for hidden weight matrices
        W_hid_stacked = T.concatenate(
             self.W_hid_to_cell, self.W_hid_to_outgate], axis=1)

        # Stack biases into a (5*num_units) vector
        b_stacked = T.concatenate(
             self.b_cell, self.b_outgate, self.b2_tg2, self.b1_tg1], axis=0)

        # W_t1_to_tg1_constraint < 0
        W_t1_to_tg1_constraint = T.switch(, self.boundary), self.W_t1_to_tg1, self.boundary)

        # Stack delta time weight matrices into a (num_inputs, 2* num_units)
        W_t_stacked = T.concatenate([ self.W_to_to_outgate, self.W_t2_to_tg2, W_t1_to_tg1_constraint ], axis=1)

        if self.precompute_input:
            # Because the input is given for all time steps, we can
            # precompute_input the inputs dot weight matrices before scanning.
            # W_in_stacked is (n_features, 4*num_units). input is then
            # (n_time_steps, n_batch, 4*num_units).
            time_input =, W_t_stacked)
            input =, W_in_stacked) + b_stacked

        # When theano.scan calls step, input_n will be (n_batch, 4*num_units).
        # We define a slicing function that extract the input to each LSTM gate
        def slice_w(x, start, stride=1):
            return x[:, start*self.num_units:(start+stride)*self.num_units]

        # Create single recurrent computation step function
        # input_n is the n'th vector of the input
        # todo
        # insert Tm_n, weight_t_o_n in to mask_n and xell_previous
        def step(input_n, time_input_n, cell_previous, hid_previous, *args):
            if not self.precompute_input:
                time_input_n =, W_t_stacked)
                input_n =, W_in_stacked) + b_stacked

            tm_wto_n = slice_w(time_input_n, 0)
            tm_w2_n = slice_w(time_input_n, 1)
            tm_w1_n = slice_w(time_input_n, 2)
            tm_w2_n = self.nonlinearity_inside_tg2(tm_w2_n)
            tm_w1_n = self.nonlinearity_inside_tg1(tm_w1_n)
            tm2_xwb_n = slice_w(input_n, 3)
            tm1_xwb_n = slice_w(input_n, 4)
            timegate2 = self.nonlinearity_outside_tg2(tm_w2_n + tm2_xwb_n)
            timegate1 = self.nonlinearity_outside_tg1(tm_w1_n + tm1_xwb_n)
            input_n = slice_w(input_n, 0, 3)

            # Calculate gates pre-activations and slice
            gates = input_n +, W_hid_stacked)

            # Clip gradients
            if self.grad_clipping:
                gates = theano.gradient.grad_clip(
                    gates, -self.grad_clipping, self.grad_clipping)

            # Extract the pre-activation gate values
            ingate = slice_w(gates, 0)
            cell_input = slice_w(gates, 1)
            outgate = slice_w(gates, 2)
            outgate += tm_wto_n

            if self.peepholes:
                # Compute peephole connections
                ingate += cell_previous*self.W_cell_to_ingate

            # Apply nonlinearities
            ingate = self.nonlinearity_ingate(ingate)
            cell_input = self.nonlinearity_cell(cell_input)
            # Compute new cell value
            cell = (1 - ingate)*cell_previous + ingate*timegate2*cell_input
            tilde_cell = (1 - ingate*timegate1)*cell_previous + ingate*timegate1*cell_input

            if self.peepholes:
                outgate += tilde_cell*self.W_cell_to_outgate

            outgate = self.nonlinearity_outgate(outgate)

            # Compute new hidden unit activation
            hid = outgate*self.nonlinearity(tilde_cell)
            return [cell, hid]

        def step_masked(input_n, time_input_n, mask_n, 
                cell_previous, hid_previous, *args):

            cell, hid = step(input_n, time_input_n, 
                    cell_previous, hid_previous, *args)

            # Skip over any input with mask 0 by copying the previous
            # hidden state; proceed normally for any input with mask 1.
            cell = T.switch(mask_n, cell, cell_previous)
            hid = T.switch(mask_n, hid, hid_previous)

            return [cell, hid]

        if mask is not None:
            # mask is given as (batch_size, seq_len). Because scan iterates
            # over first dimension, we dimshuffle to (seq_len, batch_size) and
            # add a broadcastable dimension
            mask = mask.dimshuffle(1, 0, 'x')
            sequences = [input, time_input, mask] 
            step_fun = step_masked
            sequences = [input, time_input]
            step_fun = step

        ones = T.ones((num_batch, 1))
        if not isinstance(self.cell_init, Layer):
            # Dot against a 1s vector to repeat to shape (num_batch, num_units)
            cell_init =, self.cell_init)

        if not isinstance(self.hid_init, Layer):
            # Dot against a 1s vector to repeat to shape (num_batch, num_units)
            hid_init =, self.hid_init)

        # The hidden-to-hidden weight matrix is always used in step
        non_seqs = [W_hid_stacked]

        # The "peephole" weight matrices are only used when self.peepholes=True
        if self.peepholes:
            non_seqs += [self.W_cell_to_ingate,

        # When we aren't precomputing the input outside of scan, we need to
        # provide the input weights and biases to the step function
        if not self.precompute_input:
            non_seqs += [W_in_stacked, b_stacked, W_t_stacked]

        if self.unroll_scan:
            # Retrieve the dimensionality of the incoming layer
            input_shape = self.input_shapes[0]
            # Explicitly unroll the recurrence instead of using scan
            cell_out, hid_out = unroll_scan(
                outputs_info=[cell_init, hid_init],
            # Scan op iterates over first dimension of input and repeatedly
            # applies the step function
            cell_out, hid_out = theano.scan(
                outputs_info=[cell_init, hid_init],

        # When it is requested that we only return the final sequence step,
        # we need to slice it out immediately after scan is applied
        if self.only_return_final:
            hid_out = hid_out[-1]
            # dimshuffle back to (n_batch, n_time_steps, n_features))
            hid_out = hid_out.dimshuffle(1, 0, 2)

            # if scan is backward reverse the output
            if self.backwards:
                hid_out = hid_out[:, ::-1]

        return hid_out
Esempio n. 6
    def get_output_for(self, inputs, deterministic=False, **kwargs):
        Compute this layer's output function given a symbolic input variable
        inputs : list of theano.TensorType
            `inputs[0]` should always be the symbolic input variable.  When
            this layer has a mask input (i.e. was instantiated with
            `mask_input != None`, indicating that the lengths of sequences in
            each batch vary), `inputs` should have length 2, where `inputs[1]`
            is the `mask`.  The `mask` should be supplied as a Theano variable
            denoting whether each time step in each sequence in the batch is
            part of the sequence or not.  `mask` should be a matrix of shape
            ``(n_batch, n_time_steps)`` where ``mask[i, j] = 1`` when ``j <=
            (length of sequence i)`` and ``mask[i, j] = 0`` when ``j > (length
            of sequence i)``. When the hidden state of this layer is to be
            pre-filled (i.e. was set to a :class:`Layer` instance) `inputs`
            should have length at least 2, and `inputs[-1]` is the hidden state
            to prefill with. When the cell state of this layer is to be
            pre-filled (i.e. was set to a :class:`Layer` instance) `inputs`
            should have length at least 2, and `inputs[-1]` is the hidden state
            to prefill with. When both the cell state and the hidden state are
            being pre-filled `inputs[-2]` is the hidden state, while
            `inputs[-1]` is the cell state.
        layer_output : theano.TensorType
            Symbolic output variable.
        # Retrieve the layer input
        input = inputs[0]
        # Retrieve the mask when it is supplied
        mask = None
        hid_init = None
        cell_init = None
        if self.mask_incoming_index > 0:
            mask = inputs[self.mask_incoming_index]
        if self.hid_init_incoming_index > 0:
            hid_init = inputs[self.hid_init_incoming_index]
        if self.cell_init_incoming_index > 0:
            cell_init = inputs[self.cell_init_incoming_index]

        # PHASED LSTM: Define new input
        time_mat = inputs[self.time_incoming_index]

        # Treat all dimensions after the second as flattened feature dimensions
        if input.ndim > 3:
            input = T.flatten(input, 3)

            input =

        # Because scan iterates over the first dimension we dimshuffle to
        # (n_time_steps, n_batch, n_features)
        input = input.dimshuffle(1, 0, 2)
        # PHASED LSTM: Get shapes for time input and rearrange for the scan fn
        time_input = time_mat.dimshuffle(1,0)
        time_seq_len, time_num_batch = time_input.shape
        seq_len, num_batch, _ = input.shape

        # Stack input weight matrices into a (num_inputs, 4*num_units)
        # matrix, which speeds up computation
        W_in_stacked = T.concatenate(
            [self.W_in_to_ingate, self.W_in_to_forgetgate,
             self.W_in_to_cell, self.W_in_to_outgate], axis=1)

        # Same for hidden weight matrices
        W_hid_stacked = T.concatenate(
            [self.W_hid_to_ingate, self.W_hid_to_forgetgate,
             self.W_hid_to_cell, self.W_hid_to_outgate], axis=1)

        # Stack biases into a (4*num_units) vector
        b_stacked = T.concatenate(
            [self.b_ingate, self.b_forgetgate,
             self.b_cell, self.b_outgate], axis=0)

        # PHASED LSTM: If test time, off-phase means really shut.
        if deterministic:
            print('Using true off for testing.')
            off_slope = 0.0
            print('Using {} for off_slope.'.format(self.off_alpha))
            off_slope = self.off_alpha

        # PHASED LSTM: Pregenerate broadcast vars.
        #   Same neuron in different batches has same shift and period.  Also,
        #   precalculate the middle (on_mid) and end (on_end) of the open-phase
        #   ramp.
        shift_broadcast = self.shift_timegate.dimshuffle(['x',0])
        period_broadcast = T.abs_(self.period_timegate.dimshuffle(['x',0]))
        on_mid_broadcast = T.abs_(self.on_end_timegate.dimshuffle(['x',0])) * 0.5 * period_broadcast
        on_end_broadcast = T.abs_(self.on_end_timegate.dimshuffle(['x',0])) * period_broadcast

        if self.precompute_input:
            # Because the input is given for all time steps, we can
            # precompute_input the inputs dot weight matrices before scanning.
            # W_in_stacked is (n_features, 4*num_units). input is then
            # (n_time_steps, n_batch, 4*num_units).
            input =, W_in_stacked) + b_stacked

        # At each call to scan, input_n will be (n_time_steps, 4*num_units).
        # We define a slicing function that extract the input to each LSTM gate
        def slice_w(x, n):
            return x[:, n*self.num_units:(n+1)*self.num_units]

        # Create single recurrent computation step function
        # input_n is the n'th vector of the input
        def step(input_n, time_input_n, cell_previous, hid_previous, *args):
            if not self.precompute_input:
                input_n =, W_in_stacked) + b_stacked

            # Calculate gates pre-activations and slice
            gates = input_n +, W_hid_stacked)

            # Clip gradients
            if self.grad_clipping:
                gates = theano.gradient.grad_clip(
                    gates, -self.grad_clipping, self.grad_clipping)

            # Extract the pre-activation gate values
            ingate = slice_w(gates, 0)
            forgetgate = slice_w(gates, 1)
            cell_input = slice_w(gates, 2)
            outgate = slice_w(gates, 3)

            if self.peepholes:
                # Compute peephole connections
                ingate += cell_previous*self.W_cell_to_ingate
                forgetgate += cell_previous*self.W_cell_to_forgetgate

            # Apply nonlinearities
            ingate = self.nonlinearity_ingate(ingate)
            forgetgate = self.nonlinearity_forgetgate(forgetgate)
            cell_input = self.nonlinearity_cell(cell_input)

            # Mix in new stuff
            cell = forgetgate*cell_previous + ingate*cell_input

            if self.peepholes:
                outgate += cell*self.W_cell_to_outgate
            outgate = self.nonlinearity_outgate(outgate)

            # Compute new hidden unit activation
            hid = outgate*self.nonlinearity(cell)
            return [cell, hid]

        # PHASED LSTM: The actual calculation of the time gate
        def calc_time_gate(time_input_n):
            # Broadcast the time across all units
            t_broadcast = time_input_n.dimshuffle([0,'x'])
            # Get the time within the period
            in_cycle_time = T.mod(t_broadcast + shift_broadcast, period_broadcast)
            # Find the phase
            is_up_phase = T.le(in_cycle_time, on_mid_broadcast)
            is_down_phase =, on_mid_broadcast)*T.le(in_cycle_time, on_end_broadcast)
            # Set the mask
            sleep_wake_mask = T.switch(is_up_phase, in_cycle_time/on_mid_broadcast,

            return sleep_wake_mask

        # PHASED LSTM: Mask the updates based on the time phase
        def step_masked(input_n, time_input_n, mask_n, cell_previous, hid_previous, *args):
            cell, hid = step(input_n, time_input_n, cell_previous, hid_previous, *args)

            # Get time gate openness
            sleep_wake_mask = calc_time_gate(time_input_n)

            # Sleep if off, otherwise stay a bit on
            cell = sleep_wake_mask*cell + (1.-sleep_wake_mask)*cell_previous
            hid = sleep_wake_mask*hid + (1.-sleep_wake_mask)*hid_previous

            # Skip over any input with mask 0 by copying the previous
            # hidden state; proceed normally for any input with mask 1.
            cell = T.switch(mask_n, cell, cell_previous)
            hid = T.switch(mask_n, hid, hid_previous)

            return [cell, hid]

        if mask is not None:
            # mask is given as (batch_size, seq_len). Because scan iterates
            # over first dimension, we dimshuffle to (seq_len, batch_size) and
            # add a broadcastable dimension
            mask = mask.dimshuffle(1, 0, 'x')
            mask = T.ones_like(time_input).dimshuffle(0,1,'x')

        sequences = [input, time_input, mask]
        step_fun = step_masked

        ones = T.ones((num_batch, 1))
        if not isinstance(self.cell_init, Layer):
            # Dot against a 1s vector to repeat to shape (num_batch, num_units)
            cell_init =, self.cell_init)

        if not isinstance(self.hid_init, Layer):
            # Dot against a 1s vector to repeat to shape (num_batch, num_units)
            hid_init =, self.hid_init)

        non_seqs = [W_hid_stacked, self.period_timegate, self.shift_timegate, self.on_end_timegate]
        # The "peephole" weight matrices are only used when self.peepholes=True
        if self.peepholes:
            non_seqs += [self.W_cell_to_ingate,

        # When we aren't precomputing the input outside of scan, we need to
        # provide the input weights and biases to the step function
        if not self.precompute_input:
            non_seqs += [W_in_stacked, b_stacked]

        if self.unroll_scan:
            # Retrieve the dimensionality of the incoming layer
            input_shape = self.input_shapes[0]
            # Explicitly unroll the recurrence instead of using scan
            cell_out, hid_out = unroll_scan(
                outputs_info=[cell_init, hid_init],
            # Scan op iterates over first dimension of input and repeatedly
            # applies the step function
            cell_out, hid_out = theano.scan(
                outputs_info=[cell_init, hid_init],

        # When it is requested that we only return the final sequence step,
        # we need to slice it out immediately after scan is applied
        if self.only_return_final:
            hid_out = hid_out[-1]
            # dimshuffle back to (n_batch, n_time_steps, n_features))
            hid_out = hid_out.dimshuffle(1, 0, 2)

            # if scan is backward reverse the output
            if self.backwards:
                hid_out = hid_out[:, ::-1]

        return hid_out
    def get_output_for(self, inputs, **kwargs):
        Have to re-write LSTMLayer's output construction because we need
        cell_out, which is not stored in the original
        # Retrieve the layer input
        input = inputs[0]
        # Retrieve the mask when it is supplied
        mask = None
        hid_init = None
        cell_init = None
        if self.mask_incoming_index > 0:
            mask = inputs[self.mask_incoming_index]
        if self.hid_init_incoming_index > 0:
            hid_init = inputs[self.hid_init_incoming_index]
        if self.cell_init_incoming_index > 0:
            cell_init = inputs[self.cell_init_incoming_index]

        # Treat all dimensions after the second as flattened feature dimensions
        if input.ndim > 3:
            input = T.flatten(input, 3)

        # Because scan iterates over the first dimension we dimshuffle to
        # (n_time_steps, n_batch, n_features)
        input = input.dimshuffle(1, 0, 2)
        seq_len, num_batch, _ = input.shape

        # Stack input weight matrices into a (num_inputs, 4*num_units)
        # matrix, which speeds up computation
        W_in_stacked = T.concatenate(
            [self.W_in_to_ingate, self.W_in_to_forgetgate,
             self.W_in_to_cell, self.W_in_to_outgate], axis=1)

        # Same for hidden weight matrices
        W_hid_stacked = T.concatenate(
            [self.W_hid_to_ingate, self.W_hid_to_forgetgate,
             self.W_hid_to_cell, self.W_hid_to_outgate], axis=1)

        # Stack biases into a (4*num_units) vector
        b_stacked = T.concatenate(
            [self.b_ingate, self.b_forgetgate,
             self.b_cell, self.b_outgate], axis=0)

        if self.precompute_input:
            # Because the input is given for all time steps, we can
            # precompute_input the inputs dot weight matrices before scanning.
            # W_in_stacked is (n_features, 4*num_units). input is then
            # (n_time_steps, n_batch, 4*num_units).
            input =, W_in_stacked) + b_stacked

        # At each call to scan, input_n will be (n_time_steps, 4*num_units).
        # We define a slicing function that extract the input to each LSTM gate
        def slice_w(x, n):
            return x[:, n*self.num_units:(n+1)*self.num_units]

        # Create single recurrent computation step function
        # input_n is the n'th vector of the input
        def step(input_n, cell_previous, hid_previous, *args):
            if not self.precompute_input:
                input_n =, W_in_stacked) + b_stacked

            # Calculate gates pre-activations and slice
            gates = input_n +, W_hid_stacked)

            # Clip gradients
            if self.grad_clipping:
                gates = theano.gradient.grad_clip(
                    gates, -self.grad_clipping, self.grad_clipping)

            # Extract the pre-activation gate values
            ingate = slice_w(gates, 0)
            forgetgate = slice_w(gates, 1)
            cell_input = slice_w(gates, 2)
            outgate = slice_w(gates, 3)

            if self.peepholes:
                # Compute peephole connections
                ingate += cell_previous*self.W_cell_to_ingate
                forgetgate += cell_previous*self.W_cell_to_forgetgate

            # Apply nonlinearities
            ingate = self.nonlinearity_ingate(ingate)
            forgetgate = self.nonlinearity_forgetgate(forgetgate)
            cell_input = self.nonlinearity_cell(cell_input)

            # Compute new cell value
            cell = forgetgate*cell_previous + ingate*cell_input

            if self.peepholes:
                outgate += cell*self.W_cell_to_outgate
            outgate = self.nonlinearity_outgate(outgate)

            # Compute new hidden unit activation
            hid = outgate*self.nonlinearity(cell)
            return [cell, hid]

        def step_masked(input_n, mask_n, cell_previous, hid_previous, *args):
            cell, hid = step(input_n, cell_previous, hid_previous, *args)

            # Skip over any input with mask 0 by copying the previous
            # hidden state; proceed normally for any input with mask 1.
            not_mask = 1 - mask_n
            cell = cell*mask_n + cell_previous*not_mask
            hid = hid*mask_n + hid_previous*not_mask

            return [cell, hid]

        if mask is not None:
            # mask is given as (batch_size, seq_len). Because scan iterates
            # over first dimension, we dimshuffle to (seq_len, batch_size) and
            # add a broadcastable dimension
            mask = mask.dimshuffle(1, 0, 'x')
            sequences = [input, mask]
            step_fun = step_masked
            sequences = input
            step_fun = step

        ones = T.ones((num_batch, 1))
        if isinstance(self.cell_init, Layer):
        elif isinstance(self.cell_init, T.TensorVariable):
            cell_init = self.cell_init
            # Dot against a 1s vector to repeat to shape (num_batch, num_units)
            cell_init =, self.cell_init)

        if isinstance(self.hid_init, Layer):
        elif isinstance(self.hid_init, T.TensorVariable):
            hid_init = self.hid_init
            # Dot against a 1s vector to repeat to shape (num_batch, num_units)
            hid_init =, self.hid_init)

        # The hidden-to-hidden weight matrix is always used in step
        non_seqs = [W_hid_stacked]
        # The "peephole" weight matrices are only used when self.peepholes=True
        if self.peepholes:
            non_seqs += [self.W_cell_to_ingate,

        # When we aren't precomputing the input outside of scan, we need to
        # provide the input weights and biases to the step function
        if not self.precompute_input:
            non_seqs += [W_in_stacked, b_stacked]

        if self.unroll_scan:
            # Retrieve the dimensionality of the incoming layer
            input_shape = self.input_shapes[0]
            # Explicitly unroll the recurrence instead of using scan
            cell_out, hid_out = unroll_scan(
                outputs_info=[cell_init, hid_init],
            # Scan op iterates over first dimension of input and repeatedly
            # applies the step function
            cell_out, hid_out = theano.scan(
                outputs_info=[cell_init, hid_init],

        # When it is requested that we only return the final sequence step,
        # we need to slice it out immediately after scan is applied
        if self.only_return_final:
            hid_out = hid_out[-1]
            cell_out = cell_out[-1]
            # dimshuffle back to (n_batch, n_time_steps, n_features))
            hid_out = hid_out.dimshuffle(1, 0, 2)
            cell_out = cell_out.dimshuffle(1, 0, 2)

            # if scan is backward reverse the output
            if self.backwards:
                hid_out = hid_out[:, ::-1]
                cell_out = cell_out[:, ::-1]

        return T.concatenate([cell_out, hid_out], axis=2)
Esempio n. 8
    def get_sessions(self, 
                     environment = None,
                     session_length = 10,
                     batch_size = None,
                     initial_env_states = 'zeros',
                     initial_observations = 'zeros',
                     initial_state_variables = 'zeros',
        """returns history of agent interaction with environment for given number of turns:
            environment - an environment to interact with (BaseEnvironment instance)
            session_length - how many turns of interaction shall there be for each batch
            batch_size - [required parameter] amount of independed sessions [number or symbolic].Irrelevant if you manually set all initial_*.
            initial_<something> - initial values for all variables at 0-th time step
            Unless you are doing something nasty, initial policy (qvalues) and actions will not matter at all
            'zeros' default means filling variable with zeros
            Initial values are NOT included in history sequences
            additional_output_layers - any layers of a network which outputs need to be added to the outputs
            flags: optional flags to be sent to NN when calling get_output (e.g. deterministic = True)

            state_seq,observation_seq,hidden_seq,policy_seq,action_seq, [additional_output_0, additional_output_1]
            for environment state, observation, hidden state, agent policy and chosen actions respectively
            each of them having dimensions of [batch_i,seq_i,...]
            time synchronization policy:
                state_seq,observation_seq correspond to observation BASED ON WHICH agent generated hidden_seq,policy_seq,action_seq
        env = environment
        #assert that environment is None if and only if there are no observations
        assert (env is None) == (len(self.observation_layers) == 0)
        if env is not None:
            if initial_env_states == 'zeros':
                initial_env_states = [T.zeros([batch_size,size]) 
                                      for size in check_list(env.state_size)]
                initial_env_states = check_list(initial_env_states)

            if initial_observations == 'zeros':
                initial_observations = [T.zeros((batch_size,)+tuple(obs_layer.shape[1:])) 
                                        for obs_layer in self.observation_layers]
                initial_observations = check_list(initial_observations)
            initial_env_states = initial_observations = []
        if initial_state_variables == 'zeros':
            initial_state_variables = []
            for memory in self.state_variables:
                state_shape = lasagne.layers.get_output_shape(memory)[1:] #drom batch_i dimension
                initial_state = T.zeros((batch_size,)+tuple(state_shape))

        #recurrent step function
        #during SCAN, time synchronization is reverse: state_1 came after action_1 based on observation_0 from state_0
        def step(time_tick,*args):

            #slice previous: they contain 
            #[*env_states_if_any, *observations, *state_variables, *prev_actions, *prev_outputs, *rubbish]
            # we only need env state, prev observation and agent state to iterate on
            if env is not None:
                n_env_states = len(check_list(env.state_size))
                n_env_states = 0
            n_observations = len(self.observation_layers)
            n_memories = len(self.state_variables)
            env_states,observations,prev_agent_states = unpack_list(args,n_env_states,n_observations,n_memories)
            prev_states_dict = OrderedDict(zip(self.state_variables.keys(),prev_agent_states))
            new_actions,new_agent_states,new_outputs = self.get_agent_reaction(prev_states_dict,observations,**flags)
            if env is not None: 
                new_env_states,new_observations = env.get_action_results(env_states,new_actions,time_tick)
                new_env_states = check_list(new_env_states)
                new_observations = check_list(new_observations)
                new_env_states = new_observations = []

            return new_env_states + new_observations + new_agent_states + new_actions + new_outputs

        #main recurrent loop configuration
        outputs_info = initial_env_states+initial_observations + initial_state_variables+\
        time_ticks = T.arange(session_length)
        sequences = [time_ticks]
        history = unroll_scan(step,
            sequences = sequences,
            outputs_info = outputs_info,
            non_sequences = [],
            n_steps = session_length

        #for the record
        self.last_history = history
        #from [time,batch,...] to [batch,time,...]
        history = [ (var.swapaxes(1,0) if var.ndim >1 else var) for var in history]
        groups = unpack_list(history, 
        env_state_sequences, observation_sequences, agent_state_sequences,\
            action_sequences, output_sequences = groups
        agent_state_dict = OrderedDict(zip(self.state_variables.keys(),agent_state_sequences))
        #allign time axes: actions come AFTER states with the same index
        #add first env turn, crop to session length
        env_state_sequences = [
            for state_seq, initial_env_state in 
                zip(env_state_sequences, initial_env_states)

        observation_seqs = [
            for observation_seq, initial_observation in 
                zip(observation_sequences, initial_observations)
        return env_state_sequences, observation_sequences, agent_state_dict,action_sequences, output_sequences
Esempio n. 9
    def get_output_for(self, inputs, accumulate_updates="warn",recurrence_flags={}, **kwargs):
        returns history of agent interaction with environment for given number of turns.

            inputs - [state init]  + [input_nonsequences] + [input_sequences]
                Each part is a list of theano expressions for layers in the order they were
                provided when creating this layer.
            recurrence_flags - a set of flags to be passed to the one step agent (anything that lasagne supports)
                e.g. {deterministic=True}
            [state_sequences] + [output sequences] - a list of all states and all outputs sequences
            Shape of each such sequence is [batch, tick, shape_of_one_state_or_output...]
        n_states = len(self.state_variables)
        n_state_inits = len(self.state_init)
        n_input_nonseq = len(self.input_nonsequences)
        n_input_seq = len(self.input_sequences)
        n_outputs = len(self.tracked_outputs)

        #slice inputs

        if self.mask_input is not None:
            mask,inputs = inputs[0],inputs[1:]

        initial_states_provided, nonsequences, sequences = unpack_list(inputs, [n_state_inits, n_input_nonseq, n_input_seq])

        # infer batch size
        if self.batch_size is not None:
            batch_size = self.batch_size
        elif len(inputs) != 0:
            batch_size = inputs[0].shape[0]
            raise ValueError("Need to set batch_size explicitly for recurrence")

        #here we create outputs_info for scan, basically initial values for states and outputs
        ## initial states that are given as input
        initial_states_provided = OrderedDict(list(zip(self.state_init, initial_states_provided)))

        def get_initial_state(layer, batch_size=batch_size):
            """Pick dedicated initial state or create zeros of appropriate shape and dtype
            :param layer: layer for new hidden state (key of self.state_variables)
            :param batch_size: symbolic batch_size
            # if we have a dedicated init, use it
            if layer in initial_states_provided:
                initial_state = initial_states_provided[layer]
            # otherwise initialize with zeros
                assert None not in layer.output_shape[1:],\
                    "Some of your state layers ({}) has undefined shape along non-batch dimension. (shape: {}) " \
                    "Therefore, it's initial value can't be inferred. Please set explicit initial value via state_init" \
                    "".format( or layer, layer.output_shape)

                dtype = get_layer_dtype(layer)
                initial_state = T.zeros((batch_size,) + tuple(layer.output_shape[1:]), dtype=dtype)
                #disable broadcasting along all axes (lasagne outputs are non-broadcastable)
                initial_state = T.unbroadcast(initial_state, *range(initial_state.ndim))

            return initial_state

        initial_states = list(map(get_initial_state, self.state_variables))

        # dummy initial values for tracked_outputs.
        # We need to provide them for step_masked to be able to backtrack to them. Also unroll scan requires them.
        # Initial shapes for outputs are inferred by calling get_one_step and taking shapes from it.
        # Theano optimizes shape computation without computing get_out_step outputs themselves
        # the resulting graph would be like (var1.shape[0],var1.shape[2]*3,10) so this operation is zero-cost.
        state_feed_dict = dict(zip(self.state_variables.keys(),initial_states))
        input_feed_dict = dict(zip(list(chain(self.input_nonsequences.keys(), self.input_sequences.keys())),
                                   list(chain(nonsequences,[seq[:,0] for seq in sequences]))))
        initial_output_fillers = self.get_one_step(state_feed_dict,input_feed_dict,**recurrence_flags)[1]
        # disable broadcasting of zeros_like(v) along all axes (since lasagne outputs are non-broadcastable)
        initial_output_fillers = [T.unbroadcast(T.zeros_like(v),*range(v.ndim))
                                  for v in initial_output_fillers]
        #/end of that nonsense

        #stack all initializers together
        outputs_info = initial_states + initial_output_fillers

        # reshape sequences from [batch, time, ...] to [time,batch,...] to fit scan
        sequences = [seq.swapaxes(1, 0) for seq in sequences]

        # recurrent step function
        def step(*args):

            sequence_slices, prev_states, prev_outputs, nonsequences = \
                unpack_list(args, [n_input_seq, n_states, n_outputs, n_input_nonseq])
            # make dicts of prev_states and inputs
            prev_states_dict = OrderedDict(zip(list(self.state_variables.keys()), prev_states))

            input_layers = list(chain(self.input_nonsequences.keys(), self.input_sequences.keys()))
            assert len(input_layers) == len(nonsequences + sequence_slices)

            inputs_dict = OrderedDict(zip(input_layers, nonsequences + sequence_slices))

            # call one step recurrence
            new_states, new_outputs = self.get_one_step(prev_states_dict, inputs_dict, **recurrence_flags)

            #make sure new state variables are of exactly the same type as their initial value
            state_names = [ or str(layer) for layer in list(self.state_variables.keys())]
            for i in range(len(state_names)):
                    if self.force_cast_types:
                        new_states[i] = new_states[i].astype(prev_states[i].dtype)
                    new_states[i] = cast_to_type(new_states[i],get_type(prev_states[i]))
                    raise ValueError("Could not convert new state {}, of type {}, to it's previous/initial state type "
                                     "{}. Cast type manually or set force_cast_types=True on creation."

            #make sure output variables are of exactly the same type as their initial value
            output_names = [ or str(layer) for layer in self.tracked_outputs]
            for i in range(len(output_names)):
                    if self.force_cast_types:
                        new_outputs[i] = new_outputs[i].astype(prev_outputs[i].dtype)
                    new_outputs[i] = cast_to_type(new_outputs[i],get_type(prev_outputs[i]))
                    raise ValueError("Could not convert output of {}, of type {}, to it's previous/initial state type "
                                     "{}. Cast type manually or set force_cast_types=True on creation."

            return new_states + new_outputs

        ###handling mask_input###

        #a step function that utilizes a mask
        def step_masked(mask_t,*args):
            #unpack arrays
            sequence_slices, prev_states, prev_outputs, nonsequences = \
                unpack_list(args, [n_input_seq, n_states, n_outputs, n_input_nonseq])

            #get regular step
            new_states_and_outputs = step(*args)
            old_states_and_outputs = prev_states+prev_outputs

            #if mask_t, return new ones, else return old ones
            def apply_mask(mask_t,new_state,old_state):
                assert new_state.ndim == old_state.ndim
                ndim = new_state.ndim
                #append dims to mask
                pattern = list(range(mask_t.ndim)) + ['x'] * (ndim - mask_t.ndim)

                return T.switch(mask_t.dimshuffle(pattern),
                                new_state, old_state)

            next_states_and_outputs = [apply_mask(mask_t,new_state,old_state)
                                       for new_state,old_state in zip(new_states_and_outputs,

            return next_states_and_outputs

        if self.mask_input is not None:
            sequences = [mask.swapaxes(1, 0)]+sequences
            step_function = step_masked
            step_function = step

        #scan itself
        if self.unroll_scan:
            # call scan itself
            history = unroll_scan(step_function,
            #if explicitly asked to reset updates, do so
            if accumulate_updates == False:

            history,updates = theano.scan(step_function,

            if accumulate_updates in (True,'warn'):
                self.updates += updates
            else:#replace updates
                self.updates = updates

            #check if user received last updates
            if not self._updates_received and accumulate_updates=='warn':
                warn("You called get_output from recurrence several times without gathering the updates.\n"
                     "(A) If you wanted to get two outputs from recurrence, use NOT\n"
                     ">>>out1 = get_output(rec[layer1])\n"
                     ">>>out2 = get_output(rec[layer2])\n"
                     "but instead:\n"
                     ">>>out1,out2 = get_output((rec[layer1],rec[layer2])) #or rec[layer1,layer2].\n"
                     "(B) If you want to run recurrence several times and accumulate updates from all runs,"
                     "use get_output(...,accumulate_updates=True) to silence the warning.\n"
                     "(C) If you want to get rid of old updates, use get_output(...,accumulate_updates=False)\n"

            if len(self.updates) !=0:
                warn("Recurrent loop without unroll_scan got nonempty random state updates list. That happened"
                     " because there is some source of randomness (e.g. dropout) inside recurrent step graph."
                     " To compile such graph, one must either call .get_automatic_updates() right after .get_output"
                     " and pass these updates to a function when compiling theano.function.",verbosity_level=2)

        # reordering from [time,batch,...] to [batch,time,...]
        history = [(var.swapaxes(1, 0) if var.ndim > 1 else var) for var in check_list(history)]

        assert len(history) == n_states+n_outputs

        state_seqs, output_seqs = unpack_list(history, [n_states, n_outputs])

        # handle delayed_states
        # selectively shift state sequences by 1 tick into the past, padding with their initialisations
        for i in range(len(state_seqs)):
            if list(self.state_variables.keys())[i] in self.delayed_states:
                state_seq = state_seqs[i]
                state_init = initial_states[i]
                state_seq = T.concatenate([insert_dim(state_init, 1), state_seq[:, :-1]], axis=1)
                state_seqs[i] = state_seq

        #keys corresponding to output sequences. Note that we do not use self.keys() to correctly
        # handle cases where some variable is present in both state_variables and tracked_outputs
        output_keys = list(self.state_variables.keys()) + list(self.tracked_outputs)
        output_values = state_seqs + output_seqs
        assert len(output_keys) == len(output_values)
        return OrderedDict(zip(output_keys,output_values))
Esempio n. 10
    def get_output_for(self, inputs, **kwargs):
        Compute this layer's output function given a symbolic input variable

        inputs : list of theano.TensorType
            `inputs[0]` should always be the symbolic input variable.  When
            this layer has a mask input (i.e. was instantiated with
            `mask_input != None`, indicating that the lengths of sequences in
            each batch vary), `inputs` should have length 2, where `inputs[1]`
            is the `mask`.  The `mask` should be supplied as a Theano variable
            denoting whether each time step in each sequence in the batch is
            part of the sequence or not.  `mask` should be a matrix of shape
            ``(n_batch, n_time_steps)`` where ``mask[i, j] = 1`` when ``j <=
            (length of sequence i)`` and ``mask[i, j] = 0`` when ``j > (length
            of sequence i)``.

        layer_output : theano.TensorType
            Symbolic output variable.
        # Retrieve the layer input
        input = inputs[0]
        encoder_output = inputs[1]

        # Treat all dimensions after the second as flattened feature dimensions
        if input.ndim > 3:
            input = T.flatten(input, 3)
            encoder_output = T.flatten(encoder_output, 3)

        # Because scan iterates over the first dimension we dimshuffle to
        # (n_time_steps, n_batch, n_features)
        input = input.dimshuffle(1, 0, 2)
        encoder_output = encoder_output.dimshuffle(1, 0, 2)
        seq_len, num_batch, _ = input.shape

        # Stack input weight matrices into a (num_inputs, 3*num_units)
        # matrix, which speeds up computation
        W_in_stacked = T.concatenate(
            [self.W_in_to_resetgate, self.W_in_to_updategate, self.W_in_to_hidden_update], axis=1

        # Same for hidden weight matrices
        W_hid_stacked = T.concatenate(
            [self.W_hid_to_resetgate, self.W_hid_to_updategate, self.W_hid_to_hidden_update], axis=1

        # Stack gate biases into a (3*num_units) vector
        b_stacked = T.concatenate([self.b_resetgate, self.b_updategate, self.b_hidden_update], axis=0)

        if self.precompute_input:
            # precompute_input inputs*W. W_in is (n_features, 3*num_units).
            # input is then (n_batch, n_time_steps, 3*num_units).
            input =, W_in_stacked) + b_stacked

        # At each call to scan, input_n will be (n_time_steps, 3*num_units).
        # We define a slicing function that extract the input to each GRU gate
        def slice_w(x, n):
            return x[:, n * self.num_units : (n + 1) * self.num_units]

        # Create single recurrent computation step function
        # input__n is the n'th vector of the input
        def step(
            # Compute W_{hr} h_{t - 1}, W_{hu} h_{t - 1}, and W_{hc} h_{t - 1}
            hid_input =, W_hid_stacked)

            if self.grad_clipping is not False:
                input_n = theano.gradient.grad_clip(input_n, -self.grad_clipping, self.grad_clipping)
                hid_input = theano.gradient.grad_clip(hid_input, -self.grad_clipping, self.grad_clipping)

            if not self.precompute_input:
                # Compute W_{xr}x_t + b_r, W_{xu}x_t + b_u, and W_{xc}x_t + b_c
                input_n =, W_in_stacked) + b_stacked

            # Reset and update gates
            resetgate = slice_w(hid_input, 0) + slice_w(input_n, 0)
            updategate = slice_w(hid_input, 1) + slice_w(input_n, 1)
            resetgate = self.nonlinearity_resetgate(resetgate)
            updategate = self.nonlinearity_updategate(updategate)

            # Compute W_{xc}x_t + r_t \odot (W_{hc} h_{t - 1})
            hidden_update_in = slice_w(input_n, 2)
            hidden_update_hid = slice_w(hid_input, 2)
            hidden_update = hidden_update_in + resetgate * hidden_update_hid
            if self.grad_clipping is not False:
                hidden_update = theano.gradient.grad_clip(hidden_update, -self.grad_clipping, self.grad_clipping)
            hidden_update = self.nonlinearity_hid(hidden_update)

            # Compute (1 - u_t)h_{t - 1} + u_t c_t
            hid = (1 - updategate) * hid_previous + updategate * hidden_update

            # # Add the attention
            hid += self.attention(encoder_output, hid_previous, W_att_enc, W_att_dec, W_att_out)

            # Compute the probas
            probs = T.nnet.softmax(, W_out) + b_out)
            return [hid, probs]

        sequences = [input]
        step_fun = step

        if isinstance(self.hid_init, T.TensorVariable):
            hid_init = self.hid_init
            # Dot against a 1s vector to repeat to shape (num_batch, num_units)
            hid_init =, 1)), self.hid_init)

        # The hidden-to-hidden weight matrix is always used in step
        non_seqs = [encoder_output, W_hid_stacked]
        # When we aren't precomputing the input outside of scan, we need to
        # provide the input weights and biases to the step function
        if not self.precompute_input:
            non_seqs += [
        # theano.scan only allows for positional arguments, so when
        # self.precompute_input is True, we need to supply fake placeholder
        # arguments for the input weights and biases.
            non_seqs += [(), (), self.W_att_enc, self.W_att_dec, self.W_att_out, self.W_out, self.b_out]

        if self.unroll_scan:
            # Retrieve the dimensionality of the incoming layer
            input_shape = self.input_shapes[0]
            # Explicitly unroll the recurrence instead of using scan
            out, _ = unroll_scan(
            # Scan op iterates over first dimension of input and repeatedly
            # applies the step function
            out, _ = theano.scan(
                outputs_info=[hid_init, None],

        # dimshuffle back to (n_batch, n_time_steps, n_features))
        # hid_out = hid_out[0].dimshuffle(1, 0, 2)
        s_out = out[1]

        # # if scan is backward reverse the output
        # if self.backwards:
        #     out = out[:, ::-1, :]

        return s_out
Esempio n. 11
    def get_output_for(self, inputs, **kwargs):
        Compute this layer's output function given a symbolic input variable

        input : theano.TensorType
            Symbolic input variable.
        mask : theano.TensorType
            Theano variable denoting whether each time step in each
            sequence in the batch is part of the sequence or not.  If ``None``,
            then it is assumed that all sequences are of the same length.  If
            not all sequences are of the same length, then it must be
            supplied as a matrix of shape ``(n_batch, n_time_steps)`` where
            ``mask[i, j] = 1`` when ``j <= (length of sequence i)`` and
            ``mask[i, j] = 0`` when ``j > (length of sequence i)``.

        layer_output : theano.TensorType
            Symblic output variable.
        input = inputs[0]
        # Retrieve the mask when it is supplied
        mask = inputs[1] if len(inputs) > 1 else None

        # Treat all dimensions after the second as flattened feature dimensions
        if input.ndim > 3:
            input = input.reshape((input.shape[0], input.shape[1],
        num_batch = input.shape[0]
        encode_seqlen = input.shape[1]

        if mask is None:
            mask = T.ones((num_batch, encode_seqlen),dtype='float32')
        # At each call to scan, input_n will be (n_time_steps, 4*num_units).
        # We define a slicing function that extract the input to each LSTM gate
        def slice_w(x, n):
            return x[:, n*self.num_units:(n+1)*self.num_units]

        # Create single recurrent computation step function
        # input_n is the n'th vector of the input
        def step(cell_previous, hid_previous, alpha_prev, weighted_hidden_prev,
                 input, mask, hUa, W_align, v_align,
                 W_hid_stacked, W_weightedhid_stacked, W_cell_to_ingate,
                 W_cell_to_forgetgate, W_cell_to_outgate,
                 b_stacked, *args):

            #compute (unormalized) attetion vector
            sWa =, W_align)       # (BS, aln_num_units)
            sWa = sWa.dimshuffle(0, 'x', 1)   # (BS, 1, aln_num_units)
            align_act = sWa + hUa
            tanh_sWahUa = self.nonlinearity_align(align_act)
                                            # (BS, seqlen, num_units_aln)

            a =, v_align)  # (BS, Seqlen, 1)
            a = T.reshape(a, (a.shape[0], a.shape[1]))
            #                                # (BS, Seqlen)
            # # ->(BS, seq_len)

            a = a*mask - (1-mask)*10000

            alpha = self.attention_softmax_function(a)
            #alpha = T.reshape(alpha, (input.shape[0], input.shape[1]))

            # input: (BS, Seqlen, num_units)
            weighted_hidden = input * alpha.dimshuffle(0, 1, 'x')
            weighted_hidden = T.sum(weighted_hidden, axis=1)  #sum seqlen out

            # Calculate gates pre-activations and slice

            # (BS, dec_hid) x (dec_hid, dec_hid)
            gates =, W_hid_stacked) + b_stacked
            # (BS, enc_hid) x (enc_hid, dec_hid)
            gates +=, W_weightedhid_stacked)

            # Clip gradients
            if self.grad_clipping is not False:
                gates = theano.gradient.grad_clip(
                    gates, -self.grad_clipping, self.grad_clipping)

            # Extract the pre-activation gate values
            ingate = slice_w(gates, 0)
            forgetgate = slice_w(gates, 1)
            cell_input = slice_w(gates, 2)
            outgate = slice_w(gates, 3)

            if self.peepholes:
                # Compute peephole connections
                ingate += cell_previous*W_cell_to_ingate
                forgetgate += cell_previous*W_cell_to_forgetgate

            # Apply nonlinearities
            ingate = self.nonlinearity_ingate(ingate)
            forgetgate = self.nonlinearity_forgetgate(forgetgate)
            cell_input = self.nonlinearity_cell(cell_input)
            outgate = self.nonlinearity_outgate(outgate)

            # Compute new cell value
            cell = forgetgate*cell_previous + ingate*cell_input

            if self.peepholes:
                outgate += cell*W_cell_to_outgate

            # W_align:  (num_units, aln_num_units)
            # U_align:  (num_feats, aln_num_units)
            # v_align:  (aln_num_units, 1)
            # hUa:      (BS, Seqlen, aln_num_units)
            # hid:      (BS, num_units_dec)
            # input:    (BS, Seqlen, num_inputs)

            # Compute new hidden unit activation
            hid = outgate*self.nonlinearity_out(cell)

            return [cell, hid, alpha, weighted_hidden]

        sequences = []
        step_fun = step

        ones = T.ones((num_batch, 1))
        if isinstance(self.cell_init, T.TensorVariable):
            cell_init = self.cell_init
            # Dot against a 1s vector to repeat to shape (num_batch, num_units)
            cell_init =, self.cell_init)

        if isinstance(self.hid_init, T.TensorVariable):
            hid_init = self.hid_init
            # Dot against a 1s vector to repeat to shape (num_batch, num_units)
            hid_init =, self.hid_init)

        #weighted_hidden_init = T.zeros((num_batch, input.shape[2]))
        alpha_init = T.zeros((num_batch, encode_seqlen))

        weighted_hidden_init = T.zeros((num_batch, self.num_inputs))

        # The hidden-to-hidden weight matrix is always used in step

        hUa =, self.U_align)   # (num_batch, seq_len, num_units_aln)

        non_seqs = [input, mask, hUa, self.W_align, self.v_align,
                    self.W_hid_stacked, self.W_weightedhid_stacked]
        # The "peephole" weight matrices are only used when self.peepholes=True
        if self.peepholes:
            non_seqs += [self.W_cell_to_ingate,
        # theano.scan only allows for positional arguments, so when
        # self.peepholes is False, we need to supply fake placeholder arguments
        # for the three peephole matrices.
            non_seqs += [(), (), ()]
        # When we aren't precomputing the input outside of scan, we need to
        # provide the input weights and biases to the step function

        non_seqs += [self.b_stacked]

        if self.unroll_scan:
            # Explicitly unroll the recurrence instead of using scan
            cell_out, hid_out, alpha_out, weighted_hidden_out = unroll_scan(
                outputs_info=[cell_init, hid_init, alpha_init, weighted_hidden_init],
                n_steps=self.n_decodesteps + self.decode_pre_steps)
            # Scan op iterates over first dimension of input and repeatedly
            # applies the step function
            cell_out, hid_out, alpha_out, weighted_hidden_out = theano.scan(
                outputs_info=[cell_init, hid_init, alpha_init, weighted_hidden_init],
                n_steps=self.n_decodesteps + self.decode_pre_steps,

        # dimshuffle back to (n_batch, n_time_steps, n_features))

        #a_out - (n_decodesteps, bs, seqlen)
        #hid_out -   (n_decode_steps, bs, num_units)

        # mask:  (BS, encode_seqlen
        # a_out; (n_decodesteps, BS, encode_seqlen)
        cell_out = cell_out.dimshuffle(1, 0, 2)
        hid_out = hid_out.dimshuffle(1, 0, 2)  # (BS, n_decodesteps, encode_seqlen)
        mask = mask.dimshuffle(0, 'x', 1)
        alpha_out = alpha_out.dimshuffle(1, 0, 2)  # (BS, n_decodesteps, encode_seqlen)

        weighted_hidden_out = weighted_hidden_out.dimshuffle(1, 0, 2)

        # if scan is backward reverse the output
        if self.backwards:
            hid_out = hid_out[:, ::-1]
            cell_out = cell_out[:, ::-1]
            weighted_hidden_out = weighted_hidden_out[:, ::-1]
            alpha_out = alpha_out[:, ::-1]

        if self.decode_pre_steps > 0:
            hid_out = hid_out[:, self.decode_pre_steps:]
            cell_out = hid_out[:, self.decode_pre_steps:]
            weighted_hidden_out = weighted_hidden_out[:, self.decode_pre_steps:]
            alpha_out = hid_out[:, self.decode_pre_steps:]

        self.hid_out = hid_out
        self.cell_out = cell_out
        self.weighted_hidden_out = weighted_hidden_out
        self.alpha = alpha_out

        if self.return_decodehid:
            return hid_out
            return weighted_hidden_out
Esempio n. 12
    def get_output_for(self, inputs, deterministic=False, **kwargs):
        input = inputs[0]

        mask = None
        if self.mask_incoming_index > 0:
            mask = inputs[self.mask_incoming_index]

        hid_init = None
        if self.hid_init_incoming_index > 0:
            hid_init = inputs[self.hid_init_incoming_index]

        cell_init = None
        if self.cell_init_incoming_index > 0:
            cell_init = inputs[self.cell_init_incoming_index]

        if input.ndim > 3:
            input = T.flatten(input, 3)

        input = input.dimshuffle(1, 0, 2)
        seq_len, num_batch, _ = input.shape

        #### input ####
        W_in_stacked = T.concatenate([
            self.W_in_to_ingate, self.W_in_to_forgetgate, self.W_in_to_cell,

        #### hidden ####
        W_hid_stacked = T.concatenate([
            self.W_hid_to_ingate, self.W_hid_to_forgetgate, self.W_hid_to_cell,

        #### bias ####
        b_stacked = T.concatenate(
            [self.b_ingate, self.b_forgetgate, self.b_cell, self.b_outgate],

        #### weight noise ####
        if self.weight_noise > 0 and deterministic is True:
            W_in_stacked += self.normal(size=W_in_stacked.shape,
            W_hid_stacked += self.normal(size=W_hid_stacked.shape,

        def slice_w(x, n):
            return x[:, n * self.num_units:(n + 1) * self.num_units]

        #### set dropout mask ####
        if deterministic:
            self.using_dropout = False
            self.using_dropout = True
        cell_mask = self.binomial((num_batch, self.num_units),
                                  p=T.constant(1) - self.p,

        input =, W_in_stacked) + b_stacked

        def step(input_n, cell_previous, hid_previous, *args):
            gates = input_n +, W_hid_stacked)

            ingate = slice_w(gates, 0)
            forgetgate = slice_w(gates, 1)
            cell_input = slice_w(gates, 2)
            outgate = slice_w(gates, 3)

            if self.peepholes:
                ingate += cell_previous * self.W_cell_to_ingate
                forgetgate += cell_previous * self.W_cell_to_forgetgate

            if self.grad_clipping:
                ingate = theano.gradient.grad_clip(ingate, -self.grad_clipping,
                forgetgate = theano.gradient.grad_clip(forgetgate,
                cell_input = theano.gradient.grad_clip(cell_input,

            ingate = self.nonlinearity_ingate(ingate)
            forgetgate = self.nonlinearity_forgetgate(forgetgate)
            cell_input = self.nonlinearity_cell(cell_input)

            # Compute new cell value
            if self.using_dropout == False or self.p == 0:
                cell_input = cell_input
                one = T.constant(1)
                retain_prob = one - self.p
                cell_input /= retain_prob
                cell_input = cell_input * cell_mask

            # Compute new cell value
            cell = forgetgate * cell_previous + ingate * cell_input

            if self.peepholes:
                outgate += cell * self.W_cell_to_outgate

            if self.grad_clipping:
                outgate = theano.gradient.grad_clip(outgate,
                cell = theano.gradient.grad_clip(cell, -self.grad_clipping,
            outgate = self.nonlinearity_outgate(outgate)

            # Compute new hidden unit activation
            hid = outgate * self.nonlinearity(cell)
            hid =, self.W_hid_projection)
            return [cell, hid]

        def step_masked(input_n, mask_n, cell_previous, hid_previous, *args):
            cell, hid = step(input_n, cell_previous, hid_previous, *args)

            cell = T.switch(mask_n, cell, cell_previous)
            hid = T.switch(mask_n, hid, hid_previous)
            return [cell, hid]

        if mask is not None:
            mask = mask.dimshuffle(1, 0, 'x')
            sequences = [input, mask]
            step_fun = step_masked
            sequences = input
            step_fun = step

        ones = T.ones((num_batch, 1))
        if not isinstance(self.cell_init, Layer):
            cell_init =, self.cell_init)
        if not isinstance(self.hid_init, Layer):
            hid_init =, self.hid_init)

        non_seqs = [cell_mask, W_hid_stacked]
        non_seqs += [
        if self.peepholes:
            non_seqs += [
                self.W_cell_to_ingate, self.W_cell_to_forgetgate,

        if self.unroll_scan:
            input_shape = self.input_shapes[0]
            cell_out, hid_out = unroll_scan(fn=step_fun,
                                            outputs_info=[cell_init, hid_init],
            cell_out, hid_out = theano.scan(
                outputs_info=[cell_init, hid_init],

        if self.only_return_final:
            hid_out = hid_out[-1]
            hid_out = hid_out.dimshuffle(1, 0, 2)

            if self.backwards:
                hid_out = hid_out[:, ::-1]

        if self.only_return_hidden:
            return hid_out
            if self.only_return_final:
                cell_out = cell_out[-1]
                cell_out = cell_out.dimshuffle(1, 0, 2)

                if self.backwards:
                    cell_out = cell_out[:, ::-1]

            return T.concatenate([hid_out, cell_out], axis=-1)
Esempio n. 13
    def get_output_for(self, inputs, **kwargs):
        # Retrieve the layer input
        input = inputs[0]
        # Retrieve the mask when it is supplied
        mask = None
        hid_init = None
        cell_init = None
        avg_init = None
        if self.mask_incoming_index > 0:
            mask = inputs[self.mask_incoming_index]
        if self.hid_init_incoming_index > 0:
            hid_init = inputs[self.hid_init_incoming_index]
        if self.cell_init_incoming_index > 0:
            cell_init = inputs[self.cell_init_incoming_index]

        # Treat all dimensions after the second as flattened feature dimensions
        if input.ndim > 3:
            input = T.flatten(input, 3)

        # Because scan iterates over the first dimension we dimshuffle to
        # (n_time_steps, n_batch, n_features)
        input = input.dimshuffle(1, 0, 2)
        seq_len, num_batch, _ = input.shape

        # Stack input weight matrices into a (num_inputs, 4*num_units)
        # matrix, which speeds up computation
        W_in_stacked = T.concatenate([
            self.W_in_to_ingate, self.W_in_to_forgetgate, self.W_in_to_cell,

        # Same for hidden weight matrices
        W_hid_stacked = T.concatenate([
            self.W_hid_to_ingate, self.W_hid_to_forgetgate, self.W_hid_to_cell,

        # Stack biases into a (4*num_units) vector
        b_stacked = T.concatenate(
            [self.b_ingate, self.b_forgetgate, self.b_cell, self.b_outgate],

        if self.precompute_input:
            # Because the input is given for all time steps, we can
            # precompute_input the inputs dot weight matrices before scanning.
            # W_in_stacked is (n_features, 4*num_units). input is then
            # (n_time_steps, n_batch, 4*num_units).
            input =, W_in_stacked) + b_stacked

        # When theano.scan calls step, input_n will be (n_batch, 4*num_units).
        # We define a slicing function that extract the input to each LSTM gate
        def slice_w(x, n):
            return x[:, n * self.num_units:(n + 1) * self.num_units]

        # Create single recurrent computation step function
        # input_n is the n'th vector of the input
        def step(input_n, cell_previous, hid_previous, avg_previous, *args):
            x = input_n
            if not self.precompute_input:
                input_n =, W_in_stacked) + b_stacked

            # Calculate gates pre-activations and slice
            gates = input_n +, W_hid_stacked)

            # Clip gradients
            if self.grad_clipping:
                gates = theano.gradient.grad_clip(gates, -self.grad_clipping,

            # Extract the pre-activation gate values
            ingate = slice_w(gates, 0)
            forgetgate = slice_w(gates, 1)
            cell_input = slice_w(gates, 2)
            outgate = slice_w(gates, 3)

            if self.peepholes:
                # Compute peephole connections
                ingate += cell_previous * self.W_cell_to_ingate
                forgetgate += cell_previous * self.W_cell_to_forgetgate

            # Apply nonlinearities
            ingate = self.nonlinearity_ingate(ingate)
            forgetgate = self.nonlinearity_forgetgate(forgetgate)
            cell_input = self.nonlinearity_cell(cell_input)

            # Compute new cell value
            cell = forgetgate * cell_previous + ingate * cell_input

            if self.peepholes:
                outgate += cell * self.W_cell_to_outgate
            outgate = self.nonlinearity_outgate(outgate)

            # Compute new hidden unit activation
            hid = outgate * self.nonlinearity(cell)

            avg_input =, self.W_avg1) +,
                                                      self.W_avg2) + self.b_avg
            if self.model_type == 1:
                avg = x * nonlinearities.sigmoid(avg_input)
            elif self.model_type == 6:
                avg = nonlinearities.tanh(avg_input)
            elif self.model_type == 7:
                avg_input =, self.W_avg1) *
                    hid, self.W_avg2) + self.b_avg
                avg = x * nonlinearities.sigmoid(avg_input)
            elif self.model_type == 2:
                avg = hid * nonlinearities.sigmoid(avg_input)
            elif self.model_type == 3:
                avg_input2 =, self.W_avg12) +
                    hid, self.W_avg22) + self.b_avg2
                g1 = nonlinearities.sigmoid(avg_input)
                g2 = nonlinearities.sigmoid(avg_input2)
                avg = avg_previous * g1 + x * g2
            elif self.model_type == 4:
                avg_input =
                    x, self.W_avg1) +, self.W_avg2) +
                        avg_previous, self.W_avg3) + self.b_avg
                avg_input2 =
                    x, self.W_avg12) +, self.W_avg22) +
                        avg_previous, self.W_avg32) + self.b_avg2
                g1 = nonlinearities.sigmoid(avg_input)
                g2 = nonlinearities.sigmoid(avg_input2)
                avg = avg_previous * g1 + x * g2
            elif self.model_type == 5:
                avg_input2 =, self.W_avg12) +
                    hid, self.W_avg22) + self.b_avg2
                g1 = nonlinearities.sigmoid(avg_input)
                g2 = nonlinearities.sigmoid(avg_input2)
                avg = x * g1
                havg = hid * g2
                avg = avg + havg
            return [cell, hid, avg]

        def step_masked(input_n, mask_n, cell_previous, hid_previous,
                        avg_previous, *args):
            cell, hid, avg = step(input_n, cell_previous, hid_previous,
                                  avg_previous, *args)

            # Skip over any input with mask 0 by copying the previous
            # hidden state; proceed normally for any input with mask 1.
            cell = T.switch(mask_n, cell, cell_previous)
            hid = T.switch(mask_n, hid, hid_previous)
            avg = T.switch(mask_n, avg, avg_previous)

            return [cell, hid, avg]

        if mask is not None:
            # mask is given as (batch_size, seq_len). Because scan iterates
            # over first dimension, we dimshuffle to (seq_len, batch_size) and
            # add a broadcastable dimension
            mask = mask.dimshuffle(1, 0, 'x')
            sequences = [input, mask]
            step_fun = step_masked
            sequences = input
            step_fun = step

        ones = T.ones((num_batch, 1))
        if not isinstance(self.cell_init, lasagne.layers.Layer):
            # Dot against a 1s vector to repeat to shape (num_batch, num_units)
            cell_init =, self.cell_init)

        if not isinstance(self.hid_init, lasagne.layers.Layer):
            # Dot against a 1s vector to repeat to shape (num_batch, num_units)
            hid_init =, self.hid_init)

        if not isinstance(self.avg_init, lasagne.layers.Layer):
            # Dot against a 1s vector to repeat to shape (num_batch, num_units)
            avg_init =, self.avg_init)

        # The hidden-to-hidden weight matrix is always used in step
        non_seqs = [W_hid_stacked, self.W_avg1, self.W_avg2, self.b_avg]
        # The "peephole" weight matrices are only used when self.peepholes=True
        if self.peepholes:
            non_seqs += [
                self.W_cell_to_ingate, self.W_cell_to_forgetgate,

        # When we aren't precomputing the input outside of scan, we need to
        # provide the input weights and biases to the step function
        if not self.precompute_input:
            non_seqs += [W_in_stacked, b_stacked]

        if self.model_type == 3 or self.model_type == 5:
            non_seqs += [self.W_avg12, self.W_avg22, self.b_avg2]

        if self.model_type == 4:
            non_seqs += [
                self.W_avg12, self.W_avg22, self.b_avg2, self.W_avg3,

        if self.unroll_scan:
            # Retrieve the dimensionality of the incoming layer
            input_shape = self.input_shapes[0]
            # Explicitly unroll the recurrence instead of using scan
            cell_out, hid_out, avg_out = unroll_scan(
                outputs_info=[cell_init, hid_init, avg_init],
            # Scan op iterates over first dimension of input and repeatedly
            # applies the step function
            cell_out, hid_out, avg_out = theano.scan(
                outputs_info=[cell_init, hid_init, avg_init],

        # When it is requested that we only return the final sequence step,
        # we need to slice it out immediately after scan is applied
        if self.only_return_final:
            avg_out = avg_out[-1]
            # dimshuffle back to (n_batch, n_time_steps, n_features))
            avg_out = avg_out.dimshuffle(1, 0, 2)

            # if scan is backward reverse the output
            if self.backwards:
                avg_out = avg_out[:, ::-1]

        return avg_out
    def get_output_for(self, inputs, **kwargs):

        # Retrieve the layer input
        input = inputs[0]
        # Retrieve the mask when it is supplied
        mask = inputs[1] if len(inputs) > 1 else None

        context = inputs[2]

        # Treat all dimensions after the second as flattened feature dimensions
        if input.ndim > 3:
            input = T.flatten(input, 3)

        # Because scan iterates over the first dimension we dimshuffle to
        # (n_time_steps, n_batch, n_features)
        input = input.dimshuffle(1, 0, 2)
        seq_len, num_batch, _ = input.shape

        context = context.dimshuffle(1, 0, 2)

        # Stack input weight matrices into a (num_inputs, 3*num_units)
        # matrix, which speeds up computation
        W_in_stacked = T.concatenate([
            self.W_in_to_resetgate, self.W_in_to_updategate,

        # Same for hidden weight matrices
        W_hid_stacked = T.concatenate([
            self.W_hid_to_resetgate, self.W_hid_to_updategate,

        W_tid_stacked = T.concatenate([
            self.W_tid_to_resetgate, self.W_tid_to_updategate,

        # Stack gate biases into a (3*num_units) vector
        b_stacked = T.concatenate(
            [self.b_resetgate, self.b_updategate, self.b_hidden_update],

        if self.precompute_input:
            # precompute_input inputs*W. W_in is (n_features, 3*num_units).
            # input is then (n_batch, n_time_steps, 3*num_units).
            input =, W_in_stacked) + b_stacked
            context =, W_tid_stacked) + b_stacked

        # At each call to scan, input_n will be (n_time_steps, 3*num_units).
        # We define a slicing function that extract the input to each GRU gate
        def slice_w(x, n):
            return x[:, n * self.num_units:(n + 1) * self.num_units]

        # Create single recurrent computation step function
        # input__n is the n'th vector of the input
        def step(input_n, context_n, hid_previous, W_hid_stacked, W_in_stacked,
                 W_tid_stacked, b_stacked):
            # Compute W_{hr} h_{t - 1}, W_{hu} h_{t - 1}, and W_{hc} h_{t - 1}
            hid_input =, W_hid_stacked)

            if self.grad_clipping is not False:
                input_n = theano.gradient.grad_clip(input_n,
                context_n = theano.gradient.grad_clip(context_n,
                hid_input = theano.gradient.grad_clip(hid_input,

            if not self.precompute_input:
                # Compute W_{xr}x_t + b_r, W_{xu}x_t + b_u, and W_{xc}x_t + b_c
                input_n =, W_in_stacked) + b_stacked
                context_n =, W_tid_stacked) + b_stacked

            # Reset and update gates
            resetgate = slice_w(hid_input, 0) + slice_w(input_n, 0) + slice_w(
                context_n, 0)
            updategate = slice_w(hid_input, 1) + slice_w(input_n, 1) + slice_w(
                context_n, 1)
            resetgate = self.nonlinearity_resetgate(resetgate)
            updategate = self.nonlinearity_updategate(updategate)

            # Compute W_{xc}x_t + r_t \odot (W_{hc} h_{t - 1})
            hidden_update_in = slice_w(input_n, 2)
            hidden_update_hid = slice_w(hid_input, 2)
            hidden_update_tid = slice_w(context_n, 2)
            hidden_update = hidden_update_in + resetgate * hidden_update_hid + hidden_update_tid
            if self.grad_clipping is not False:
                hidden_update = theano.gradient.grad_clip(
                    hidden_update, -self.grad_clipping, self.grad_clipping)
            hidden_update = self.nonlinearity_hid(hidden_update)

            # Compute (1 - u_t)h_{t - 1} + u_t c_t
            hid = (1 - updategate) * hid_previous + updategate * hidden_update
            return hid

        def step_masked(input_n, mask_n, context_n, hid_previous,
                        W_hid_stacked, W_in_stacked, W_tid_stacked, b_stacked):

            hid = step(input_n, context_n, hid_previous, W_hid_stacked,
                       W_in_stacked, W_tid_stacked, b_stacked)

            # Skip over any input with mask 0 by copying the previous
            # hidden state; proceed normally for any input with mask 1.
            not_mask = 1 - mask_n
            hid = hid * mask_n + hid_previous * not_mask

            return hid

        if mask is not None:
            # mask is given as (batch_size, seq_len). Because scan iterates
            # over first dimension, we dimshuffle to (seq_len, batch_size) and
            # add a broadcastable dimension
            mask = mask.dimshuffle(1, 0, 'x')
            sequences = [input, mask, context]
            step_fun = step_masked
            sequences = [input, context]
            step_fun = step

        if isinstance(self.hid_init, T.TensorVariable):
            hid_init = self.hid_init
            # Dot against a 1s vector to repeat to shape (num_batch, num_units)
            hid_init =, 1)), self.hid_init)

        # The hidden-to-hidden weight matrix is always used in step
        non_seqs = [W_hid_stacked]
        # When we aren't precomputing the input outside of scan, we need to
        # provide the input weights and biases to the step function
        if not self.precompute_input:
            non_seqs += [W_in_stacked, W_tid_stacked, b_stacked]
        # theano.scan only allows for positional arguments, so when
        # self.precompute_input is True, we need to supply fake placeholder
        # arguments for the input weights and biases.
            non_seqs += [(), (), ()]

        if self.unroll_scan:
            # Retrieve the dimensionality of the incoming layer
            input_shape = self.input_shapes[0]
            # Explicitly unroll the recurrence instead of using scan
            hid_out = unroll_scan(fn=step_fun,
            # Scan op iterates over first dimension of input and repeatedly
            # applies the step function
            hid_out = theano.scan(fn=step_fun,

        # dimshuffle back to (n_batch, n_time_steps, n_features))
        hid_out = hid_out.dimshuffle(1, 0, 2)

        # if scan is backward reverse the output
        if self.backwards:
            hid_out = hid_out[:, ::-1, :]

        return hid_out
Esempio n. 15
    def get_sessions(self, 
                     session_length = 10,
                     batch_size = None,
                     initial_env_state = 'zeros',initial_observation = 'zeros',initial_hidden = 'zeros',
                     additional_output_layers = [],
        """returns history of agent interaction with environment for given number of turns:
            environment - an environment to interact with (BaseEnvironment instance)
            session_length - how many turns of interaction shall there be for each batch
            batch_size - [required parameter] amount of independed sessions [number or symbolic].Irrelevant if you manually set all initial_*.
            initial_<something> - initial values for all variables at 0-th time step
            Unless you are doing something nasty, initial policy (qvalues) and actions will not matter at all
            'zeros' default means filling variable with zeros
            Initial values are NOT included in history sequences
            additional_output_layers - any layers of a network which outputs need to be added to the outputs
            flags: optional flags to be sent to NN when calling get_output (e.g. deterministic = True)

            state_seq,observation_seq,hidden_seq,policy_seq,action_seq, [additional_output_0, additional_output_1]
            for environment state, observation, hidden state, agent policy and chosen actions respectively
            each of them having dimensions of [batch_i,seq_i,...]
            time synchronization policy:
                state_seq,observation_seq correspond to observation BASED ON WHICH agent generated hidden_seq,policy_seq,action_seq
        env = environment
        if initial_env_state == 'zeros':
            initial_env_state = T.zeros([batch_size,env.state_size])
        if initial_observation == 'zeros':
            initial_observation = T.zeros([batch_size,env.observation_size])
        if initial_hidden == 'zeros':
            memory_state_shape = lasagne.layers.get_output_shape(self.memory)[1:]
            initial_hidden = T.zeros((batch_size,)+tuple(memory_state_shape))
        time_ticks = T.arange(session_length)


        #recurrent step function
        #during SCAN, time synchronization is reverse: state_1 came after action_1 based on observation_0 from state_0
        def step(time_tick,env_state,observation,last_hidden,last_policy,last_action,

            hidden,policy,action,additional_outputs = self.get_agent_reaction(last_hidden,observation,
            new_env_state,new_observation = env.get_action_results(env_state,action,time_tick)

            return [new_env_state,new_observation,hidden,policy,action]+additional_outputs

        #main recurrent loop configuration
        additional_init = [None for i in additional_output_layers]
        outputs_info = [initial_env_state,initial_observation,initial_hidden,None,None] + additional_init
        history = unroll_scan(step,
            sequences = [time_ticks],
            outputs_info = outputs_info,
            non_sequences = [],
            n_steps = session_length

        self.history = history
        #from [time,batch,...] to [batch,time,...]
        history = [ (var.swapaxes(1,0) if var.ndim >1 else var) for var in history]
        #what's inside:
        state_seq,observation_seq,hidden_seq,policy_seq,action_seq = history[:5]
        additional_output_sequences = tuple(history[5:])
        #allign time axes: actions come AFTER states with the same index
        #add first env turn, crop to session length
        state_seq = T.concatenate([insert_dim(initial_env_state,1),
        observation_seq = T.concatenate([insert_dim(initial_observation,1),
        return (state_seq,observation_seq,hidden_seq,policy_seq,action_seq) + additional_output_sequences
Esempio n. 16
    def get_output_for(self, inputs, mask=None, **kwargs):
        Compute this layer's output function given a symbolic input variable

        inputs : list of theano.TensorType
            `inputs[0]` should always be the symbolic input variable.  When
            this layer has a mask input (i.e. was instantiated with
            `mask_input != None`, indicating that the lengths of sequences in
            each batch vary), `inputs` should have length 2, where `inputs[1]`
            is the `mask`.  The `mask` should be supplied as a Theano variable
            denoting whether each time step in each sequence in the batch is
            part of the sequence or not.  `mask` should be a matrix of shape
            ``(n_batch, n_time_steps)`` where ``mask[i, j] = 1`` when ``j <=
            (length of sequence i)`` and ``mask[i, j] = 0`` when ``j > (length
            of sequence i)``. When the hidden state of this layer is to be
            pre-filled (i.e. was set to a :class:`Layer` instance) `inputs`
            should have length at least 2, and `inputs[-1]` is the hidden state
            to prefill with. When the cell state of this layer is to be
            pre-filled (i.e. was set to a :class:`Layer` instance) `inputs`
            should have length at least 2, and `inputs[-1]` is the hidden state
            to prefill with. When both the cell state and the hidden state are
            being pre-filled `inputs[-2]` is the hidden state, while
            `inputs[-1]` is the cell state.

        layer_output : theano.TensorType
            Symbolic output variable.
        # Retrieve the layer input
        input = inputs[0]
        # Retrieve the mask when it is supplied
        hid_init = None
        cell_init = None
        if self.mask_incoming_index > 0:
            mask = inputs[self.mask_incoming_index]
        if self.hid_init_incoming_index > 0:
            hid_init = inputs[self.hid_init_incoming_index]
        if self.cell_init_incoming_index > 0:
            cell_init = inputs[self.cell_init_incoming_index]

        # Treat all dimensions after the second as flattened feature dimensions
        if input.ndim > 3:
            input = T.flatten(input, 3)

        # Because scan iterates over the first dimension we dimshuffle to
        # (n_time_steps, n_batch, n_features)
        input = input.dimshuffle(1, 0, 2)
        seq_len, num_batch, _ = input.shape

        # Stack input weight matrices into a (num_inputs, 4*num_units)
        # matrix, which speeds up computation
        W_in_stacked = T.concatenate(
            [self.W_in_to_ingate, self.W_in_to_forgetgate,
             self.W_in_to_cell, self.W_in_to_outgate], axis=1)

        # Same for hidden weight matrices
        W_hid_stacked = T.concatenate(
            [self.W_hid_to_ingate, self.W_hid_to_forgetgate,
             self.W_hid_to_cell, self.W_hid_to_outgate], axis=1)

        # Stack biases into a (4*num_units) vector
        b_stacked = T.concatenate(
            [self.b_ingate, self.b_forgetgate,
             self.b_cell, self.b_outgate], axis=0)

        if self.precompute_input:
            # Because the input is given for all time steps, we can
            # precompute_input the inputs dot weight matrices before scanning.
            # W_in_stacked is (n_features, 4*num_units). input is then
            # (n_time_steps, n_batch, 4*num_units).
            input =, W_in_stacked) + b_stacked

        # At each call to scan, input_n will be (n_time_steps, 4*num_units).
        # We define a slicing function that extract the input to each LSTM gate
        def slice_w(x, n):
            return x[:, n*self.num_units:(n+1)*self.num_units]

        # Create single recurrent computation step function
        # input_n is the n'th vector of the input
        def step(input_n, cell_previous, hid_previous, *args):
            if not self.precompute_input:
                input_n =, W_in_stacked) + b_stacked

            # Calculate gates pre-activations and slice
            gates = input_n +, W_hid_stacked)

            # Clip gradients
            if self.grad_clipping:
                gates = theano.gradient.grad_clip(
                    gates, -self.grad_clipping, self.grad_clipping)

            # Extract the pre-activation gate values
            ingate = slice_w(gates, 0)
            forgetgate = slice_w(gates, 1)
            cell_input = slice_w(gates, 2)
            outgate = slice_w(gates, 3)

            if self.peepholes:
                # Compute peephole connections
                ingate += cell_previous*self.W_cell_to_ingate
                forgetgate += cell_previous*self.W_cell_to_forgetgate

            # Apply nonlinearities
            ingate = self.nonlinearity_ingate(ingate)
            forgetgate = self.nonlinearity_forgetgate(forgetgate)
            cell_input = self.nonlinearity_cell(cell_input)

            # Compute new cell value
            cell = forgetgate*cell_previous + ingate*cell_input

            if self.peepholes:
                outgate += cell*self.W_cell_to_outgate
            outgate = self.nonlinearity_outgate(outgate)

            # Compute new hidden unit activation
            hid = outgate*self.nonlinearity(cell)
            return [cell, hid]

        def step_masked(input_n, mask_n, cell_previous, hid_previous, *args):
            cell, hid = step(input_n, cell_previous, hid_previous, *args)

            # Skip over any input with mask 0 by copying the previous
            # hidden state; proceed normally for any input with mask 1.
            cell = T.switch(mask_n, cell, cell_previous)
            hid = T.switch(mask_n, hid, hid_previous)

            return [cell, hid]

        if mask is not None:
            # mask is given as (batch_size, seq_len). Because scan iterates
            # over first dimension, we dimshuffle to (seq_len, batch_size) and
            # add a broadcastable dimension
            mask = mask.dimshuffle(1, 0, 'x')
            sequences = [input, mask]
            step_fun = step_masked
            sequences = input
            step_fun = step

        ones = T.ones((num_batch, 1))
        if not isinstance(self.cell_init, Layer):
            # Dot against a 1s vector to repeat to shape (num_batch, num_units)
            cell_init =, self.cell_init)

        if not isinstance(self.hid_init, Layer):
            # Dot against a 1s vector to repeat to shape (num_batch, num_units)
            hid_init =, self.hid_init)

        # The hidden-to-hidden weight matrix is always used in step
        non_seqs = [W_hid_stacked]
        # The "peephole" weight matrices are only used when self.peepholes=True
        if self.peepholes:
            non_seqs += [self.W_cell_to_ingate,

        # When we aren't precomputing the input outside of scan, we need to
        # provide the input weights and biases to the step function
        if not self.precompute_input:
            non_seqs += [W_in_stacked, b_stacked]

        if self.unroll_scan:
            # Retrieve the dimensionality of the incoming layer
            input_shape = self.input_shapes[0]
            # Explicitly unroll the recurrence instead of using scan
            cell_out, hid_out = unroll_scan(
                outputs_info=[cell_init, hid_init],
            # Scan op iterates over first dimension of input and repeatedly
            # applies the step function
            cell_out, hid_out = theano.scan(
                outputs_info=[cell_init, hid_init],

        # When it is requested that we only return the final sequence step,
        # we need to slice it out immediately after scan is applied
        if self.only_return_final:
            hid_out = hid_out[-1]
            # dimshuffle back to (n_batch, n_time_steps, n_features))
            hid_out = hid_out.dimshuffle(1, 0, 2)

            # if scan is backward reverse the output
            if self.backwards:
                hid_out = hid_out[:, ::-1]

        return hid_out
    def get_output_for(self, inputs, **kwargs):
        Compute this layer's output function given a symbolic input variable.

        inputs : list of theano.TensorType
            `inputs[0]` should always be the symbolic input variable.  When
            this layer has a mask input (i.e. was instantiated with
            `mask_input != None`, indicating that the lengths of sequences in
            each batch vary), `inputs` should have length 2, where `inputs[1]`
            is the `mask`.  The `mask` should be supplied as a Theano variable
            denoting whether each time step in each sequence in the batch is
            part of the sequence or not.  `mask` should be a matrix of shape
            ``(n_batch, n_time_steps)`` where ``mask[i, j] = 1`` when ``j <=
            (length of sequence i)`` and ``mask[i, j] = 0`` when ``j > (length
            of sequence i)``.

        layer_output : theano.TensorType
            Symbolic output variable.
        # Retrieve the layer input
        input = inputs[0]
        # Retrieve the mask when it is supplied
        mask = inputs[1] if len(inputs) > 1 else None

        # Input should be provided as (n_batch, n_time_steps, n_features)
        # but scan requires the iterable dimension to be first
        # So, we need to dimshuffle to (n_time_steps, n_batch, n_features)
        input = input.dimshuffle(1, 0, *range(2, input.ndim))
        seq_len, num_batch = input.shape[0], input.shape[1]

        if self.precompute_input:
            # Because the input is given for all time steps, we can precompute
            # the inputs to hidden before scanning. First we need to reshape
            # from (seq_len, batch_size, trailing dimensions...) to
            # (seq_len*batch_size, trailing dimensions...)
            # This strange use of a generator in a tuple was because
            # input.shape[2:] was raising a Theano error
            trailing_dims = tuple(input.shape[n] for n in range(2, input.ndim))
            input = T.reshape(input, (seq_len * num_batch, ) + trailing_dims)
            input = helper.get_output(self.input_to_hidden, input, **kwargs)

            # Reshape back to (seq_len, batch_size, trailing dimensions...)
            trailing_dims = tuple(input.shape[n] for n in range(1, input.ndim))
            input = T.reshape(input, (seq_len, num_batch) + trailing_dims)

        # We will always pass the hidden-to-hidden layer params to step
        non_seqs = helper.get_all_params(self.hidden_to_hidden)
        non_seqs += helper.get_all_params(self.output_to_hidden)
        # When we are not precomputing the input, we also need to pass the
        # input-to-hidden parameters to step
        if not self.precompute_input:
            non_seqs += helper.get_all_params(self.input_to_hidden)

        # Create single recurrent computation step function
        def step(input_n, hid_previous, *args):
            # Compute the hidden-to-hidden activation
            hid_pre = helper.get_output(self.hidden_to_hidden, hid_previous,

            # out_layers = helper.get_all_layers(self.output_to_hidden)
            # out_layers[1].incoming_layer = self.hidden_to_hidden
            hid_pre += helper.get_output(self.output_to_hidden, hid_previous,

            # If the dot product is precomputed then add it, otherwise
            # calculate the input_to_hidden values and add them
            if self.precompute_input:
                hid_pre += input_n
                hid_pre += helper.get_output(self.input_to_hidden, input_n,

            # Clip gradients
            if self.grad_clipping:
                hid_pre = theano.gradient.grad_clip(hid_pre,

            return self.nonlinearity(hid_pre)

        def step_masked(input_n, mask_n, hid_previous, *args):
            # Skip over any input with mask 0 by copying the previous
            # hidden state; proceed normally for any input with mask 1.
            hid = step(input_n, hid_previous, *args)
            hid_out = hid * mask_n + hid_previous * (1 - mask_n)
            return [hid_out]

        if mask is not None:
            mask = mask.dimshuffle(1, 0, 'x')
            sequences = [input, mask]
            step_fun = step_masked
            sequences = input
            step_fun = step

        # When hid_init is provided as a TensorVariable, use it as-is
        if isinstance(self.hid_init, T.TensorVariable):
            hid_init = self.hid_init
            # The code below simply repeats self.hid_init num_batch times in
            # its first dimension.  Turns out using a dot product and a
            # dimshuffle is faster than T.repeat.
            dot_dims = (list(range(1, self.hid_init.ndim - 1)) +
                        [0, self.hid_init.ndim - 1])
            hid_init =, 1)),

        if self.unroll_scan:
            # Retrieve the dimensionality of the incoming layer
            input_shape = self.input_shapes[0]
            # Explicitly unroll the recurrence instead of using scan
            hid_out = unroll_scan(fn=step_fun,
            # Scan op iterates over first dimension of input and repeatedly
            # applies the step function
            hid_out = theano.scan(fn=step_fun,

        # dimshuffle back to (n_batch, n_time_steps, n_features))
        hid_out = hid_out.dimshuffle(1, 0, *range(2, hid_out.ndim))

        # if scan is backward reverse the output
        if self.backwards:
            hid_out = hid_out[:, ::-1]

        return hid_out
    def get_output_for(self, inputs, **kwargs):
        Compute this layer's output function given a symbolic input variable

        inputs : list of theano.TensorType
            `inputs[0]` should always be the symbolic input variable.  When
            this layer has a mask input (i.e. was instantiated with
            `mask_input != None`, indicating that the lengths of sequences in
            each batch vary), `inputs` should have length 2, where `inputs[1]`
            is the `mask`.  The `mask` should be supplied as a Theano variable
            denoting whether each time step in each sequence in the batch is
            part of the sequence or not.  `mask` should be a matrix of shape
            ``(n_batch, n_time_steps)`` where ``mask[i, j] = 1`` when ``j <=
            (length of sequence i)`` and ``mask[i, j] = 0`` when ``j > (length
            of sequence i)``.

        layer_output : theano.TensorType
            Symbolic output variable.
        # Retrieve the layer input
        input = inputs[0]
        # Retrieve the mask when it is supplied
        mask = inputs[1] if len(inputs) > 1 else None

        # Treat all dimensions after the second as flattened feature dimensions
        if input.ndim > 3:
            input = T.flatten(input, 3)

        # Because scan iterates over the first dimension we dimshuffle to
        # (n_time_steps, n_batch, n_features)
        input = input.dimshuffle(1, 0, 2)
        seq_len, num_batch, _ = input.shape

        # Stack input weight matrices into a (num_inputs, 4*num_units)
        # matrix, which speeds up computation
        W_in_stacked = T.concatenate(
                self.W_in_to_ingate, self.W_in_to_forgetgate, self.W_in_to_cell

        # Same for hidden weight matrices
        W_hid_stacked = T.concatenate(
                self.W_hid_to_ingate, self.W_hid_to_forgetgate,

        # Stack biases into a (4*num_units) vector
        b_stacked = T.concatenate(
                self.b_ingate, self.b_forgetgate, self.b_cell

        if self.precompute_input:
            # Because the input is given for all time steps, we can
            # precompute_input the inputs dot weight matrices before scanning.
            # W_in_stacked is (n_features, 4*num_units). input is then
            # (n_time_steps, n_batch, 4*num_units).
            input =, W_in_stacked) + b_stacked

        # At each call to scan, input_n will be (n_time_steps, 4*num_units).
        # We define a slicing function that extract the input to each LSTM gate
        def slice_w(x, n):
            return x[:, n * self.num_units:(n + 1) * self.num_units]

        # Create single recurrent computation step function
        # input_n is the n'th vector of the input
        def step(

            if not self.precompute_input:
                input_n =, W_in_stacked) + b_stacked

            # Calculate gates pre-activations and slice
            gates = input_n +, W_hid_stacked)

            # Clip gradients
            if self.grad_clipping is not False:
                gates = theano.gradient.grad_clip(gates, -self.grad_clipping,

            # Extract the pre-activation gate values
            ingate = slice_w(gates, 0)
            forgetgate = slice_w(gates, 1)
            cell_input = slice_w(gates, 2)
            #outgate = slice_w(gates, 3)

            if self.peepholes:
                # Compute peephole connections
                ingate += cell_previous * W_cell_to_ingate
                forgetgate += cell_previous * W_cell_to_forgetgate

            # Apply nonlinearities
            ingate = self.nonlinearity_ingate(ingate)
            forgetgate = self.nonlinearity_forgetgate(forgetgate)
            cell_input = self.nonlinearity_cell(cell_input)
            #outgate = self.nonlinearity_outgate(outgate)

            # Compute new cell value
            cell = forgetgate * cell_previous + ingate * cell_input

            #if self.peepholes:
            #    outgate += cell*W_cell_to_outgate

            # Compute new hidden unit activation
            #hid = outgate*self.nonlinearity(cell)
            hid = self.nonlinearity(cell)
            return [cell, hid]

        def step_masked(

            cell, hid = step(

            # Skip over any input with mask 0 by copying the previous
            # hidden state; proceed normally for any input with mask 1.
            not_mask = 1 - mask_n
            cell = cell * mask_n + cell_previous * not_mask
            hid = hid * mask_n + hid_previous * not_mask

            return [cell, hid]

        if mask is not None:
            # mask is given as (batch_size, seq_len). Because scan iterates
            # over first dimension, we dimshuffle to (seq_len, batch_size) and
            # add a broadcastable dimension
            mask = mask.dimshuffle(1, 0, 'x')
            sequences = [input, mask]
            step_fun = step_masked
            sequences = input
            step_fun = step

        ones = T.ones((num_batch, 1))
        if isinstance(self.cell_init, T.TensorVariable):
            cell_init = self.cell_init
            # Dot against a 1s vector to repeat to shape (num_batch, num_units)
            cell_init =, self.cell_init)

        if isinstance(self.hid_init, T.TensorVariable):
            hid_init = self.hid_init
            # Dot against a 1s vector to repeat to shape (num_batch, num_units)
            hid_init =, self.hid_init)

        # The hidden-to-hidden weight matrix is always used in step
        non_seqs = [W_hid_stacked]
        # The "peephole" weight matrices are only used when self.peepholes=True
        if self.peepholes:
            non_seqs += [
                self.W_cell_to_ingate, self.W_cell_to_forgetgate
        # theano.scan only allows for positional arguments, so when
        # self.peepholes is False, we need to supply fake placeholder arguments
        # for the three peephole matrices.
            non_seqs += [(), ()
                         #                     ()
        # When we aren't precomputing the input outside of scan, we need to
        # provide the input weights and biases to the step function
        if not self.precompute_input:
            non_seqs += [W_in_stacked, b_stacked]
        # As above, when we aren't providing these parameters, we need to
        # supply placehold arguments
            non_seqs += [(), ()]

        if self.unroll_scan:
            # Retrieve the dimensionality of the incoming layer
            input_shape = self.input_shapes[0]
            # Explicitly unroll the recurrence instead of using scan
            cell_out, hid_out = unroll_scan(fn=step_fun,
                                            outputs_info=[cell_init, hid_init],
            # Scan op iterates over first dimension of input and repeatedly
            # applies the step function
            cell_out, hid_out = theano.scan(
                outputs_info=[cell_init, hid_init],

        # dimshuffle back to (n_batch, n_time_steps, n_features))
        hid_out = hid_out.dimshuffle(1, 0, 2)
        cell_out = cell_out.dimshuffle(1, 0, 2)

        # if scan is backward reverse the output
        if self.backwards:
            hid_out = hid_out[:, ::-1]
            cell_out = cell_out[:, ::-1]

        return hid_out
Esempio n. 19
    def get_output_for(self, inputs, **kwargs):
        Compute this layer's output function given a symbolic input variable

        inputs : list of theano.TensorType
            `inputs[0]` should always be the symbolic input variable.  When
            this layer has a mask input (i.e. was instantiated with
            `mask_input != None`, indicating that the lengths of sequences in
            each batch vary), `inputs` should have length 2, where `inputs[1]`
            is the `mask`.  The `mask` should be supplied as a Theano variable
            denoting whether each time step in each sequence in the batch is
            part of the sequence or not.  `mask` should be a matrix of shape
            ``(n_batch, n_time_steps)`` where ``mask[i, j] = 1`` when ``j <=
            (length of sequence i)`` and ``mask[i, j] = 0`` when ``j > (length
            of sequence i)``.

        layer_output : theano.TensorType
            Symbolic output variable.
        # Retrieve the layer input
        input = inputs[0]
        encoder_output = inputs[1]

        # Treat all dimensions after the second as flattened feature dimensions
        if input.ndim > 3:
            input = T.flatten(input, 3)
            encoder_output = T.flatten(encoder_output, 3)

        # Because scan iterates over the first dimension we dimshuffle to
        # (n_time_steps, n_batch, n_features)
        input = input.dimshuffle(1, 0, 2)
        encoder_output = encoder_output.dimshuffle(1, 0, 2)
        seq_len, num_batch, _ = input.shape

        # Stack input weight matrices into a (num_inputs, 3*num_units)
        # matrix, which speeds up computation
        W_in_stacked = T.concatenate([
            self.W_in_to_resetgate, self.W_in_to_updategate,

        # Same for hidden weight matrices
        W_hid_stacked = T.concatenate([
            self.W_hid_to_resetgate, self.W_hid_to_updategate,

        # Stack gate biases into a (3*num_units) vector
        b_stacked = T.concatenate(
            [self.b_resetgate, self.b_updategate, self.b_hidden_update],

        if self.precompute_input:
            # precompute_input inputs*W. W_in is (n_features, 3*num_units).
            # input is then (n_batch, n_time_steps, 3*num_units).
            input =, W_in_stacked) + b_stacked

        # At each call to scan, input_n will be (n_time_steps, 3*num_units).
        # We define a slicing function that extract the input to each GRU gate
        def slice_w(x, n):
            return x[:, n * self.num_units:(n + 1) * self.num_units]

        # Create single recurrent computation step function
        # input__n is the n'th vector of the input
        def step(input_n, hid_previous, encoder_output, W_hid_stacked,
                 W_in_stacked, b_stacked, W_att_enc, W_att_dec, W_att_out,
                 W_out, b_out):
            # Compute W_{hr} h_{t - 1}, W_{hu} h_{t - 1}, and W_{hc} h_{t - 1}
            hid_input =, W_hid_stacked)

            if self.grad_clipping is not False:
                input_n = theano.gradient.grad_clip(input_n,
                hid_input = theano.gradient.grad_clip(hid_input,

            if not self.precompute_input:
                # Compute W_{xr}x_t + b_r, W_{xu}x_t + b_u, and W_{xc}x_t + b_c
                input_n =, W_in_stacked) + b_stacked

            # Reset and update gates
            resetgate = slice_w(hid_input, 0) + slice_w(input_n, 0)
            updategate = slice_w(hid_input, 1) + slice_w(input_n, 1)
            resetgate = self.nonlinearity_resetgate(resetgate)
            updategate = self.nonlinearity_updategate(updategate)

            # Compute W_{xc}x_t + r_t \odot (W_{hc} h_{t - 1})
            hidden_update_in = slice_w(input_n, 2)
            hidden_update_hid = slice_w(hid_input, 2)
            hidden_update = hidden_update_in + resetgate * hidden_update_hid
            if self.grad_clipping is not False:
                hidden_update = theano.gradient.grad_clip(
                    hidden_update, -self.grad_clipping, self.grad_clipping)
            hidden_update = self.nonlinearity_hid(hidden_update)

            # Compute (1 - u_t)h_{t - 1} + u_t c_t
            hid = (1 - updategate) * hid_previous + updategate * hidden_update

            # # Add the attention
            hid += self.attention(encoder_output, hid_previous, W_att_enc,
                                  W_att_dec, W_att_out)

            # Compute the probas
            probs = T.nnet.softmax(, W_out) + b_out)
            return [hid, probs]

        sequences = [input]
        step_fun = step

        if isinstance(self.hid_init, T.TensorVariable):
            hid_init = self.hid_init
            # Dot against a 1s vector to repeat to shape (num_batch, num_units)
            hid_init =, 1)), self.hid_init)

        # The hidden-to-hidden weight matrix is always used in step
        non_seqs = [encoder_output, W_hid_stacked]
        # When we aren't precomputing the input outside of scan, we need to
        # provide the input weights and biases to the step function
        if not self.precompute_input:
            non_seqs += [
        # theano.scan only allows for positional arguments, so when
        # self.precompute_input is True, we need to supply fake placeholder
        # arguments for the input weights and biases.
            non_seqs += [(), (), self.W_att_enc, self.W_att_dec,
                         self.W_att_out, self.W_out, self.b_out]

        if self.unroll_scan:
            # Retrieve the dimensionality of the incoming layer
            input_shape = self.input_shapes[0]
            # Explicitly unroll the recurrence instead of using scan
            out, _ = unroll_scan(fn=step_fun,
            # Scan op iterates over first dimension of input and repeatedly
            # applies the step function
            out, _ = theano.scan(fn=step_fun,
                                 outputs_info=[hid_init, None],

        # dimshuffle back to (n_batch, n_time_steps, n_features))
        # hid_out = hid_out[0].dimshuffle(1, 0, 2)
        s_out = out[1]

        # # if scan is backward reverse the output
        # if self.backwards:
        #     out = out[:, ::-1, :]

        return s_out
    def get_output_for(self, inputs, **kwargs):
        Compute this layer's output function given a symbolic input variable

        inputs : list of theano.TensorType
            `inputs[0]` should always be the symbolic input variable.  When
            this layer has a mask input (i.e. was instantiated with
            `mask_input != None`, indicating that the lengths of sequences in
            each batch vary), `inputs` should have length 2, where `inputs[1]`
            is the `mask`.  The `mask` should be supplied as a Theano variable
            denoting whether each time step in each sequence in the batch is
            part of the sequence or not.  `mask` should be a matrix of shape
            ``(n_batch, n_time_steps)`` where ``mask[i, j] = 1`` when ``j <=
            (length of sequence i)`` and ``mask[i, j] = 0`` when ``j > (length
            of sequence i)``. When the hidden state of this layer is to be
            pre-filled (i.e. was set to a :class:`Layer` instance) `inputs`
            should have length at least 2, and `inputs[-1]` is the hidden state
            to prefill with.

        layer_output : theano.TensorType
            Symbolic output variable.
        # Retrieve the layer input
        input_p = inputs[0]
        input_q = inputs[1]
        z_init = inputs[2]
        mu_p_init = inputs[3]

        # Retrieve the mask when it is supplied
        mask = None
        if self.mask_incoming_index > 0:
            mask = inputs[self.mask_incoming_index]

        # Because scan iterates over the first dimension we dimshuffle to
        # (n_time_steps, n_batch, n_features)
        input_p = input_p.dimshuffle(1, 0, 2)
        input_q = input_q.dimshuffle(1, 0, 2)
        seq_len, num_batch, _ = input_p.shape

        # Create single recurrent computation step function
        # input__n is the n'th vector of the input
        def log_sum_exp(a, b):
            return T.log(T.exp(a) + T.exp(b))

        def step(noise_n, input_p_n, input_q_n, z_previous, mu_p_previous,
                 logvar_p_previous, mu_q_previous, logvar_q_previous, *args):

            input_p = T.concatenate([input_p_n, z_previous], axis=1)
            mu_p = get_output(self.mu_p_mlp, input_p)

            logvar_p = get_output(self.logvar_p_mlp, input_p)
            logvar_p = T.log(T.exp(logvar_p) + self.cons)

            q_input_n = T.concatenate([input_q_n, z_previous], axis=1)

            mu_q = get_output(self.q_mu_mlp, q_input_n)
            if self.use_mu_residual_q:
                print "Using residuals for mean_q"
                mu_q += mu_p

            logvar_q = get_output(self.q_logvar_mlp, q_input_n)

            # Numerical stability
            logvar_q = T.log(T.exp(logvar_q) + self.cons)

            z_n = mu_q + T.exp(0.5 * logvar_q) * noise_n

            return z_n, mu_p, logvar_p, mu_q, logvar_q

        def step_masked(noise_n, input_p_n, input_q_n, mask_n, z_previous,
                        mu_p_previous, logvar_p_previous, mu_q_previous,
                        logvar_q_previous, *args):

            # Skip over any input with mask 0 by copying the previous
            # hidden state; proceed normally for any input with mask 1.

            z_n, mu_p, logvar_p, mu_q, logvar_q = step(
                noise_n, input_p_n, input_q_n, z_previous, mu_p_previous,
                logvar_p_previous, mu_q_previous, logvar_q_previous, *args)

            z_n = T.switch(mask_n, z_n, z_previous)
            mu_p = T.switch(mask_n, mu_p, mu_p_previous)
            logvar_p = T.switch(mask_n, logvar_p, logvar_p_previous)
            mu_q = T.switch(mask_n, mu_q, mu_q_previous)
            logvar_q = T.switch(mask_n, logvar_q, logvar_q_previous)

            return z_n, mu_p, logvar_p, mu_q, logvar_q

        eps = self._srng.normal(size=(seq_len, num_batch, self.num_units),
        logvar_init = T.zeros((num_batch, self.num_units))
        if mask is not None:
            # mask is given as (batch_size, seq_len). Because scan iterates
            # over first dimension, we dimshuffle to (seq_len, batch_size) and
            # add a broadcastable dimension
            mask = mask.dimshuffle(1, 0, 'x')
            sequences = [eps, input_p, input_q, mask]
            step_fun = step_masked
            sequences = [eps, input_p, input_q]
            step_fun = step

        # The hidden-to-hidden weight matrix is always used in step
        non_seqs = helper.get_all_params(self.logvar_p_mlp)
        non_seqs += helper.get_all_params(self.mu_p_mlp)
        non_seqs += helper.get_all_params(self.q_mu_mlp)
        non_seqs += helper.get_all_params(self.q_logvar_mlp)

        if self.unroll_scan:
            # Retrieve the dimensionality of the incoming layer
            input_shape = self.input_shapes[0]
            # Explicitly unroll the recurrence instead of using scan
            scan_out = unroll_scan(fn=step_fun,
                                       z_init, mu_p_init, logvar_init,
                                       mu_p_init, logvar_init
            # Scan op iterates over first dimension of input and repeatedly
            # applies the step function
            scan_out = theano.scan(fn=step_fun,
                                       z_init, mu_p_init, logvar_init,
                                       mu_p_init, logvar_init

        z, mu_p, logvar_p, mu_q, logvar_q = scan_out

        # When it is requested that we only return the final sequence step,
        # we need to slice it out immediately after scan is applied
        if self.only_return_final:
            assert False
            # dimshuffle back to (n_batch, n_time_steps, n_features))
            z = z.dimshuffle(1, 0, 2)
            mu_p = mu_p.dimshuffle(1, 0, 2)
            logvar_p = logvar_p.dimshuffle(1, 0, 2)
            mu_q = mu_q.dimshuffle(1, 0, 2)
            logvar_q = logvar_q.dimshuffle(1, 0, 2)

            # if scan is backward reverse the output
            if self.backwards:
                z = z[:, ::-1]
                mu_p = mu_p[:, ::-1]
                logvar_p = logvar_p[:, ::-1]
                mu_q = mu_q[:, ::-1]
                logvar_q = logvar_q[:, ::-1]

        return z, mu_p, logvar_p, mu_q, logvar_q
Esempio n. 21
    def predict_symbolic(self, mx, Sx, unroll_scan=False):
        idims = self.D
        odims = self.E

        Ms =[1]
        sf2M = (self.hyp[:, idims]**2)/tt.cast(Ms, floatX)
        sn2 = self.hyp[:, idims+1]**2

        # TODO this should just fallback to the method from the SSGP class
        if Sx is None:
            # first check if we received a vector [D] or a matrix [nxD]
            if mx.ndim == 1:
                mx = mx[None, :]

            srdotx =,2,1)
            phi_x = tt.concatenate([tt.sin(srdotx), tt.cos(srdotx)], 2)
            M = (phi_x*self.beta_ss[:, None, :]).sum(-1)
            phi_x_L = tt.stack([
                solve_lower_triangular(self.Lmm[i], phi_x[i].T)
                for i in range(odims)])
            S = sn2[:, None]*(1 + (sf2M[:, None])*(phi_x_L**2).sum(-2)) + 1e-6

            return M, S

        # precompute some variables
        srdotx =
        srdotSx =
        srdotSxdotsr = tt.sum(srdotSx*, 2)
        e = tt.exp(-0.5*srdotSxdotsr)
        cos_srdotx = tt.cos(srdotx)
        sin_srdotx = tt.sin(srdotx)
        cos_srdotx_e = cos_srdotx*e
        sin_srdotx_e = sin_srdotx*e

        # compute the mean vector
        mphi = tt.horizontal_stack(sin_srdotx_e, cos_srdotx_e)  # E x 2*Ms
        M = tt.sum(mphi*self.beta_ss, 1)

        # input output covariance
        mx_c = mx.dimshuffle(0, 'x')
        sin_srdotx_e_r = sin_srdotx_e.dimshuffle(0, 'x', 1)
        cos_srdotx_e_r = cos_srdotx_e.dimshuffle(0, 'x', 1)
        srdotSx_tr = srdotSx.transpose(0, 2, 1)
        c = tt.concatenate([mx_c*sin_srdotx_e_r + srdotSx_tr*cos_srdotx_e_r,
                            mx_c*cos_srdotx_e_r - srdotSx_tr*sin_srdotx_e_r],
                           axis=2)  # E x D x 2*Ms
        beta_ss_r = self.beta_ss.dimshuffle(0, 'x', 1)

        # input output covariance (notice this is not premultiplied by the
        # input covariance inverse)
        V = tt.sum(c*beta_ss_r, 2).T - tt.outer(mx, M)

        srdotSxdotsr_c = srdotSxdotsr.dimshuffle(0, 1, 'x')
        srdotSxdotsr_r = srdotSxdotsr.dimshuffle(0, 'x', 1)
        M2 = tt.zeros((odims, odims))

        # initialize indices
        triu_indices = np.triu_indices(odims)
        indices = [tt.as_index_variable(idx) for idx in triu_indices]

        def second_moments(i, j, M2, beta, iA, sn2, sf2M, sr, srdotSx,
                           srdotSxdotsr_c, srdotSxdotsr_r,
                           sin_srdotx, cos_srdotx, *args):
            # compute the second moments of the spectrum feature vectors
            siSxsj = srdotSx[i].dot(sr[j].T)  # Ms x Ms
            sijSxsij = -0.5*(srdotSxdotsr_c[i] + srdotSxdotsr_r[j])
            em = tt.exp(sijSxsij+siSxsj)      # MsxMs
            ep = tt.exp(sijSxsij-siSxsj)     # MsxMs
            si = sin_srdotx[i]       # Msx1
            ci = cos_srdotx[i]       # Msx1
            sj = sin_srdotx[j]       # Msx1
            cj = cos_srdotx[j]       # Msx1
            sicj = tt.outer(si, cj)  # MsxMs
            cisj = tt.outer(ci, sj)  # MsxMs
            sisj = tt.outer(si, sj)  # MsxMs
            cicj = tt.outer(ci, cj)  # MsxMs
            sm = (sicj-cisj)*em
            sp = (sicj+cisj)*ep
            cm = (sisj+cicj)*em
            cp = (cicj-sisj)*ep

            # Populate the second moment matrix of the feature vector
            Q_up = tt.concatenate([cm-cp, sm+sp], axis=1)
            Q_lo = tt.concatenate([sp-sm, cm+cp], axis=1)
            Q = tt.concatenate([Q_up, Q_lo], axis=0)

            # Compute the second moment of the output
            m2 = 0.5*matrix_dot(beta[i], Q, beta[j].T)

            m2 = theano.ifelse.ifelse(
                tt.eq(i, j),
                m2 + sn2[i]*(1.0 + sf2M[i]*tt.sum(self.iA[i]*Q)) + 1e-6,
            M2 = tt.set_subtensor(M2[i, j], m2)
            return M2

        nseq = [self.beta_ss, self.iA, sn2, sf2M,, srdotSx,
                srdotSxdotsr_c, srdotSxdotsr_r, sin_srdotx, cos_srdotx,

        if unroll_scan:
            from lasagne.utils import unroll_scan
            [M2_] = unroll_scan(second_moments, indices,
                                [M2], nseq, len(triu_indices[0]))
            updts = {}
            M2_, updts = theano.scan(fn=second_moments,
                                     name="%s>M2_scan" % (

        M2 = M2_[-1]
        M2 = M2 + tt.triu(M2, k=1).T
        S = M2 - tt.outer(M, M)

        return M, S, V
Esempio n. 22
    def get_output_for(self, inputs, recurrence_flags={}, **kwargs):
        returns history of agent interaction with environment for given number of turns.
            inputs - [state init]  + [input_nonsequences] + [input_sequences]
                Each part is a list of theano expressions for layers in the order they were
                provided when creating this layer.
            recurrence_flags - a set of flags to be passed to the one step agent (anything that lasagne supports)
                e.g. {deterministic=True}
            [state_sequences] + [output sequences] - a list of all states and all outputs sequences
            Shape of each such sequence is [batch, tick, shape_of_one_state_or_output...]
        # set batch size
        if len(inputs) != 0:
            batch_size = inputs[0].shape[0]
            batch_size = self.batch_size

        n_states = len(self.state_variables)
        n_state_inits = len(self.state_init)
        n_input_nonseq = len(self.input_nonsequences)
        n_input_seq = len(self.input_sequences)
        n_outputs = len(self.tracked_outputs)

        initial_states, nonsequences, sequences = unpack_list(inputs, [n_state_inits, n_input_nonseq, n_input_seq])

        # reshape sequences from [batch, time, ...] to [time,batch,...] to fit scan
        sequences = [seq.swapaxes(1, 0) for seq in sequences]

        # create outputs_info for scan
        initial_states = OrderedDict(list(zip(self.state_init, initial_states)))

        def get_initial_state(state_out_layer):
            """Pick dedicated initial state or create zeros of appropriate shape and dtype"""
            # if we have a dedicated init, use it
            if state_out_layer in initial_states:
                initial_state = initial_states[state_out_layer]
            # otherwise initialize with zeros
                initial_state = T.zeros((batch_size,) + tuple(state_out_layer.output_shape[1:]),
            return initial_state

        initial_state_variables = list(map(get_initial_state, self.state_variables))

        outputs_info = initial_state_variables + [None] * len(self.tracked_outputs)

        # recurrent step function
        def step(*args):

            sequence_slices, prev_states, prev_outputs, nonsequences = \
                unpack_list(args, [n_input_seq, n_states, n_outputs, n_input_nonseq])

            # make dicts of prev_states and inputs
            prev_states_dict = OrderedDict(zip(list(self.state_variables.keys()), prev_states))

            input_layers = list(chain(self.input_nonsequences.keys(), self.input_sequences.keys()))
            assert len(input_layers) == len(nonsequences + sequence_slices)

            inputs_dict = OrderedDict(zip(input_layers, nonsequences + sequence_slices))

            # call one step recurrence
            new_states, new_outputs = self.get_one_step(prev_states_dict, inputs_dict, **recurrence_flags)
            return new_states + new_outputs

        if self.unroll_scan:
            # call scan itself
            history = unroll_scan(step,
            history,updates = theano.scan(step,
            self.updates = updates
            if len(updates) !=0:
                warn("Warning: recurrent loop without unroll_scan got nonempty random state updates list. That happened"
                     " because there is some source of randomness (e.g. dropout) inside recurrent step graph."
                     " To compile such graph, one must either call .get_automatic_updates() right after .get_output"
                     " and pass these updates to a function, or use no_defalt_updates=True when compiling theano.function.")

        # reordering from [time,batch,...] to [batch,time,...]
        history = [(var.swapaxes(1, 0) if var.ndim > 1 else var) for var in history]

        state_seqs, output_seqs = unpack_list(history, [n_states, n_outputs])

        # handle delayed_states
        # selectively shift state sequences by 1 tick into the past, padding with their initialisations
        for i in range(len(state_seqs)):
            if list(self.state_variables.keys())[i] in self.delayed_states:
                state_seq = state_seqs[i]
                state_init = initial_state_variables[i]
                state_seq = T.concatenate([insert_dim(state_init, 1), state_seq[:, :-1]], axis=1)
                state_seqs[i] = state_seq

        return OrderedDict(zip(self.keys(),state_seqs + output_seqs))
Esempio n. 23
    def get_loss(self, unroll_scan=False, cache_intermediate=True):
        utils.print_with_stamp('Building Sparse Spectrum loss',
        idims = self.D

        if is None:
   = self.w/(self.hyp[:, :idims])
   =, 0, 2)

        # init variables
        N = self.X.shape[0].astype(floatX)
        M =[1].astype(floatX)
        Mi = 2*[1]
        EyeM = tt.eye(Mi)
        sf2 = self.hyp[:, idims]**2
        sf2M = (sf2/M).dimshuffle(0, 'x', 'x')
        sn2 = (self.hyp[:, idims+1]**2).dimshuffle(0, 'x', 'x')
        srdotX =

        phi_f = tt.concatenate([tt.sin(srdotX), tt.cos(srdotX)], axis=1)
        Phi_f = tt.batched_dot(phi_f, phi_f.transpose(0, 2, 1))
        A = sf2M*Phi_f
        A += (sn2 + 1e-6)*EyeM
        phi_f_dotY = tt.batched_dot(phi_f, self.Y.T)

        def nlml(A, phidotY, EyeM):
            Lmm = Cholesky()(A)
            rhs = tt.concatenate([EyeM, phidotY[:, None]], axis=1)
            sol = solve_upper_triangular(
                Lmm.T, solve_lower_triangular(Lmm, rhs))
            iA = sol[:, :-1]
            beta_ss = sol[:, -1]

            return iA, Lmm, beta_ss

        seq = [A, phi_f_dotY]
        nseq = [EyeM]

        if unroll_scan:
            from lasagne.utils import unroll_scan
            [iA, Lmm, beta_ss] = unroll_scan(nlml, seq, [], nseq, self.E)
            updts = {}
            (iA, Lmm, beta_ss), updts = theano.scan(
                fn=nlml, sequences=seq, non_sequences=nseq,
                allow_gc=False, return_list=True,
                name='%s>logL_ss' % (

        # scale beta_ss
        beta_ss *= sf2M[:, :, 0]

        # And finally, the negative log marginal likelihood
        YdotY = tt.sum(self.Y**2, 0)
        Ydotphidotbeta = tt.sum(phi_f_dotY*beta_ss, -1)
        loss_ss = 0.5*(YdotY - Ydotphidotbeta)/sn2
        idx = [theano.tensor.arange(Lmm.shape[i]) for i in [1, 2]]
        loss_ss += tt.sum(tt.log(Lmm[:, idx[0], idx[1]]), 1)
        loss_ss += (0.5*N - M)*tt.log(sn2)
        loss_ss += 0.5*N*np.log(2*np.pi, dtype=floatX)

        if cache_intermediate:
            # we are going to save the intermediate results in the following
            # shared variables, so we can use them during prediction without
            # having to recompute them
            kk = 2*self.n_inducing
            N, E = self.N, self.E
            if type(self.iA) is not tt.sharedvar.SharedVariable:
                self.iA = S(np.tile(np.eye(kk, dtype=floatX), (E, 1, 1)),
                            name="%s>iA" % (
            if type(self.Lmm) is not tt.sharedvar.SharedVariable:
                self.Lmm = S(np.tile(np.eye(kk, dtype=floatX), (E, 1, 1)),
                             name="%s>Lmm" % (
            if type(self.beta_ss) is not tt.sharedvar.SharedVariable:
                self.beta_ss = S(np.ones((E, kk), dtype=floatX),
                                 name="%s>beta_ss" % (
            updts = [(self.iA, iA), (self.Lmm, Lmm), (self.beta_ss, beta_ss)]
            self.iA, self.Lmm, self.beta_ss = iA, Lmm, beta_ss
            updts = None

        # we add some penalty to avoid having parameters that are too large
        if self.snr_penalty is not None:
            penalty_params = {'log_snr': np.log(1000, dtype=floatX),
                              'log_ls': np.log(100, dtype=floatX),
                              'log_std': tt.log(self.X.std(0)*(N/(N-1.0))),
                              'p': 30}
            loss_ss += self.snr_penalty(tt.log(self.hyp), **penalty_params)

        # add a penalty for high frequencies
        freq_penalty = tt.square(self.w).sum(-1).mean(0)
        loss_ss = loss_ss + freq_penalty

        inps = []
        self.state_changed = True  # for saving
        return loss_ss.sum(), inps, updts
Esempio n. 24
    def get_output_for(self, inputs, **kwargs):
        # Retrieve the layer input
        input = inputs[0]
        # Retrieve the mask when it is supplied
        mask = None
        hid_init = None
        cell_init = None
        if self.mask_incoming_index > 0:
            mask = inputs[self.mask_incoming_index]
        if self.hid_init_incoming_index > 0:
            hid_init = inputs[self.hid_init_incoming_index]
        if self.cell_init_incoming_index > 0:
            cell_init = inputs[self.cell_init_incoming_index]

        # TLSTM: Define new input
        time_mat = inputs[self.time_incoming_index]

        # Treat all dimensions after the second as flattened feature dimensions
        if input.ndim > 3:
            input = T.flatten(input, 3)

            input =

        # Because scan iterates over the first dimension we dimshuffle to
        # (n_time_steps, n_batch, n_features)
        input = input.dimshuffle(1, 0, 2)
        #(n_time_steps, n_batch)
        time_input = time_mat.dimshuffle(1, 0, 'x')
        time_seq_len, time_num_batch, _ = time_input.shape
        seq_len, num_batch, _ = input.shape

        # Stack input weight matrices into a (num_inputs, 4*num_units)
        # matrix, which speeds up computation
        W_in_stacked = T.concatenate([
            self.W_in_to_ingate, self.W_in_to_forgetgate, self.W_in_to_cell,
            self.W_in_to_outgate, self.W_x2_to_tg2, self.W_x1_to_tg1

        # Same for hidden weight matrices
        W_hid_stacked = T.concatenate([
            self.W_hid_to_ingate, self.W_hid_to_forgetgate, self.W_hid_to_cell,

        # Stack biases into a (4*num_units) vector
        b_stacked = T.concatenate([
            self.b_ingate, self.b_forgetgate, self.b_cell, self.b_outgate,
            self.b2_tg2, self.b1_tg1

        # W_t1_to_tg1_constraint < 0
        W_t1_to_tg1_constraint = T.switch(
  , self.boundary), self.W_t1_to_tg1,

        # Stack delta time weight matrices into a (num_inputs, 2* num_units)
        W_t_stacked = T.concatenate(
            [self.W_to_to_outgate, self.W_t2_to_tg2, W_t1_to_tg1_constraint],

        if self.precompute_input:
            # Because the input is given for all time steps, we can
            # precompute_input the inputs dot weight matrices before scanning.
            # W_in_stacked is (n_features, 4*num_units). input is then
            # (n_time_steps, n_batch, 4*num_units).
            time_input =, W_t_stacked)
            input =, W_in_stacked) + b_stacked

        # When theano.scan calls step, input_n will be (n_batch, 4*num_units).
        # We define a slicing function that extract the input to each LSTM gate
        def slice_w(x, start, stride=1):
            return x[:,
                     start * self.num_units:(start + stride) * self.num_units]

        # Create single recurrent computation step function
        # input_n is the n'th vector of the input
        # todo
        # insert Tm_n, weight_t_o_n in to mask_n and xell_previous
        def step(input_n, time_input_n, cell_previous, hid_previous, *args):
            if not self.precompute_input:
                time_input_n =, W_t_stacked)
                input_n =, W_in_stacked) + b_stacked

            tm_wto_n = slice_w(time_input_n, 0)
            tm_w2_n = slice_w(time_input_n, 1)
            tm_w1_n = slice_w(time_input_n, 2)
            tm_w2_n = self.nonlinearity_inside_tg2(tm_w2_n)
            tm_w1_n = self.nonlinearity_inside_tg1(tm_w1_n)
            tm2_xwb_n = slice_w(input_n, 4)
            tm1_xwb_n = slice_w(input_n, 5)
            timegate2 = self.nonlinearity_outside_tg2(tm_w2_n + tm2_xwb_n)
            timegate1 = self.nonlinearity_outside_tg1(tm_w1_n + tm1_xwb_n)
            input_n = slice_w(input_n, 0, 4)

            # Calculate gates pre-activations and slice
            gates = input_n +, W_hid_stacked)

            # Clip gradients
            if self.grad_clipping:
                gates = theano.gradient.grad_clip(gates, -self.grad_clipping,

            # Extract the pre-activation gate values
            ingate = slice_w(gates, 0)
            forgetgate = slice_w(gates, 1)
            cell_input = slice_w(gates, 2)
            outgate = slice_w(gates, 3)
            outgate += tm_wto_n

            if self.peepholes:
                # Compute peephole connections
                ingate += cell_previous * self.W_cell_to_ingate
                forgetgate += cell_previous * self.W_cell_to_forgetgate

            # Apply nonlinearities
            ingate = self.nonlinearity_ingate(ingate)
            forgetgate = self.nonlinearity_forgetgate(forgetgate)
            cell_input = self.nonlinearity_cell(cell_input)

            # Compute new cell value
            cell = forgetgate * cell_previous + ingate * timegate2 * cell_input
            tilde_cell = forgetgate * cell_previous + ingate * timegate1 * cell_input

            if self.peepholes:
                outgate += tilde_cell * self.W_cell_to_outgate

            outgate = self.nonlinearity_outgate(outgate)

            # Compute new hidden unit activation
            hid = outgate * self.nonlinearity(tilde_cell)
            return [cell, hid]

        def step_masked(input_n, time_input_n, mask_n, cell_previous,
                        hid_previous, *args):

            cell, hid = step(input_n, time_input_n, cell_previous,
                             hid_previous, *args)

            # Skip over any input with mask 0 by copying the previous
            # hidden state; proceed normally for any input with mask 1.
            cell = T.switch(mask_n, cell, cell_previous)
            hid = T.switch(mask_n, hid, hid_previous)

            return [cell, hid]

        if mask is not None:
            # mask is given as (batch_size, seq_len). Because scan iterates
            # over first dimension, we dimshuffle to (seq_len, batch_size) and
            # add a broadcastable dimension
            mask = mask.dimshuffle(1, 0, 'x')
            sequences = [input, time_input, mask]
            step_fun = step_masked
            sequences = [input, time_input]
            step_fun = step

        ones = T.ones((num_batch, 1))
        if not isinstance(self.cell_init, Layer):
            # Dot against a 1s vector to repeat to shape (num_batch, num_units)
            cell_init =, self.cell_init)

        if not isinstance(self.hid_init, Layer):
            # Dot against a 1s vector to repeat to shape (num_batch, num_units)
            hid_init =, self.hid_init)

        # The hidden-to-hidden weight matrix is always used in step
        non_seqs = [W_hid_stacked]

        # The "peephole" weight matrices are only used when self.peepholes=True
        if self.peepholes:
            non_seqs += [
                self.W_cell_to_ingate, self.W_cell_to_forgetgate,

        # When we aren't precomputing the input outside of scan, we need to
        # provide the input weights and biases to the step function
        if not self.precompute_input:
            non_seqs += [W_in_stacked, b_stacked, W_t_stacked]

        if self.unroll_scan:
            # Retrieve the dimensionality of the incoming layer
            input_shape = self.input_shapes[0]
            # Explicitly unroll the recurrence instead of using scan
            cell_out, hid_out = unroll_scan(fn=step_fun,
                                            outputs_info=[cell_init, hid_init],
            # Scan op iterates over first dimension of input and repeatedly
            # applies the step function
            cell_out, hid_out = theano.scan(
                outputs_info=[cell_init, hid_init],

        # When it is requested that we only return the final sequence step,
        # we need to slice it out immediately after scan is applied
        if self.only_return_final:
            hid_out = hid_out[-1]
            # dimshuffle back to (n_batch, n_time_steps, n_features))
            hid_out = hid_out.dimshuffle(1, 0, 2)

            # if scan is backward reverse the output
            if self.backwards:
                hid_out = hid_out[:, ::-1]

        return hid_out
Esempio n. 25
    def get_output_for(self, inputs, **kwargs):
        Compute this layer's output function given a symbolic input variable
        inputs : list of theano.TensorType
            `inputs[0]` should always be the symbolic input variable.  When
            this layer has a mask input (i.e. was instantiated with
            `mask_input != None`, indicating that the lengths of sequences in
            each batch vary), `inputs` should have length 2, where `inputs[1]`
            is the `mask`.  The `mask` should be supplied as a Theano variable
            denoting whether each time step in each sequence in the batch is
            part of the sequence or not.  `mask` should be a matrix of shape
            ``(n_batch, n_time_steps)`` where ``mask[i, j] = 1`` when ``j <=
            (length of sequence i)`` and ``mask[i, j] = 0`` when ``j > (length
            of sequence i)``. When the hidden state of this layer is to be
            pre-filled (i.e. was set to a :class:`Layer` instance) `inputs`
            should have length at least 2, and `inputs[-1]` is the hidden state
            to prefill with. When the cell state of this layer is to be
            pre-filled (i.e. was set to a :class:`Layer` instance) `inputs`
            should have length at least 2, and `inputs[-1]` is the hidden state
            to prefill with. When both the cell state and the hidden state are
            being pre-filled `inputs[-2]` is the hidden state, while
            `inputs[-1]` is the cell state.
        layer_output : theano.TensorType
            Symbolic output variable.
        # Retrieve the layer input
        input = inputs[0]
        gate = inputs[1]

        cell_init = None
        if self.cell_init_incoming_index > 0:
            cell_init = inputs[self.cell_init_incoming_index]
        mask = None
        if self.mask_incoming_index > 0:
            mask = inputs[self.mask_incoming_index]

        # Because scan iterates over the first dimension we dimshuffle to
        # (n_time_steps, n_batch, n_features)
        input = input[:, :, :, 0].dimshuffle(2, 0, 1)
        gate = gate[:, :, :, 0].dimshuffle(2, 0, 1)

        seq_len, num_batch, _ = input.shape

        # Stack input weight matrices into a (num_inputs, 4*num_units)
        # matrix, which speeds up computation

        # We define a slicing function that extract the input to each LSTM gate
        def slice_w(x, n):
            return x[:, n * self.num_units:(n + 1) * self.num_units]

        # Create single recurrent computation step function
        # input_n is the n'th vector of the input
        def step(input_n, gate_n, hid_previous, *args):

            hid = input_n
            # temp=rectify( gate_pos_n*rectify(hid_previous) )
            # temp+=neg_rectify_neg( gate_neg_n*neg_rectify_neg(hid_previous) )

            # temp = T.nnet.hard_sigmoid(gate_n)*hid_previous
            temp = gate_n * hid_previous

            hid += temp

            return hid, temp

        def step_masked(input_n, gate_n, mask_n, hid_previous, *args):
            hid, temp = step(input_n, gate_n, hid_previous, *args)

            # Skip over any input with mask 0 by copying the previous
            # hidden state; proceed normally for any input with mask 1.
            hid = T.switch(mask_n, hid, hid_previous)

            return hid, temp

        if mask is not None:
            # mask is given as (batch_size, seq_len). Because scan iterates
            # over first dimension, we dimshuffle to (seq_len, batch_size) and
            # add a broadcastable dimension
            mask = mask.dimshuffle(1, 0, 'x')
            sequences = [input, gate, mask]
            step_fun = step_masked
            sequences = [input, gate]
            step_fun = step

        ones = T.ones((num_batch, 1))
        if not isinstance(self.cell_init, Layer):
            # Dot against a 1s vector to repeat to shape (num_batch, num_units)
            cell_init =, self.cell_init)

        outputs_info = [cell_init, None]

        if self.unroll_scan:
            # Retrieve the dimensionality of the incoming layer
            input_shape = self.input_shapes[0]
            # Explicitly unroll the recurrence instead of using scan
            outs = unroll_scan(fn=step_fun,
            # Scan op iterates over first dimension of input and repeatedly
            # applies the step function
            outs = theano.scan(
                # truncate_gradient=self.gradient_steps,

        if self.only_return_final:
            return outs[-1]

        # dimshuffle back to (n_batch, n_time_steps, n_features))
        cell_out = outs[0].dimshuffle(1, 0, 2)
        temp_out = outs[1].dimshuffle(1, 0, 2)

        # if scan is backward reverse the output
        if self.backwards:
            cell_out = cell_out[:, ::-1]

        return cell_out, temp_out
Esempio n. 26
    def get_output_for(self, inputs, **kwargs):
        # Retrieve the layer input
        input = inputs[0]
        # Retrieve the mask when it is supplied
        mask = None
        hid_init = None
        cell_init = None
        visual_input = None
        if self.mask_incoming_index > 0:
            mask = inputs[self.mask_incoming_index]
        if self.hid_init_incoming_index > 0:
            hid_init = inputs[self.hid_init_incoming_index]
        if self.cell_init_incoming_index > 0:
            cell_init = inputs[self.cell_init_incoming_index]
        if self.visual_input_index > 0:
            visual_input = inputs[self.visual_input_index]

        # Treat all dimensions after the second as flattened feature dimensions
        if input.ndim > 3:
            input = T.flatten(input, 3)

        # Because scan iterates over the first dimension we dimshuffle to
        # (n_time_steps, n_batch, n_features)
        input = input.dimshuffle(1, 0, 2)
        seq_len, num_batch, _ = input.shape

        # Stack input weight matrices into a (num_inputs, 4*num_units)
        # matrix, which speeds up computation
        W_in_stacked = T.concatenate(
            [self.W_in_to_ingate, self.W_in_to_forgetgate,
             self.W_in_to_cell, self.W_in_to_outgate, self.W_in_to_ggate],

        # Same for hidden weight matrices
        # pdb.set_trace()
        W_hid_stacked = T.concatenate(
            [self.W_hid_to_ingate, self.W_hid_to_forgetgate,
             self.W_hid_to_cell, self.W_hid_to_outgate, self.W_hid_to_ggate],

        # Stack biases into a (4*num_units) vector
        b_stacked = T.concatenate(
            [self.b_ingate, self.b_forgetgate,
             self.b_cell, self.b_outgate, self.b_ggate], axis=0)

        if self.precompute_input:
            # Because the input is given for all time steps, we can
            # precompute_input the inputs dot weight matrices before scanning.
            # W_in_stacked is (n_features, 4*num_units). input is then
            # (n_time_steps, n_batch, 4*num_units).
            input =, W_in_stacked) + b_stacked

        # When theano.scan calls step, input_n will be (n_batch, 4*num_units).
        # We define a slicing function that extract the input to each LSTM gate
        def slice_w(x, n):
            return x[:, n*self.num_units:(n+1)*self.num_units]

        # Create single recurrent computation step function
        # input_n is the n'th vector of the input
        def step(
            cell_previous, hid_previous,
            W_hid_stacked, W_in_stacked, b_stacked,
            W_cell_to_ingate, W_cell_to_forgetgate, W_cell_to_outgate,
            W_h_to_attenGate, W_g_to_attenGate, W_v_to_attenGate, W_s_to_attenGate,
            if not self.precompute_input:
                input_n =, W_in_stacked) + b_stacked

            # Calculate gates pre-activations and slice
            gates = input_n +, W_hid_stacked)

            # Clip gradients
            if self.grad_clipping:
                gates = theano.gradient.grad_clip(
                    gates, -self.grad_clipping, self.grad_clipping)

            # Extract the pre-activation gate values
            ingate = slice_w(gates, 0)
            forgetgate = slice_w(gates, 1)
            cell_input = slice_w(gates, 2)
            outgate = slice_w(gates, 3)
            ggate = slice_w(gates, 4)

            if self.peepholes:
                # Compute peephole connections
                ingate += cell_previous*W_cell_to_ingate
                forgetgate += cell_previous*W_cell_to_forgetgate

            # Apply nonlinearities
            ingate = self.nonlinearity_ingate(ingate)
            forgetgate = self.nonlinearity_forgetgate(forgetgate)
            cell_input = self.nonlinearity_cell(cell_input)
            # ggate gt
            ggate = self.nonlinearity_ggate(ggate)

            # Compute new cell value
            cell = forgetgate*cell_previous + ingate*cell_input

            if self.peepholes:
                outgate += cell*W_cell_to_outgate
            outgate = self.nonlinearity_outgate(outgate)

            # Compute new hidden unit activation
            hid = outgate*self.nonlinearity(cell)
            st = ggate*self.nonlinearity(cell)

            # zt =
            #     self.nonlinearity(
            #, W_v_to_attenGate) +
            #   , W_g_to_attenGate).dimshuffle(0, 1, 'x'),
            #             T.ones((1, self.video_len))
            #         )
            #     ),
            #     W_h_to_attenGate
            # )[:, :, 0]

            # to avoid optimization failure of Tenseor 3D dot vector, we should transform
            # e = to e = A*B.dimshuffle('x', 'x', 0), e=e.sum(axis=2)
            zt_dot_A = self.nonlinearity(
      , W_v_to_attenGate) +
          , W_g_to_attenGate).dimshuffle(0, 1, 'x'),
                    T.ones((1, self.video_len))
            zt = zt_dot_A*W_h_to_attenGate.dimshuffle('x', 'x', 0)
            zt = zt.sum(axis=2)

            # vt =
            #     self.nonlinearity(
            #             st, W_s_to_attenGate
            #         ) +
            #             hid, W_g_to_attenGate
            #         )
            #     ),
            #     W_h_to_attenGate
            # )

            vt_dot_A = self.nonlinearity(
                    st, W_s_to_attenGate
                ) +
                    hid, W_g_to_attenGate
            vt = vt_dot_A*W_h_to_attenGate.dimshuffle('x', 0)
            vt = vt.sum(axis=1)
            vt = vt.dimshuffle(0, 'x')

            alpha_hat_t = self.nonlinearity_attenGate(T.concatenate(
                [zt, vt],
            feature = T.concatenate(
                [visual_input, st.dimshuffle(0, 'x', 1)],
            ).dimshuffle(2, 0, 1)
            c_hat_t = T.sum(alpha_hat_t*feature, axis=-1)
            It =
                (c_hat_t.T+hid), W_p
            return [cell, hid, It]

        def step_masked(
            input_n, mask_n,
            cell_previous, hid_previous, It_previous,
            W_hid_stacked, W_in_stacked, b_stacked,
            W_cell_to_ingate, W_cell_to_forgetgate, W_cell_to_outgate,
            W_h_to_attenGate, W_g_to_attenGate, W_v_to_attenGate, W_s_to_attenGate,
            cell, hid, It = step(
                cell_previous, hid_previous,
                W_hid_stacked, W_in_stacked, b_stacked,
                W_cell_to_ingate, W_cell_to_forgetgate, W_cell_to_outgate,
                W_h_to_attenGate, W_g_to_attenGate, W_v_to_attenGate, W_s_to_attenGate,

            # Skip over any input with mask 0 by copying the previous
            # hidden state; proceed normally for any input with mask 1.
            cell = T.switch(mask_n, cell, cell_previous)
            hid = T.switch(mask_n, hid, hid_previous)
            It = T.switch(mask_n, It, It_previous)
            # theano.printing.Print('It')(It)
            return [cell, hid, It]

        if mask is not None:
            # mask is given as (batch_size, seq_len). Because scan iterates
            # over first dimension, we dimshuffle to (seq_len, batch_size) and
            # add a broadcastable dimension
            mask = mask.dimshuffle(1, 0, 'x')
            sequences = [input, mask]
            step_fun = step_masked
            sequences = input
            step_fun = step

        ones = T.ones((num_batch, 1))
        if not isinstance(self.cell_init, Layer):
            # Dot against a 1s vector to repeat to shape (num_batch, num_units)
            cell_init =, self.cell_init)

        if not isinstance(self.hid_init, Layer):
            # Dot against a 1s vector to repeat to shape (num_batch, num_units)
            hid_init =, self.hid_init)

        It_init =, self.It_init)

        # The hidden-to-hidden weight matrix is always used in step
        non_seqs = [visual_input, W_hid_stacked]
        if not self.precompute_input:
            non_seqs += [W_in_stacked, b_stacked]
            non_seqs += [(), ()]
        # The "peephole" weight matrices are only used when self.peepholes=True
        if self.peepholes:
            non_seqs += [self.W_cell_to_ingate,
            non_seqs += [(), (), ()]

        # When we aren't precomputing the input outside of scan, we need to
        # provide the input weights and biases to the step function

        non_seqs += [self.W_h_to_attenGate, self.W_g_to_attenGate, self.W_v_to_attenGate, self.W_s_to_attenGate, self.W_p]

        if self.unroll_scan:
            # Retrieve the dimensionality of the incoming layer
            input_shape = self.input_shapes[0]
            # Explicitly unroll the recurrence instead of using scan
            cell_out, hid_out, It = unroll_scan(
                outputs_info=[cell_init, hid_init, It_init],
            # Scan op iterates over first dimension of input and repeatedly
            # applies the step function

            cell_out, hid_out, It = theano.scan(
                outputs_info=[cell_init, hid_init, It_init],

        It = It.dimshuffle(1, 0, 2)
        if self.backwards:
            It = It[:, ::-1]
        return It
Esempio n. 27
    def get_output_for(self, inputs, **kwargs):
        # Retrieve the layer input
        input = inputs[0]
        # Retrieve the mask when it is supplied
        mask = None
        hid_init = None
        cell_init = None

        # 取出之前存储在inputs中的几个输入
        if self.mask_incoming_index > 0:
            mask = inputs[self.mask_incoming_index]
        if self.hid_init_incoming_index > 0:
            hid_init = inputs[self.hid_init_incoming_index]
        if self.cell_init_incoming_index > 0:
            cell_init = inputs[self.cell_init_incoming_index]

        time_mat = inputs[self.time_incoming_index]

        # addv
        duration_mat = inputs[self.duration_incoming_index]

        # 如果ndim>3,则折叠input的后面的尺寸
        例如,如果我们用展平(x,outdim = 2)展平shape(2,3,4,5)的张量,
        那么我们将具有相同的(2-1 = 1)前导尺寸(2,),
        if input.ndim > 3:
            input = T.flatten(input, 3)

        # 批量正则化
            input =

        # 交换1 2维的数据
        input = input.dimshuffle(1, 0, 2)
        seq_len, num_batch, _ = input.shape
        # (n_time_steps, n_batch)
        # add
        time_input = time_mat.dimshuffle(1, 0, 'x')
        time_seq_len, time_num_batch, _ = time_input.shape

        # addv
        duration_input = duration_mat.dimshuffle(1, 0, 'x')
        duration_seq_len, duration_num_batch, _ = duration_input.shape

        # 合成 num_features,(num_units*6)) vector
        # 同时添加一个权重矩阵(x)
        W_in_stacked = T.concatenate(
                self.W_x1_to_tg1,  # add
                self.W_x2_to_dg2,  # addv

        W_hid_stacked = T.concatenate(
            [self.W_hid_to_ingate, self.W_hid_to_cell, self.W_hid_to_outgate],
        # 合成 (6*num_units) vector
        b_stacked = T.concatenate(
                self.b1_tg1,  # add 添加时间的偏置
                self.b2_dg2,  # addv

        # add2 永用于控制Wt1 使用 W_t1_to_tg1_constraint 代替 W_t1_to_tg1
        # W_t1_to_tg1_constraint < 0
        # W_t1_to_tg1_constraint = self.W_t1_to_tg1
        # W_t1_to_tg1_constraint = T.switch(, self.boundary), self.W_t1_to_tg1, self.boundary)

        # add t只有和两个矩阵相乘(t)
        # Stack delta time weight matrices into a (1, 2* num_units)
        W_t_stacked = T.concatenate(
                self.W_t1_to_tg1  # change

        # addv
        W_d_stacked = T.concatenate([
        ], axis=1)

        if self.precompute_input:
            # Because the input is given for all time steps, we can
            # precompute_input the inputs dot weight matrices before scanning.
            # W_in_stacked is (n_features, 4*num_units). input is then
            # (n_time_steps, n_batch, 4*num_units).
            # add 输入预计算
            time_input =, W_t_stacked)
            input =, W_in_stacked) + b_stacked
            # addv
            duration_input =, W_d_stacked)

        # When theano.scan calls step, input_n will be (n_batch, 4*num_units).
        # We define a slicing function that extract the input to each LSTM gate
        # change
        def slice_w(x, start, stride=1):
            return x[:,
                     start * self.num_units:(start + stride) * self.num_units]

        # Create single recurrent computation step function
        # input_n is the n'th vector of the input
        def step(input_n, time_input_n, duration_input_n, cell_previous,
                 hid_previous, hid1_pre, hid2_pre, n_pre, *args):
            # 之前已经有预计算的时候,这里不用,但是不知道什么用
            # 可能在这里是分步计算
            if not self.precompute_input:
                # add
                # time_input_n是time序列中的一个输入
                # 之前time_input_n(n_batch,'x')
                # time_input_n(n_time_steps, n_batch,'x')
                time_input_n =, W_t_stacked)
                # 之前input_n(n_batch, n_features)
                # input_n(n_time_steps, n_batch, num_units)
                input_n =, W_in_stacked) + b_stacked
                # addv
                duration_input_n =, W_d_stacked)

            # 通过分片的函数,将输入的数据分成几个部分,
            # 这几个部分分别对应一块,
            # 如 tm_wto_n表示t和wto相乘的块
            # add
            tm_wto_n = slice_w(time_input_n, 0)

            tm_w1_n = slice_w(time_input_n, 1)
            tm_w1_n = self.nonlinearity_inside_tg1(tm_w1_n)

            # addv
            dm_w2_n = slice_w(duration_input_n, 0)

            dm_w2_n = self.nonlinearity_dg2(dm_w2_n)

            tm1_xwb_n = slice_w(input_n, 3)
            timegate1 = self.nonlinearity_outside_tg1(tm_w1_n + tm1_xwb_n)

            dm2_xwb_n = slice_w(input_n, 4)
            duration_gate2 = self.nonlinearity_outside_dg2(dm_w2_n + dm2_xwb_n)

            input_n = slice_w(input_n, 0, 3)

            # Calculate gates pre-activations and slice
            gates = input_n +, W_hid_stacked)

            # Clip gradients
            if self.grad_clipping:
                gates = theano.gradient.grad_clip(gates, -self.grad_clipping,

            # Extract the pre-activation gate values
            ingate = slice_w(gates, 0)
            cell_input = slice_w(gates, 1)
            outgate = slice_w(gates, 2)
            # add outgate 的在输出到激活函数之前,会添加一个tm
            outgate += tm_wto_n

            if self.peepholes:
                # Compute peephole connections
                ingate += cell_previous * self.W_cell_to_ingate

            # Apply nonlinearities
            ingate = self.nonlinearity_ingate(ingate)
            cell_input = self.nonlinearity_cell(cell_input)

            # Compute new cell value
            # add 在cell中,后面的乘项增加一个
            # cell = forgetgate * cell_previous + ingate * timegate1 * cell_input
            # add2
            # addv
            cell = (1 - ingate * timegate1
                    ) * cell_previous + duration_gate2 * ingate * cell_input
            # tilde_cell = (1 - ingate) * cell_previous + ingate * timegate1 * cell_input
            tilde_cell = cell + ingate * timegate1 * cell_input

            if self.peepholes:
                outgate += cell * self.W_cell_to_outgate
            outgate = self.nonlinearity_outgate(outgate)

            hid = outgate * self.nonlinearity(tilde_cell)

            # addv22_6
            hid1 = self.nonlinearity(, self.W_x_wg) + self.b_wg)
            n = n_pre
            hid2 = hid1

            return [cell, hid, hid1, hid2, n]

        def step_masked(
                time_input_n,  # add 添加时间的输入
                duration_input_n,  # addv
            cell, hid, hid1, hid2, n = step(
                time_input_n,  # add 添加时间的输入
                duration_input_n,  # addv

            # Skip over any input with mask 0 by copying the previous
            # hidden state; proceed normally for any input with mask 1.
            cell = T.switch(mask_n, cell, cell_previous)
            hid = T.switch(mask_n, hid, hid_previous)
            hid1 = T.switch(mask_n, hid1, hid1_pre)
            hid2 = T.switch(mask_n, hid2, hid2_pre)

            return [cell, hid, hid1, hid2, n]

        if mask is not None:
            # mask is given as (batch_size, seq_len). Because scan iterates
            # over first dimension, we dimshuffle to (seq_len, batch_size) and
            # add a broadcastable dimension
            # (1, 0, ‘x’) -> AxB to BxAx(可广播的维度)
            mask = mask.dimshuffle(1, 0, 'x')
            # input(seq_len,batch_size,n_feature),mask(seq_len, batch_size,(可广播的维度))
            # add 这里设置sequences,可能是用于后面的scan
            # addv
            sequences = [input, time_input, duration_input, mask]
            step_fun = step_masked
            # add
            # addv
            sequences = [input, time_input, duration_input]
            step_fun = step

        # 后面不太懂
        ones = T.ones((num_batch, 1))
        if not isinstance(self.cell_init, Layer):
            # Dot against a 1s vector to repeat to shape (num_batch, num_units)
            # ones(num_batch,1) self.cell(1,num_units)
            cell_init =, self.cell_init)

        if not isinstance(self.hid_init, Layer):
            # Dot against a 1s vector to repeat to shape (num_batch, num_units)
            # ones(num_batch,1) self.hid(1,num_units)
            hid_init =, self.hid_init)

            # addv22_6
            zeros = T.zeros((num_batch, 1))
            hid1_init =, self.hid1_init)
            hid2_init =, self.hid2_init)

        # The hidden-to-hidden weight matrix is always used in step
        # 权重属于不变的量
        non_seqs = [W_hid_stacked]
        # The "peephole" weight matrices are only used when self.peepholes=True
        if self.peepholes:
            non_seqs += [self.W_cell_to_ingate, self.W_cell_to_outgate]

        # When we aren't precomputing the input outside of scan, we need to
        # provide the input weights and biases to the step function
        # 之前没有计算的话,就需要
        if not self.precompute_input:
            non_seqs += [W_in_stacked, b_stacked]

        # addv22_6
        non_seqs += [self.W_x_wg, self.b_wg]
        if self.unroll_scan:
            # Retrieve the dimensionality of the incoming layer
            input_shape = self.input_shapes[0]
            # Explicitly unroll the recurrence instead of using scan
            cell_out, hid_out = unroll_scan(fn=step_fun,
                                            outputs_info=[cell_init, hid_init],
            # Scan op iterates over first dimension of input and repeatedly
            # applies the step function
            cell_out, hid_out2, hid1_out, hid_out, n = theano.scan(
                sequences=sequences,  # [input, time_input, mask]
                outputs_info=[cell_init, hid_init, hid1_init, hid2_init, 0],

        # When it is requested that we only return the final sequence step,
        # we need to slice it out immediately after scan is applied
        if self.only_return_final:
            hid_out = hid_out[-1]
            # dimshuffle back to (n_batch, n_time_steps, n_features))
            hid_out = hid_out.dimshuffle(1, 0, 2)

            # if scan is backward reverse the output
            if self.backwards:
                hid_out = hid_out[:, ::-1]

        return hid_out
Esempio n. 28
    def get_output_for(self, inputs, **kwargs):
        Compute this layer's output function given a symbolic input variable

        input : theano.TensorType
            Symbolic input variable.
        mask : theano.TensorType
            Theano variable denoting whether each time step in each
            sequence in the batch is part of the sequence or not.  If ``None``,
            then it is assumed that all sequences are of the same length.  If
            not all sequences are of the same length, then it must be
            supplied as a matrix of shape ``(n_batch, n_time_steps)`` where
            ``mask[i, j] = 1`` when ``j <= (length of sequence i)`` and
            ``mask[i, j] = 0`` when ``j > (length of sequence i)``.

        layer_output : theano.TensorType
            Symblic output variable.
        input = inputs[0]
        # Retrieve the mask when it is supplied
        mask = inputs[1] if len(inputs) > 1 else None

        # Treat all dimensions after the second as flattened feature dimensions
        if input.ndim > 3:
            input = input.reshape((input.shape[0], input.shape[1],
        num_batch = input.shape[0]
        encode_seqlen = input.shape[1]

        if mask is None:
            mask = T.ones((num_batch, encode_seqlen),dtype='float32')
        # At each call to scan, input_n will be (n_time_steps, 4*num_units).
        # We define a slicing function that extract the input to each LSTM gate
        def slice_w(x, n):
            return x[:, n*self.num_units:(n+1)*self.num_units]

        # Create single recurrent computation step function
        # input_n is the n'th vector of the input
        def step(cell_previous, hid_previous, alpha_prev, weighted_hidden_prev,
                 input, mask, hUa, W_align, v_align,
                 W_hid_stacked, W_weightedhid_stacked, W_cell_to_ingate,
                 W_cell_to_forgetgate, W_cell_to_outgate,
                 b_stacked, *args):

            #compute (unormalized) attetion vector
            sWa =, W_align)       # (BS, aln_num_units)
            sWa = sWa.dimshuffle(0, 'x', 1)   # (BS, 1, aln_num_units)
            align_act = sWa + hUa
            tanh_sWahUa = self.nonlinearity_align(align_act)
                                            # (BS, seqlen, num_units_aln)

            a =, v_align)  # (BS, Seqlen, 1)
            a = T.reshape(a, (a.shape[0], a.shape[1]))
            #                                # (BS, Seqlen)
            # # ->(BS, seq_len)

            a = a*mask - (1-mask)*10000

            alpha = self.attention_softmax_function(a)
            #alpha = T.reshape(alpha, (input.shape[0], input.shape[1]))

            # input: (BS, Seqlen, num_units)
            weighted_hidden = input * alpha.dimshuffle(0, 1, 'x')
            weighted_hidden = T.sum(weighted_hidden, axis=1)  #sum seqlen out

            # Calculate gates pre-activations and slice

            # (BS, dec_hid) x (dec_hid, dec_hid)
            gates =, W_hid_stacked) + b_stacked
            # (BS, enc_hid) x (enc_hid, dec_hid)
            gates +=, W_weightedhid_stacked)

            # Clip gradients
            if self.grad_clipping is not False:
                gates = theano.gradient.grad_clip(
                    gates, -self.grad_clipping, self.grad_clipping)

            # Extract the pre-activation gate values
            ingate = slice_w(gates, 0)
            forgetgate = slice_w(gates, 1)
            cell_input = slice_w(gates, 2)
            outgate = slice_w(gates, 3)

            if self.peepholes:
                # Compute peephole connections
                ingate += cell_previous*W_cell_to_ingate
                forgetgate += cell_previous*W_cell_to_forgetgate

            # Apply nonlinearities
            ingate = self.nonlinearity_ingate(ingate)
            forgetgate = self.nonlinearity_forgetgate(forgetgate)
            cell_input = self.nonlinearity_cell(cell_input)
            outgate = self.nonlinearity_outgate(outgate)

            # Compute new cell value
            cell = forgetgate*cell_previous + ingate*cell_input

            if self.peepholes:
                outgate += cell*W_cell_to_outgate

            # W_align:  (num_units, aln_num_units)
            # U_align:  (num_feats, aln_num_units)
            # v_align:  (aln_num_units, 1)
            # hUa:      (BS, Seqlen, aln_num_units)
            # hid:      (BS, num_units_dec)
            # input:    (BS, Seqlen, num_inputs)

            # Compute new hidden unit activation
            hid = outgate*self.nonlinearity_out(cell)

            return [cell, hid, alpha, weighted_hidden]

        sequences = []
        step_fun = step

        ones = T.ones((num_batch, 1))
        if isinstance(self.cell_init, T.TensorVariable):
            cell_init = self.cell_init
            # Dot against a 1s vector to repeat to shape (num_batch, num_units)
            cell_init =, self.cell_init)

        if isinstance(self.hid_init, T.TensorVariable):
            hid_init = self.hid_init
            # Dot against a 1s vector to repeat to shape (num_batch, num_units)
            hid_init =, self.hid_init)

        #weighted_hidden_init = T.zeros((num_batch, input.shape[2]))
        alpha_init = T.zeros((num_batch, encode_seqlen))

        weighted_hidden_init = T.zeros((num_batch, self.num_inputs))

        # The hidden-to-hidden weight matrix is always used in step

        hUa =, self.U_align)   # (num_batch, seq_len, num_units_aln)

        non_seqs = [input, mask, hUa, self.W_align, self.v_align,
                    self.W_hid_stacked, self.W_weightedhid_stacked]
        # The "peephole" weight matrices are only used when self.peepholes=True
        if self.peepholes:
            non_seqs += [self.W_cell_to_ingate,
        # theano.scan only allows for positional arguments, so when
        # self.peepholes is False, we need to supply fake placeholder arguments
        # for the three peephole matrices.
            non_seqs += [(), (), ()]
        # When we aren't precomputing the input outside of scan, we need to
        # provide the input weights and biases to the step function

        non_seqs += [self.b_stacked]

        if self.unroll_scan:
            # Explicitly unroll the recurrence instead of using scan
            cell_out, hid_out, alpha_out, weighted_hidden_out = unroll_scan(
                outputs_info=[cell_init, hid_init, alpha_init, weighted_hidden_init],
                n_steps=self.n_decodesteps + self.decode_pre_steps)
            # Scan op iterates over first dimension of input and repeatedly
            # applies the step function
            cell_out, hid_out, alpha_out, weighted_hidden_out = theano.scan(
                outputs_info=[cell_init, hid_init, alpha_init, weighted_hidden_init],
                n_steps=self.n_decodesteps + self.decode_pre_steps,

        # dimshuffle back to (n_batch, n_time_steps, n_features))

        #a_out - (n_decodesteps, bs, seqlen)
        #hid_out -   (n_decode_steps, bs, num_units)

        # mask:  (BS, encode_seqlen
        # a_out; (n_decodesteps, BS, encode_seqlen)
        cell_out = cell_out.dimshuffle(1, 0, 2)
        hid_out = hid_out.dimshuffle(1, 0, 2)  # (BS, n_decodesteps, encode_seqlen)
        mask = mask.dimshuffle(0, 'x', 1)
        alpha_out = alpha_out.dimshuffle(1, 0, 2)  # (BS, n_decodesteps, encode_seqlen)

        weighted_hidden_out = weighted_hidden_out.dimshuffle(1, 0, 2)

        # if scan is backward reverse the output
        if self.backwards:
            hid_out = hid_out[:, ::-1]
            cell_out = cell_out[:, ::-1]
            weighted_hidden_out = weighted_hidden_out[:, ::-1]
            alpha_out = alpha_out[:, ::-1]

        if self.decode_pre_steps > 0:
            hid_out = hid_out[:, self.decode_pre_steps:]
            cell_out = hid_out[:, self.decode_pre_steps:]
            weighted_hidden_out = weighted_hidden_out[:, self.decode_pre_steps:]
            alpha_out = hid_out[:, self.decode_pre_steps:]

        self.hid_out = hid_out
        self.cell_out = cell_out
        self.weighted_hidden_out = weighted_hidden_out
        self.alpha = alpha_out

        if self.return_decodehid:
            return hid_out
            return weighted_hidden_out
Esempio n. 29
    def get_sessions(self, 
                     session_length = 10,
                     batch_size = None,
                     recorded_sequence = None,
                     initial_hidden = 'zeros',
                     initial_policy = 'zeros',
                     initial_actions = 'zeros',
                     additional_output_layers = [],
        """returns history of agent-generated sequences for given number of turns:
            session_length - how many turns of interaction shall there be for each batch
            batch_size - [required parameter] amount of independed sessions [number or symbolic].Irrelevant if you manually set all initial_*.
            recorded_sequence - if None, generator is actually generating output.
                if a tensor[batch_i,time_tick,...] is passed instead, the generator observes this sequence 
                    instead of it's own output
            initial_<something> - initial values for all variables at 0-th time step
            Unless you are doing something nasty, initial policy and actions will not matter at all
            'zeros' default means filling variable with zeros
            Initial values are NOT included in history sequences
            additional_output_layers - any layers of a network which outputs need to be added to the outputs
            flags: optional flags to be sent to NN when calling get_output (e.g. deterministic = True)

            hidden_seq,policy_seq,action_seq, [additional_output_0, additional_output_1]
            for hidden state, agent policy and chosen actions respectively
            each of them having dimensions of [batch_i,seq_i,...]
        if initial_hidden == 'zeros':
            memory_state_shape = lasagne.layers.get_output_shape(self.memory)[1:]
            initial_hidden = T.zeros((batch_size,)+tuple(memory_state_shape))
        if initial_actions == 'zeros':
            initial_actions = T.zeros([batch_size],dtype='int32')
        time_ticks = T.arange(session_length)

        #recurrent step functions
        def step_active(time_tick,last_hidden,last_policy,last_action,
            """a recurrent step function where generator actually generates sequence"""

            hidden,policy,action,additional_outputs = self.get_agent_reaction(last_hidden,last_action,
            return [hidden,policy,action]+additional_outputs
        def step_passive(time_tick,current_observation,last_hidden,last_policy,last_action,
            """a recurrent step function where generator observes recorded sequence of actions and generates
            possible next steps for recorded sequence prefices. Used for passive training (like language model)"""
            hidden,policy,action,additional_outputs = self.get_agent_reaction(last_hidden,current_observation,
            return [hidden,policy,action]+additional_outputs

        ##main recurrence loop
        #state 0 values
        additional_init = [None for i in additional_output_layers]
        outputs_info = [initial_hidden,None,initial_actions] + additional_init
        #time ticks and [optional] transposed recorded sequence [tick,batch,...]
        sequences = [time_ticks]
        if recorded_sequence is not None:
        step = step_active if recorded_sequence is None else step_passive
        history = unroll_scan(step,
            sequences = sequences,
            outputs_info = outputs_info,
            non_sequences = [],
            n_steps = session_length
        self.history = history
        #from [time,batch,...] to [batch,time,...]
        history = [ (var.swapaxes(1,0) if var.ndim >1 else var) for var in history]
        #what's inside:
        hidden_seq,policy_seq,action_seq = history[:3]
        additional_output_sequences = tuple(history[3:])
        return (hidden_seq,policy_seq,action_seq) + additional_output_sequences
Esempio n. 30
    def get_output_for(self, inputs, **kwargs):
        Compute this layer's output function given a symbolic input variable.

        inputs : list of theano.TensorType
            `inputs[0]` should always be the symbolic input variable.  When
            this layer has a mask input (i.e. was instantiated with
            `mask_input != None`, indicating that the lengths of sequences in
            each batch vary), `inputs` should have length 2, where `inputs[1]`
            is the `mask`.  The `mask` should be supplied as a Theano variable
            denoting whether each time step in each sequence in the batch is
            part of the sequence or not.  `mask` should be a matrix of shape
            ``(n_batch, n_time_steps)`` where ``mask[i, j] = 1`` when ``j <=
            (length of sequence i)`` and ``mask[i, j] = 0`` when ``j > (length
            of sequence i)``. When the hidden state of this layer is to be
            pre-filled (i.e. was set to a :class:`Layer` instance) `inputs`
            should have length at least 2, and `inputs[-1]` is the hidden state
            to prefill with.

        layer_output : theano.TensorType
            Symbolic output variable.
        # Retrieve the layer input
        input = inputs[0]
        # Retrieve the mask when it is supplied
        mask = None
        hid_init = None
        if self.mask_incoming_index > 0:
            mask = inputs[self.mask_incoming_index]
        if self.hid_init_incoming_index > 0:
            hid_init = inputs[self.hid_init_incoming_index]

        # Input should be provided as (n_batch, n_time_steps, n_features)
        # but scan requires the iterable dimension to be first
        # So, we need to dimshuffle to (n_time_steps, n_batch, n_features)
        input = input.dimshuffle(1, 0, *range(2, input.ndim))
        seq_len, num_batch = input.shape[0], input.shape[1]

        if self.precompute_input:
            # Because the input is given for all time steps, we can precompute
            # the inputs to hidden before scanning. First we need to reshape
            # from (seq_len, batch_size, trailing dimensions...) to
            # (seq_len*batch_size, trailing dimensions...)
            # This strange use of a generator in a tuple was because
            # input.shape[2:] was raising a Theano error
            trailing_dims = tuple(input.shape[n] for n in range(2, input.ndim))
            input = T.reshape(input, (seq_len * num_batch, ) + trailing_dims)
            input = helper.get_output(self.input_to_hidden, input, **kwargs)

            # Reshape back to (seq_len, batch_size, trailing dimensions...)
            trailing_dims = tuple(input.shape[n] for n in range(1, input.ndim))
            input = T.reshape(input, (seq_len, num_batch) + trailing_dims)

        # We will always pass the hidden-to-hidden layer params to step
        non_seqs = helper.get_all_params(self.hidden_to_hidden)
        non_seqs += self._get_mi_params()
        # When we are not precomputing the input, we also need to pass the
        # input-to-hidden parameters to step
        if not self.precompute_input:
            non_seqs += helper.get_all_params(self.input_to_hidden)

        # Create single recurrent computation step function
        def step(input_n, hid_previous, *args):
            # Compute the hidden-to-hidden activation
            hid_to_hid = helper.get_output(self.hidden_to_hidden, hid_previous,

            # Compute the input-to-hidden activation
            if self.precompute_input:
                # if the input is precomputed
                in_to_hid = input_n
                # compute the input
                in_to_hid = helper.get_output(self.input_to_hidden, input_n,

            # Compute the second order term
            if self.a_g is not None:
                second_order_term = (self.a_g * in_to_hid * hid_to_hid)
                # second_order_term = in_to_hid * hid_to_hid
                second_order_term = 0

            # Compute the first order hidden-to-hidden term
            if self.b_g_hid_to_hid is not None:
                f_o_hid_to_hid = self.b_g_hid_to_hid * hid_to_hid

                f_o_hid_to_hid = 0

            # Compute first order input to hidden term
            if self.b_g_in_to_hid is not None:
                f_o_in_to_hid = self.b_g_in_to_hid * in_to_hid

                # if all else is None, it will output zeros of the right size
                f_o_in_to_hid = T.zeros_like(in_to_hid)

            hid_pre = second_order_term + f_o_in_to_hid + f_o_hid_to_hid

            if self.b is not None:
                hid_pre = hid_pre + self.b

            return self.nonlinearity(hid_pre)

        def step_masked(input_n, mask_n, hid_previous, *args):
            # Skip over any input with mask 0 by copying the previous
            # hidden state; proceed normally for any input with mask 1.
            hid = step(input_n, hid_previous, *args)
            hid_out = T.switch(mask_n, hid, hid_previous)
            return [hid_out]

        if mask is not None:
            mask = mask.dimshuffle(1, 0, 'x')
            sequences = [input, mask]
            step_fun = step_masked
            sequences = input
            step_fun = step

        if not isinstance(self.hid_init, Layer):
            # The code below simply repeats self.hid_init num_batch times in
            # its first dimension.  Turns out using a dot product and a
            # dimshuffle is faster than T.repeat.
            dot_dims = (list(range(1, self.hid_init.ndim - 1)) +
                        [0, self.hid_init.ndim - 1])
            hid_init =, 1)),

        if self.unroll_scan:
            # Retrieve the dimensionality of the incoming layer
            input_shape = self.input_shapes[0]
            # Explicitly unroll the recurrence instead of using scan
            hid_out = unroll_scan(fn=step_fun,
            # Scan op iterates over first dimension of input and repeatedly
            # applies the step function
            hid_out = theano.scan(fn=step_fun,

        # When it is requested that we only return the final sequence step,
        # we need to slice it out immediately after scan is applied
        if self.only_return_final:
            hid_out = hid_out[-1]
            # dimshuffle back to (n_batch, n_time_steps, n_features))
            hid_out = hid_out.dimshuffle(1, 0, *range(2, hid_out.ndim))

            # if scan is backward reverse the output
            if self.backwards:
                hid_out = hid_out[:, ::-1]

        return hid_out
Esempio n. 31
    def get_loss(self, unroll_scan=False, cache_intermediate=True):
        msg = 'Building full GP loss'
        idims = self.D
        N = self.X.shape[0].astype(floatX)

        def nlml(Y, hyp, i, X, EyeN, nigp=None, y_var=None):
            # initialise the (before compilation) kernel function
            hyps = (hyp[:idims + 1], hyp[idims + 1])
            kernel_func = partial(cov.Sum, hyps, self.covs)

            # We initialise the kernel matrices (one for each output dimension)
            K = kernel_func(X)

            # add the contribution from the input noise
            if nigp:
                K += tt.diag(nigp[i])
            # add the contribution from the output uncertainty (acts as weight)
            if y_var:
                K += tt.diag(y_var[i])

            # compute chol(K)
            L = Cholesky()(K)

            # compute K^-1 and (K^-1)dot(y)
            rhs = tt.concatenate([EyeN, Y[:, None]], axis=1)
            sol = solve_upper_triangular(L.T, solve_lower_triangular(L, rhs))
            iK = sol[:, :-1]
            beta = sol[:, -1]

            return iK, L, beta

        nseq = [self.X, tt.eye(self.X.shape[0])]
        if self.nigp:
        if self.Y_var:

        seq = [self.Y.T, self.hyp, tt.arange(self.X.shape[0])]

        if unroll_scan:
            from lasagne.utils import unroll_scan
            [iK, L, beta] = unroll_scan(nlml, seq, [], nseq, self.E)
            updts = {}
            (iK, L,
             beta), updts = theano.scan(fn=nlml,
                                        name="%s>logL_scan" % (

        # And finally, the negative log marginal likelihood
        loss = 0.5 * tt.sum(self.Y.T * beta, 1)
        idx = [theano.tensor.arange(L.shape[i]) for i in [1, 2]]
        loss += tt.sum(tt.log(L[:, idx[0], idx[1]]), 1)
        loss += 0.5 * N * tt.log(2 * np.pi)

        if cache_intermediate:
            # we are going to save the intermediate results in the following
            # shared variables, so we can use them during prediction without
            # having to recompute them
            N, E = self.N, self.E
            if type(self.iK) is not tt.sharedvar.SharedVariable:
                self.iK = S(np.tile(np.eye(N, dtype=floatX), (E, 1, 1)),
                            name="%s>iK" % (
            if type(self.L) is not tt.sharedvar.SharedVariable:
                self.L = S(np.tile(np.eye(N, dtype=floatX), (E, 1, 1)),
                           name="%s>L" % (
            if type(self.beta) is not tt.sharedvar.SharedVariable:
                self.beta = S(np.ones((E, N), dtype=floatX),
                              name="%s>beta" % (
            updts = [(self.iK, iK), (self.L, L), (self.beta, beta)]
            # save intermediate graphs (in case we require grads wrt params)
            self.iK, self.L, self.beta = iK, L, beta
            updts = None

        # we add some penalty to avoid having parameters that are too large
        if self.snr_penalty is not None:
            penalty_params = {
                'log_snr': np.log(1000, dtype=floatX),
                'log_ls': np.log(100, dtype=floatX),
                'log_std': tt.log(self.X.std(0) * (N / (N - 1.0))),
                'p': 30
            loss += self.snr_penalty(tt.log(self.hyp), **penalty_params)
        inps = []
        self.state_changed = True  # for saving
        return loss.sum(), inps, updts
Esempio n. 32
    def get_output_for(self, inputs, **kwargs):
        Compute this layer's output function given a symbolic input variable

        inputs : list of theano.TensorType
            `inputs[0]` should always be the symbolic input variable.  When
            this layer has a mask input (i.e. was instantiated with
            `mask_input != None`, indicating that the lengths of sequences in
            each batch vary), `inputs` should have length 2, where `inputs[1]`
            is the `mask`.  The `mask` should be supplied as a Theano variable
            denoting whether each time step in each sequence in the batch is
            part of the sequence or not.  `mask` should be a matrix of shape
            ``(n_batch, n_time_steps)`` where ``mask[i, j] = 1`` when ``j <=
            (length of sequence i)`` and ``mask[i, j] = 0`` when ``j > (length
            of sequence i)``. When the hidden state of this layer is to be
            pre-filled (i.e. was set to a :class:`Layer` instance) `inputs`
            should have length at least 2, and `inputs[-1]` is the hidden state
            to prefill with.

        layer_output : theano.TensorType
            Symbolic output variable.
        # Retrieve the layer input
        input = inputs[0]
        # Retrieve the mask when it is supplied
        mask = None
        hid_init = None
        if self.mask_incoming_index > 0:
            mask = inputs[self.mask_incoming_index]
        if self.hid_init_incoming_index > 0:
            hid_init = inputs[self.hid_init_incoming_index]

        # Treat all dimensions after the second as flattened feature dimensions
        if input.ndim > 3:
            input = T.flatten(input, 3)

            input =

        # Because scan iterates over the first dimension we dimshuffle to
        # (n_time_steps, n_batch, n_features)
        input = input.dimshuffle(1, 0, 2)
        seq_len, num_batch, _ = input.shape

        # Stack input weight matrices into a (num_inputs, 3*num_units)
        # matrix, which speeds up computation
        W_in_stacked = T.concatenate([
            self.W_in_to_resetgate, self.W_in_to_updategate,

        # Same for hidden weight matrices
        W_hid_stacked = T.concatenate([
            self.W_hid_to_resetgate, self.W_hid_to_updategate,

        # Stack gate biases into a (3*num_units) vector
        b_stacked = T.concatenate(
            [self.b_resetgate, self.b_updategate, self.b_hidden_update],

        # Stack second order gating biases into a (3*num_units) vector
        a_g_stacked = T.concatenate(
            [self.a_g_resetgate, self.a_g_updategate, self.a_g_hidden_update],

        # Stack second order gating biases into a (3*num_units) vector
        b_g_in_to_hid_stacked = T.concatenate([
            self.b_g_in_to_hid_resetgate, self.b_g_in_to_hid_updategate,

        # Stack second order gating biases into a (3*num_units) vector
        b_g_hid_to_hid_stacked = T.concatenate([
            self.b_g_hid_to_hid_resetgate, self.b_g_hid_to_hid_updategate,

        if self.precompute_input:
            # precompute_input inputs*W. W_in is (n_features, 3*num_units).
            # input is then (n_batch, n_time_steps, 3*num_units).
            input =, W_in_stacked)

        # When theano.scan calls step, input_n will be (n_batch, 3*num_units).
        # We define a slicing function that extract the input to each GRU gate
        def slice_w(x, n):
            s = x[:, n * self.num_units:(n + 1) * self.num_units]
            if self.num_units == 1:
                s = T.addbroadcast(s, 1)  # Theano cannot infer this by itself
            return s

        # Create single recurrent computation step function
        # input__n is the n'th vector of the input
        def step(input_n, hid_previous, *args):
            # Compute W_{hr} h_{t - 1}, W_{hu} h_{t - 1}, and W_{hc} h_{t - 1}
            hid_input =, W_hid_stacked)

            if self.grad_clipping:
                input_n = theano.gradient.grad_clip(input_n,
                hid_input = theano.gradient.grad_clip(hid_input,

            if not self.precompute_input:
                # Compute W_{xr}x_t + b_r, W_{xu}x_t + b_u, and W_{xc}x_t + b_c
                input_n =, W_in_stacked)

            # Compute the second_order_term
            second_order_term = a_g_stacked * input_n * hid_input

            # Compute the first order input-to-hidden term
            f_o_input = b_g_in_to_hid_stacked * input_n + b_stacked

            # Compute the first order hidden-to-hidden term
            f_o_hid_input = b_g_hid_to_hid_stacked * hid_input

            # Reset and update gates
            resetgate = (slice_w(second_order_term, 0) +
                         slice_w(f_o_hid_input, 0) + slice_w(f_o_input, 0))
            updategate = (slice_w(second_order_term, 1) +
                          slice_w(f_o_hid_input, 1) + slice_w(f_o_input, 1))
            resetgate = self.nonlinearity_resetgate(resetgate)
            updategate = self.nonlinearity_updategate(updategate)

            # Compute
            # (W_{xc}x_t + r_t \odot (W_{hc} h_{t - 1}) +
            #  r_t \odot (W_{xc}x_t * W_{hc} h_{t-1}))
            # This is different from the paper, but follows the
            # formulation used in Lasagne
            hidden_update_in = slice_w(f_o_hid_input, 2)
            hidden_update_hid = slice_w(f_o_hid_input, 2)
            hidden_update_s_o = slice_w(second_order_term, 2)
            hidden_update = (hidden_update_in + resetgate *
                             (hidden_update_hid + hidden_update_s_o))
            if self.grad_clipping:
                hidden_update = theano.gradient.grad_clip(
                    hidden_update, -self.grad_clipping, self.grad_clipping)
            hidden_update = self.nonlinearity_hidden_update(hidden_update)

            # Compute (1 - u_t)h_{t - 1} + u_t c_t
            hid = (1 - updategate) * hid_previous + updategate * hidden_update
            return hid

        def step_masked(input_n, mask_n, hid_previous, *args):
            hid = step(input_n, hid_previous, *args)

            # Skip over any input with mask 0 by copying the previous
            # hidden state; proceed normally for any input with mask 1.
            hid = T.switch(mask_n, hid, hid_previous)

            return hid

        if mask is not None:
            # mask is given as (batch_size, seq_len). Because scan iterates
            # over first dimension, we dimshuffle to (seq_len, batch_size) and
            # add a broadcastable dimension
            mask = mask.dimshuffle(1, 0, 'x')
            sequences = [input, mask]
            step_fun = step_masked
            sequences = [input]
            step_fun = step

        if not isinstance(self.hid_init, Layer):
            # Dot against a 1s vector to repeat to shape (num_batch, num_units)
            hid_init =, 1)), self.hid_init)

        # The hidden-to-hidden weight matrix is always used in step
        non_seqs = [
            W_hid_stacked, a_g_stacked, b_g_in_to_hid_stacked,
            b_g_hid_to_hid_stacked, b_stacked
        # When we aren't precomputing the input outside of scan, we need to
        # provide the input weights and biases to the step function
        if not self.precompute_input:
            non_seqs += [W_in_stacked]

        if self.unroll_scan:
            # Retrieve the dimensionality of the incoming layer
            input_shape = self.input_shapes[0]
            # Explicitly unroll the recurrence instead of using scan
            hid_out = unroll_scan(fn=step_fun,
            # Scan op iterates over first dimension of input and repeatedly
            # applies the step function
            hid_out = theano.scan(fn=step_fun,

        # When it is requested that we only return the final sequence step,
        # we need to slice it out immediately after scan is applied
        if self.only_return_final:
            hid_out = hid_out[-1]
            # dimshuffle back to (n_batch, n_time_steps, n_features))
            hid_out = hid_out.dimshuffle(1, 0, 2)

            # if scan is backward reverse the output
            if self.backwards:
                hid_out = hid_out[:, ::-1]

        return hid_out
Esempio n. 33
    def predict_symbolic(self, mx, Sx, unroll_scan=False):
        idims = self.D
        odims = self.E

        # centralize inputs
        zeta = self.X - mx

        # initialize some variables
        sf2 = self.hyp[:, idims]**2
        eyeE = tt.tile(tt.eye(idims), (odims, 1, 1))
        lscales = self.hyp[:, :idims]
        iL = eyeE / lscales.dimshuffle(0, 1, 'x')

        # predictive mean
        inp =, 2, 1)
        iLdotSx =
        # TODO vectorize this
        B = (iLdotSx[:, :, None, :] *
             iL[:, None, :, :]).sum(-1) + tt.eye(idims)
        t = tt.stack([solve(B[i].T, inp[i].T).T for i in range(odims)])
        c = sf2 / tt.sqrt(tt.stack([det(B[i]) for i in range(odims)]))
        l = tt.exp(-0.5 * tt.sum(inp * t, 2))
        lb = l * self.beta  # E x N dot E x N
        M = tt.sum(lb, 1) * c

        # input output covariance
        tiL = (t[:, :, None, :] * iL[:, None, :, :]).sum(-1)
        # tiL = tt.stack([t[i].dot(iL[i]) for i in range(odims)])
        V = tt.stack([tiL[i][i]) for i in range(odims)]).T * c

        # predictive covariance
        logk = (tt.log(sf2))[:, None] - 0.5 * tt.sum(inp * inp, 2)
        logk_r = logk.dimshuffle(0, 'x', 1)
        logk_c = logk.dimshuffle(0, 1, 'x')
        Lambda = tt.square(iL)
        LL = (Lambda.dimshuffle(0, 'x', 1, 2) + Lambda).transpose(0, 1, 3, 2)
        R =, Sx).transpose(0, 1, 3, 2) + tt.eye(idims)
        z_ =, 2, 1)

        M2 = tt.zeros((odims, odims))

        # initialize indices
        triu_indices = np.triu_indices(odims)
        indices = [tt.as_index_variable(idx) for idx in triu_indices]

        def second_moments(i, j, M2, beta, iK, sf2, R, logk_c, logk_r, z_, Sx,
            # This comes from Deisenroth's thesis ( Eqs 2.51- 2.54 )
            Rij = R[i, j]
            n2 = logk_c[i] + logk_r[j]
            n2 += utils.maha(z_[i], -z_[j], 0.5 * solve(Rij, Sx))

            Q = tt.exp(n2) / tt.sqrt(det(Rij))

            # Eq 2.55
            m2 = matrix_dot(beta[i], Q, beta[j])

            m2 = theano.ifelse.ifelse(tt.eq(i, j),
                                      m2 - tt.sum(iK[i] * Q) + sf2[i], m2)
            M2 = tt.set_subtensor(M2[i, j], m2)
            return M2

        nseq = [self.beta, self.iK, sf2, R, logk_c, logk_r, z_, Sx, self.L]
        if unroll_scan:
            from lasagne.utils import unroll_scan
            [M2_] = unroll_scan(second_moments, indices, [M2], nseq,
            updts = {}
            M2_, updts = theano.scan(fn=second_moments,
                                     name="%s>M2_scan" % (
        M2 = M2_[-1]
        M2 = M2 + tt.triu(M2, k=1).T
        S = M2 - tt.outer(M, M)

        return M, S, V
Esempio n. 34
    def get_output_for(self, inputs, **kwargs):
        # Retrieve the layer input
        input = inputs[0]
        # Retrieve the mask when it is supplied
        mask = None
        hid_init = None
        cell_init = None
        if self.mask_incoming_index > 0:
            mask = inputs[self.mask_incoming_index]
        if self.hid_init_incoming_index > 0:
            hid_init = inputs[self.hid_init_incoming_index]
        if self.cell_init_incoming_index > 0:
            cell_init = inputs[self.cell_init_incoming_index]

        # Stack input weight matrices into a (num_inputs, 4*num_units)
        # matrix, which speeds up computation
        W_in_stacked = T.concatenate([
            self.W_in_to_ingate, self.W_in_to_forgetgate, self.W_in_to_cell,

        # Same for hidden weight matrices
        W_hid_stacked = T.concatenate([
            self.W_hid_to_ingate, self.W_hid_to_forgetgate, self.W_hid_to_cell,

        # Stack biases into a (4*num_units) vector
        b_stacked = T.concatenate(
            [self.b_ingate, self.b_forgetgate, self.b_cell, self.b_outgate],
            axis=0).dimshuffle('x', 0, 'x', 'x')

        # border_mode = (self.num_units // 2, self.num_units // 2)

        if self.precompute_input:
            # Because the input is given for all time steps, we can
            # precompute_input the inputs dot weight matrices before scanning.
            # W_in_stacked is (n_features, 4*num_units). input is then
            # (batch_size, 4*num_units, num_rows, num_columns)
            # (n_batch, 4*num_units, height, width).
            input = T.nnet.conv2d(input,
                                  subsample=(1, 1),
                                  filter_flip=False) + b_stacked

        # Because scan iterates over the first dimension we dimshuffle to
        # (n_time_steps, n_batch, n_features)
        # (height, n_batch, 4*num_units, width)
        # (n_batch, num_units, width)
        input = input.dimshuffle(2, 0, 1, 3)
        seq_len, num_batch = input.shape[:2]

        # When theano.scan calls step, input_n will be (n_batch, 4*num_units).
        # We define a slicing function that extract the input to each LSTM gate
        def slice_w(x, n):
            return x[:, n * self.num_units:(n + 1) * self.num_units]

        # Create single recurrent computation step function
        # input_n is the n'th vector of the input
        def step(input_n, cell_previous, hid_previous, *args):
            if not self.precompute_input:
                input_n = T.nnet.conv2d(input_n,
                                        subsample=(1, 1),
                                        filter_flip=False) + b_stacked

            # Calculate gates pre-activations and slice
            hid_previous = pad(hid_previous, [(1, 0)], 0, 2)

            gates = input_n + conv1d_mc1(hid_previous,
                                         subsample=(1, ),

            # Clip gradients
            if self.grad_clipping:
                gates = theano.gradient.grad_clip(gates, -self.grad_clipping,

            # Extract the pre-activation gate values
            ingate = slice_w(gates, 0)
            forgetgate = slice_w(gates, 1)
            cell_input = slice_w(gates, 2)
            outgate = slice_w(gates, 3)

            if self.peepholes:
                # Compute peephole connections
                ingate += cell_previous * self.W_cell_to_ingate
                forgetgate += cell_previous * self.W_cell_to_forgetgate

            # Apply nonlinearities
            ingate = self.nonlinearity_ingate(ingate)
            forgetgate = self.nonlinearity_forgetgate(forgetgate)
            cell_input = self.nonlinearity_cell(cell_input)

            # Compute new cell value
            cell = forgetgate * cell_previous + ingate * cell_input

            if self.peepholes:
                outgate += cell * self.W_cell_to_outgate
            outgate = self.nonlinearity_outgate(outgate)

            # Compute new hidden unit activation
            hid = outgate * self.nonlinearity(cell)
            return [cell, hid]

        def step_masked(input_n, mask_n, cell_previous, hid_previous, *args):
            cell, hid = step(input_n, cell_previous, hid_previous, *args)

            # Skip over any input with mask 0 by copying the previous
            # hidden state; proceed normally for any input with mask 1.
            cell = T.switch(mask_n, cell, cell_previous)
            hid = T.switch(mask_n, hid, hid_previous)

            return [cell, hid]

        if mask is not None:
            # mask is given as (batch_size, seq_len). Because scan iterates
            # over first dimension, we dimshuffle to (seq_len, batch_size) and
            # add a broadcastable dimension
            mask = mask.dimshuffle(1, 0, 'x')
            sequences = [input, mask]
            step_fun = step_masked
            sequences = input
            step_fun = step

        ones = T.ones((num_batch, 1))
        if not isinstance(self.cell_init, Layer):
            # Dot against a 1s vector to repeat to shape (num_batch, num_units)
            # cell_init =, self.cell_init)
            cell_init = T.tensordot(ones,
                                    T.unbroadcast(self.cell_init, 0),
                                    axes=[1, 0])

        if not isinstance(self.hid_init, Layer):
            # Dot against a 1s vector to repeat to shape (num_batch, num_units)
            # hid_init =, self.hid_init)
            hid_init = T.tensordot(ones,
                                   T.unbroadcast(self.hid_init, 0),
                                   axes=[1, 0])

        # print(self.cell_init.ndim, self.cell_init.broadcastable)
        # print(cell_init.ndim, cell_init.broadcastable)
        # print(self.hid_init.ndim, self.hid_init.broadcastable)
        # print(hid_init.ndim, hid_init.broadcastable)

        # print(self.cell_init.get_value(True).shape)
        # print(self.hid_init.get_value(True).shape)

        # The hidden-to-hidden weight matrix is always used in step
        non_seqs = [W_hid_stacked]
        # The "peephole" weight matrices are only used when self.peepholes=True
        if self.peepholes:
            non_seqs += [
                self.W_cell_to_ingate, self.W_cell_to_forgetgate,

        # When we aren't precomputing the input outside of scan, we need to
        # provide the input weights and biases to the step function
        if not self.precompute_input:
            non_seqs += [W_in_stacked, b_stacked]

        if self.unroll_scan:
            # Retrieve the dimensionality of the incoming layer
            input_shape = self.input_shapes[0]
            # Explicitly unroll the recurrence instead of using scan
            cell_out, hid_out = unroll_scan(fn=step_fun,
                                            outputs_info=[cell_init, hid_init],
            # Scan op iterates over first dimension of input and repeatedly
            # applies the step function
            cell_out, hid_out = theano.scan(
                sequences=sequences,  # input
                outputs_info=[cell_init, hid_init],

        # When it is requested that we only return the final sequence step,
        # we need to slice it out immediately after scan is applied
        if self.only_return_final:
            hid_out = hid_out[-1]
            # dimshuffle back to (n_batch, n_time_steps, n_features))
            # hid_out = hid_out.dimshuffle(1, 0, 2)
            hid_out = hid_out.dimshuffle(1, 2, 0, 3)

            # if scan is backward reverse the output
            if self.backwards:
                hid_out = hid_out[:, :, ::-1]

        return hid_out
Esempio n. 35
    def predict_symbolic(self, mx, Sx=None, unroll_scan=False):
        idims = self.D
        odims = self.E

        # initialize some variables
        sf2 = self.hyp[:, idims]**2
        eyeE = tt.tile(tt.eye(idims), (odims, 1, 1))
        lscales = self.hyp[:, :idims]
        iL = eyeE / lscales.dimshuffle(0, 1, 'x')

        if Sx is None:
            # first check if we received a vector [D] or a matrix [nxD]
            if mx.ndim == 1:
                mx = mx[None, :]
            # centralize inputs
            zeta = self.X[:, None, :] - mx[None, :, :]

            # predictive mean ( we don't need to do the rest )
            inp = (iL[:, None, :, None, :] * zeta[:, None, :, :]).sum(2)
            l = tt.exp(-0.5 * tt.sum(inp**2, -1))
            lb = l * self.beta[:, :, None]  # E x N
            M = tt.sum(lb, 1).T * sf2

            # apply saturating function to the output if available
            if self.sat_func is not None:
                # saturate the output
                M = self.sat_func(M)

            return M

        # centralize inputs
        zeta = self.X - mx

        # predictive mean
        inp =, 2, 1)
        iLdotSx =
        B = (iLdotSx[:, :, None, :] *
             iL[:, None, :, :]).sum(-1) + tt.eye(idims)
        t = tt.stack([solve(B[i].T, inp[i].T).T for i in range(odims)])
        c = sf2 / tt.sqrt(tt.stack([det(B[i]) for i in range(odims)]))
        l = tt.exp(-0.5 * tt.sum(inp * t, 2))
        lb = l * self.beta
        M = tt.sum(lb, 1) * c

        # input output covariance
        tiL = tt.stack([t[i].dot(iL[i]) for i in range(odims)])
        V = tt.stack([tiL[i][i]) for i in range(odims)]).T * c

        # predictive covariance
        logk = (tt.log(sf2))[:, None] - 0.5 * tt.sum(inp * inp, 2)
        logk_r = logk.dimshuffle(0, 'x', 1)
        logk_c = logk.dimshuffle(0, 1, 'x')
        Lambda = tt.square(iL)
        LL = (Lambda.dimshuffle(0, 'x', 1, 2) + Lambda).transpose(0, 1, 3, 2)
        R =, Sx).transpose(0, 1, 3, 2) + tt.eye(idims)
        z_ =, 2, 1)

        M2 = tt.zeros((odims, odims))

        # initialize indices
        triu_indices = np.triu_indices(odims)
        indices = [tt.as_index_variable(idx) for idx in triu_indices]

        def second_moments(i, j, M2, beta, R, logk_c, logk_r, z_, Sx, *args):
            # This comes from Deisenroth's thesis ( Eqs 2.51- 2.54 )
            Rij = R[i, j]
            n2 = logk_c[i] + logk_r[j]
            n2 += utils.maha(z_[i], -z_[j], 0.5 * solve(Rij, Sx))
            Q = tt.exp(n2) / tt.sqrt(det(Rij))

            # Eq 2.55
            m2 = matrix_dot(beta[i], Q, beta[j])

            m2 = theano.ifelse.ifelse(tt.eq(i, j), m2 + 1e-6, m2)
            M2 = tt.set_subtensor(M2[i, j], m2)
            return M2

        nseq = [self.beta, R, logk_c, logk_r, z_, Sx, self.iK, self.L]

        if unroll_scan:
            from lasagne.utils import unroll_scan
            [M2_] = unroll_scan(second_moments, indices, [M2], nseq,
            updts = {}
            M2_, updts = theano.scan(fn=second_moments,
                                     name="%s>M2_scan" % (
        M2 = M2_[-1]
        M2 = M2 + tt.triu(M2, k=1).T
        S = M2 - tt.outer(M, M)

        # apply saturating function to the output if available
        if self.sat_func is not None:
            # saturate the output
            M, S, U = self.sat_func(M, S)
            # compute the joint input output covariance
            V =

        return M, S, V
Esempio n. 36
    def get_output_for(self,
        returns history of agent interaction with environment for given number of turns.
            inputs - [state init]  + [input_nonsequences] + [input_sequences]
                Each part is a list of theano expressions for layers in the order they were
                provided when creating this layer.
            recurrence_flags - a set of flags to be passed to the one step agent (anything that lasagne supports)
                e.g. {deterministic=True}
            [state_sequences] + [output sequences] - a list of all states and all outputs sequences
            Shape of each such sequence is [batch, tick, shape_of_one_state_or_output...]
        n_states = len(self.state_variables)
        n_state_inits = len(self.state_init)
        n_input_nonseq = len(self.input_nonsequences)
        n_input_seq = len(self.input_sequences)
        n_outputs = len(self.tracked_outputs)

        #slice inputs

        if self.mask_input is not None:
            mask, inputs = inputs[0], inputs[1:]

        initial_states_provided, nonsequences, sequences = unpack_list(
            inputs, [n_state_inits, n_input_nonseq, n_input_seq])

        # infer batch size
        if self.batch_size is not None:
            batch_size = self.batch_size
        elif len(inputs) != 0:
            batch_size = inputs[0].shape[0]
            raise ValueError(
                "Need to set batch_size explicitly for recurrence")

        # reshape sequences from [batch, time, ...] to [time,batch,...] to fit scan
        sequences = [seq.swapaxes(1, 0) for seq in sequences]

        #here we create outputs_info for scan
        ## initial states that are given as input
        initial_states_provided = OrderedDict(
            list(zip(self.state_init, initial_states_provided)))

        def get_initial_state(state_out_layer, batch_size=batch_size):
            """Pick dedicated initial state or create zeros of appropriate shape and dtype"""
            # if we have a dedicated init, use it
            if state_out_layer in initial_states_provided:
                initial_state = initial_states_provided[state_out_layer]
            # otherwise initialize with zeros
                dtype = get_layer_dtype(state_out_layer)
                initial_state = T.zeros(
                    (batch_size, ) + tuple(state_out_layer.output_shape[1:]),

                #cast to non-broadcastable tensortype
                t_state = T.TensorType(dtype, (False, ) * initial_state.ndim)
                initial_state = t_state.convert_variable(initial_state)
                assert initial_state is not None  #if None, conversion failed. report ASAP

            return initial_state

        initial_states = list(map(get_initial_state, self.state_variables))

        #dummy values for initial outputs. They have no role in computation, but if nonsequences are present,
        # AND scan is not unrolled, the step function will not receive prev outputs as parameters, while
        # if unroll_scan, these parameters are present. we forcibly initialize outputs to prevent
        # complications during parameter parsing in step function below.
        initial_output_fillers = list(
            map(get_initial_state, self.tracked_outputs))

        outputs_info = initial_states + initial_output_fillers

        # recurrent step function
        def step(*args):

            sequence_slices, prev_states, prev_outputs, nonsequences = \
                unpack_list(args, [n_input_seq, n_states, n_outputs, n_input_nonseq])
            # make dicts of prev_states and inputs
            prev_states_dict = OrderedDict(
                zip(list(self.state_variables.keys()), prev_states))

            input_layers = list(
            assert len(input_layers) == len(nonsequences + sequence_slices)

            inputs_dict = OrderedDict(
                zip(input_layers, nonsequences + sequence_slices))

            # call one step recurrence
            new_states, new_outputs = self.get_one_step(
                prev_states_dict, inputs_dict, **recurrence_flags)

            #make sure output variable is of exactly the same type as corresponding input

            get_type = lambda tensor: T.TensorType(
                sparse_grad=getattr(tensor.type, "sparse_grad", False))

            new_states = [
                for (prev_state, state) in zip(prev_states, new_states)
            assert None not in new_states, "Some state variables has different dtype/shape from init ."

            new_outputs = [
                for (prev_out, out) in zip(prev_outputs, new_outputs)
            assert None not in new_outputs, "Some of the tracked outputs has shape/dtype changing over time. Please report this."

            return new_states + new_outputs

        ###handling mask_input###

        #a step function that utilizes a mask
        def step_masked(mask_t, *args):
            #unpack arrays
            sequence_slices, prev_states, prev_outputs, nonsequences = \
                unpack_list(args, [n_input_seq, n_states, n_outputs, n_input_nonseq])

            #get regular step
            new_states_and_outputs = step(*args)
            old_states_and_outputs = prev_states + prev_outputs

            #if mask_t, return new ones, else return old ones
            def apply_mask(mask_t, new_state, old_state):
                assert new_state.ndim == old_state.ndim
                ndim = new_state.ndim
                #append dims to mask
                pattern = list(range(
                    mask_t.ndim)) + ['x'] * (ndim - mask_t.ndim)

                return T.switch(mask_t.dimshuffle(pattern), new_state,

            next_states_and_outputs = [
                apply_mask(mask_t, new_state,
                           old_state) for new_state, old_state in zip(
                               new_states_and_outputs, old_states_and_outputs)

            return next_states_and_outputs

        if self.mask_input is not None:
            sequences = [mask.swapaxes(1, 0)] + sequences
            step_function = step_masked
            step_function = step

        #scan itself
        if self.unroll_scan:
            # call scan itself
            history = unroll_scan(step_function,
            #if explicitly asked to reset updates, do so
            if accumulate_updates == False:
                self.updates = OrderedUpdates()

            history, updates = theano.scan(step_function,

            if accumulate_updates in (True, 'warn'):
                self.updates += updates
            else:  #replace updates
                self.updates = updates

            #check if user received last updates
            if not self._updates_received and accumulate_updates == 'warn':
                    "You called get_output from recurrence several times without gathering the updates.\n"
                    "(A) If you wanted to get two outputs from recurrence, use NOT\n"
                    ">>>out1 = get_output(rec[layer1])\n"
                    ">>>out2 = get_output(rec[layer2])\n"
                    "but instead:\n"
                    ">>>out1,out2 = get_output((rec[layer1],rec[layer2])) #or rec[layer1,layer2].\n"
                    "(B) If you want to run recurrence several times and accumulate updates from all runs,"
                    "use get_output(...,accumulate_updates=True) to silence the warning.\n"
                    "(C) If you want to get rid of old updates, use get_output(...,accumulate_updates=False)\n"

            if len(self.updates) != 0:
                self._updates_received = False
                    "Recurrent loop without unroll_scan got nonempty random state updates list. That happened"
                    " because there is some source of randomness (e.g. dropout) inside recurrent step graph."
                    " To compile such graph, one must either call .get_automatic_updates() right after .get_output"
                    " and pass these updates to a function when compiling theano.function.",

        # reordering from [time,batch,...] to [batch,time,...]
        history = [(var.swapaxes(1, 0) if var.ndim > 1 else var)
                   for var in check_list(history)]

        assert len(history) == n_states + n_outputs

        state_seqs, output_seqs = unpack_list(history, [n_states, n_outputs])

        # handle delayed_states
        # selectively shift state sequences by 1 tick into the past, padding with their initialisations
        for i in range(len(state_seqs)):
            if list(self.state_variables.keys())[i] in self.delayed_states:
                state_seq = state_seqs[i]
                state_init = initial_states[i]
                state_seq = T.concatenate(
                    [insert_dim(state_init, 1), state_seq[:, :-1]], axis=1)
                state_seqs[i] = state_seq

        #keys corresponding to output sequences. Note that we do not use self.keys() to correctly
        # handle cases where some variable is present in both state_variables and tracked_outputs
        output_keys = list(self.state_variables.keys()) + list(
        output_values = state_seqs + output_seqs
        assert len(output_keys) == len(output_values)
        return OrderedDict(zip(output_keys, output_values))
Esempio n. 37
    def get_output_for(self, inputs, **kwargs):
        Compute this layer's output function given a symbolic input variable

        inputs : list of theano.TensorType
            `inputs[0]` should always be the symbolic input variable.  When
            this layer has a mask input (i.e. was instantiated with
            `mask_input != None`, indicating that the lengths of sequences in
            each batch vary), `inputs` should have length 2, where `inputs[1]`
            is the `mask`.  The `mask` should be supplied as a Theano variable
            denoting whether each time step in each sequence in the batch is
            part of the sequence or not.  `mask` should be a matrix of shape
            ``(n_batch, n_time_steps)`` where ``mask[i, j] = 1`` when ``j <=
            (length of sequence i)`` and ``mask[i, j] = 0`` when ``j > (length
            of sequence i)``. When the hidden state of this layer is to be
            pre-filled (i.e. was set to a :class:`Layer` instance) `inputs`
            should have length at least 2, and `inputs[-1]` is the hidden state
            to prefill with.

        layer_output : theano.TensorType
            Symbolic output variable.
        # Retrieve the layer input
        input = inputs[0]
        # Retrieve the mask when it is supplied
        mask = None
        hid_init = None
        cov_init = None
        if self.mask_incoming_index > 0:
            mask = inputs[self.mask_incoming_index]
        if self.hid_init_incoming_index > 0:
            hid_init = inputs[self.hid_init_incoming_index]
        if self.cov_init_incoming_index > 0:
            cov_init = inputs[self.cov_init_incoming_index]

        # Treat all dimensions after the second as flattened feature dimensions
        if input.ndim > 3:
            input = T.flatten(input, 3)

        # Because scan iterates over the first dimension we dimshuffle to
        # (n_time_steps, n_batch, n_features)
        input = input.dimshuffle(1, 0, 2)
        seq_len, num_batch, _ = input.shape

        # Stack input weight matrices into a (num_inputs, 3*num_units)
        # matrix, which speeds up computation
        W_in_stacked = T.concatenate(
            [self.W_in_to_resetgate, self.W_in_to_updategate,
             self.W_in_to_hidden_update], axis=1)

        # Same for hidden weight matrices
        W_hid_stacked = T.concatenate(
            [self.W_hid_to_resetgate, self.W_hid_to_updategate,
             self.W_hid_to_hidden_update], axis=1)

        # Stack gate biases into a (3*num_units) vector
        b_stacked = T.concatenate(
            [self.b_resetgate, self.b_updategate,
             self.b_hidden_update], axis=0)

        if self.precompute_input:
            # precompute_input inputs*W. W_in is (n_features, 3*num_units).
            # input is then (n_batch, n_time_steps, 3*num_units).
            input =, W_in_stacked) + b_stacked

        # At each call to scan, input_n will be (n_time_steps, 3*num_units).
        # We define a slicing function that extract the input to each GRU gate
        def slice_w(x, n):
            return x[:, n*self.num_units:(n+1)*self.num_units]

        # Create single recurrent computation step function
        # input__n is the n'th vector of the input
        def step(input_n, hid_previous, cov_previous, *args):
            # Compute W_{hr} h_{t - 1}, W_{hu} h_{t - 1}, and W_{hc} h_{t - 1}
            hid_input =, W_hid_stacked)

            if self.grad_clipping:
                input_n = theano.gradient.grad_clip(
                    input_n, -self.grad_clipping, self.grad_clipping)
                hid_input = theano.gradient.grad_clip(
                    hid_input, -self.grad_clipping, self.grad_clipping)

            if not self.precompute_input:
                # Compute W_{xr}x_t + b_r, W_{xu}x_t + b_u, and W_{xc}x_t + b_c
                input_n =, W_in_stacked) + b_stacked

            # Reset and update gates
            resetgate = slice_w(hid_input, 0) + slice_w(input_n, 0)
            updategate = slice_w(hid_input, 1) + slice_w(input_n, 1)
            resetgate = self.nonlinearity_resetgate(resetgate)
            updategate = self.nonlinearity_updategate(updategate)

            # Compute W_{xc}x_t + r_t \odot (W_{hc} h_{t - 1})
            hidden_update_in = slice_w(input_n, 2)
            hidden_update_hid = slice_w(hid_input, 2)
            hidden_update = hidden_update_in + resetgate*hidden_update_hid
            if self.grad_clipping:
                hidden_update = theano.gradient.grad_clip(
                    hidden_update, -self.grad_clipping, self.grad_clipping)
            hidden_update = self.nonlinearity_hid(hidden_update)

            # Compute (1 - u_t)h_{t - 1} + u_t c_t
            hid = (1 - updategate)*hid_previous + updategate*hidden_update

            cov = cov_previous + hid.dimshuffle((0, 'x', 1)) * hid.dimshuffle((0, 1, 'x'))

            return hid, cov

        def step_masked(input_n, mask_n, hid_previous, cov_previous, *args):
            hid, cov = step(input_n, hid_previous, cov_previous, *args)

            # Skip over any input with mask 0 by copying the previous
            # hidden state; proceed normally for any input with mask 1.
            hid = T.switch(mask_n, hid, hid_previous)

            cov = T.switch(mask_n, cov, hid_previous)

            return hid, cov

        if mask is not None:
            # mask is given as (batch_size, seq_len). Because scan iterates
            # over first dimension, we dimshuffle to (seq_len, batch_size) and
            # add a broadcastable dimension
            mask = mask.dimshuffle(1, 0, 'x')
            sequences = [input, mask]
            step_fun = step_masked
            sequences = [input]
            step_fun = step

        if not isinstance(self.hid_init, Layer):
            # Dot against a 1s vector to repeat to shape (num_batch, num_units)
            hid_init =, 1)), self.hid_init)

        # The hidden-to-hidden weight matrix is always used in step
        non_seqs = [W_hid_stacked]
        # When we aren't precomputing the input outside of scan, we need to
        # provide the input weights and biases to the step function
        if not self.precompute_input:
            non_seqs += [W_in_stacked, b_stacked]

        if self.unroll_scan:
            # Retrieve the dimensionality of the incoming layer
            input_shape = self.input_shapes[0]
            # Explicitly unroll the recurrence instead of using scan
            hid_out, cov_out = unroll_scan(
                outputs_info=[hid_init, cov_init],
            # Scan op iterates over first dimension of input and repeatedly
            # applies the step function
            hid_out, cov_out = theano.scan(
                outputs_info=[hid_init, cov_init],

        # When it is requested that we only return the final sequence step,
        # we need to slice it out immediately after scan is applied
        if self.only_return_final:
            hid_out = hid_out[-1]
            cov_out = cov_out[-1]
            # dimshuffle back to (n_batch, n_time_steps, n_features))
            hid_out = hid_out.dimshuffle(1, 0, 2)
            cov_out = cov_out.dimshuffle(1, 0, 2, 3)

            # if scan is backward reverse the output
            if self.backwards:
                hid_out = hid_out[:, ::-1]
                cov_out = cov_out[:, ::-1]

        return hid_out, cov_out
    def get_output_for(self, inputs, **kwargs):
        Compute this layer's output function given a symbolic input variable
        inputs : list of theano.TensorType
            `inputs[0]` should always be the symbolic input variable.  When
            this layer has a mask input (i.e. was instantiated with
            `mask_input != None`, indicating that the lengths of sequences in
            each batch vary), `inputs` should have length 2, where `inputs[1]`
            is the `mask`.  The `mask` should be supplied as a Theano variable
            denoting whether each time step in each sequence in the batch is
            part of the sequence or not.  `mask` should be a matrix of shape
            ``(n_batch, n_time_steps)`` where ``mask[i, j] = 1`` when ``j <=
            (length of sequence i)`` and ``mask[i, j] = 0`` when ``j > (length
            of sequence i)``. When the hidden state of this layer is to be
            pre-filled (i.e. was set to a :class:`Layer` instance) `inputs`
            should have length at least 2, and `inputs[-1]` is the hidden state
            to prefill with. When the cell state of this layer is to be
            pre-filled (i.e. was set to a :class:`Layer` instance) `inputs`
            should have length at least 2, and `inputs[-1]` is the hidden state
            to prefill with. When both the cell state and the hidden state are
            being pre-filled `inputs[-2]` is the hidden state, while
            `inputs[-1]` is the cell state.
        layer_output : theano.TensorType
            Symbolic output variable.
        # Retrieve the layer input
        input = inputs[0]
        # Retrieve the mask when it is supplied
        mask = None
        hid_init = None
        cell_init = None
        if self.mask_incoming_index > 0:
            mask = inputs[self.mask_incoming_index]
        if self.hid_init_incoming_index > 0:
            hid_init = inputs[self.hid_init_incoming_index]
        if self.cell_init_incoming_index > 0:
            cell_init = inputs[self.cell_init_incoming_index]

        # Treat all dimensions after the second as flattened feature dimensions
        if input.ndim > 3:
            input = T.flatten(input, 3)

            input =

        # Because scan iterates over the first dimension we dimshuffle to
        # (n_time_steps, n_batch, n_features)
        input = input.dimshuffle(1, 0, 2)
        seq_len, num_batch, _ = input.shape

        # Stack input weight matrices into a (num_inputs, 4*num_units)
        # matrix, which speeds up computation
        W_in_stacked = T.concatenate([
            self.W_in_to_ingate, self.W_in_to_forgetgate, self.W_in_to_cell,

        # Same for hidden weight matrices
        W_hid_stacked = T.concatenate([
            self.W_hid_to_ingate, self.W_hid_to_forgetgate, self.W_hid_to_cell,

        # Stack biases into a (4*num_units) vector
        b_stacked = T.concatenate(
            [self.b_ingate, self.b_forgetgate, self.b_cell, self.b_outgate],

        if self.precompute_input:
            # Because the input is given for all time steps, we can
            # precompute_input the inputs dot weight matrices before scanning.
            # W_in_stacked is (n_features, 4*num_units). input is then
            # (n_time_steps, n_batch, 4*num_units).
            input =, W_in_stacked) + b_stacked

        # At each call to scan, input_n will be (n_time_steps, 4*num_units).
        # We define a slicing function that extract the input to each LSTM gate
        def slice_w(x, n):
            return x[:, n * self.num_units:(n + 1) * self.num_units]

        # Create single recurrent computation step function
        # input_n is the n'th vector of the input
        def step(input_n, cell_previous, hid_previous, *args):
            if not self.precompute_input:
                input_n =, W_in_stacked) + b_stacked

            # Calculate gates pre-activations and slice
            gates = input_n +, W_hid_stacked)

            # Clip gradients
            if self.grad_clipping:
                gates = theano.gradient.grad_clip(gates, -self.grad_clipping,

            # Extract the pre-activation gate values
            ingate = slice_w(gates, 0)
            forgetgate = slice_w(gates, 1)
            cell_input = slice_w(gates, 2)
            outgate = slice_w(gates, 3)

            if self.peepholes:
                # Compute peephole connections
                ingate += cell_previous * self.W_cell_to_ingate
                forgetgate += cell_previous * self.W_cell_to_forgetgate

            # Apply nonlinearities
            ingate = self.nonlinearity_ingate(ingate)
            forgetgate = self.nonlinearity_forgetgate(forgetgate)
            cell_input = self.nonlinearity_cell(cell_input)

            # Compute new cell value
            cell = forgetgate * cell_previous + ingate * cell_input

            if self.peepholes:
                outgate += cell * self.W_cell_to_outgate
            outgate = self.nonlinearity_outgate(outgate)

            # Compute new hidden unit activation
            hid = outgate * self.nonlinearity(cell)
            return [cell, hid]

        def step_masked(input_n, mask_n, iter_idx, cell_previous, hid_previous,
            cell, hid = step(input_n, cell_previous, hid_previous, *args)
            # if self.sleepy:
            #     sleep_mask = T.eq(T.mod(iter_idx, (T.arange(cell.shape[-1])+1)),0)
            #     final_mask_n = T.switch(sleep_mask, mask_n, 0)
            # else:
            final_mask_n = mask_n

            # Skip over any input with mask 0 by copying the previous
            # hidden state; proceed normally for any input with mask 1.
            cell = T.switch(final_mask_n, cell, cell_previous)
            hid = T.switch(final_mask_n, hid, hid_previous)

            return [cell, hid]

        if mask is not None:
            # mask is given as (batch_size, seq_len). Because scan iterates
            # over first dimension, we dimshuffle to (seq_len, batch_size) and
            # add a broadcastable dimension
            mask = mask.dimshuffle(1, 0, 'x')
            iter_range = T.arange(mask.shape[0]).astype('int32')
            sequences = [input, mask, iter_range]
            step_fun = step_masked
            sequences = input
            step_fun = step

        ones = T.ones((num_batch, 1))
        if not isinstance(self.cell_init, Layer):
            # Dot against a 1s vector to repeat to shape (num_batch, num_units)
            cell_init =, self.cell_init)

        if not isinstance(self.hid_init, Layer):
            # Dot against a 1s vector to repeat to shape (num_batch, num_units)
            hid_init =, self.hid_init)

        # The hidden-to-hidden weight matrix is always used in step
        non_seqs = [W_hid_stacked]
        # The "peephole" weight matrices are only used when self.peepholes=True
        if self.peepholes:
            non_seqs += [
                self.W_cell_to_ingate, self.W_cell_to_forgetgate,

        # When we aren't precomputing the input outside of scan, we need to
        # provide the input weights and biases to the step function
        if not self.precompute_input:
            non_seqs += [W_in_stacked, b_stacked]

        if self.unroll_scan:
            # Retrieve the dimensionality of the incoming layer
            input_shape = self.input_shapes[0]
            # Explicitly unroll the recurrence instead of using scan
            cell_out, hid_out = unroll_scan(fn=step_fun,
                                            outputs_info=[cell_init, hid_init],
            # Scan op iterates over first dimension of input and repeatedly
            # applies the step function
            cell_out, hid_out = theano.scan(
                outputs_info=[cell_init, hid_init],

        # When it is requested that we only return the final sequence step,
        # we need to slice it out immediately after scan is applied
        if self.only_return_final:
            hid_out = hid_out[-1]
            # dimshuffle back to (n_batch, n_time_steps, n_features))
            hid_out = hid_out.dimshuffle(1, 0, 2)

            # if scan is backward reverse the output
            if self.backwards:
                hid_out = hid_out[:, ::-1]

        return hid_out
Esempio n. 39
    def get_output_for(self, inputs, recurrence_flags={}, **kwargs):
        returns history of agent interaction with environment for given number of turns.
            inputs - [state init]  + [input_nonsequences] + [input_sequences]
                Each part is a list of theano expressions for layers in the order they were
                provided when creating this layer.
            recurrence_flags - a set of flags to be passed to the one step agent (anything that lasagne supports)
                e.g. {deterministic=True}
            [state_sequences] + [output sequences] - a list of all states and all outputs sequences
            Shape of each such sequence is [batch, tick, shape_of_one_state_or_output...]
        n_states = len(self.state_variables)
        n_state_inits = len(self.state_init)
        n_input_nonseq = len(self.input_nonsequences)
        n_input_seq = len(self.input_sequences)
        n_outputs = len(self.tracked_outputs)

        #slice inputs

        if self.mask_input is not None:
            mask, inputs = inputs[0], inputs[1:]

        initial_states_provided, nonsequences, sequences = unpack_list(
            inputs, [n_state_inits, n_input_nonseq, n_input_seq])

        # infer batch size
        if self.batch_size is not None:
            batch_size = self.batch_size
        elif len(inputs) != 0:
            batch_size = inputs[0].shape[0]
            raise ValueError(
                "Need to set batch_size explicitly for recurrence")

        # reshape sequences from [batch, time, ...] to [time,batch,...] to fit scan
        sequences = [seq.swapaxes(1, 0) for seq in sequences]

        #here we create outputs_info for scan
        ## initial states that are given as input
        initial_states_provided = OrderedDict(
            list(zip(self.state_init, initial_states_provided)))

        def get_initial_state(state_out_layer, batch_size=batch_size):
            """Pick dedicated initial state or create zeros of appropriate shape and dtype"""
            # if we have a dedicated init, use it
            if state_out_layer in initial_states_provided:
                initial_state = initial_states_provided[state_out_layer]
            # otherwise initialize with zeros
                #constant batch_size==1 causes T.zeros to get broadcastable, which results in an error
                #TODO(jheuristic) investigate a better way to do so.
                if (type(batch_size) is int) and (batch_size == 1):
                    batch_size = theano.shared(batch_size)

                initial_state = T.zeros(
                    (batch_size, ) + tuple(state_out_layer.output_shape[1:]),

            return initial_state

        initial_states = list(map(get_initial_state, self.state_variables))

        #dummy values for initial outputs. They have no role in computation, but if nonsequences are present,
        # AND scan is not unrolled, the step function will not receive prev outputs as parameters, while
        # if unroll_scan, these parameters are present. we forcibly initialize outputs to prevent
        # complications during parameter parsing in step function below.
        initial_output_fillers = list(
            map(get_initial_state, self.tracked_outputs))

        outputs_info = initial_states + initial_output_fillers

        # recurrent step function
        def step(*args):

            sequence_slices, prev_states, prev_outputs, nonsequences = \
                unpack_list(args, [n_input_seq, n_states, n_outputs, n_input_nonseq])

            # make dicts of prev_states and inputs
            prev_states_dict = OrderedDict(
                zip(list(self.state_variables.keys()), prev_states))

            input_layers = list(
            assert len(input_layers) == len(nonsequences + sequence_slices)

            inputs_dict = OrderedDict(
                zip(input_layers, nonsequences + sequence_slices))

            # call one step recurrence
            new_states, new_outputs = self.get_one_step(
                prev_states_dict, inputs_dict, **recurrence_flags)
            return new_states + new_outputs

        ###handling mask_input###

        #a step function that utilizes a mask
        def step_masked(mask_t, *args):
            #unpack arrays
            sequence_slices, prev_states, prev_outputs, nonsequences = \
                unpack_list(args, [n_input_seq, n_states, n_outputs, n_input_nonseq])

            #get regular step
            new_states_and_outputs = step(*args)
            old_states_and_outputs = prev_states + prev_outputs

            #if mask_t, return new ones, else return old ones
            def apply_mask(mask_t, new_state, old_state):
                assert new_state.ndim == old_state.ndim
                ndim = new_state.ndim
                #append dims to mask
                pattern = list(range(
                    mask_t.ndim)) + ['x'] * (ndim - mask_t.ndim)

                return T.switch(mask_t.dimshuffle(pattern), new_state,

            next_states_and_outputs = [
                apply_mask(mask_t, new_state,
                           old_state) for new_state, old_state in zip(
                               new_states_and_outputs, old_states_and_outputs)

            return next_states_and_outputs

        if self.mask_input is not None:
            sequences = [mask.swapaxes(1, 0)] + sequences
            step_function = step_masked
            step_function = step

        #scan itself
        if self.unroll_scan:
            # call scan itself
            history = unroll_scan(step_function,
            self.updates = OrderedDict()
            history, updates = theano.scan(step_function,
            self.updates = updates
            if len(updates) != 0:
                    "Warning: recurrent loop without unroll_scan got nonempty random state updates list. That happened"
                    " because there is some source of randomness (e.g. dropout) inside recurrent step graph."
                    " To compile such graph, one must either call .get_automatic_updates() right after .get_output"
                    " and pass these updates to a function, or use no_defalt_updates=True when compiling theano.function."

        # reordering from [time,batch,...] to [batch,time,...]
        history = [(var.swapaxes(1, 0) if var.ndim > 1 else var)
                   for var in check_list(history)]

        assert len(history) == n_states + n_outputs

        state_seqs, output_seqs = unpack_list(history, [n_states, n_outputs])

        # handle delayed_states
        # selectively shift state sequences by 1 tick into the past, padding with their initialisations
        for i in range(len(state_seqs)):
            if list(self.state_variables.keys())[i] in self.delayed_states:
                state_seq = state_seqs[i]
                state_init = initial_states[i]
                state_seq = T.concatenate(
                    [insert_dim(state_init, 1), state_seq[:, :-1]], axis=1)
                state_seqs[i] = state_seq

        #keys corresponding to output sequences. Note that we do not use self.keys() to correctly
        # handle cases where some variable is present in both state_variables and tracked_outputs
        output_keys = list(self.state_variables.keys()) + list(
        output_values = state_seqs + output_seqs
        assert len(output_keys) == len(output_values)
        return OrderedDict(zip(output_keys, output_values))