Exemple #1
0
    def get_output_for(self, inputs, **kwargs):
        input = inputs[0]
        hid_init = None
        if self.hid_init_incoming_index > 0:
            hid_init = inputs[self.hid_init_incoming_index]

        # Input should be provided as (n_batch, n_time_steps, n_features)
        # but scan requires the iterable dimension to be first
        # So, we need to dimshuffle to (n_time_steps, n_batch, n_features)
        input = input.dimshuffle(1, 0, *range(2, input.ndim))
        seq_len, num_batch = input.shape[0], input.shape[1]

        # precompute inputs before scanning
        trailing_dims = tuple(input.shape[n] for n in range(2, input.ndim))
        input = T.reshape(input, (seq_len * num_batch, ) + trailing_dims)
        input = helper.get_output(self.input_to_hidden, input, **kwargs)

        # Reshape back to (seq_len, batch_size, trailing dimensions...)
        trailing_dims = tuple(input.shape[n] for n in range(1, input.ndim))
        input = T.reshape(input, (seq_len, num_batch) + trailing_dims)

        # pass params to step
        non_seqs = helper.get_all_params(self.hidden_to_hidden)
        non_seqs += helper.get_all_params(self.post_concat)

        # Create single recurrent computation step function
        def step(input_n, hid_previous, *args):
            # Compute the hidden-to-hidden activation
            hid_pre = helper.get_output(self.hidden_to_hidden, hid_previous,
                                        **kwargs)
            hid_pre = T.concatenate([hid_pre, input_n], axis=1)
            hid_pre = helper.get_output(self.post_concat, hid_pre, **kwargs)
            if self.grad_clipping:
                hid_pre = theano.gradient.grad_clip(hid_pre,
                                                    -self.grad_clipping,
                                                    self.grad_clipping)
            return hid_pre

        sequences = input
        step_fun = step

        if not isinstance(self.hid_init, Layer):
            # repeats self.hid_init num_batch times in first dimension
            dot_dims = (list(range(1, self.hid_init.ndim - 1)) +
                        [0, self.hid_init.ndim - 1])
            hid_init = T.dot(T.ones((num_batch, 1)),
                             self.hid_init.dimshuffle(dot_dims))

        hid_out = theano.scan(fn=step_fun,
                              sequences=sequences,
                              go_backwards=False,
                              outputs_info=[hid_init],
                              non_sequences=non_seqs,
                              truncate_gradient=-1,
                              strict=True)[0]

        # dimshuffle back to (n_batch, n_time_steps, n_features))
        hid_out = hid_out.dimshuffle(1, 0, *range(2, hid_out.ndim))

        return hid_out
Exemple #2
0
 def get_params(self, **tags):
     # Get all parameters from this layer, the master layer
     params = super(CustomMIRecurrentLayer, self).get_params(**tags)
     # Combine with all parameters from the child layers
     params += helper.get_all_params(self.input_to_hidden, **tags)
     params += helper.get_all_params(self.hidden_to_hidden, **tags)
     return params
Exemple #3
0
 def get_params(self, **tags):
     # Get all parameters from this layer, the master layer
     params = super(CustomRecurrentLayer, self).get_params(**tags)
     # Combine with all parameters from the child layers
     params += helper.get_all_params(self.input_to_hidden, **tags)
     params += helper.get_all_params(self.hidden_to_hidden, **tags)
     return params
 def get_params(self, **tags):
     # Get all parameters from this layer, the master layer
     params = super(StochsticRecurrentLayer, self).get_params(**tags)
     if self.logvar_p_mlp is not None:
         params += helper.get_all_params(self.logvar_p_mlp, **tags)
         params += helper.get_all_params(self.q_mu_mlp, **tags)
         params += helper.get_all_params(self.q_logvar_mlp, **tags)
         params += helper.get_all_params(self.mu_p_mlp, **tags)
     return params
 def get_params(self, **tags):
     # Get all parameters from this layer, the master layer
     params = super(RecurrentUnitaryLayer, self).get_params(**tags)
     # Combine with all parameters from the child layers
     params += helper.get_all_params(self.input_to_hidden, **tags)
     params += helper.get_all_params(self.hidden_to_hidden, **tags)
     if isinstance(self.nonlinearity, Layer):
         params += self.nonlinearity.get_params()
     return params
Exemple #6
0
    def get_params(self):
        """
        Get all parameters of this layer.

        :returns:
            - params : list of theano.shared
                List of all parameters
        """
        params = helper.get_all_params(self.input_to_hidden) + helper.get_all_params(self.hidden_to_hidden)

        if self.learn_init:
            return params + self.get_init_params()
        else:
            return params
    def get_params(self):
        '''
        Get all parameters of this layer.

        :returns:
            - params : list of theano.shared
                List of all parameters
        '''
        params = (helper.get_all_params(self.input_to_hidden) +
                  helper.get_all_params(self.hidden_to_hidden))

        if self.learn_init:
            return params + self.get_init_params()
        else:
            return params
Exemple #8
0
 def _compile_train_func(self):
     logger.info("Compiling train cost function...")
     network_input = self.net.symbolic_input()
     network_output = self.net.symbolic_output(deterministic=False)
     target_var = ndim_tensor(name='target', ndim=network_output.ndim)
     mask_var = ndim_tensor(name='mask', ndim=network_output.ndim)
     loss = self.loss_func(network_output, target_var, mask_var)
     all_params = get_all_params(self.net.layers[-1], trainable=True)
     updates = self.updates_func(
         loss, all_params, learning_rate=self._learning_rate)
     train_func = theano.function(
         inputs=[network_input, target_var, mask_var],
         outputs=loss,
         updates=updates,
         on_unused_input='warn',
         allow_input_downcast=True)
     logger.info("Done compiling cost function.")
     return train_func
Exemple #9
0
    def __init__(self, *args, **kwargs):
        super(TrainerMixin, self).__init__(*args, **kwargs)
        input_var = tensor.tensor4('inputs')
        target_var = tensor.ivector('targets')

        loss, _ = loss_acc(self.model,
                           input_var,
                           target_var,
                           deterministic=False)
        layers = get_all_layers(self.model)
        decay = regularize_layer_params(layers, l2) * 0.0001
        loss = loss + decay

        params = get_all_params(self.model, trainable=True)
        updates = momentum(loss,
                           params,
                           momentum=0.9,
                           learning_rate=self.learning_rate)
        self.set_training(input_var, target_var, loss, updates)
Exemple #10
0
    def get_output_for(self, inputs, mask=None, **kwargs):
        """
        Compute this layer's output function given a symbolic input variable.

        Parameters
        ----------
        inputs : list of theano.TensorType
            `inputs[0]` should always be the symbolic input variable.  When
            this layer has a mask input (i.e. was instantiated with
            `mask_input != None`, indicating that the lengths of sequences in
            each batch vary), `inputs` should have length 2, where `inputs[1]`
            is the `mask`.  The `mask` should be supplied as a Theano variable
            denoting whether each time step in each sequence in the batch is
            part of the sequence or not.  `mask` should be a matrix of shape
            ``(n_batch, n_time_steps)`` where ``mask[i, j] = 1`` when ``j <=
            (length of sequence i)`` and ``mask[i, j] = 0`` when ``j > (length
            of sequence i)``. When the hidden state of this layer is to be
            pre-filled (i.e. was set to a :class:`Layer` instance) `inputs`
            should have length at least 2, and `inputs[-1]` is the hidden state
            to prefill with.

        Returns
        -------
        layer_output : theano.TensorType
            Symbolic output variable.
        """
        # Retrieve the layer input
        input = inputs[0]
        # Retrieve the mask when it is supplied
        hid_init = None
        if self.mask_incoming_index > 0:
            mask = inputs[self.mask_incoming_index]
        if self.hid_init_incoming_index > 0:
            hid_init = inputs[self.hid_init_incoming_index]

        # Input should be provided as (n_batch, n_time_steps, n_features)
        # but scan requires the iterable dimension to be first
        # So, we need to dimshuffle to (n_time_steps, n_batch, n_features)
        input = input.dimshuffle(1, 0, *range(2, input.ndim))
        seq_len, num_batch = input.shape[0], input.shape[1]

        if self.precompute_input:
            # Because the input is given for all time steps, we can precompute
            # the inputs to hidden before scanning. First we need to reshape
            # from (seq_len, batch_size, trailing dimensions...) to
            # (seq_len*batch_size, trailing dimensions...)
            # This strange use of a generator in a tuple was because
            # input.shape[2:] was raising a Theano error
            trailing_dims = tuple(input.shape[n] for n in range(2, input.ndim))
            input = T.reshape(input, (seq_len*num_batch,) + trailing_dims)
            input = helper.get_output(
                self.input_to_hidden, input, **kwargs)

            # Reshape back to (seq_len, batch_size, trailing dimensions...)
            trailing_dims = tuple(input.shape[n] for n in range(1, input.ndim))
            input = T.reshape(input, (seq_len, num_batch) + trailing_dims)

        # We will always pass the hidden-to-hidden layer params to step
        non_seqs = helper.get_all_params(self.hidden_to_hidden)
        # When we are not precomputing the input, we also need to pass the
        # input-to-hidden parameters to step
        if not self.precompute_input:
            non_seqs += helper.get_all_params(self.input_to_hidden)

        # Create single recurrent computation step function
        def step(input_n, hid_previous, *args):
            # Compute the hidden-to-hidden activation
            hid_pre = helper.get_output(
                self.hidden_to_hidden, hid_previous, **kwargs)

            # If the dot product is precomputed then add it, otherwise
            # calculate the input_to_hidden values and add them
            if self.precompute_input:
                hid_pre += input_n
            else:
                hid_pre += helper.get_output(
                    self.input_to_hidden, input_n, **kwargs)

            # Clip gradients
            if self.grad_clipping:
                hid_pre = theano.gradient.grad_clip(
                    hid_pre, -self.grad_clipping, self.grad_clipping)

            return self.nonlinearity(hid_pre)

        def step_masked(input_n, mask_n, hid_previous, *args):
            # Skip over any input with mask 0 by copying the previous
            # hidden state; proceed normally for any input with mask 1.
            hid = step(input_n, hid_previous, *args)
            hid_out = T.switch(mask_n, hid, hid_previous)
            return [hid_out]

        if mask is not None:
            mask = mask.dimshuffle(1, 0, 'x')
            sequences = [input, mask]
            step_fun = step_masked
        else:
            sequences = input
            step_fun = step

        if not isinstance(self.hid_init, Layer):
            # The code below simply repeats self.hid_init num_batch times in
            # its first dimension.  Turns out using a dot product and a
            # dimshuffle is faster than T.repeat.
            dot_dims = (list(range(1, self.hid_init.ndim - 1)) +
                        [0, self.hid_init.ndim - 1])
            hid_init = T.dot(T.ones((num_batch, 1)),
                             self.hid_init.dimshuffle(dot_dims))

        if self.unroll_scan:
            # Retrieve the dimensionality of the incoming layer
            input_shape = self.input_shapes[0]
            # Explicitly unroll the recurrence instead of using scan
            hid_out = unroll_scan(
                fn=step_fun,
                sequences=sequences,
                outputs_info=[hid_init],
                go_backwards=self.backwards,
                non_sequences=non_seqs,
                n_steps=input_shape[1])[0]
        else:
            # Scan op iterates over first dimension of input and repeatedly
            # applies the step function
            hid_out = theano.scan(
                fn=step_fun,
                sequences=sequences,
                go_backwards=self.backwards,
                outputs_info=[hid_init],
                non_sequences=non_seqs,
                truncate_gradient=self.gradient_steps,
                strict=True)[0]

        # When it is requested that we only return the final sequence step,
        # we need to slice it out immediately after scan is applied
        if self.only_return_final:
            hid_out = hid_out[-1]
        else:
            # dimshuffle back to (n_batch, n_time_steps, n_features))
            hid_out = hid_out.dimshuffle(1, 0, *range(2, hid_out.ndim))

            # if scan is backward reverse the output
            if self.backwards:
                hid_out = hid_out[:, ::-1]

        return hid_out
Exemple #11
0
 def get_params(self, **tags):
     # Get all parameters from this layer, the master layer
     params = super(ConvTimeStep1DLayer, self).get_params(**tags)
     # Combine with all parameters from the child layers
     params += helper.get_all_params(self.conv1d, **tags)
     return params
Exemple #12
0
    def get_output_for(self, inputs, **kwargs):
        """
        Compute this layer's output function given a symbolic input variable.

        Parameters
        ----------
        inputs : list of theano.TensorType
            `inputs[0]` should always be the symbolic input variable.  When
            this layer has a mask input (i.e. was instantiated with
            `mask_input != None`, indicating that the lengths of sequences in
            each batch vary), `inputs` should have length 2, where `inputs[1]`
            is the `mask`.  The `mask` should be supplied as a Theano variable
            denoting whether each time step in each sequence in the batch is
            part of the sequence or not.  `mask` should be a matrix of shape
            ``(n_batch, n_time_steps)`` where ``mask[i, j] = 1`` when ``j <=
            (length of sequence i)`` and ``mask[i, j] = 0`` when ``j > (length
            of sequence i)``. When the hidden state of this layer is to be
            pre-filled (i.e. was set to a :class:`Layer` instance) `inputs`
            should have length at least 2, and `inputs[-1]` is the hidden state
            to prefill with.

        Returns
        -------
        layer_output : theano.TensorType
            Symbolic output variable.
        """
        # Retrieve the layer input
        input = inputs[0]
        # Retrieve the mask when it is supplied
        mask = None
        hid_init = None
        if self.mask_incoming_index > 0:
            mask = inputs[self.mask_incoming_index]
        if self.hid_init_incoming_index > 0:
            hid_init = inputs[self.hid_init_incoming_index]

        # Input should be provided as (n_batch, n_time_steps, n_features)
        # but scan requires the iterable dimension to be first
        # So, we need to dimshuffle to (n_time_steps, n_batch, n_features)
        input = input.dimshuffle(1, 0, *range(2, input.ndim))
        seq_len, num_batch = input.shape[0], input.shape[1]

        if self.precompute_input:
            # Because the input is given for all time steps, we can precompute
            # the inputs to hidden before scanning. First we need to reshape
            # from (seq_len, batch_size, trailing dimensions...) to
            # (seq_len*batch_size, trailing dimensions...)
            # This strange use of a generator in a tuple was because
            # input.shape[2:] was raising a Theano error
            trailing_dims = tuple(input.shape[n] for n in range(2, input.ndim))
            input = T.reshape(input, (seq_len * num_batch, ) + trailing_dims)
            input = helper.get_output(self.input_to_hidden, input, **kwargs)

            # Reshape back to (seq_len, batch_size, trailing dimensions...)
            trailing_dims = tuple(input.shape[n] for n in range(1, input.ndim))
            input = T.reshape(input, (seq_len, num_batch) + trailing_dims)

        # We will always pass the hidden-to-hidden layer params to step
        non_seqs = helper.get_all_params(self.hidden_to_hidden)
        non_seqs += self._get_mi_params()
        # When we are not precomputing the input, we also need to pass the
        # input-to-hidden parameters to step
        if not self.precompute_input:
            non_seqs += helper.get_all_params(self.input_to_hidden)

        # Create single recurrent computation step function
        def step(input_n, hid_previous, *args):
            # Compute the hidden-to-hidden activation
            hid_to_hid = helper.get_output(self.hidden_to_hidden, hid_previous,
                                           **kwargs)

            # Compute the input-to-hidden activation
            if self.precompute_input:
                # if the input is precomputed
                in_to_hid = input_n
            else:
                # compute the input
                in_to_hid = helper.get_output(self.input_to_hidden, input_n,
                                              **kwargs)

            # Compute the second order term
            if self.a_g is not None:
                second_order_term = (self.a_g * in_to_hid * hid_to_hid)
                # second_order_term = in_to_hid * hid_to_hid
            else:
                second_order_term = 0

            # Compute the first order hidden-to-hidden term
            if self.b_g_hid_to_hid is not None:
                f_o_hid_to_hid = self.b_g_hid_to_hid * hid_to_hid

            else:
                f_o_hid_to_hid = 0

            # Compute first order input to hidden term
            if self.b_g_in_to_hid is not None:
                f_o_in_to_hid = self.b_g_in_to_hid * in_to_hid

            else:
                # if all else is None, it will output zeros of the right size
                f_o_in_to_hid = T.zeros_like(in_to_hid)

            hid_pre = second_order_term + f_o_in_to_hid + f_o_hid_to_hid

            if self.b is not None:
                hid_pre = hid_pre + self.b

            return self.nonlinearity(hid_pre)

        def step_masked(input_n, mask_n, hid_previous, *args):
            # Skip over any input with mask 0 by copying the previous
            # hidden state; proceed normally for any input with mask 1.
            hid = step(input_n, hid_previous, *args)
            hid_out = T.switch(mask_n, hid, hid_previous)
            return [hid_out]

        if mask is not None:
            mask = mask.dimshuffle(1, 0, 'x')
            sequences = [input, mask]
            step_fun = step_masked
        else:
            sequences = input
            step_fun = step

        if not isinstance(self.hid_init, Layer):
            # The code below simply repeats self.hid_init num_batch times in
            # its first dimension.  Turns out using a dot product and a
            # dimshuffle is faster than T.repeat.
            dot_dims = (list(range(1, self.hid_init.ndim - 1)) +
                        [0, self.hid_init.ndim - 1])
            hid_init = T.dot(T.ones((num_batch, 1)),
                             self.hid_init.dimshuffle(dot_dims))

        if self.unroll_scan:
            # Retrieve the dimensionality of the incoming layer
            input_shape = self.input_shapes[0]
            # Explicitly unroll the recurrence instead of using scan
            hid_out = unroll_scan(fn=step_fun,
                                  sequences=sequences,
                                  outputs_info=[hid_init],
                                  go_backwards=self.backwards,
                                  non_sequences=non_seqs,
                                  n_steps=input_shape[1])[0]
        else:
            # Scan op iterates over first dimension of input and repeatedly
            # applies the step function
            hid_out = theano.scan(fn=step_fun,
                                  sequences=sequences,
                                  go_backwards=self.backwards,
                                  outputs_info=[hid_init],
                                  non_sequences=non_seqs,
                                  truncate_gradient=self.gradient_steps,
                                  strict=True)[0]

        # When it is requested that we only return the final sequence step,
        # we need to slice it out immediately after scan is applied
        if self.only_return_final:
            hid_out = hid_out[-1]
        else:
            # dimshuffle back to (n_batch, n_time_steps, n_features))
            hid_out = hid_out.dimshuffle(1, 0, *range(2, hid_out.ndim))

            # if scan is backward reverse the output
            if self.backwards:
                hid_out = hid_out[:, ::-1]

        return hid_out
Exemple #13
0
    y = data['target']
    P = compute_joint_probabilities(X,
                                    batch_size=batch_size,
                                    d=2,
                                    perplexity=30,
                                    tol=1e-5,
                                    verbose=0)

    x = Input((None, X.shape[1]))
    z = Dense(x, num_units=256, nonlinearity=rectify)
    z = Dense(z, num_units=2, nonlinearity=linear)
    z_pred = get_output(z)
    P_real = T.matrix()
    loss = tsne_loss(P_real, z_pred)

    params = get_all_params(z, trainable=True)
    lr = theano.shared(np.array(0.01, dtype=floatX))
    updates = updates.adam(loss, params, learning_rate=lr)
    train_fn = theano.function([x.input_var, P_real], loss, updates=updates)
    encode = theano.function([x.input_var], z_pred)

    X_train = X
    Y_train = P
    for epoch in range(1000):
        total_loss = 0
        nb = 0
        for xt in iterate_minibatches(X_train,
                                      batch_size=batch_size,
                                      shuffle=False):
            yt = Y_train[nb]
            total_loss += train_fn(xt, yt)
    def get_output_for(self, inputs, **kwargs):
        # Retrieve the layer input
        input = inputs[0]
        # Retrieve the mask when it is supplied
        mask = None
        hid_init = None
        if self.mask_incoming_index > 0:
            mask = inputs[self.mask_incoming_index]
        if self.hid_init_incoming_index > 0:
            hid_init = inputs[self.hid_init_incoming_index]

        # Input should be provided as (n_batch, n_time_steps, n_features)
        # but scan requires the iterable dimension to be first
        # So, we need to dimshuffle to (n_time_steps, n_batch, n_features)
        #input = input.dimshuffle(1, 0, *range(2, input.ndim))
        seq_len, num_batch = input.shape[0], input.shape[1]

        # We will always pass the hidden-to-hidden layer params to step
        non_seqs = helper.get_all_params(self.hidden_to_hidden)

        # Create single recurrent computation step function
        def step(input_n, hid_previous, *args):
            # Compute the hidden-to-hidden activation
            hid_pre = helper.get_output(
                self.hidden_to_hidden, hid_previous, **kwargs)

            hid_pre += input_n

            # Clip gradients
            if self.grad_clipping:
                hid_pre = theano.gradient.grad_clip(
                    hid_pre, -self.grad_clipping, self.grad_clipping)

            return self.nonlinearity(hid_pre)

        def step_masked(input_n, mask_n, hid_previous, *args):
            # Skip over any input with mask 0 by copying the previous
            # hidden state; proceed normally for any input with mask 1.
            hid = step(input_n, hid_previous, *args)
            hid_out = T.switch(mask_n, hid, hid_previous)
            return [hid_out]

        if mask is not None:
            mask = mask.dimshuffle(1, 0, 'x')
            sequences = [input, mask]
            step_fun = step_masked
        else:
            sequences = input
            step_fun = step

        if not isinstance(self.hid_init, Layer):
            # The code below simply repeats self.hid_init num_batch times in
            # its first dimension.  Turns out using a dot product and a
            # dimshuffle is faster than T.repeat.
            dot_dims = (list(range(1, self.hid_init.ndim - 1)) +
                        [0, self.hid_init.ndim - 1])
            hid_init = T.dot(T.ones((num_batch, 1)),
                             self.hid_init.dimshuffle(dot_dims))

        if self.unroll_scan:
            # Retrieve the dimensionality of the incoming layer
            input_shape = self.input_shapes[0]
            # Explicitly unroll the recurrence instead of using scan
            hid_out = unroll_scan(
                fn=step_fun,
                sequences=sequences,
                outputs_info=[hid_init],
                go_backwards=self.backwards,
                non_sequences=non_seqs,
                n_steps=input_shape[1])[0]
        else:
            # Scan op iterates over first dimension of input and repeatedly
            # applies the step function
            hid_out = theano.scan(
                fn=step_fun,
                sequences=sequences,
                go_backwards=self.backwards,
                outputs_info=[hid_init],
                non_sequences=non_seqs,
                truncate_gradient=self.gradient_steps,
                strict=True)[0]

        # When it is requested that we only return the final sequence step,
        # we need to slice it out immediately after scan is applied
        if self.only_return_final:
            hid_out = hid_out[-1]
        else:
            # dimshuffle back to (n_batch, n_time_steps, n_features))
            #hid_out = hid_out.dimshuffle(1, 0, *range(2, hid_out.ndim))

            # if scan is backward reverse the output
            if self.backwards:
                hid_out = hid_out[::-1,:]

        return hid_out
Exemple #15
0
 def num_trainable_parameters(self):
     return sum(
         [p.get_value().size for p in get_all_params(self.layers[-1])])
	def get_params(self, **tags):
		# Get all parameters from this layer, the master layer
		params = super(PermutationalLayer, self).get_params(**tags)
		# Combine with all parameters from the child layers
		params += helper.get_all_params(self.subnet, **tags)
		return params
Exemple #17
0
def main(num_epochs=10, layers=1, load_file=None, batch_size=128, seq_len=96, suffix='', test=False, model_name='model'):
    print "Building network ..."
    print theano.config.floatX

    BATCH_SIZE = batch_size
    SEQ_LENGTH = seq_len

    # Recurrent layers expect input of shape (batch size, SEQ_LENGTH, num_features)
    x = T.imatrix('x')
    mask = T.matrix('mask')
    target_values = T.ivector('target_output')
    
    # We now build a layer for the embeddings.
    U = np.random.randn(vocab_size, char_dims).astype(theano.config.floatX)
    embeddings = theano.shared(U, name='embeddings', borrow=True)
    x_embedded = embeddings[x]

    l_in = lasagne.layers.InputLayer(shape=(BATCH_SIZE, SEQ_LENGTH, char_dims), input_var=x_embedded)
    l_mask = lasagne.layers.InputLayer(shape=(BATCH_SIZE, SEQ_LENGTH), input_var=mask)

    recurrent_type = lasagne.layers.LSTMLayer
    l_forward_1 = recurrent_type(l_in, N_HIDDEN, 
        grad_clipping=GRAD_CLIP,
        nonlinearity=lasagne.nonlinearities.tanh,
        mask_input=l_mask)
    l_backward_1 = recurrent_type(l_in, N_HIDDEN, 
        grad_clipping=GRAD_CLIP,
        nonlinearity=lasagne.nonlinearities.tanh,
        backwards=True, 
        mask_input=l_mask)
    if layers == 2:
        l_forward_2 = recurrent_type(l_forward_1, N_HIDDEN, 
            grad_clipping=GRAD_CLIP,
            nonlinearity=lasagne.nonlinearities.tanh,
            mask_input=l_mask)
        l_backward_2 = recurrent_type(l_backward_1, N_HIDDEN, 
            grad_clipping=GRAD_CLIP,
            nonlinearity=lasagne.nonlinearities.tanh,
            backwards=True, 
            mask_input=l_mask)
        l_forward_slice = lasagne.layers.get_output(l_forward_2)[:,-1,:]
        l_backward_slice = lasagne.layers.get_output(l_backward_2)[:,-1,:]
    else:
        l_forward_slice = lasagne.layers.get_output(l_forward_1)[:,-1,:]
        l_backward_slice = lasagne.layers.get_output(l_backward_1)[:,-1,:]

    # Now combine the LSTM layers.  
    _Wf, _Wb = np.random.randn(N_HIDDEN, dim_out).astype(theano.config.floatX), np.random.randn(N_HIDDEN, dim_out).astype(theano.config.floatX)
    _bias = np.random.randn(dim_out).astype(theano.config.floatX)
    wf = theano.shared(_Wf, name='join forward weights', borrow=True)
    wb = theano.shared(_Wb, name='join backward weights', borrow=True)
    bias = theano.shared(_bias, name='join bias', borrow=True)

    joined = T.dot(l_forward_slice, wf) + T.dot(l_backward_slice, wb) + bias
    tmp = lasagne.layers.InputLayer(shape=(BATCH_SIZE, dim_out))
    l_out = lasagne.layers.DenseLayer(tmp, num_units=NUM_TAGS, W = lasagne.init.Normal(), nonlinearity=lasagne.nonlinearities.softmax)

    # lasagne.layers.get_output produces a variable for the output of the net
    network_output = l_out.get_output_for(joined)
    # The loss function is calculated as the mean of the (categorical) cross-entropy between the prediction and target.
    cost = T.nnet.categorical_crossentropy(network_output,target_values).mean()

    # Retrieve all parameters from the network
    if layers == 1:
        all_params = helper.get_all_params(l_forward_1) + helper.get_all_params(l_backward_1) 
    else:
        all_params = helper.get_all_params(l_forward_2) + helper.get_all_params(l_backward_2) 
    all_params += helper.get_all_params(l_out) + [wf, wb, bias, embeddings] 
    print len(all_params)

    grads = T.grad(cost, all_params)
    get_grads = theano.function([x, mask, target_values], grads)

    # Compute AdaGrad updates for training
    print("Computing updates ...")
    updates = lasagne.updates.adam(cost, all_params)

    # Theano functions for training and computing cost
    print("Compiling functions ...")
    train = theano.function([x, mask, target_values], cost, updates=updates, allow_input_downcast=True)
    compute_cost = theano.function([x, mask, target_values], cost, allow_input_downcast=True) 
    
    pred = T.argmax(network_output, axis=1)
    get_preds = theano.function([x, mask], pred, allow_input_downcast=True)

    errors = T.sum(T.neq(pred, target_values))
    count_errors = theano.function([x, mask, target_values], errors, allow_input_downcast=True)

    def get_data(fname):
        import cPickle
        with open(fname, 'rb') as handle:
            data = cPickle.load(handle)
        xs = [d.astype('int32') for d in data[0]]
        return xs, data[1]

    print 'Loading train'
    train_xs, train_ys = get_data('train%s' % suffix)
    print 'Loading dev'
    dev_xs, dev_ys = get_data('dev%s' % suffix)
    print 'Loading test'
    test_xs, test_ys = get_data('test%s' % suffix)
    print 'Sizes:\tTrain: %d\tDev: %d\tTest: %d\n' % (len(train_xs) * BATCH_SIZE, len(dev_xs) * BATCH_SIZE, len(test_xs) * BATCH_SIZE)


    def get_accuracy(pXs, pYs):
        total = sum([len(batch) for batch in pXs])
        errors = sum([count_errors(tx, get_mask(tx), ty) for tx, ty in zip(pXs, pYs)])
        return float(total-errors)/total

    def save_preds(pXs, pYs):
        preds = [get_preds(tx, get_mask(tx)) for tx, _ in zip(pXs, pYs)]
        with open('pred.pkl', 'wb') as handle:
            handle.dump(preds, handle)

    if not load_file is None:
        print 'Loading params...'
        with open(load_file, 'rb') as handle:
            params = cPickle.load(handle)
        print len(params)
        for ix, _ in enumerate(zip(params, all_params)):
            all_params[ix].set_value(params[ix].astype('float32'))

    print("Training ...")
    try:
        if test:
            dev_acc = get_accuracy(dev_xs, dev_ys)
            save_preds(dev_xs, dev_ys)
            print dev_acc
            return

        best_acc = 0.0
        for it in xrange(num_epochs):
            data = zip(train_xs, train_ys)
            random.shuffle(data)
            train_xs, train_ys = zip(*data)

            avg_cost = 0;
            total = 0.
            for x, y in zip(train_xs, train_ys):          
                avg_cost += train(x, get_mask(x), y)
                total += 1.

            train_acc = 0.
            #train_acc = get_accuracy(train_xs, train_ys)
            dev_acc = get_accuracy(dev_xs, dev_ys)
            test_acc = get_accuracy(test_xs, test_ys)

            if dev_acc > best_acc:
                params = [np.asarray(p.eval()) for p in all_params]
                with open('%s_%f.pkl' % (model_name, dev_acc), 'wb') as handle:
                    cPickle.dump(params, handle)
                best_acc = dev_acc

            print("Epoch {} average loss = {}".format(it, avg_cost / total))
            print "Accuracies:\t train: %f\tdev: %f\ttest: %f\n" % (train_acc, dev_acc, test_acc) 
            print 
    except KeyboardInterrupt:
        pass
    def get_output_for(self, inputs, **kwargs):
        """
        Compute this layer's output function given a symbolic input variable.

        Parameters
        ----------
        inputs : list of theano.TensorType
            `inputs[0]` should always be the symbolic input variable.  When
            this layer has a mask input (i.e. was instantiated with
            `mask_input != None`, indicating that the lengths of sequences in
            each batch vary), `inputs` should have length 2, where `inputs[1]`
            is the `mask`.  The `mask` should be supplied as a Theano variable
            denoting whether each time step in each sequence in the batch is
            part of the sequence or not.  `mask` should be a matrix of shape
            ``(n_batch, n_time_steps)`` where ``mask[i, j] = 1`` when ``j <=
            (length of sequence i)`` and ``mask[i, j] = 0`` when ``j > (length
            of sequence i)``.

        Returns
        -------
        layer_output : theano.TensorType
            Symbolic output variable.
        """
        # Retrieve the layer input
        input = inputs[0]
        # Retrieve the mask when it is supplied
        mask = inputs[1] if len(inputs) > 1 else None

        # Input should be provided as (n_batch, n_time_steps, n_features)
        # but scan requires the iterable dimension to be first
        # So, we need to dimshuffle to (n_time_steps, n_batch, n_features)
        input = input.dimshuffle(1, 0, *range(2, input.ndim))
        seq_len, num_batch = input.shape[0], input.shape[1]

        if self.precompute_input:
            # Because the input is given for all time steps, we can precompute
            # the inputs to hidden before scanning. First we need to reshape
            # from (seq_len, batch_size, trailing dimensions...) to
            # (seq_len*batch_size, trailing dimensions...)
            # This strange use of a generator in a tuple was because
            # input.shape[2:] was raising a Theano error
            trailing_dims = tuple(input.shape[n] for n in range(2, input.ndim))
            input = T.reshape(input, (seq_len * num_batch, ) + trailing_dims)
            input = helper.get_output(self.input_to_hidden, input, **kwargs)

            # Reshape back to (seq_len, batch_size, trailing dimensions...)
            trailing_dims = tuple(input.shape[n] for n in range(1, input.ndim))
            input = T.reshape(input, (seq_len, num_batch) + trailing_dims)

        # We will always pass the hidden-to-hidden layer params to step
        non_seqs = helper.get_all_params(self.hidden_to_hidden)
        non_seqs += helper.get_all_params(self.output_to_hidden)
        # When we are not precomputing the input, we also need to pass the
        # input-to-hidden parameters to step
        if not self.precompute_input:
            non_seqs += helper.get_all_params(self.input_to_hidden)

        # Create single recurrent computation step function
        def step(input_n, hid_previous, *args):
            # Compute the hidden-to-hidden activation
            hid_pre = helper.get_output(self.hidden_to_hidden, hid_previous,
                                        **kwargs)

            # out_layers = helper.get_all_layers(self.output_to_hidden)
            # out_layers[1].incoming_layer = self.hidden_to_hidden
            hid_pre += helper.get_output(self.output_to_hidden, hid_previous,
                                         **kwargs)

            # If the dot product is precomputed then add it, otherwise
            # calculate the input_to_hidden values and add them
            if self.precompute_input:
                hid_pre += input_n
            else:
                hid_pre += helper.get_output(self.input_to_hidden, input_n,
                                             **kwargs)

            # Clip gradients
            if self.grad_clipping:
                hid_pre = theano.gradient.grad_clip(hid_pre,
                                                    -self.grad_clipping,
                                                    self.grad_clipping)

            return self.nonlinearity(hid_pre)

        def step_masked(input_n, mask_n, hid_previous, *args):
            # Skip over any input with mask 0 by copying the previous
            # hidden state; proceed normally for any input with mask 1.
            hid = step(input_n, hid_previous, *args)
            hid_out = hid * mask_n + hid_previous * (1 - mask_n)
            return [hid_out]

        if mask is not None:
            mask = mask.dimshuffle(1, 0, 'x')
            sequences = [input, mask]
            step_fun = step_masked
        else:
            sequences = input
            step_fun = step

        # When hid_init is provided as a TensorVariable, use it as-is
        if isinstance(self.hid_init, T.TensorVariable):
            hid_init = self.hid_init
        else:
            # The code below simply repeats self.hid_init num_batch times in
            # its first dimension.  Turns out using a dot product and a
            # dimshuffle is faster than T.repeat.
            dot_dims = (list(range(1, self.hid_init.ndim - 1)) +
                        [0, self.hid_init.ndim - 1])
            hid_init = T.dot(T.ones((num_batch, 1)),
                             self.hid_init.dimshuffle(dot_dims))

        if self.unroll_scan:
            # Retrieve the dimensionality of the incoming layer
            input_shape = self.input_shapes[0]
            # Explicitly unroll the recurrence instead of using scan
            hid_out = unroll_scan(fn=step_fun,
                                  sequences=sequences,
                                  outputs_info=[hid_init],
                                  go_backwards=self.backwards,
                                  non_sequences=non_seqs,
                                  n_steps=input_shape[1])[0]
        else:
            # Scan op iterates over first dimension of input and repeatedly
            # applies the step function
            hid_out = theano.scan(fn=step_fun,
                                  sequences=sequences,
                                  go_backwards=self.backwards,
                                  outputs_info=[hid_init],
                                  non_sequences=non_seqs,
                                  truncate_gradient=self.gradient_steps,
                                  strict=True)[0]

        # dimshuffle back to (n_batch, n_time_steps, n_features))
        hid_out = hid_out.dimshuffle(1, 0, *range(2, hid_out.ndim))

        # if scan is backward reverse the output
        if self.backwards:
            hid_out = hid_out[:, ::-1]

        return hid_out
Exemple #19
0
 def get_params(self, **tags):
     # Get all parameters from this layer, the master layer
     params = super(ConvTimeStep1DLayer, self).get_params(**tags)
     # Combine with all parameters from the child layers
     params += helper.get_all_params(self.conv1d, **tags)
     return params
Exemple #20
0
 def num_trainable_parameters(self):
     return sum(
         [p.get_value().size for p in get_all_params(self.layers[-1])])
    def get_output_for(self, inputs, **kwargs):
        """
        Compute this layer's output function given a symbolic input variable

        Parameters
        ----------
        inputs : list of theano.TensorType
            `inputs[0]` should always be the symbolic input variable.  When
            this layer has a mask input (i.e. was instantiated with
            `mask_input != None`, indicating that the lengths of sequences in
            each batch vary), `inputs` should have length 2, where `inputs[1]`
            is the `mask`.  The `mask` should be supplied as a Theano variable
            denoting whether each time step in each sequence in the batch is
            part of the sequence or not.  `mask` should be a matrix of shape
            ``(n_batch, n_time_steps)`` where ``mask[i, j] = 1`` when ``j <=
            (length of sequence i)`` and ``mask[i, j] = 0`` when ``j > (length
            of sequence i)``. When the hidden state of this layer is to be
            pre-filled (i.e. was set to a :class:`Layer` instance) `inputs`
            should have length at least 2, and `inputs[-1]` is the hidden state
            to prefill with.

        Returns
        -------
        layer_output : theano.TensorType
            Symbolic output variable.
        """
        # Retrieve the layer input
        input_p = inputs[0]
        input_q = inputs[1]
        z_init = inputs[2]
        mu_p_init = inputs[3]

        # Retrieve the mask when it is supplied
        mask = None
        if self.mask_incoming_index > 0:
            mask = inputs[self.mask_incoming_index]

        # Because scan iterates over the first dimension we dimshuffle to
        # (n_time_steps, n_batch, n_features)
        input_p = input_p.dimshuffle(1, 0, 2)
        input_q = input_q.dimshuffle(1, 0, 2)
        seq_len, num_batch, _ = input_p.shape

        # Create single recurrent computation step function
        # input__n is the n'th vector of the input
        def log_sum_exp(a, b):
            return T.log(T.exp(a) + T.exp(b))

        def step(noise_n, input_p_n, input_q_n, z_previous, mu_p_previous,
                 logvar_p_previous, mu_q_previous, logvar_q_previous, *args):

            input_p = T.concatenate([input_p_n, z_previous], axis=1)
            mu_p = get_output(self.mu_p_mlp, input_p)

            logvar_p = get_output(self.logvar_p_mlp, input_p)
            logvar_p = T.log(T.exp(logvar_p) + self.cons)

            q_input_n = T.concatenate([input_q_n, z_previous], axis=1)

            mu_q = get_output(self.q_mu_mlp, q_input_n)
            if self.use_mu_residual_q:
                print "Using residuals for mean_q"
                mu_q += mu_p

            logvar_q = get_output(self.q_logvar_mlp, q_input_n)

            # Numerical stability
            logvar_q = T.log(T.exp(logvar_q) + self.cons)

            z_n = mu_q + T.exp(0.5 * logvar_q) * noise_n

            return z_n, mu_p, logvar_p, mu_q, logvar_q

        def step_masked(noise_n, input_p_n, input_q_n, mask_n, z_previous,
                        mu_p_previous, logvar_p_previous, mu_q_previous,
                        logvar_q_previous, *args):

            # Skip over any input with mask 0 by copying the previous
            # hidden state; proceed normally for any input with mask 1.

            z_n, mu_p, logvar_p, mu_q, logvar_q = step(
                noise_n, input_p_n, input_q_n, z_previous, mu_p_previous,
                logvar_p_previous, mu_q_previous, logvar_q_previous, *args)

            z_n = T.switch(mask_n, z_n, z_previous)
            mu_p = T.switch(mask_n, mu_p, mu_p_previous)
            logvar_p = T.switch(mask_n, logvar_p, logvar_p_previous)
            mu_q = T.switch(mask_n, mu_q, mu_q_previous)
            logvar_q = T.switch(mask_n, logvar_q, logvar_q_previous)

            return z_n, mu_p, logvar_p, mu_q, logvar_q

        eps = self._srng.normal(size=(seq_len, num_batch, self.num_units),
                                avg=0.0,
                                std=1.0)
        logvar_init = T.zeros((num_batch, self.num_units))
        if mask is not None:
            # mask is given as (batch_size, seq_len). Because scan iterates
            # over first dimension, we dimshuffle to (seq_len, batch_size) and
            # add a broadcastable dimension
            mask = mask.dimshuffle(1, 0, 'x')
            sequences = [eps, input_p, input_q, mask]
            step_fun = step_masked
        else:
            sequences = [eps, input_p, input_q]
            step_fun = step

        # The hidden-to-hidden weight matrix is always used in step
        non_seqs = helper.get_all_params(self.logvar_p_mlp)
        non_seqs += helper.get_all_params(self.mu_p_mlp)
        non_seqs += helper.get_all_params(self.q_mu_mlp)
        non_seqs += helper.get_all_params(self.q_logvar_mlp)

        if self.unroll_scan:
            # Retrieve the dimensionality of the incoming layer
            input_shape = self.input_shapes[0]
            # Explicitly unroll the recurrence instead of using scan
            scan_out = unroll_scan(fn=step_fun,
                                   sequences=sequences,
                                   outputs_info=[
                                       z_init, mu_p_init, logvar_init,
                                       mu_p_init, logvar_init
                                   ],
                                   go_backwards=self.backwards,
                                   non_sequences=non_seqs,
                                   n_steps=input_shape[1])
        else:
            # Scan op iterates over first dimension of input and repeatedly
            # applies the step function
            scan_out = theano.scan(fn=step_fun,
                                   sequences=sequences,
                                   go_backwards=self.backwards,
                                   outputs_info=[
                                       z_init, mu_p_init, logvar_init,
                                       mu_p_init, logvar_init
                                   ],
                                   non_sequences=non_seqs,
                                   truncate_gradient=self.gradient_steps,
                                   strict=True)[0]

        z, mu_p, logvar_p, mu_q, logvar_q = scan_out

        # When it is requested that we only return the final sequence step,
        # we need to slice it out immediately after scan is applied
        if self.only_return_final:
            assert False
        else:
            # dimshuffle back to (n_batch, n_time_steps, n_features))
            z = z.dimshuffle(1, 0, 2)
            mu_p = mu_p.dimshuffle(1, 0, 2)
            logvar_p = logvar_p.dimshuffle(1, 0, 2)
            mu_q = mu_q.dimshuffle(1, 0, 2)
            logvar_q = logvar_q.dimshuffle(1, 0, 2)

            # if scan is backward reverse the output
            if self.backwards:
                z = z[:, ::-1]
                mu_p = mu_p[:, ::-1]
                logvar_p = logvar_p[:, ::-1]
                mu_q = mu_q[:, ::-1]
                logvar_q = logvar_q[:, ::-1]

        return z, mu_p, logvar_p, mu_q, logvar_q
    def get_output_for(self, inputs, **kwargs):
        input = inputs[0]
        hid_init = None
        if self.hid_init_incoming_index > 0:
            hid_init = inputs[self.hid_init_incoming_index]

        # Input should be provided as (n_batch, n_time_steps, n_features)
        # but scan requires the iterable dimension to be first
        # So, we need to dimshuffle to (n_time_steps, n_batch, n_features)
        input = input.dimshuffle(1, 0, *range(2, input.ndim))
        seq_len, num_batch = input.shape[0], input.shape[1]

        # precompute inputs before scanning
        trailing_dims = tuple(input.shape[n] for n in range(2, input.ndim))
        input = T.reshape(input, (seq_len*num_batch,) + trailing_dims)
        input = helper.get_output(
            self.input_to_hidden, input, **kwargs)

        # Reshape back to (seq_len, batch_size, trailing dimensions...)
        trailing_dims = tuple(input.shape[n] for n in range(1, input.ndim))
        input = T.reshape(input, (seq_len, num_batch) + trailing_dims)

        # pass params to step
        non_seqs = helper.get_all_params(self.hidden_to_hidden)
        non_seqs += helper.get_all_params(self.post_concat)

        # Create single recurrent computation step function
        def step(input_n, hid_previous, *args):
            # Compute the hidden-to-hidden activation
            hid_pre = helper.get_output(
                self.hidden_to_hidden, hid_previous, **kwargs)
            hid_pre = T.concatenate([hid_pre, input_n], axis=1)
            hid_pre = helper.get_output(self.post_concat, hid_pre, **kwargs)
            if self.grad_clipping:
                hid_pre = theano.gradient.grad_clip(
                    hid_pre, -self.grad_clipping, self.grad_clipping)
            return hid_pre

        sequences = input
        step_fun = step

        if not isinstance(self.hid_init, Layer):
            # repeats self.hid_init num_batch times in first dimension
            dot_dims = (list(range(1, self.hid_init.ndim - 1)) +
                        [0, self.hid_init.ndim - 1])
            hid_init = T.dot(T.ones((num_batch, 1)),
                             self.hid_init.dimshuffle(dot_dims))

        hid_out = theano.scan(
            fn=step_fun,
            sequences=sequences,
            go_backwards=False,
            outputs_info=[hid_init],
            non_sequences=non_seqs,
            truncate_gradient=-1,
            strict=True)[0]

        # dimshuffle back to (n_batch, n_time_steps, n_features))
        hid_out = hid_out.dimshuffle(1, 0, *range(2, hid_out.ndim))

        return hid_out