Example #1
0
    def create_structure(self):
        """Creates the symbolic graph of this layer.

        The input is 3- or 4-dimensional: the first dimension is the time step,
        the second dimension are the sequences, and the third and fourth
        dimension are the layer input. The fourth dimension is created when more
        than one filter is used.
        """

        if not self._network.mode.minibatch:
            raise RuntimeError("Text generation and lattice decoding are not "
                               "possible with convolution layers.")

        layer_input = self._input_layers[0].output
        num_time_steps = layer_input.shape[0]
        num_sequences = layer_input.shape[1]
        input_size = self._input_size

        # Shift the input right by k - 1 time steps, where k is the filter size,
        # so that the output at any time step does not contain information from
        # future words.
        padding_size = self._filter_size - 1
        padding = tensor.zeros([padding_size, num_sequences, input_size])
        layer_input = tensor.concatenate([padding, layer_input])

        # Compute the linear projection and the gate pre-activation in a single
        # convolution operation.
        preact = self._tensor_conv1d(layer_input, 'input')
        linear = get_submatrix(preact, 0, self.output_size)
        gate = get_submatrix(preact, 1, self.output_size)

        self.output = linear * tensor.nnet.sigmoid(gate)
Example #2
0
    def create_structure(self):
        """Creates the symbolic graph of this layer.

        Sets self.output to a symbolic matrix that describes the output of this
        layer.
        """

        layer_input = tensor.concatenate(
            [x.output for x in self._input_layers], axis=2)
        preact = self._tensor_preact(layer_input, 'input')
        # normal activation (hidden state) and transform gate
        h = tensor.tanh(get_submatrix(preact, 0, self.output_size))
        t = tensor.nnet.sigmoid(get_submatrix(preact, 1, self.output_size))
        self.output = h * t + layer_input * (1 - t)
Example #3
0
    def create_structure(self):
        """Creates the symbolic graph of this layer.

        Sets self.output to a symbolic matrix that describes the output of this
        layer.
        """

        layer_input = tensor.concatenate([x.output for x in self.input_layers],
                                         axis=2)
        preact = self._tensor_preact(layer_input, 'input')
        # normal activation (hidden state) and transform gate
        h = tensor.tanh(get_submatrix(preact, 0, self.output_size))
        t = tensor.nnet.sigmoid(get_submatrix(preact, 1, self.output_size))
        self.output = h * t + layer_input * (1 - t)
Example #4
0
    def _create_time_step(self, mask, x_preact, C_in, h_in, h_weights):
        """The LSTM step function for theano.scan(). Creates the structure of
        one time step.

        The inputs do not contain the time step dimension. ``mask`` is a vector
        containing a boolean mask for each sequence. ``x_preact`` is a matrix
        containing the preactivations for each sequence. ``C_in`` and ``h_in``,
        as well as the outputs, are matrices containing the state vectors for
        each sequence.

        The required affine transformations have already been applied to the
        input prior to creating the loop. The transformed inputs and the mask
        that will be passed to the step function are vectors when processing a
        mini-batch - each value corresponds to the same time step in a different
        sequence.

        :type mask: TensorVariable
        :param mask: a symbolic vector that masks out sequences that are past
                     the last word

        :type x_preact: TensorVariable
        :param x_preact: concatenation of the input x_(t) pre-activations
                         computed using the gate and candidate state weights and
                         biases; shape is (the number of sequences, state size *
                         4)

        :type C_in: TensorVariable
        :param C_in: C_(t-1), cell state output of the previous time step; shape
                     is (the number of sequences, state size)

        :type h_in: TensorVariable
        :param h_in: h_(t-1), hidden state output of the previous time step;
                     shape is (the number of sequences, state size)

        :type h_weights: TensorVariable
        :param h_weights: concatenation of the gate and candidate state weights
                          to be applied to h_(t-1); shape is (state size, state
                          size * 4)

        :rtype: a tuple of two TensorVariables
        :returns: C_(t) and h_(t), the cell state and hidden state outputs
        """

        # pre-activation of the gates and candidate state
        preact = tensor.dot(h_in, h_weights)
        preact += x_preact

        # input, forget, and output gates
        i = tensor.nnet.sigmoid(get_submatrix(preact, 0, self.output_size))
        f = tensor.nnet.sigmoid(get_submatrix(preact, 1, self.output_size))
        o = tensor.nnet.sigmoid(get_submatrix(preact, 2, self.output_size))

        # cell state and hidden state outputs
        C_candidate = tensor.tanh(get_submatrix(preact, 3, self.output_size))
        C_out = f * C_in + i * C_candidate
        h_out = o * tensor.tanh(C_out)

        # Apply the mask. None creates a new axis with size 1, causing the mask
        # to be broadcast to all the outputs.
        C_out = tensor.switch(mask[:, None], C_out, C_in)
        h_out = tensor.switch(mask[:, None], h_out, h_in)

        return C_out, h_out
Example #5
0
    def _create_time_step(self, mask, x_preact, h_in, h_weights):
        """The GRU step function for theano.scan(). Creates the structure of one
        time step.

        The inputs do not contain the time step dimension. ``mask`` is a vector
        containing a boolean mask for each sequence. ``x_preact`` is a matrix
        containing the preactivations for each sequence. ``C_in`` and ``h_in``,
        as well as the outputs, are matrices containing the state vectors for
        each sequence.

        The required affine transformations have already been applied to the
        input prior to creating the loop. The transformed inputs and the mask
        that will be passed to the step function are vectors when processing a
        mini-batch - each value corresponds to the same time step in a different
        sequence.

        :type mask: TensorVariable
        :param mask: a symbolic vector that masks out sequences that are past
                     the last word

        :type x_preact: TensorVariable
        :param x_preact: concatenation of the input x_(t) pre-activations
                         computed using the gate and candidate state weights and
                         biases; shape is (the number of sequences, state size *
                         3)

        :type h_in: TensorVariable
        :param h_in: h_(t-1), hidden state output of the previous time step;
                     shape is (the number of sequences, state size)

        :type h_weights: TensorVariable
        :param h_weights: concatenation of the gate and candidate state weights
                          to be applied to h_(t-1); shape is (state size, state
                          size * 3)

        :rtype: TensorVariable
        :returns: h_(t), the hidden state output
        """

        # pre-activation of the gates
        h_preact = tensor.dot(h_in, h_weights)
        preact_gates = get_submatrix(h_preact, 0, self.output_size, 1)
        preact_gates += get_submatrix(x_preact, 0, self.output_size, 1)

        # reset and update gates
        r = tensor.nnet.sigmoid(
            get_submatrix(preact_gates, 0, self.output_size))
        u = tensor.nnet.sigmoid(
            get_submatrix(preact_gates, 1, self.output_size))

        # pre-activation of the candidate state
        preact_candidate = get_submatrix(h_preact, 2, self.output_size)
        preact_candidate *= r
        preact_candidate += get_submatrix(x_preact, 2, self.output_size)

        # hidden state output
        h_candidate = tensor.tanh(preact_candidate)
        h_out = (1.0 - u) * h_in + u * h_candidate

        # Apply the mask. None creates a new axis with size 1, causing the mask
        # to be broadcast to all the outputs.
        h_out = tensor.switch(mask[:, None], h_out, h_in)

        return h_out
Example #6
0
    def _create_time_step(self, mask, x_preact, h_in, h_weights):
        """The GRU step function for theano.scan(). Creates the structure of one
        time step.

        The inputs do not contain the time step dimension. ``mask`` is a vector
        containing a boolean mask for each sequence. ``x_preact`` is a matrix
        containing the preactivations for each sequence. ``C_in`` and ``h_in``,
        as well as the outputs, are matrices containing the state vectors for
        each sequence.

        The required affine transformations have already been applied to the
        input prior to creating the loop. The transformed inputs and the mask
        that will be passed to the step function are vectors when processing a
        mini-batch - each value corresponds to the same time step in a different
        sequence.

        :type mask: TensorVariable
        :param mask: a symbolic vector that masks out sequences that are past
                     the last word

        :type x_preact: TensorVariable
        :param x_preact: concatenation of the input x_(t) pre-activations
                         computed using the gate and candidate state weights and
                         biases; shape is (the number of sequences, state size *
                         3)

        :type h_in: TensorVariable
        :param h_in: h_(t-1), hidden state output of the previous time step;
                     shape is (the number of sequences, state size)

        :type h_weights: TensorVariable
        :param h_weights: concatenation of the gate and candidate state weights
                          to be applied to h_(t-1); shape is (state size, state
                          size * 3)

        :rtype: TensorVariable
        :returns: h_(t), the hidden state output
        """

        # pre-activation of the gates
        h_preact = tensor.dot(h_in, h_weights)
        preact_gates = get_submatrix(h_preact, 0, self.output_size, 1)
        preact_gates += get_submatrix(x_preact, 0, self.output_size, 1)

        # reset and update gates
        r = tensor.nnet.sigmoid(get_submatrix(preact_gates, 0, self.output_size))
        u = tensor.nnet.sigmoid(get_submatrix(preact_gates, 1, self.output_size))

        # pre-activation of the candidate state
        preact_candidate = get_submatrix(h_preact, 2, self.output_size)
        preact_candidate *= r
        preact_candidate += get_submatrix(x_preact, 2, self.output_size)

        # hidden state output
        h_candidate = tensor.tanh(preact_candidate)
        h_out = (1.0 - u) * h_in + u * h_candidate

        # Apply the mask. None creates a new axis with size 1, causing the mask
        # to be broadcast to all the outputs.
        h_out = tensor.switch(mask[:,None], h_out, h_in)

        return h_out
Example #7
0
    def _create_time_step(self, mask, x_preact, C_in, h_in, h_weights, mem_weights, mem_bias, v_weights, v_bias, q_weights):
        """The LSTM step function for theano.scan(). Creates the structure of
        one time step.

        The inputs do not contain the time step dimension. ``mask`` is a vector
        containing a boolean mask for each sequence. ``x_preact`` is a matrix
        containing the preactivations for each sequence. ``C_in`` and ``h_in``,
        as well as the outputs, are matrices containing the state vectors for
        each sequence.

        The required affine transformations have already been applied to the
        input prior to creating the loop. The transformed inputs and the mask
        that will be passed to the step function are vectors when processing a
        mini-batch - each value corresponds to the same time step in a different
        sequence.

        :type mask: Variable
        :param mask: a symbolic vector that masks out sequences that are past
                     the last word

        :type x_preact: Variable
        :param x_preact: concatenation of the input x_(t) pre-activations
                         computed using the gate and candidate state weights and
                         biases; shape is (the number of sequences, state size *
                         4)

        :type C_in: Variable
        :param C_in: C_(t-1...t-n), memory (cell output) of the previous time steps; shape
                     is (the number of sequences, state size* memory size)

        :type h_in: Variable
        :param h_in: h_(t-1), hidden state output of the previous time step;
                     shape is (the number of sequences, state size)

        :type h_weights: Variable
        :param h_weights: concatenation of the gate and candidate state weights
                          to be applied to h_(t-1); shape is (state size, state
                          size * 4)

        :rtype: a tuple of two Variables
        :returns: C_(t) and h_(t), the cell state and hidden state outputs
        """

        # pre-activation of the gates and candidate state
        preact = tensor.dot(h_in, h_weights)
        preact += x_preact
        num_sequences = x_preact.shape[0]
        # input, forget, and output gates
        i = tensor.nnet.sigmoid(get_submatrix(preact, 0, self.output_size))
        f = tensor.nnet.sigmoid(get_submatrix(preact, 1, self.output_size))
        o = tensor.nnet.sigmoid(get_submatrix(preact, 2, self.output_size))

        # hidden state outputs candidate
        h_candidate = tensor.tanh(get_submatrix(preact, 3, self.output_size))

        # calculate the attention weights
        # transforming the memory
        # First rehape C_in
        mem = C_in.reshape([num_sequences, self.memory_size, self.output_size])		
        hidden = tensor.dot(mem[:,:-1,:], mem_weights) + mem_bias
        hidden_q = (tensor.dot(h_in, q_weights)).reshape([num_sequences, 1, self.output_size])
        # use V to calculate the attention scores for all previous input vectors
        raw_attention = tensor.dot(tensor.tanh(hidden+hidden_q), v_weights) + v_bias

        #logging.debug("time: %s, seq: &s", t, num_sequences)
        raw_attention = tensor.swapaxes(raw_attention, 0, 1)
        raw_attention = raw_attention.reshape([num_sequences, self.memory_size-1])

        # with softmax we get the attention scores for each time t
        # shape is (num_sequences, t)
        attentions = tensor.nnet.softmax(raw_attention)
        # apply attention to the memory
        long_memory = tensor.batched_dot(attentions.reshape([attentions.shape[0],1,attentions.shape[1]]), mem[:,:-1,:]) #TODO test
        long_memory = long_memory.reshape([long_memory.shape[0], long_memory.shape[2]])
        h_out = o * self._activation(f * long_memory + i * h_candidate)
        #concat new vector
        logging.debug("C ndim: %s, h_out ndim: %s", C_in.ndim, h_out.ndim)
        mem = tensor.concatenate([C_in[:,self.output_size:], h_out], axis=1) # TODO chech dimensions!

        # Apply the mask. None creates a new axis with size 1, causing the mask
        # to be broadcast to all the outputs.
        #C_out = tensor.switch(mask[:, None], C_out, C_in)
        h_out = tensor.switch(mask[:, None], h_out, h_in)

        return mem, h_out
Example #8
0
    def _create_time_step(self, mask, x_preact, C_in, h_in, h_weights):
        """The LSTM step function for theano.scan(). Creates the structure of
        one time step.

        The inputs do not contain the time step dimension. ``mask`` is a vector
        containing a boolean mask for each sequence. ``x_preact`` is a matrix
        containing the preactivations for each sequence. ``C_in`` and ``h_in``,
        as well as the outputs, are matrices containing the state vectors for
        each sequence.

        The required affine transformations have already been applied to the
        input prior to creating the loop. The transformed inputs and the mask
        that will be passed to the step function are vectors when processing a
        mini-batch - each value corresponds to the same time step in a different
        sequence.

        :type mask: TensorVariable
        :param mask: a symbolic vector that masks out sequences that are past
                     the last word

        :type x_preact: TensorVariable
        :param x_preact: concatenation of the input x_(t) pre-activations
                         computed using the gate and candidate state weights and
                         biases; shape is (the number of sequences, state size *
                         4)

        :type C_in: TensorVariable
        :param C_in: C_(t-1), cell state output of the previous time step; shape
                     is (the number of sequences, state size)

        :type h_in: TensorVariable
        :param h_in: h_(t-1), hidden state output of the previous time step;
                     shape is (the number of sequences, state size)

        :type h_weights: TensorVariable
        :param h_weights: concatenation of the gate and candidate state weights
                          to be applied to h_(t-1); shape is (state size, state
                          size * 4)

        :rtype: a tuple of two TensorVariables
        :returns: C_(t) and h_(t), the cell state and hidden state outputs
        """

        # pre-activation of the gates and candidate state
        preact = tensor.dot(h_in, h_weights)
        preact += x_preact

        # input, forget, and output gates
        i = tensor.nnet.sigmoid(get_submatrix(preact, 0, self.output_size))
        f = tensor.nnet.sigmoid(get_submatrix(preact, 1, self.output_size))
        o = tensor.nnet.sigmoid(get_submatrix(preact, 2, self.output_size))

        # cell state and hidden state outputs
        C_candidate = tensor.tanh(get_submatrix(preact, 3, self.output_size))
        C_out = f * C_in + i * C_candidate
        h_out = o * tensor.tanh(C_out)

        # Apply the mask. None creates a new axis with size 1, causing the mask
        # to be broadcast to all the outputs.
        C_out = tensor.switch(mask[:,None], C_out, C_in)
        h_out = tensor.switch(mask[:,None], h_out, h_in)

        return C_out, h_out