def create_structure(self): """Creates the symbolic graph of this layer. The input is 3- or 4-dimensional: the first dimension is the time step, the second dimension are the sequences, and the third and fourth dimension are the layer input. The fourth dimension is created when more than one filter is used. """ if not self._network.mode.minibatch: raise RuntimeError("Text generation and lattice decoding are not " "possible with convolution layers.") layer_input = self._input_layers[0].output num_time_steps = layer_input.shape[0] num_sequences = layer_input.shape[1] input_size = self._input_size # Shift the input right by k - 1 time steps, where k is the filter size, # so that the output at any time step does not contain information from # future words. padding_size = self._filter_size - 1 padding = tensor.zeros([padding_size, num_sequences, input_size]) layer_input = tensor.concatenate([padding, layer_input]) # Compute the linear projection and the gate pre-activation in a single # convolution operation. preact = self._tensor_conv1d(layer_input, 'input') linear = get_submatrix(preact, 0, self.output_size) gate = get_submatrix(preact, 1, self.output_size) self.output = linear * tensor.nnet.sigmoid(gate)
def create_structure(self): """Creates the symbolic graph of this layer. Sets self.output to a symbolic matrix that describes the output of this layer. """ layer_input = tensor.concatenate( [x.output for x in self._input_layers], axis=2) preact = self._tensor_preact(layer_input, 'input') # normal activation (hidden state) and transform gate h = tensor.tanh(get_submatrix(preact, 0, self.output_size)) t = tensor.nnet.sigmoid(get_submatrix(preact, 1, self.output_size)) self.output = h * t + layer_input * (1 - t)
def create_structure(self): """Creates the symbolic graph of this layer. Sets self.output to a symbolic matrix that describes the output of this layer. """ layer_input = tensor.concatenate([x.output for x in self.input_layers], axis=2) preact = self._tensor_preact(layer_input, 'input') # normal activation (hidden state) and transform gate h = tensor.tanh(get_submatrix(preact, 0, self.output_size)) t = tensor.nnet.sigmoid(get_submatrix(preact, 1, self.output_size)) self.output = h * t + layer_input * (1 - t)
def _create_time_step(self, mask, x_preact, C_in, h_in, h_weights): """The LSTM step function for theano.scan(). Creates the structure of one time step. The inputs do not contain the time step dimension. ``mask`` is a vector containing a boolean mask for each sequence. ``x_preact`` is a matrix containing the preactivations for each sequence. ``C_in`` and ``h_in``, as well as the outputs, are matrices containing the state vectors for each sequence. The required affine transformations have already been applied to the input prior to creating the loop. The transformed inputs and the mask that will be passed to the step function are vectors when processing a mini-batch - each value corresponds to the same time step in a different sequence. :type mask: TensorVariable :param mask: a symbolic vector that masks out sequences that are past the last word :type x_preact: TensorVariable :param x_preact: concatenation of the input x_(t) pre-activations computed using the gate and candidate state weights and biases; shape is (the number of sequences, state size * 4) :type C_in: TensorVariable :param C_in: C_(t-1), cell state output of the previous time step; shape is (the number of sequences, state size) :type h_in: TensorVariable :param h_in: h_(t-1), hidden state output of the previous time step; shape is (the number of sequences, state size) :type h_weights: TensorVariable :param h_weights: concatenation of the gate and candidate state weights to be applied to h_(t-1); shape is (state size, state size * 4) :rtype: a tuple of two TensorVariables :returns: C_(t) and h_(t), the cell state and hidden state outputs """ # pre-activation of the gates and candidate state preact = tensor.dot(h_in, h_weights) preact += x_preact # input, forget, and output gates i = tensor.nnet.sigmoid(get_submatrix(preact, 0, self.output_size)) f = tensor.nnet.sigmoid(get_submatrix(preact, 1, self.output_size)) o = tensor.nnet.sigmoid(get_submatrix(preact, 2, self.output_size)) # cell state and hidden state outputs C_candidate = tensor.tanh(get_submatrix(preact, 3, self.output_size)) C_out = f * C_in + i * C_candidate h_out = o * tensor.tanh(C_out) # Apply the mask. None creates a new axis with size 1, causing the mask # to be broadcast to all the outputs. C_out = tensor.switch(mask[:, None], C_out, C_in) h_out = tensor.switch(mask[:, None], h_out, h_in) return C_out, h_out
def _create_time_step(self, mask, x_preact, h_in, h_weights): """The GRU step function for theano.scan(). Creates the structure of one time step. The inputs do not contain the time step dimension. ``mask`` is a vector containing a boolean mask for each sequence. ``x_preact`` is a matrix containing the preactivations for each sequence. ``C_in`` and ``h_in``, as well as the outputs, are matrices containing the state vectors for each sequence. The required affine transformations have already been applied to the input prior to creating the loop. The transformed inputs and the mask that will be passed to the step function are vectors when processing a mini-batch - each value corresponds to the same time step in a different sequence. :type mask: TensorVariable :param mask: a symbolic vector that masks out sequences that are past the last word :type x_preact: TensorVariable :param x_preact: concatenation of the input x_(t) pre-activations computed using the gate and candidate state weights and biases; shape is (the number of sequences, state size * 3) :type h_in: TensorVariable :param h_in: h_(t-1), hidden state output of the previous time step; shape is (the number of sequences, state size) :type h_weights: TensorVariable :param h_weights: concatenation of the gate and candidate state weights to be applied to h_(t-1); shape is (state size, state size * 3) :rtype: TensorVariable :returns: h_(t), the hidden state output """ # pre-activation of the gates h_preact = tensor.dot(h_in, h_weights) preact_gates = get_submatrix(h_preact, 0, self.output_size, 1) preact_gates += get_submatrix(x_preact, 0, self.output_size, 1) # reset and update gates r = tensor.nnet.sigmoid( get_submatrix(preact_gates, 0, self.output_size)) u = tensor.nnet.sigmoid( get_submatrix(preact_gates, 1, self.output_size)) # pre-activation of the candidate state preact_candidate = get_submatrix(h_preact, 2, self.output_size) preact_candidate *= r preact_candidate += get_submatrix(x_preact, 2, self.output_size) # hidden state output h_candidate = tensor.tanh(preact_candidate) h_out = (1.0 - u) * h_in + u * h_candidate # Apply the mask. None creates a new axis with size 1, causing the mask # to be broadcast to all the outputs. h_out = tensor.switch(mask[:, None], h_out, h_in) return h_out
def _create_time_step(self, mask, x_preact, h_in, h_weights): """The GRU step function for theano.scan(). Creates the structure of one time step. The inputs do not contain the time step dimension. ``mask`` is a vector containing a boolean mask for each sequence. ``x_preact`` is a matrix containing the preactivations for each sequence. ``C_in`` and ``h_in``, as well as the outputs, are matrices containing the state vectors for each sequence. The required affine transformations have already been applied to the input prior to creating the loop. The transformed inputs and the mask that will be passed to the step function are vectors when processing a mini-batch - each value corresponds to the same time step in a different sequence. :type mask: TensorVariable :param mask: a symbolic vector that masks out sequences that are past the last word :type x_preact: TensorVariable :param x_preact: concatenation of the input x_(t) pre-activations computed using the gate and candidate state weights and biases; shape is (the number of sequences, state size * 3) :type h_in: TensorVariable :param h_in: h_(t-1), hidden state output of the previous time step; shape is (the number of sequences, state size) :type h_weights: TensorVariable :param h_weights: concatenation of the gate and candidate state weights to be applied to h_(t-1); shape is (state size, state size * 3) :rtype: TensorVariable :returns: h_(t), the hidden state output """ # pre-activation of the gates h_preact = tensor.dot(h_in, h_weights) preact_gates = get_submatrix(h_preact, 0, self.output_size, 1) preact_gates += get_submatrix(x_preact, 0, self.output_size, 1) # reset and update gates r = tensor.nnet.sigmoid(get_submatrix(preact_gates, 0, self.output_size)) u = tensor.nnet.sigmoid(get_submatrix(preact_gates, 1, self.output_size)) # pre-activation of the candidate state preact_candidate = get_submatrix(h_preact, 2, self.output_size) preact_candidate *= r preact_candidate += get_submatrix(x_preact, 2, self.output_size) # hidden state output h_candidate = tensor.tanh(preact_candidate) h_out = (1.0 - u) * h_in + u * h_candidate # Apply the mask. None creates a new axis with size 1, causing the mask # to be broadcast to all the outputs. h_out = tensor.switch(mask[:,None], h_out, h_in) return h_out
def _create_time_step(self, mask, x_preact, C_in, h_in, h_weights, mem_weights, mem_bias, v_weights, v_bias, q_weights): """The LSTM step function for theano.scan(). Creates the structure of one time step. The inputs do not contain the time step dimension. ``mask`` is a vector containing a boolean mask for each sequence. ``x_preact`` is a matrix containing the preactivations for each sequence. ``C_in`` and ``h_in``, as well as the outputs, are matrices containing the state vectors for each sequence. The required affine transformations have already been applied to the input prior to creating the loop. The transformed inputs and the mask that will be passed to the step function are vectors when processing a mini-batch - each value corresponds to the same time step in a different sequence. :type mask: Variable :param mask: a symbolic vector that masks out sequences that are past the last word :type x_preact: Variable :param x_preact: concatenation of the input x_(t) pre-activations computed using the gate and candidate state weights and biases; shape is (the number of sequences, state size * 4) :type C_in: Variable :param C_in: C_(t-1...t-n), memory (cell output) of the previous time steps; shape is (the number of sequences, state size* memory size) :type h_in: Variable :param h_in: h_(t-1), hidden state output of the previous time step; shape is (the number of sequences, state size) :type h_weights: Variable :param h_weights: concatenation of the gate and candidate state weights to be applied to h_(t-1); shape is (state size, state size * 4) :rtype: a tuple of two Variables :returns: C_(t) and h_(t), the cell state and hidden state outputs """ # pre-activation of the gates and candidate state preact = tensor.dot(h_in, h_weights) preact += x_preact num_sequences = x_preact.shape[0] # input, forget, and output gates i = tensor.nnet.sigmoid(get_submatrix(preact, 0, self.output_size)) f = tensor.nnet.sigmoid(get_submatrix(preact, 1, self.output_size)) o = tensor.nnet.sigmoid(get_submatrix(preact, 2, self.output_size)) # hidden state outputs candidate h_candidate = tensor.tanh(get_submatrix(preact, 3, self.output_size)) # calculate the attention weights # transforming the memory # First rehape C_in mem = C_in.reshape([num_sequences, self.memory_size, self.output_size]) hidden = tensor.dot(mem[:,:-1,:], mem_weights) + mem_bias hidden_q = (tensor.dot(h_in, q_weights)).reshape([num_sequences, 1, self.output_size]) # use V to calculate the attention scores for all previous input vectors raw_attention = tensor.dot(tensor.tanh(hidden+hidden_q), v_weights) + v_bias #logging.debug("time: %s, seq: &s", t, num_sequences) raw_attention = tensor.swapaxes(raw_attention, 0, 1) raw_attention = raw_attention.reshape([num_sequences, self.memory_size-1]) # with softmax we get the attention scores for each time t # shape is (num_sequences, t) attentions = tensor.nnet.softmax(raw_attention) # apply attention to the memory long_memory = tensor.batched_dot(attentions.reshape([attentions.shape[0],1,attentions.shape[1]]), mem[:,:-1,:]) #TODO test long_memory = long_memory.reshape([long_memory.shape[0], long_memory.shape[2]]) h_out = o * self._activation(f * long_memory + i * h_candidate) #concat new vector logging.debug("C ndim: %s, h_out ndim: %s", C_in.ndim, h_out.ndim) mem = tensor.concatenate([C_in[:,self.output_size:], h_out], axis=1) # TODO chech dimensions! # Apply the mask. None creates a new axis with size 1, causing the mask # to be broadcast to all the outputs. #C_out = tensor.switch(mask[:, None], C_out, C_in) h_out = tensor.switch(mask[:, None], h_out, h_in) return mem, h_out
def _create_time_step(self, mask, x_preact, C_in, h_in, h_weights): """The LSTM step function for theano.scan(). Creates the structure of one time step. The inputs do not contain the time step dimension. ``mask`` is a vector containing a boolean mask for each sequence. ``x_preact`` is a matrix containing the preactivations for each sequence. ``C_in`` and ``h_in``, as well as the outputs, are matrices containing the state vectors for each sequence. The required affine transformations have already been applied to the input prior to creating the loop. The transformed inputs and the mask that will be passed to the step function are vectors when processing a mini-batch - each value corresponds to the same time step in a different sequence. :type mask: TensorVariable :param mask: a symbolic vector that masks out sequences that are past the last word :type x_preact: TensorVariable :param x_preact: concatenation of the input x_(t) pre-activations computed using the gate and candidate state weights and biases; shape is (the number of sequences, state size * 4) :type C_in: TensorVariable :param C_in: C_(t-1), cell state output of the previous time step; shape is (the number of sequences, state size) :type h_in: TensorVariable :param h_in: h_(t-1), hidden state output of the previous time step; shape is (the number of sequences, state size) :type h_weights: TensorVariable :param h_weights: concatenation of the gate and candidate state weights to be applied to h_(t-1); shape is (state size, state size * 4) :rtype: a tuple of two TensorVariables :returns: C_(t) and h_(t), the cell state and hidden state outputs """ # pre-activation of the gates and candidate state preact = tensor.dot(h_in, h_weights) preact += x_preact # input, forget, and output gates i = tensor.nnet.sigmoid(get_submatrix(preact, 0, self.output_size)) f = tensor.nnet.sigmoid(get_submatrix(preact, 1, self.output_size)) o = tensor.nnet.sigmoid(get_submatrix(preact, 2, self.output_size)) # cell state and hidden state outputs C_candidate = tensor.tanh(get_submatrix(preact, 3, self.output_size)) C_out = f * C_in + i * C_candidate h_out = o * tensor.tanh(C_out) # Apply the mask. None creates a new axis with size 1, causing the mask # to be broadcast to all the outputs. C_out = tensor.switch(mask[:,None], C_out, C_in) h_out = tensor.switch(mask[:,None], h_out, h_in) return C_out, h_out