Esempio n. 1
0
        def step(input_n, cell_previous, hid_previous, r_previous, *args):
            if not self.precompute_input:
                input_n = T.dot(input_n, W_in_stacked) + b_stacked

            # Calculate gates pre-activations and slice
            gates = input_n + T.dot(hid_previous, W_hid_stacked)

            # Clip gradients
            if self.grad_clipping:
                gates = theano.gradient.grad_clip(
                    gates, -self.grad_clipping, self.grad_clipping)

            # Extract the pre-activation gate values
            ingate = slice_w(gates, 0)
            forgetgate = slice_w(gates, 1)
            cell_input = slice_w(gates, 2)
            outgate = slice_w(gates, 3)

            if self.peepholes:
                # Compute peephole connections
                ingate += cell_previous*self.W_cell_to_ingate
                forgetgate += cell_previous*self.W_cell_to_forgetgate

            # Apply nonlinearities
            ingate = self.nonlinearity_ingate(ingate)
            forgetgate = self.nonlinearity_forgetgate(forgetgate)
            cell_input = self.nonlinearity_cell(cell_input)

            # Compute new cell value
            cell = forgetgate*cell_previous + ingate*cell_input

            if self.peepholes:
                outgate += cell*self.W_cell_to_outgate
            outgate = self.nonlinearity_outgate(outgate)

            # Compute new hidden unit activation
            hid = outgate*self.nonlinearity(cell)
            r = r_previous
            if self.attention:
                if self.wordbyword:
                    M_partial = T.dot(hid, self.W_h) + T.dot(r_previous, self.W_r)
                    M_partial = M_partial.dimshuffle(0, 'x', 1)
                    M = T.dot(encoder_hs, self.W_y) + M_partial
                    M = nonlinearities.tanh(M)

                    alpha = T.dot(M, self.w)
                    alpha = T.flatten(alpha, 2)
                    alpha = T.nnet.softmax(alpha)
                    alpha = alpha.dimshuffle(0, 1, 'x')
                    
                    r = T.sum(encoder_hs*alpha, axis=1) + nonlinearities.tanh(T.dot(r_previous, self.W_t))

            return [cell, hid, r]
Esempio n. 2
0
 def Ep_Gate(c, m, q, Wb, W1, W2, b1, b2):
     z = T.concatenate([
         c, m, q, c * q, c * m,
         T.abs_(c - q),
         T.abs_(c - m), c * Wb * q, c * Wb * m
     ],
                       axis=2)
     #g = (T.dot(W2, nonlin.tanh(T.dot(z, W1) + b1)) + b2) <- (big mistake :)
     g = (T.dot(nonlin.tanh(T.dot(W1, z) + b1), W2) + b2)
     return g
    def get_output_for(self, inputs, **kwargs):

        input = inputs[0]
        mask = None
        if self.mask_incoming_index > 0:
            mask = inputs[self.mask_incoming_index]
        (d1, d2, d3) = input.shape

        # out = T.tensordot(input, self.W, axes=[[2], [0]])
        # b_shuffled = self.b.dimshuffle('x', 'x', 0)
        # out += b_shuffled
        # out = tanh(out)
        # out *= mask.dimshuffle(0, 1, 'x')
        # out = T.batched_dot(out, out.dimshuffle(0, 2, 1))
        q = T.tensordot(input, self.W1, axes=[[2], [0]])
        b1_shuffled = self.b1.dimshuffle('x', 'x', 0)
        q += b1_shuffled
        q = tanh(q)

        #        k = T.tensordot(input, self.W2, axes=[[2], [0]])
        # b2_shuffled = self.b2.dimshuffle('x', 'x', 0)
        # k += b2_shuffled
        # k = tanh(k)

        q *= mask.dimshuffle(0, 1, 'x')
        #        k *= mask.dimshuffle(0, 1, 'x')
        out = T.batched_dot(q, q.dimshuffle(0, 2, 1))
        #out /= np.sqrt(self.nu)
        #out *= 0.1

        out *= (1 - T.eye(d2, d2))

        matrix = softmax(out.reshape((d1 * d2, d2))).reshape((d1, d2, d2))
        matrix *= mask.dimshuffle(0, 1, 'x')
        matrix *= mask.dimshuffle(0, 'x', 1)

        return matrix
Esempio n. 4
0
        def step(input_n, cell_previous, hid_previous, previous_r, *args):
            if not self.precompute_input:
                input_n = T.dot(input_n, W_in_stacked) + b_stacked

            # Calculate gates pre-activations and slice
            gates = input_n + T.dot(hid_previous, W_hid_stacked)

            # Clip gradients
            if self.grad_clipping:
                gates = theano.gradient.grad_clip(gates, -self.grad_clipping,
                                                  self.grad_clipping)

            # Extract the pre-activation gate values
            ingate = slice_w(gates, 0)
            forgetgate = slice_w(gates, 1)
            cell_input = slice_w(gates, 2)
            outgate = slice_w(gates, 3)

            if self.peepholes:
                # Compute peephole connections
                ingate += cell_previous * self.W_cell_to_ingate
                forgetgate += cell_previous * self.W_cell_to_forgetgate

            # Apply nonlinearities
            ingate = self.nonlinearity_ingate(ingate)
            forgetgate = self.nonlinearity_forgetgate(forgetgate)
            cell_input = self.nonlinearity_cell(cell_input)

            # Compute new cell value
            cell = forgetgate * cell_previous + ingate * cell_input

            if self.peepholes:
                outgate += cell * self.W_cell_to_outgate
            outgate = self.nonlinearity_outgate(outgate)

            # Compute new hidden unit activation
            hid = outgate * self.nonlinearity(cell)
            r = previous_r
            if self.attention and self.word_by_word:
                mh = T.dot(hid, self.W_h_attend) + T.dot(
                    previous_r, self.W_r_attend)
                # mh is (n_batch, 1, n_features)
                mh = mh.dimshuffle(0, 'x', 1)
                M = T.dot(encoder_hs, self.W_y_attend) + mh
                # (n_batch, n_time_steps, n_features)
                M = nonlinearities.tanh(M)
                # alpha is (n_batch, n_time_steps, 1)
                alpha = T.dot(M, self.w_attend)
                # now is (n_batch, n_time_steps)
                alpha = T.flatten(alpha, 2)
                # 0 after softmax is not 0, f**k, my mistake.
                # when i > encoder_seq_len, fill alpha_i to -np.inf
                # alpha = T.switch(encoder_mask, alpha, -np.inf)
                alpha = T.nnet.softmax(alpha)
                # apply encoder_mask to alpha
                # encoder_mask is (n_batch, n_time_steps)
                # when i > encoder_seq_len, alpha_i should be 0.
                # actually not need mask, but in case of error
                # alpha = alpha * encoder_mask
                alpha = alpha.dimshuffle(0, 1, 'x')
                weighted_encoder = T.sum(encoder_hs * alpha, axis=1)
                r = weighted_encoder + nonlinearities.tanh(
                    T.dot(previous_r, self.W_t_attend))

            return [cell, hid, r]
Esempio n. 5
0
        def step(input_i, cell_previous, hid_previous, *args):
            # word-by-word attention
            mh = T.dot(hid_previous, self.W_a_pointer)
            mh += self.b_a_pointer
            # mh is (n_batch, 1, n_features)
            mh = mh.dimshuffle(0, 'x', 1)
            M = T.dot(passage, self.V_pointer) + mh
            # (n_batch, passage_seq_len, n_features)
            M = nonlinearities.tanh(M)
            # alpha is (n_batch, passage_seq_len, 1)
            alpha = T.dot(M, self.v_pointer)
            # now is (n_batch, passage_seq_len)
            alpha = T.flatten(alpha, 2)
            alpha += self.c_pointer
            # 0 after softmax is not 0, f**k, my mistake.
            # when i >= passage_seq_len, fill alpha_i to -np.inf
            # apply passage_mask to alpha
            # passage_mask is (n_batch, passage_seq_len)
            alpha = T.switch(mask, alpha, -np.inf)
            alpha = T.nnet.softmax(alpha)
            # when i >= passage_seq_len, alpha_i should be 0.
            # actually not need mask, but in case of error
            # alpha = alpha * mask
            alpha = alpha.dimshuffle(0, 1, 'x')
            weighted_passage = T.sum(passage * alpha, axis=1)
            # (n_batch, n_features)
            input_n = weighted_passage
            if not self.precompute_input:
                input_n = T.dot(input_n, W_in_stacked) + b_stacked

            # Calculate gates pre-activations and slice
            gates = input_n + T.dot(hid_previous, W_hid_stacked)

            # Clip gradients
            if self.grad_clipping:
                gates = theano.gradient.grad_clip(gates, -self.grad_clipping,
                                                  self.grad_clipping)

            # Extract the pre-activation gate values
            ingate = slice_w(gates, 0)
            forgetgate = slice_w(gates, 1)
            cell_input = slice_w(gates, 2)
            outgate = slice_w(gates, 3)

            if self.peepholes:
                # Compute peephole connections
                ingate += cell_previous * self.W_cell_to_ingate
                forgetgate += cell_previous * self.W_cell_to_forgetgate

            # Apply nonlinearities
            ingate = self.nonlinearity_ingate(ingate)
            forgetgate = self.nonlinearity_forgetgate(forgetgate)
            cell_input = self.nonlinearity_cell(cell_input)

            # Compute new cell value
            cell = forgetgate * cell_previous + ingate * cell_input

            if self.peepholes:
                outgate += cell * self.W_cell_to_outgate
            outgate = self.nonlinearity_outgate(outgate)

            # Compute new hidden unit activation
            hid = outgate * self.nonlinearity(cell)

            return [cell, hid, alpha]
def tanh_temperature(x, temperature=1):
    from lasagne.nonlinearities import tanh
    return tanh(x * temperature)
Esempio n. 7
0
        def step(input_n, cell_previous, hid_previous, avg_previous, *args):
            x = input_n
            if not self.precompute_input:
                input_n = T.dot(input_n, W_in_stacked) + b_stacked

            # Calculate gates pre-activations and slice
            gates = input_n + T.dot(hid_previous, W_hid_stacked)

            # Clip gradients
            if self.grad_clipping:
                gates = theano.gradient.grad_clip(gates, -self.grad_clipping,
                                                  self.grad_clipping)

            # Extract the pre-activation gate values
            ingate = slice_w(gates, 0)
            forgetgate = slice_w(gates, 1)
            cell_input = slice_w(gates, 2)
            outgate = slice_w(gates, 3)

            if self.peepholes:
                # Compute peephole connections
                ingate += cell_previous * self.W_cell_to_ingate
                forgetgate += cell_previous * self.W_cell_to_forgetgate

            # Apply nonlinearities
            ingate = self.nonlinearity_ingate(ingate)
            forgetgate = self.nonlinearity_forgetgate(forgetgate)
            cell_input = self.nonlinearity_cell(cell_input)

            # Compute new cell value
            cell = forgetgate * cell_previous + ingate * cell_input

            if self.peepholes:
                outgate += cell * self.W_cell_to_outgate
            outgate = self.nonlinearity_outgate(outgate)

            # Compute new hidden unit activation
            hid = outgate * self.nonlinearity(cell)

            avg_input = T.dot(x, self.W_avg1) + T.dot(hid,
                                                      self.W_avg2) + self.b_avg
            if self.model_type == 1:
                avg = x * nonlinearities.sigmoid(avg_input)
            elif self.model_type == 6:
                avg = nonlinearities.tanh(avg_input)
            elif self.model_type == 7:
                avg_input = T.dot(x, self.W_avg1) * T.dot(
                    hid, self.W_avg2) + self.b_avg
                avg = x * nonlinearities.sigmoid(avg_input)
            elif self.model_type == 2:
                avg = hid * nonlinearities.sigmoid(avg_input)
            elif self.model_type == 3:
                avg_input2 = T.dot(x, self.W_avg12) + T.dot(
                    hid, self.W_avg22) + self.b_avg2
                g1 = nonlinearities.sigmoid(avg_input)
                g2 = nonlinearities.sigmoid(avg_input2)
                avg = avg_previous * g1 + x * g2
            elif self.model_type == 4:
                avg_input = T.dot(
                    x, self.W_avg1) + T.dot(hid, self.W_avg2) + T.dot(
                        avg_previous, self.W_avg3) + self.b_avg
                avg_input2 = T.dot(
                    x, self.W_avg12) + T.dot(hid, self.W_avg22) + T.dot(
                        avg_previous, self.W_avg32) + self.b_avg2
                g1 = nonlinearities.sigmoid(avg_input)
                g2 = nonlinearities.sigmoid(avg_input2)
                avg = avg_previous * g1 + x * g2
            elif self.model_type == 5:
                avg_input2 = T.dot(x, self.W_avg12) + T.dot(
                    hid, self.W_avg22) + self.b_avg2
                g1 = nonlinearities.sigmoid(avg_input)
                g2 = nonlinearities.sigmoid(avg_input2)
                avg = x * g1
                havg = hid * g2
                avg = avg + havg
            return [cell, hid, avg]
Esempio n. 8
0
def normalize(x):
    return tanh(x / 4.0)
    def get_output_for(self, inputs, **kwargs):
        """
        Compute this layer's output function given a symbolic input variable

        Parameters
        ----------
        inputs : list of theano.TensorType
            `inputs[0]` should always be the symbolic input variable.  When
            this layer has a mask input (i.e. was instantiated with
            `mask_input != None`, indicating that the lengths of sequences in
            each batch vary), `inputs` should have length 2, where `inputs[1]`
            is the `mask`.  The `mask` should be supplied as a Theano variable
            denoting whether each time step in each sequence in the batch is
            part of the sequence or not.  `mask` should be a matrix of shape
            ``(n_batch, n_time_steps)`` where ``mask[i, j] = 1`` when ``j <=
            (length of sequence i)`` and ``mask[i, j] = 0`` when ``j > (length
            of sequence i)``. When the hidden state of this layer is to be
            pre-filled (i.e. was set to a :class:`Layer` instance) `inputs`
            should have length at least 2, and `inputs[-1]` is the hidden state
            to prefill with. When the cell state of this layer is to be
            pre-filled (i.e. was set to a :class:`Layer` instance) `inputs`
            should have length at least 2, and `inputs[-1]` is the hidden state
            to prefill with. When both the cell state and the hidden state are
            being pre-filled `inputs[-2]` is the hidden state, while
            `inputs[-1]` is the cell state.

        Returns
        -------
        layer_output : theano.TensorType
            Symbolic output variable.
        """
        # Retrieve the layer input
        input = inputs[0]
        # Retrieve the mask when it is supplied
        mask = None
        hid_init = None
        cell_init = None
        encoder_hs = None
        encoder_mask = None
        if self.mask_incoming_index > 0:
            mask = inputs[self.mask_incoming_index]
        if self.hid_init_incoming_index > 0:
            hid_init = inputs[self.hid_init_incoming_index]
        if self.encoder_mask_incoming_index > 0:
            # (n_batch, n_time_steps)
            encoder_mask = inputs[self.encoder_mask_incoming_index]
            encoder_mask = encoder_mask.astype('float32')
        cell_init = inputs[self.cell_init_incoming_index]
        if self.attention:
            # (n_batch, n_time_steps, n_features)
            encoder_hs = cell_init[0]
            # encoder_mask is # (n_batch, n_time_steps, 1)
            encoder_hs = encoder_hs * encoder_mask.dimshuffle(0, 1, 'x')
        cell_init = cell_init[1]

        # Treat all dimensions after the second as flattened feature dimensions
        if input.ndim > 3:
            input = T.flatten(input, 3)

        # Because scan iterates over the first dimension we dimshuffle to
        # (n_time_steps, n_batch, n_features)
        input = input.dimshuffle(1, 0, 2)
        seq_len, num_batch, _ = input.shape

        # Stack input weight matrices into a (num_inputs, 4*num_units)
        # matrix, which speeds up computation
        W_in_stacked = T.concatenate(
            [self.W_in_to_ingate, self.W_in_to_forgetgate,
             self.W_in_to_cell, self.W_in_to_outgate], axis=1)

        # Same for hidden weight matrices
        W_hid_stacked = T.concatenate(
            [self.W_hid_to_ingate, self.W_hid_to_forgetgate,
             self.W_hid_to_cell, self.W_hid_to_outgate], axis=1)

        # Stack biases into a (4*num_units) vector
        b_stacked = T.concatenate(
            [self.b_ingate, self.b_forgetgate,
             self.b_cell, self.b_outgate], axis=0)

        if self.precompute_input:
            # Because the input is given for all time steps, we can
            # precompute_input the inputs dot weight matrices before scanning.
            # W_in_stacked is (n_features, 4*num_units). input is then
            # (n_time_steps, n_batch, 4*num_units).
            input = T.dot(input, W_in_stacked) + b_stacked

        # At each call to scan, input_n will be (n_time_steps, 4*num_units).
        # We define a slicing function that extract the input to each LSTM gate
        def slice_w(x, n):
            return x[:, n*self.num_units:(n+1)*self.num_units]

        # Create single recurrent computation step function
        # input_n is the n'th vector of the input
        def step(input_n, cell_previous, hid_previous, previous_r, *args):
            if not self.precompute_input:
                input_n = T.dot(input_n, W_in_stacked) + b_stacked

            # Calculate gates pre-activations and slice
            gates = input_n + T.dot(hid_previous, W_hid_stacked)

            # Clip gradients
            if self.grad_clipping:
                gates = theano.gradient.grad_clip(
                    gates, -self.grad_clipping, self.grad_clipping)

            # Extract the pre-activation gate values
            ingate = slice_w(gates, 0)
            forgetgate = slice_w(gates, 1)
            cell_input = slice_w(gates, 2)
            outgate = slice_w(gates, 3)

            if self.peepholes:
                # Compute peephole connections
                ingate += cell_previous*self.W_cell_to_ingate
                forgetgate += cell_previous*self.W_cell_to_forgetgate

            # Apply nonlinearities
            ingate = self.nonlinearity_ingate(ingate)
            forgetgate = self.nonlinearity_forgetgate(forgetgate)
            cell_input = self.nonlinearity_cell(cell_input)

            # Compute new cell value
            cell = forgetgate*cell_previous + ingate*cell_input

            if self.peepholes:
                outgate += cell*self.W_cell_to_outgate
            outgate = self.nonlinearity_outgate(outgate)

            # Compute new hidden unit activation
            hid = outgate*self.nonlinearity(cell)
            r = previous_r
            if self.attention and self.word_by_word:
                mh = T.dot(hid, self.W_h_attend) + T.dot(previous_r, self.W_r_attend)
                # mh is (n_batch, 1, n_features)
                mh = mh.dimshuffle(0, 'x', 1)
                M = T.dot(encoder_hs, self.W_y_attend) + mh
                # (n_batch, n_time_steps, n_features)
                M = nonlinearities.tanh(M)
                # alpha is (n_batch, n_time_steps, 1)
                alpha = T.dot(M, self.w_attend)
                # now is (n_batch, n_time_steps)
                alpha = T.flatten(alpha, 2)
                # 0 after softmax is not 0, f**k, my mistake.
                # when i > encoder_seq_len, fill alpha_i to -np.inf
                # alpha = T.switch(encoder_mask, alpha, -np.inf)
                alpha = T.nnet.softmax(alpha)
                # apply encoder_mask to alpha
                # encoder_mask is (n_batch, n_time_steps)
                # when i > encoder_seq_len, alpha_i should be 0.
                # actually not need mask, but in case of error
                # alpha = alpha * encoder_mask
                alpha = alpha.dimshuffle(0, 1, 'x')
                weighted_encoder = T.sum(encoder_hs * alpha, axis=1)
                r = weighted_encoder + nonlinearities.tanh(T.dot(previous_r, self.W_t_attend))

            return [cell, hid, r]

        def step_masked(input_n, mask_n, cell_previous, hid_previous, previous_r, *args):
            cell, hid, r = step(input_n, cell_previous, hid_previous, previous_r, *args)

            # Skip over any input with mask 0 by copying the previous
            # hidden state; proceed normally for any input with mask 1.
            cell = T.switch(mask_n, cell, cell_previous)
            hid = T.switch(mask_n, hid, hid_previous)
            r = T.switch(mask_n, r, previous_r)
            return [cell, hid, r]

        if mask is not None:
            # mask is given as (batch_size, seq_len). Because scan iterates
            # over first dimension, we dimshuffle to (seq_len, batch_size) and
            # add a broadcastable dimension
            mask = mask.dimshuffle(1, 0, 'x')
            sequences = [input, mask]
            step_fun = step_masked
        else:
            sequences = input
            step_fun = step

        ones = T.ones((num_batch, 1))
        if not isinstance(self.hid_init, Layer):
            # Dot against a 1s vector to repeat to shape (num_batch, num_units)
            hid_init = T.dot(ones, self.hid_init)

        # The hidden-to-hidden weight matrix is always used in step
        non_seqs = [W_hid_stacked]
        # The "peephole" weight matrices are only used when self.peepholes=True
        if self.peepholes:
            non_seqs += [self.W_cell_to_ingate,
                         self.W_cell_to_forgetgate,
                         self.W_cell_to_outgate]

        # When we aren't precomputing the input outside of scan, we need to
        # provide the input weights and biases to the step function
        if not self.precompute_input:
            non_seqs += [W_in_stacked, b_stacked]

        r_init = T.dot(ones, self.r_init)
        if self.attention and self.word_by_word:
            non_seqs += [self.W_y_attend,
                         self.W_h_attend,
                         self.W_r_attend,
                         self.w_attend,
                         self.W_t_attend,
                         encoder_hs,
                         # encoder_mask
                         ]
        # Scan op iterates over first dimension of input and repeatedly
        # applies the step function
        cell_out, hid_out, r_out = theano.scan(
            fn=step_fun,
            sequences=sequences,
            outputs_info=[cell_init, hid_init, r_init],
            go_backwards=self.backwards,
            truncate_gradient=self.gradient_steps,
            non_sequences=non_seqs,
            strict=True)[0]
        # (n_batch, n_features)
        hid_N = hid_out[-1]
        out = hid_N
        if self.attention:
            if self.word_by_word:
                r_N = r_out[-1]
            else:
                mh = T.dot(hid_N, self.W_h_attend)
                mh = mh.dimshuffle(0, 'x', 1)
                M = T.dot(encoder_hs, self.W_y_attend) + mh
                # (n_batch, n_time_steps, n_features)
                M = nonlinearities.tanh(M)
                alpha = T.dot(M, self.w_attend)
                # (n_batch, n_time_steps)
                alpha = T.flatten(alpha, 2)
                # when i > encoder_seq_len, fill alpha_i to -np.inf
                # alpha = T.switch(encoder_mask, alpha, -np.inf)
                alpha = T.nnet.softmax(alpha)
                # apply encoder_mask to alpha
                # encoder_mask is (n_batch, n_time_steps)
                # when i > encoder_seq_len, alpha_i should be 0.
                # actually not need mask, but in case of error
                # alpha = alpha * encoder_mask
                alpha = alpha.dimshuffle(0, 1, 'x')
                # (n_batch, n_features)
                r_N = T.sum(encoder_hs * alpha, axis=1)
            out = nonlinearities.tanh(T.dot(r_N, self.W_p_attend) + T.dot(hid_N, self.W_x_attend))
        return out
Esempio n. 10
0
 def get_output_for(self, inputs, **kwargs):
     num_batch, _, _ = inputs.shape
     
     #add padded zeros in front of sequence
     padded_input = T.concatenate([T.zeros((num_batch, self.filter_width - 1, self.original_features)), inputs], axis=1)
     
     #reshape input to include 1 filter dimension
     rs = padded_input.dimshuffle([0, 'x', 1, 2])
     
     #apply convolutions for all "gates" (output = (n_batch, n_filters, n_time_steps, 1))
     Z = nonlinearities.tanh(T.nnet.conv2d(rs, self.Z_W,
                                           input_shape=(None, 1, self.internal_seq_len, self.original_features),
                                           filter_shape=(self.num_units, 1, self.filter_width, self.original_features)))
     F = nonlinearities.sigmoid(T.nnet.conv2d(rs, self.F_W,
                                           input_shape=(None, 1, self.internal_seq_len, self.original_features),
                                           filter_shape=(self.num_units, 1, self.filter_width, self.original_features)))
     
     if self.pooling == 'fo' or self.pooling == 'ifo':
         O = nonlinearities.sigmoid(T.nnet.conv2d(rs, self.O_W,
                                           input_shape=(None, 1, self.internal_seq_len, self.original_features),
                                           filter_shape=(self.num_units, 1, self.filter_width, self.original_features)))
     if self.pooling == 'ifo':
         I = nonlinearities.sigmoid(T.nnet.conv2d(rs, self.I_W,
                                           input_shape=(None, 1, self.internal_seq_len, self.original_features),
                                           filter_shape=(self.num_units, 1, self.filter_width, self.original_features)))
     
     # Because scan iterates over the first dimension we dimshuffle to
     # (n_time_steps, n_batch, n_features)
     Z = Z.flatten(ndim=3)
     Z = Z.dimshuffle([2, 0, 1])
     F = F.flatten(ndim=3)
     F = F.dimshuffle([2, 0, 1])
     if self.pooling == 'fo' or self.pooling == 'ifo':
         O = O.flatten(ndim=3)
         O = O.dimshuffle([2, 0, 1])
     if self.pooling == 'ifo':
         I = I.flatten(ndim=3)
         I = I.dimshuffle([2, 0, 1])
     
     # Dot against a 1s vector to repeat to shape (num_batch, num_units)
     ones = T.ones((num_batch, 1))
     hid_init = T.dot(ones, self.hid_init)
     
     # Create single recurrent computation step function
     # input_n is the n'th vector of the input: (n_batch, n_features)
     def step_f(forget_n, z_n, hid_previous, *args):
         return forget_n * hid_previous + (1.0 - forget_n) * z_n
     def step_fo(forget_n, z_n, o_n, hid_previous, cell_previous, *args):
         cell_current = forget_n * cell_previous + (1.0 - forget_n) * z_n
         hid_current = o_n * cell_current
         return [hid_current, cell_current]
     def step_ifo(forget_n, z_n, o_n, i_n, hid_previous, cell_previous, *args):
         cell_current = forget_n * cell_previous + i_n * z_n
         hid_current = o_n * cell_current
         return [hid_current, cell_current]
     
     if self.pooling == 'f':
         step = step_f
         sequences = [F, Z]
         outputs_info = [hid_init]
     if self.pooling == 'fo':
         step = step_fo
         sequences = [F, Z, O]
         # Note that, below, we use hid_init as the initial /cell/ state!
         # That way we only need to declare one set of weights
         outputs_info = [T.zeros((num_batch, self.num_units)), hid_init]
     if self.pooling == 'ifo':
         step = step_ifo
         sequences = [F, Z, O, I]
         outputs_info = [T.zeros((num_batch, self.num_units)), hid_init]
     
     outputs = theano.scan(
             fn=step,
             sequences=sequences,
             outputs_info=outputs_info,
             strict=True)[0]
     
     hid_out = outputs
     if self.pooling == 'fo' or self.pooling == 'ifo':
         hid_out = outputs[0]
     
     # Shuffle back to (n_batch, n_time_steps, n_features)
     hid_out = hid_out.dimshuffle([1, 0, 2])
     return hid_out
Esempio n. 11
0
 def Ep_Gate(c, m, q, Wb, W1, W2, b1, b2):
     z = T.concatenate([c,m,q,c*q,c*m,T.abs_(c-q),T.abs_(c-m),c*Wb*q,c*Wb*m], axis=2)
     #g = (T.dot(W2, nonlin.tanh(T.dot(z, W1) + b1)) + b2) <- (big mistake :)
     g = (T.dot(nonlin.tanh(T.dot(W1, z) + b1), W2) + b2)
     return g
Esempio n. 12
0
def tanh_add(x, y):
    return tanh(T.add(x, y))
Esempio n. 13
0
        def step(input_n, hid_previous_total, *args):
            
            hid_previous_facts = hid_previous_total[0:self.num_hidden_units_h]
            hid_previous_brain = hid_previous_total[self.num_hidden_units_h:]
            
            self.cur_sequence_idx += 1  # Updates where we are at in the sequence
            
            # Compute W_{hr} h_{t - 1}, W_{hu} h_{t - 1}, and W_{hc} h_{t - 1}
            hid_input_facts = T.dot(hid_previous_facts, W_hid_stacked)


            if self.grad_clipping:
                input_n = theano.gradient.grad_clip(
                    input_n, -self.grad_clipping, self.grad_clipping)
                hid_input_facts = theano.gradient.grad_clip(
                    hid_input_facts, -self.grad_clipping, self.grad_clipping)

            if not self.precompute_input:
                # Compute W_{xr}x_t + b_r, W_{xu}x_t + b_u, and W_{xc}x_t + b_c
                input_n = T.dot(input_n, W_in_stacked) + b_stacked  # DS Note:  accomplishes the multiplication AND adds bias

            # Reset and update gates
            resetgate = slice_w_h(hid_input_facts, 0) + slice_w_h(input_n, 0)
            updategate = slice_w_h(hid_input_facts, 1) + slice_w_h(input_n, 1)
            resetgate = self.nonlinearity_resetgate(resetgate)
            updategate = self.nonlinearity_updategate(updategate)
            
            # DS Edit: DynamMemNet modifiers
            m_dmn = hid_previous_brain  # Note that this should have size 
            c_dmn = input_n  # This is a TesnorType<float64, row>
            q_dmn = self.question_layer  # This is a lasagne recurrent GRU layer

            z_dmn = T.concatenate([c_dmn, m_dmn, q_dmn, c_dmn * q_dmn, abs(c_dmn - q_dmn), abs(c_dmn - m_dmn), T.dot(c_dmn.T, T.dot(self.W_dmn_b, q_dmn)),
                         T.dot(c_dmn.T, T.dot(self.W_dmn_b, m_dmn))], axis=1)
            G_dmn = nonlinearities.sigmoid(T.dot(self.W_dmn_2, nonlinearities.tanh(T.dot(self.W_dmn_1, z_dmn)) + self.b_dmn_1) + self.b_dmn_2)
            # Note, you also need W_b for the c and q elements.
            #something_else = T.dot(hid_previous_facts, W_hid_stacked)
            hidden_update_in = slice_w_h(input_n, 2)
            hidden_update_hid = slice_w_h(hid_input_facts, 2)
            hidden_update_facts = hidden_update_in + resetgate * hidden_update_hid
            if self.grad_clipping:
                hidden_update_facts = theano.gradient.grad_clip(
                    hidden_update_facts, -self.grad_clipping, self.grad_clipping)
            hidden_update_facts = self.nonlinearity_hid(hidden_update_facts)

            # Compute (1 - u_t)h_{t - 1} + u_t c_t
            hid = (1 - updategate) * hid_previous_facts + updategate * hidden_update_facts  # This is the GRU_fact output
            #output_dmn = G_dmn * hid + (1 - G_dmn) * hid_previous_facts  # This is the output of the Dynamic Memory Net modified GRU, Eq. (5)
            output_dmn = hid
                        
#             if self.cur_sequence_idx == self.max_seqlen:
#                 hid_input_brain = T.dot(hid_previous_brain, W_brain_hid_stacked)            
#             
#                 if self.grad_clipping:
#                     input_to_brain = theano.gradient.grad_clip(
#                         output_dmn, -self.grad_clipping, self.grad_clipping)
#                     hid_input_brain = theano.gradient.grad_clip(
#                         hid_input_brain, -self.grad_clipping, self.grad_clipping)
#                 else:
#                     input_to_brain = output_dmn
#                     
#                 if not self.precompute_input:
#                     # Compute W_{xr}x_t + b_r, W_{xu}x_t + b_u, and W_{xc}x_t + b_c
#                     input_to_brain = T.dot(input_to_brain, W_brain_in_stacked) + b_brain_stacked  # DS Note:  accomplishes the multiplication AND adds bias
#             
#                 # Reset and update gates
#                 resetgate_brain = slice_w_m(hid_input_brain, 0) + slice_w_m(input_to_brain, 0)
#                 updategate_brain = slice_w_m(hid_input_brain, 1) + slice_w_m(input_to_brain, 1)
#                 resetgate_brain = self.nonlinearity_brain_resetgate(resetgate_brain)
#                 updategate_brain = self.nonlinearity_brain_updategate(updategate_brain)
#             
#                 hidden_update_in_brain = slice_w_m(input_to_brain, 2)
#                 hidden_update_brain = slice_w_m(hid_input_brain, 2)
#                 hidden_update_brain = hidden_update_in_brain + resetgate_brain * hidden_update_brain
#                 
#                 if self.grad_clipping:
#                     hidden_update_brain = theano.gradient.grad_clip(hidden_update_brain, -self.grad_clipping, self.grad_clipping)
#                 hidden_update_brain = self.nonlinearity_brain_hid_update(hidden_update_brain)
#                 
#                 hid_brain = (1 - updategate_brain) * hid_previous_brain + updategate_brain * hidden_update_brain
#             
#             else:                
#             
            hid_brain = hid_previous_brain
                              
            return T.concatenate([output_dmn, hid_brain], axis=1)
Esempio n. 14
0
    def get_output_for(self, inputs, **kwargs):
        """
        Compute this layer's output function given a symbolic input variable

        Parameters
        ----------
        inputs : list of theano.TensorType
            `inputs[0]` should always be the symbolic input variable.  When
            this layer has a mask input (i.e. was instantiated with
            `mask_input != None`, indicating that the lengths of sequences in
            each batch vary), `inputs` should have length 2, where `inputs[1]`
            is the `mask`.  The `mask` should be supplied as a Theano variable
            denoting whether each time step in each sequence in the batch is
            part of the sequence or not.  `mask` should be a matrix of shape
            ``(n_batch, n_time_steps)`` where ``mask[i, j] = 1`` when ``j <=
            (length of sequence i)`` and ``mask[i, j] = 0`` when ``j > (length
            of sequence i)``. When the hidden state of this layer is to be
            pre-filled (i.e. was set to a :class:`Layer` instance) `inputs`
            should have length at least 2, and `inputs[-1]` is the hidden state
            to prefill with. When the cell state of this layer is to be
            pre-filled (i.e. was set to a :class:`Layer` instance) `inputs`
            should have length at least 2, and `inputs[-1]` is the hidden state
            to prefill with. When both the cell state and the hidden state are
            being pre-filled `inputs[-2]` is the hidden state, while
            `inputs[-1]` is the cell state.

        Returns
        -------
        layer_output : theano.TensorType
            Symbolic output variable.
        """
        # Retrieve the layer input
        input = inputs[0]
        # Retrieve the mask when it is supplied
        mask = None
        hid_init = None
        cell_init = None
        encoder_hs = None
        encoder_mask = None
        if self.mask_incoming_index > 0:
            mask = inputs[self.mask_incoming_index]
        if self.hid_init_incoming_index > 0:
            hid_init = inputs[self.hid_init_incoming_index]
        if self.encoder_mask_incoming_index > 0:
            # (n_batch, n_time_steps)
            encoder_mask = inputs[self.encoder_mask_incoming_index]
            encoder_mask = encoder_mask.astype('float32')
        cell_init = inputs[self.cell_init_incoming_index]
        if self.attention:
            # (n_batch, n_time_steps, n_features)
            encoder_hs = cell_init[0]
            # encoder_mask is # (n_batch, n_time_steps, 1)
            encoder_hs = encoder_hs * encoder_mask.dimshuffle(0, 1, 'x')
        cell_init = cell_init[1]

        # Treat all dimensions after the second as flattened feature dimensions
        if input.ndim > 3:
            input = T.flatten(input, 3)

        # Because scan iterates over the first dimension we dimshuffle to
        # (n_time_steps, n_batch, n_features)
        input = input.dimshuffle(1, 0, 2)
        seq_len, num_batch, _ = input.shape

        # Stack input weight matrices into a (num_inputs, 4*num_units)
        # matrix, which speeds up computation
        W_in_stacked = T.concatenate([
            self.W_in_to_ingate, self.W_in_to_forgetgate, self.W_in_to_cell,
            self.W_in_to_outgate
        ],
                                     axis=1)

        # Same for hidden weight matrices
        W_hid_stacked = T.concatenate([
            self.W_hid_to_ingate, self.W_hid_to_forgetgate, self.W_hid_to_cell,
            self.W_hid_to_outgate
        ],
                                      axis=1)

        # Stack biases into a (4*num_units) vector
        b_stacked = T.concatenate(
            [self.b_ingate, self.b_forgetgate, self.b_cell, self.b_outgate],
            axis=0)

        if self.precompute_input:
            # Because the input is given for all time steps, we can
            # precompute_input the inputs dot weight matrices before scanning.
            # W_in_stacked is (n_features, 4*num_units). input is then
            # (n_time_steps, n_batch, 4*num_units).
            input = T.dot(input, W_in_stacked) + b_stacked

        # At each call to scan, input_n will be (n_time_steps, 4*num_units).
        # We define a slicing function that extract the input to each LSTM gate
        def slice_w(x, n):
            return x[:, n * self.num_units:(n + 1) * self.num_units]

        # Create single recurrent computation step function
        # input_n is the n'th vector of the input
        def step(input_n, cell_previous, hid_previous, previous_r, *args):
            if not self.precompute_input:
                input_n = T.dot(input_n, W_in_stacked) + b_stacked

            # Calculate gates pre-activations and slice
            gates = input_n + T.dot(hid_previous, W_hid_stacked)

            # Clip gradients
            if self.grad_clipping:
                gates = theano.gradient.grad_clip(gates, -self.grad_clipping,
                                                  self.grad_clipping)

            # Extract the pre-activation gate values
            ingate = slice_w(gates, 0)
            forgetgate = slice_w(gates, 1)
            cell_input = slice_w(gates, 2)
            outgate = slice_w(gates, 3)

            if self.peepholes:
                # Compute peephole connections
                ingate += cell_previous * self.W_cell_to_ingate
                forgetgate += cell_previous * self.W_cell_to_forgetgate

            # Apply nonlinearities
            ingate = self.nonlinearity_ingate(ingate)
            forgetgate = self.nonlinearity_forgetgate(forgetgate)
            cell_input = self.nonlinearity_cell(cell_input)

            # Compute new cell value
            cell = forgetgate * cell_previous + ingate * cell_input

            if self.peepholes:
                outgate += cell * self.W_cell_to_outgate
            outgate = self.nonlinearity_outgate(outgate)

            # Compute new hidden unit activation
            hid = outgate * self.nonlinearity(cell)
            r = previous_r
            if self.attention and self.word_by_word:
                mh = T.dot(hid, self.W_h_attend) + T.dot(
                    previous_r, self.W_r_attend)
                # mh is (n_batch, 1, n_features)
                mh = mh.dimshuffle(0, 'x', 1)
                M = T.dot(encoder_hs, self.W_y_attend) + mh
                # (n_batch, n_time_steps, n_features)
                M = nonlinearities.tanh(M)
                # alpha is (n_batch, n_time_steps, 1)
                alpha = T.dot(M, self.w_attend)
                # now is (n_batch, n_time_steps)
                alpha = T.flatten(alpha, 2)
                # 0 after softmax is not 0, f**k, my mistake.
                # when i > encoder_seq_len, fill alpha_i to -np.inf
                # alpha = T.switch(encoder_mask, alpha, -np.inf)
                alpha = T.nnet.softmax(alpha)
                # apply encoder_mask to alpha
                # encoder_mask is (n_batch, n_time_steps)
                # when i > encoder_seq_len, alpha_i should be 0.
                # actually not need mask, but in case of error
                # alpha = alpha * encoder_mask
                alpha = alpha.dimshuffle(0, 1, 'x')
                weighted_encoder = T.sum(encoder_hs * alpha, axis=1)
                r = weighted_encoder + nonlinearities.tanh(
                    T.dot(previous_r, self.W_t_attend))

            return [cell, hid, r]

        def step_masked(input_n, mask_n, cell_previous, hid_previous,
                        previous_r, *args):
            cell, hid, r = step(input_n, cell_previous, hid_previous,
                                previous_r, *args)

            # Skip over any input with mask 0 by copying the previous
            # hidden state; proceed normally for any input with mask 1.
            cell = T.switch(mask_n, cell, cell_previous)
            hid = T.switch(mask_n, hid, hid_previous)
            r = T.switch(mask_n, r, previous_r)
            return [cell, hid, r]

        if mask is not None:
            # mask is given as (batch_size, seq_len). Because scan iterates
            # over first dimension, we dimshuffle to (seq_len, batch_size) and
            # add a broadcastable dimension
            mask = mask.dimshuffle(1, 0, 'x')
            sequences = [input, mask]
            step_fun = step_masked
        else:
            sequences = input
            step_fun = step

        ones = T.ones((num_batch, 1))
        if not isinstance(self.hid_init, Layer):
            # Dot against a 1s vector to repeat to shape (num_batch, num_units)
            hid_init = T.dot(ones, self.hid_init)

        # The hidden-to-hidden weight matrix is always used in step
        non_seqs = [W_hid_stacked]
        # The "peephole" weight matrices are only used when self.peepholes=True
        if self.peepholes:
            non_seqs += [
                self.W_cell_to_ingate, self.W_cell_to_forgetgate,
                self.W_cell_to_outgate
            ]

        # When we aren't precomputing the input outside of scan, we need to
        # provide the input weights and biases to the step function
        if not self.precompute_input:
            non_seqs += [W_in_stacked, b_stacked]

        r_init = T.dot(ones, self.r_init)
        if self.attention and self.word_by_word:
            non_seqs += [
                self.W_y_attend,
                self.W_h_attend,
                self.W_r_attend,
                self.w_attend,
                self.W_t_attend,
                encoder_hs,
                # encoder_mask
            ]
        # Scan op iterates over first dimension of input and repeatedly
        # applies the step function
        cell_out, hid_out, r_out = theano.scan(
            fn=step_fun,
            sequences=sequences,
            outputs_info=[cell_init, hid_init, r_init],
            go_backwards=self.backwards,
            truncate_gradient=self.gradient_steps,
            non_sequences=non_seqs,
            strict=True)[0]
        # (n_batch, n_features)
        hid_N = hid_out[-1]
        out = hid_N
        if self.attention:
            if self.word_by_word:
                r_N = r_out[-1]
            else:
                mh = T.dot(hid_N, self.W_h_attend)
                mh = mh.dimshuffle(0, 'x', 1)
                M = T.dot(encoder_hs, self.W_y_attend) + mh
                # (n_batch, n_time_steps, n_features)
                M = nonlinearities.tanh(M)
                alpha = T.dot(M, self.w_attend)
                # (n_batch, n_time_steps)
                alpha = T.flatten(alpha, 2)
                # when i > encoder_seq_len, fill alpha_i to -np.inf
                # alpha = T.switch(encoder_mask, alpha, -np.inf)
                alpha = T.nnet.softmax(alpha)
                # apply encoder_mask to alpha
                # encoder_mask is (n_batch, n_time_steps)
                # when i > encoder_seq_len, alpha_i should be 0.
                # actually not need mask, but in case of error
                # alpha = alpha * encoder_mask
                alpha = alpha.dimshuffle(0, 1, 'x')
                # (n_batch, n_features)
                r_N = T.sum(encoder_hs * alpha, axis=1)
            out = nonlinearities.tanh(
                T.dot(r_N, self.W_p_attend) + T.dot(hid_N, self.W_x_attend))
        return out
        def step(input_n, cell_previous, hid_previous, previous_r, *args):
            if not self.precompute_input:
                input_n = T.dot(input_n, W_in_stacked) + b_stacked

            # Calculate gates pre-activations and slice
            gates = input_n + T.dot(hid_previous, W_hid_stacked)

            # Clip gradients
            if self.grad_clipping:
                gates = theano.gradient.grad_clip(
                    gates, -self.grad_clipping, self.grad_clipping)

            # Extract the pre-activation gate values
            ingate = slice_w(gates, 0)
            forgetgate = slice_w(gates, 1)
            cell_input = slice_w(gates, 2)
            outgate = slice_w(gates, 3)

            if self.peepholes:
                # Compute peephole connections
                ingate += cell_previous*self.W_cell_to_ingate
                forgetgate += cell_previous*self.W_cell_to_forgetgate

            # Apply nonlinearities
            ingate = self.nonlinearity_ingate(ingate)
            forgetgate = self.nonlinearity_forgetgate(forgetgate)
            cell_input = self.nonlinearity_cell(cell_input)

            # Compute new cell value
            cell = forgetgate*cell_previous + ingate*cell_input

            if self.peepholes:
                outgate += cell*self.W_cell_to_outgate
            outgate = self.nonlinearity_outgate(outgate)

            # Compute new hidden unit activation
            hid = outgate*self.nonlinearity(cell)
            r = previous_r
            if self.attention and self.word_by_word:
                mh = T.dot(hid, self.W_h_attend) + T.dot(previous_r, self.W_r_attend)
                # mh is (n_batch, 1, n_features)
                mh = mh.dimshuffle(0, 'x', 1)
                M = T.dot(encoder_hs, self.W_y_attend) + mh
                # (n_batch, n_time_steps, n_features)
                M = nonlinearities.tanh(M)
                # alpha is (n_batch, n_time_steps, 1)
                alpha = T.dot(M, self.w_attend)
                # now is (n_batch, n_time_steps)
                alpha = T.flatten(alpha, 2)
                # 0 after softmax is not 0, f**k, my mistake.
                # when i > encoder_seq_len, fill alpha_i to -np.inf
                # alpha = T.switch(encoder_mask, alpha, -np.inf)
                alpha = T.nnet.softmax(alpha)
                # apply encoder_mask to alpha
                # encoder_mask is (n_batch, n_time_steps)
                # when i > encoder_seq_len, alpha_i should be 0.
                # actually not need mask, but in case of error
                # alpha = alpha * encoder_mask
                alpha = alpha.dimshuffle(0, 1, 'x')
                weighted_encoder = T.sum(encoder_hs * alpha, axis=1)
                r = weighted_encoder + nonlinearities.tanh(T.dot(previous_r, self.W_t_attend))

            return [cell, hid, r]
Esempio n. 16
0
        def step(input_n, hid_previous_total, *args):
            print("317 into step")
            print(" type input n: ", type(input_n))
            
            hid_previous_facts = hid_previous_total[0:self.num_hidden_units_h]
            hid_previous_brain = hid_previous_total[self.num_hidden_units_h:]
            
            self.cur_sequence_idx += 1  # Updates where we are at in the sequence
                                
            # Compute W_{hr} h_{t - 1}, W_{hu} h_{t - 1}, and W_{hc} h_{t - 1}
            hid_input_facts = T.dot(hid_previous_facts, W_hid_stacked)

            if self.grad_clipping:
                input_n = theano.gradient.grad_clip(
                    input_n, -self.grad_clipping, self.grad_clipping)
                hid_input_facts = theano.gradient.grad_clip(
                    hid_input_facts, -self.grad_clipping, self.grad_clipping)

            if not self.precompute_input:
                # Compute W_{xr}x_t + b_r, W_{xu}x_t + b_u, and W_{xc}x_t + b_c
                input_n = T.dot(input_n, W_in_stacked) + b_stacked  # DS Note:  accomplishes the multiplication AND adds bias

            # Reset and update gates
            resetgate = slice_w_h(hid_input_facts, 0) + slice_w_h(input_n, 0)
            updategate = slice_w_h(hid_input_facts, 1) + slice_w_h(input_n, 1)
            resetgate = self.nonlinearity_resetgate(resetgate)
            updategate = self.nonlinearity_updategate(updategate)

            # DS Edit: DynamMemNet modifiers
            m_dmn = hid_previous_brain  # Note that this should have size 
            c_dmn = input_n  # This is a TesnorType<float64, row>
            q_dmn = self.question_layer  # This is a lasagne recurrent GRU layer
                        
            print(" entering 344")
            # DS Note:  I believe this has size 9 x size(m_dmn)==size(cdmn)
#             z_dmn = [c_dmn, m_dmn, q_dmn, c_dmn * q_dmn, abs(c_dmn - q_dmn), abs(c_dmn - m_dmn), T.dot(c_dmn.T, T.dot(self.W_dmn_b, q_dmn)), 
#                         T.dot(c_dmn.T, T.dot(self.W_dmn_b, m_dmn))]
#             
            z_dmn = T.concatenate([c_dmn, m_dmn, q_dmn, c_dmn * q_dmn, abs(c_dmn - q_dmn), abs(c_dmn - m_dmn), T.dot(c_dmn.T, T.dot(self.W_dmn_b, q_dmn)),
                        T.dot(c_dmn.T, T.dot(self.W_dmn_b, m_dmn))], axis=1)
            G_dmn = nonlinearities.sigmoid(T.dot(self.W_dmn_2, nonlinearities.tanh(T.dot(self.W_dmn_1, z_dmn)) + self.b_dmn_1) + self.b_dmn_2)
            # Note, you also need W_b for the c and q elements.
            
            # Compute W_{xc}x_t + r_t \odot (W_{hc} h_{t - 1})
            hidden_update_in = slice_w_h(input_n, 2)
            hidden_update_hid = slice_w_h(hid_input_facts, 2)
            hidden_update_facts = hidden_update_in + resetgate * hidden_update_hid
            if self.grad_clipping:
                hidden_update_facts = theano.gradient.grad_clip(
                    hidden_update_facts, -self.grad_clipping, self.grad_clipping)
            hidden_update_facts = self.nonlinearity_hid(hidden_update_facts)

            # Compute (1 - u_t)h_{t - 1} + u_t c_t
            hid = (1 - updategate) * hid_previous_facts + updategate * hidden_update_facts  # This is the GRU_fact output
            output_dmn = G_dmn * hid + (1 - G_dmn) * hid_previous_facts  # This is the output of the Dynamic Memory Net modified GRU, Eq. (5)
            
            # UPDATE THE BRAIN
            # We update the brain parameters if the current idx is equal to the sent len
            if self.cur_sequence_idx == self.max_seqlen:
                hid_input_brain = T.dot(hid_previous_brain, W_brain_hid_stacked)            
            
                if self.grad_clipping:
                    input_to_brain = theano.gradient.grad_clip(
                        output_dmn, -self.grad_clipping, self.grad_clipping)
                    hid_input_brain = theano.gradient.grad_clip(
                        hid_input_brain, -self.grad_clipping, self.grad_clipping)
                else:
                    input_to_brain = output_dmn

                if not self.precompute_input:
                    # Compute W_{xr}x_t + b_r, W_{xu}x_t + b_u, and W_{xc}x_t + b_c
                    input_to_brain = T.dot(input_to_brain, W_brain_in_stacked) + b_brain_stacked  # DS Note:  accomplishes the multiplication AND adds bias
    
                # Reset and update gates
                resetgate_brain = slice_w_m(hid_input_brain, 0) + slice_w_m(input_to_brain, 0)
                updategate_brain = slice_w_m(hid_input_brain, 1) + slice_w_m(input_to_brain, 1)
                resetgate_brain = self.nonlinearity_brain_resetgate(resetgate_brain)
                updategate_brain = self.nonlinearity_brain_updategate(updategate_brain)

                hidden_update_in_brain = slice_w_m(input_to_brain, 2)
                hidden_update_brain = slice_w_m(hid_input_brain, 2)
                
                hidden_update_brain = hidden_update_in_brain + resetgate_brain * hidden_update_brain
                
                if self.grad_clipping:
                    hidden_update_brain = theano.gradient.grad_clip(hidden_update_brain, -self.grad_clipping, self.grad_clipping)
                hidden_update_brain = self.nonlinearity_brain_hid_update(hidden_update_brain)
                
                hid_brain = (1 - updategate_brain) * hid_previous_brain + updategate_brain * hidden_update_brain                
            else:                
                hid_brain = hid_previous_brain
            
            # TODO: DS:  ERROR IS HERE
            output_dmn = T.concatenate([output_dmn, hid_brain], axis=1) 
           
            print(" 412 out of step") 
            return output_dmn
Esempio n. 17
0
 def action_nonlinearity(x):
     return self.scale_action * tanh(x)