def model(x, embedding_size, n_hidden):

    # hidden and input weights
    U = shared_glorot_uniform((embedding_size, n_hidden), name="U")
    W = shared_glorot_uniform((n_hidden, n_hidden), name="W")
    bh = shared_zeros((n_hidden, ), name="bh")

    # output weights
    V = shared_glorot_uniform((n_hidden, embedding_size), name="V")
    by = shared_zeros((embedding_size, ), name="by")

    params = [U, V, W, by, bh]

    def step(x_t, h_tm1):
        h_t = T.tanh(U[x_t] + T.dot(h_tm1, W) + bh)
        y_t = T.dot(h_t, V) + by
        return h_t, y_t

    h0 = shared_zeros((n_hidden, ), name='h0')
    [h, y_pred], _ = theano.scan(step,
                                 sequences=x,
                                 outputs_info=[h0, None],
                                 truncate_gradient=10)

    model = T.nnet.softmax(y_pred)
    return model, params
Beispiel #2
0
 def test_gen_cloning_with_shape_change(self):
     data = floatX(np.random.uniform(size=(1000, 10)))
     minibatches = DataSampler(data, batchsize=50)
     gen = generator(minibatches)
     gen_r = tt_rng().normal(size=gen.shape).T
     X = gen.dot(gen_r)
     res, _ = theano.scan(lambda x: x.sum(), X, n_steps=X.shape[0])
     assert res.eval().shape == (50, )
     shared = theano.shared(data)
     res2 = theano.clone(res, {gen: shared**2})
     assert res2.eval().shape == (1000, )
Beispiel #3
0
 def test_gen_cloning_with_shape_change(self):
     data = floatX(np.random.uniform(size=(1000, 10)))
     minibatches = DataSampler(data, batchsize=50)
     gen = generator(minibatches)
     gen_r = tt_rng().normal(size=gen.shape).T
     X = gen.dot(gen_r)
     res, _ = theano.scan(lambda x: x.sum(), X, n_steps=X.shape[0])
     assert res.eval().shape == (50,)
     shared = theano.shared(data)
     res2 = theano.clone(res, {gen: shared**2})
     assert res2.eval().shape == (1000,)
Beispiel #4
0
 def apply(self, f):
     # f: kernel function for KSD f(histogram) -> (k(x,.), \nabla_x k(x,.))
     X = self.approx.histogram
     t = self.approx.normalizing_constant
     dlogpdx = theano.scan(
         fn=lambda zg: theano.grad(self.logp_norm(zg), zg),
         sequences=[X])[0]  # bottleneck
     Kxy, dxkxy = f(X)
     # scaling factor
     # not needed for Kxy as we already scaled dlogpdx
     dxkxy /= t
     n = X.shape[0].astype('float32') / t
     svgd_grad = (tt.dot(Kxy, dlogpdx) + dxkxy) / n
     return -1 * svgd_grad  # gradient
Beispiel #5
0
 def apply(self, f):
     # f: kernel function for KSD f(histogram) -> (k(x,.), \nabla_x k(x,.))
     X = self.approx.histogram
     t = self.approx.normalizing_constant
     dlogpdx = theano.scan(
         fn=lambda zg: theano.grad(self.logp_norm(zg), zg),
         sequences=[X]
     )[0]    # bottleneck
     Kxy, dxkxy = f(X)
     # scaling factor
     # not needed for Kxy as we already scaled dlogpdx
     dxkxy /= t
     n = X.shape[0].astype('float32') / t
     svgd_grad = (tt.dot(Kxy, dlogpdx) + dxkxy) / n
     return -1 * svgd_grad   # gradient
def model(x, embedding_size, n_hidden):

    # Update gate weights
    W_xz = shared_glorot_uniform((embedding_size, n_hidden))
    W_hz = shared_glorot_uniform((n_hidden, n_hidden))
    b_z = shared_zeros((n_hidden, ))

    # Reset gate weights
    W_xr = shared_glorot_uniform((embedding_size, n_hidden))
    W_hr = shared_glorot_uniform((n_hidden, n_hidden))
    b_r = shared_zeros((n_hidden, ))

    # Hidden layer
    W_xh = shared_glorot_uniform((embedding_size, n_hidden))
    W_hh = shared_glorot_uniform((n_hidden, n_hidden))
    b_h = shared_zeros((n_hidden, ))

    # Output weights
    W_y = shared_glorot_uniform((n_hidden, embedding_size), name="V")
    b_y = shared_zeros((embedding_size, ), name="by")

    params = [W_xz, W_hz, b_z, W_xr, W_hr, b_r, W_xh, W_hh, b_h, W_y, b_y]

    def step(x_t, h_tm1):
        z_t = T.nnet.sigmoid(W_xz[x_t] + T.dot(W_hz, h_tm1) + b_z)
        r_t = T.nnet.sigmoid(W_xr[x_t] + T.dot(W_hr, h_tm1) + b_r)
        can_h_t = T.tanh(W_xh[x_t] + r_t * T.dot(W_hh, h_tm1) + b_h)
        h_t = (1 - z_t) * h_tm1 + z_t * can_h_t
        y_t = T.dot(h_t, W_y) + b_y
        return h_t, y_t

    h0 = shared_zeros((n_hidden, ), name='h0')
    [h, y_pred], _ = theano.scan(step,
                                 sequences=x,
                                 outputs_info=[h0, None],
                                 truncate_gradient=10)

    model = T.nnet.softmax(y_pred)
    return model, params
Beispiel #7
0
 def dlogp(self):
     return theano.scan(
         fn=lambda zg: theano.grad(self.approx.logp_norm(zg), zg),
         sequences=[self.input_matrix])[0]
from theano import function, theano
from theano import pp
import theano.tensor as T


#computing Gradients
x = T.dscalar('x')
y= x ** 2
gy = T.grad(y, x)

print pp(gy)
f = function([x], gy)
print f(4)


x = T.matrix('x')
s = T.sum(1 / (1 + T.exp(-x)))
gs = T.grad(s, x)
dlogistic = function([[0, 1], [-1, -2]])
print dlogistic

#computing Jacobian
x = T.dvectors('x')
y = x ** 2
J, updates = theano.scan(lambda i, y,x : T.grad(y[i], x), sequences = T.arange(y.shape[0]), non_sequences = [y, x])
f = function([x], J, updates = updates)
print f([4, 4])


    def get_output_for(self, inputs, **kwargs):
        """
        Compute this layer's output function given a symbolic input variable

        Parameters
        ----------
        inputs : list of theano.TensorType
            `inputs[0]` should always be the symbolic input variable.  When
            this layer has a mask input (i.e. was instantiated with
            `mask_input != None`, indicating that the lengths of sequences in
            each batch vary), `inputs` should have length 2, where `inputs[1]`
            is the `mask`.  The `mask` should be supplied as a Theano variable
            denoting whether each time step in each sequence in the batch is
            part of the sequence or not.  `mask` should be a matrix of shape
            ``(n_batch, n_time_steps)`` where ``mask[i, j] = 1`` when ``j <=
            (length of sequence i)`` and ``mask[i, j] = 0`` when ``j > (length
            of sequence i)``.

        Returns
        -------
        layer_output : theano.TensorType
            Symbolic output variable.
        """
        # Retrieve the layer input
        input = inputs[0]
        encoder_output = inputs[1]

        # Treat all dimensions after the second as flattened feature dimensions
        if input.ndim > 3:
            input = T.flatten(input, 3)
            encoder_output = T.flatten(encoder_output, 3)

        # Because scan iterates over the first dimension we dimshuffle to
        # (n_time_steps, n_batch, n_features)
        input = input.dimshuffle(1, 0, 2)
        encoder_output = encoder_output.dimshuffle(1, 0, 2)
        seq_len, num_batch, _ = input.shape

        # Stack input weight matrices into a (num_inputs, 3*num_units)
        # matrix, which speeds up computation
        W_in_stacked = T.concatenate(
            [self.W_in_to_resetgate, self.W_in_to_updategate, self.W_in_to_hidden_update], axis=1
        )

        # Same for hidden weight matrices
        W_hid_stacked = T.concatenate(
            [self.W_hid_to_resetgate, self.W_hid_to_updategate, self.W_hid_to_hidden_update], axis=1
        )

        # Stack gate biases into a (3*num_units) vector
        b_stacked = T.concatenate([self.b_resetgate, self.b_updategate, self.b_hidden_update], axis=0)

        if self.precompute_input:
            # precompute_input inputs*W. W_in is (n_features, 3*num_units).
            # input is then (n_batch, n_time_steps, 3*num_units).
            input = T.dot(input, W_in_stacked) + b_stacked

        # At each call to scan, input_n will be (n_time_steps, 3*num_units).
        # We define a slicing function that extract the input to each GRU gate
        def slice_w(x, n):
            return x[:, n * self.num_units : (n + 1) * self.num_units]

        # Create single recurrent computation step function
        # input__n is the n'th vector of the input
        def step(
            input_n,
            hid_previous,
            encoder_output,
            W_hid_stacked,
            W_in_stacked,
            b_stacked,
            W_att_enc,
            W_att_dec,
            W_att_out,
            W_out,
            b_out,
        ):
            # Compute W_{hr} h_{t - 1}, W_{hu} h_{t - 1}, and W_{hc} h_{t - 1}
            hid_input = T.dot(hid_previous, W_hid_stacked)

            if self.grad_clipping is not False:
                input_n = theano.gradient.grad_clip(input_n, -self.grad_clipping, self.grad_clipping)
                hid_input = theano.gradient.grad_clip(hid_input, -self.grad_clipping, self.grad_clipping)

            if not self.precompute_input:
                # Compute W_{xr}x_t + b_r, W_{xu}x_t + b_u, and W_{xc}x_t + b_c
                input_n = T.dot(input_n, W_in_stacked) + b_stacked

            # Reset and update gates
            resetgate = slice_w(hid_input, 0) + slice_w(input_n, 0)
            updategate = slice_w(hid_input, 1) + slice_w(input_n, 1)
            resetgate = self.nonlinearity_resetgate(resetgate)
            updategate = self.nonlinearity_updategate(updategate)

            # Compute W_{xc}x_t + r_t \odot (W_{hc} h_{t - 1})
            hidden_update_in = slice_w(input_n, 2)
            hidden_update_hid = slice_w(hid_input, 2)
            hidden_update = hidden_update_in + resetgate * hidden_update_hid
            if self.grad_clipping is not False:
                hidden_update = theano.gradient.grad_clip(hidden_update, -self.grad_clipping, self.grad_clipping)
            hidden_update = self.nonlinearity_hid(hidden_update)

            # Compute (1 - u_t)h_{t - 1} + u_t c_t
            hid = (1 - updategate) * hid_previous + updategate * hidden_update

            # # Add the attention
            hid += self.attention(encoder_output, hid_previous, W_att_enc, W_att_dec, W_att_out)

            # Compute the probas
            probs = T.nnet.softmax(T.dot(hid, W_out) + b_out)
            return [hid, probs]

        sequences = [input]
        step_fun = step

        if isinstance(self.hid_init, T.TensorVariable):
            hid_init = self.hid_init
        else:
            # Dot against a 1s vector to repeat to shape (num_batch, num_units)
            hid_init = T.dot(T.ones((num_batch, 1)), self.hid_init)

        # The hidden-to-hidden weight matrix is always used in step
        non_seqs = [encoder_output, W_hid_stacked]
        # When we aren't precomputing the input outside of scan, we need to
        # provide the input weights and biases to the step function
        if not self.precompute_input:
            non_seqs += [
                W_in_stacked,
                b_stacked,
                self.W_att_enc,
                self.W_att_dec,
                self.W_att_out,
                self.W_out,
                self.b_out,
            ]
        # theano.scan only allows for positional arguments, so when
        # self.precompute_input is True, we need to supply fake placeholder
        # arguments for the input weights and biases.
        else:
            non_seqs += [(), (), self.W_att_enc, self.W_att_dec, self.W_att_out, self.W_out, self.b_out]

        if self.unroll_scan:
            # Retrieve the dimensionality of the incoming layer
            input_shape = self.input_shapes[0]
            # Explicitly unroll the recurrence instead of using scan
            out, _ = unroll_scan(
                fn=step_fun,
                sequences=sequences,
                outputs_info=[hid_init],
                go_backwards=self.backwards,
                non_sequences=non_seqs,
                n_steps=input_shape[1],
            )
        else:
            # Scan op iterates over first dimension of input and repeatedly
            # applies the step function
            out, _ = theano.scan(
                fn=step_fun,
                sequences=sequences,
                go_backwards=self.backwards,
                outputs_info=[hid_init, None],
                non_sequences=non_seqs,
                truncate_gradient=self.gradient_steps,
                strict=True,
            )

        # dimshuffle back to (n_batch, n_time_steps, n_features))
        # hid_out = hid_out[0].dimshuffle(1, 0, 2)
        s_out = out[1]

        # # if scan is backward reverse the output
        # if self.backwards:
        #     out = out[:, ::-1, :]

        return s_out
Beispiel #10
0
    def get_output_for(self, inputs, **kwargs):
        """
        Compute this layer's output function given a symbolic input variable

        Parameters
        ----------
        inputs : list of theano.TensorType
            `inputs[0]` should always be the symbolic input variable.  When
            this layer has a mask input (i.e. was instantiated with
            `mask_input != None`, indicating that the lengths of sequences in
            each batch vary), `inputs` should have length 2, where `inputs[1]`
            is the `mask`.  The `mask` should be supplied as a Theano variable
            denoting whether each time step in each sequence in the batch is
            part of the sequence or not.  `mask` should be a matrix of shape
            ``(n_batch, n_time_steps)`` where ``mask[i, j] = 1`` when ``j <=
            (length of sequence i)`` and ``mask[i, j] = 0`` when ``j > (length
            of sequence i)``.

        Returns
        -------
        layer_output : theano.TensorType
            Symbolic output variable.
        """
        # Retrieve the layer input
        input = inputs[0]
        encoder_output = inputs[1]

        # Treat all dimensions after the second as flattened feature dimensions
        if input.ndim > 3:
            input = T.flatten(input, 3)
            encoder_output = T.flatten(encoder_output, 3)

        # Because scan iterates over the first dimension we dimshuffle to
        # (n_time_steps, n_batch, n_features)
        input = input.dimshuffle(1, 0, 2)
        encoder_output = encoder_output.dimshuffle(1, 0, 2)
        seq_len, num_batch, _ = input.shape

        # Stack input weight matrices into a (num_inputs, 3*num_units)
        # matrix, which speeds up computation
        W_in_stacked = T.concatenate([
            self.W_in_to_resetgate, self.W_in_to_updategate,
            self.W_in_to_hidden_update
        ],
                                     axis=1)

        # Same for hidden weight matrices
        W_hid_stacked = T.concatenate([
            self.W_hid_to_resetgate, self.W_hid_to_updategate,
            self.W_hid_to_hidden_update
        ],
                                      axis=1)

        # Stack gate biases into a (3*num_units) vector
        b_stacked = T.concatenate(
            [self.b_resetgate, self.b_updategate, self.b_hidden_update],
            axis=0)

        if self.precompute_input:
            # precompute_input inputs*W. W_in is (n_features, 3*num_units).
            # input is then (n_batch, n_time_steps, 3*num_units).
            input = T.dot(input, W_in_stacked) + b_stacked

        # At each call to scan, input_n will be (n_time_steps, 3*num_units).
        # We define a slicing function that extract the input to each GRU gate
        def slice_w(x, n):
            return x[:, n * self.num_units:(n + 1) * self.num_units]

        # Create single recurrent computation step function
        # input__n is the n'th vector of the input
        def step(input_n, hid_previous, encoder_output, W_hid_stacked,
                 W_in_stacked, b_stacked, W_att_enc, W_att_dec, W_att_out,
                 W_out, b_out):
            # Compute W_{hr} h_{t - 1}, W_{hu} h_{t - 1}, and W_{hc} h_{t - 1}
            hid_input = T.dot(hid_previous, W_hid_stacked)

            if self.grad_clipping is not False:
                input_n = theano.gradient.grad_clip(input_n,
                                                    -self.grad_clipping,
                                                    self.grad_clipping)
                hid_input = theano.gradient.grad_clip(hid_input,
                                                      -self.grad_clipping,
                                                      self.grad_clipping)

            if not self.precompute_input:
                # Compute W_{xr}x_t + b_r, W_{xu}x_t + b_u, and W_{xc}x_t + b_c
                input_n = T.dot(input_n, W_in_stacked) + b_stacked

            # Reset and update gates
            resetgate = slice_w(hid_input, 0) + slice_w(input_n, 0)
            updategate = slice_w(hid_input, 1) + slice_w(input_n, 1)
            resetgate = self.nonlinearity_resetgate(resetgate)
            updategate = self.nonlinearity_updategate(updategate)

            # Compute W_{xc}x_t + r_t \odot (W_{hc} h_{t - 1})
            hidden_update_in = slice_w(input_n, 2)
            hidden_update_hid = slice_w(hid_input, 2)
            hidden_update = hidden_update_in + resetgate * hidden_update_hid
            if self.grad_clipping is not False:
                hidden_update = theano.gradient.grad_clip(
                    hidden_update, -self.grad_clipping, self.grad_clipping)
            hidden_update = self.nonlinearity_hid(hidden_update)

            # Compute (1 - u_t)h_{t - 1} + u_t c_t
            hid = (1 - updategate) * hid_previous + updategate * hidden_update

            # # Add the attention
            hid += self.attention(encoder_output, hid_previous, W_att_enc,
                                  W_att_dec, W_att_out)

            # Compute the probas
            probs = T.nnet.softmax(T.dot(hid, W_out) + b_out)
            return [hid, probs]

        sequences = [input]
        step_fun = step

        if isinstance(self.hid_init, T.TensorVariable):
            hid_init = self.hid_init
        else:
            # Dot against a 1s vector to repeat to shape (num_batch, num_units)
            hid_init = T.dot(T.ones((num_batch, 1)), self.hid_init)

        # The hidden-to-hidden weight matrix is always used in step
        non_seqs = [encoder_output, W_hid_stacked]
        # When we aren't precomputing the input outside of scan, we need to
        # provide the input weights and biases to the step function
        if not self.precompute_input:
            non_seqs += [
                W_in_stacked,
                b_stacked,
                self.W_att_enc,
                self.W_att_dec,
                self.W_att_out,
                self.W_out,
                self.b_out,
            ]
        # theano.scan only allows for positional arguments, so when
        # self.precompute_input is True, we need to supply fake placeholder
        # arguments for the input weights and biases.
        else:
            non_seqs += [(), (), self.W_att_enc, self.W_att_dec,
                         self.W_att_out, self.W_out, self.b_out]

        if self.unroll_scan:
            # Retrieve the dimensionality of the incoming layer
            input_shape = self.input_shapes[0]
            # Explicitly unroll the recurrence instead of using scan
            out, _ = unroll_scan(fn=step_fun,
                                 sequences=sequences,
                                 outputs_info=[hid_init],
                                 go_backwards=self.backwards,
                                 non_sequences=non_seqs,
                                 n_steps=input_shape[1])
        else:
            # Scan op iterates over first dimension of input and repeatedly
            # applies the step function
            out, _ = theano.scan(fn=step_fun,
                                 sequences=sequences,
                                 go_backwards=self.backwards,
                                 outputs_info=[hid_init, None],
                                 non_sequences=non_seqs,
                                 truncate_gradient=self.gradient_steps,
                                 strict=True)

        # dimshuffle back to (n_batch, n_time_steps, n_features))
        # hid_out = hid_out[0].dimshuffle(1, 0, 2)
        s_out = out[1]

        # # if scan is backward reverse the output
        # if self.backwards:
        #     out = out[:, ::-1, :]

        return s_out