Exemple #1
0
    def step(self, h, k_pre, w_pre, seq_cond, seq_cond_mask, mask=None):
        # act: (batch_size, 3*n_mixt)
        act = T.exp(T.dot(h, self.w_cond) + self.b_cond)

        a = act[:, :self.n_mixt]
        b = act[:, self.n_mixt:2*self.n_mixt]
        k = k_pre + self.position_gap * act[:, -self.n_mixt:]

        # u: (length_cond_sequence, 1, 1)
        u = T.shape_padright(T.arange(seq_cond.shape[0], dtype=floatX), 1)
        # phi: (length_cond_sequence, batch_size, n_mixt)
        temp = ((-b[:, 0] * (k[:, 0] - u) ** 2) * seq_cond_mask -1000 * (
            1 - seq_cond_mask))
        phi = T.nnet.softmax(temp.T).T
        # phi: (length_cond_sequence, batch_size)
        phi *= seq_cond_mask

        # w: (batch_size, condition_n_features)
        w = T.sum(T.shape_padright(phi) * seq_cond, axis=0)

        if mask:
            k = mask[:, None]*k + (1-mask[:, None])*k_pre
            w = mask[:, None]*w + (1-mask[:, None])*w_pre

        if self.grad_clip:
            w = grad_clip(w, -self.grad_clip, self.grad_clip)

        return a, k, phi, w
Exemple #2
0
    def get_output_for(self, input, **kwargs):
        if input.ndim > 2:
            # if the input has more than two dimensions, flatten it into a
            # batch of feature vectors.
            input = input.flatten(2)
        activation = T.dot(input, self.W)
        if self.b is not None:
            activation = activation + self.b.dimshuffle('x', 0)
        if self.grad_clipping:
            activation = grad_clip(activation, -self.grad_clipping,
                                   self.grad_clipping)
        return self.nonlinearity(activation)


# def sgd_with_grad_clipping(loss_or_grads, params, learning_rate, rescale):
#     grads = lasagne.updates.get_or_compute_grads(loss_or_grads, params)
#     updates = OrderedDict()
#
#     grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), grads)))
#     not_finite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm))
#     grad_norm = T.sqrt(grad_norm)
#     scaling_num = rescale
#     scaling_den = T.maximum(rescale, grad_norm)
#     for n, (param, grad) in enumerate(zip(params, grads)):
#         grad = T.switch(not_finite, 0.1 * param,
#             grad * (scaling_num / scaling_den))
#         updates[param] = param - learning_rate * grad
#     return updates

# max_norm = 5.0
# grads = theano.gradient(loss, params)
# grads = [lasagne.updates.norm_constraint(grad, max_norm, range(grad.ndim))
#          for grad in grads]
# updates = lasagne.updates.whatever(grads, params)
Exemple #3
0
    def define_train_test_funcs(self):
        # pYs = T.reshape(self.activation, (self.mask_y.shape[0] * self.batch_size, self.out_size))
        pYs_y = T.reshape(self.activation_dim_y, (self.mask_y.shape[0] * self.batch_size, self.dim_y))
        pYs_pos = T.reshape(self.activation_dim_pos, (self.mask_y.shape[0] * self.batch_size, self.dim_pos))
        # tYs =  T.reshape(self.X, (self.mask.shape[0] * self.batch_size, self.out_size))
        tYs_y = T.reshape(self.Y_y, (self.mask_y.shape[0] * self.batch_size, self.dim_y))
        tYs_pos = T.reshape(self.Y_pos, (self.mask_y.shape[0] * self.batch_size, self.dim_pos))
        # tYs =  T.reshape(self.Y, (self.mask_y.shape[0] * self.batch_size, self.out_size))
        cost_y = self.categorical_crossentropy(pYs_y, tYs_y)
        cost_pos = self.categorical_crossentropy(pYs_pos, tYs_pos)

        cost = cost_y + cost_pos

        gparams = []
        for param in self.params:
            gparam = T.grad(grad_clip(cost, -5.0, +5.0), param)
            gparams.append(gparam)

        lr = T.scalar("lr")
        # eval(): string to function
        optimizer = eval(self.optimizer)
        updates = optimizer(self.params, gparams, lr)

        #updates = sgd(self.params, gparams, lr)
        #updates = momentum(self.params, gparams, lr)
        #updates = rmsprop(self.params, gparams, lr)
        #updates = adagrad(self.params, gparams, lr)
        #updates = adadelta(self.params, gparams, lr)
        #updates = adam(self.params, gparams, lr)
        
        self.train = theano.function(inputs = [self.X, self.Y_y, self.Y_pos, self.mask, self.mask_y, lr, self.batch_size],
                                               givens = {self.is_train : np.cast['int32'](1)},
                                               outputs = [cost,cost_y,cost_pos, self.activation_dim_y, self.activation_dim_pos],
                                               updates = updates)
    def get_output_for(self, input, **kwargs):
        if input.ndim > 2:
            # if the input has more than two dimensions, flatten it into a
            # batch of feature vectors.
            input = input.flatten(2)
        activation = T.dot(input, self.W)
        if self.b is not None:
            activation = activation + self.b.dimshuffle('x', 0)
        if self.grad_clipping:
            activation = grad_clip(activation, -self.grad_clipping, self.grad_clipping)
        return self.nonlinearity(activation)
# def sgd_with_grad_clipping(loss_or_grads, params, learning_rate, rescale):
#     grads = lasagne.updates.get_or_compute_grads(loss_or_grads, params)
#     updates = OrderedDict()
#
#     grad_norm = T.sqrt(sum(map(lambda x: T.sqr(x).sum(), grads)))
#     not_finite = T.or_(T.isnan(grad_norm), T.isinf(grad_norm))
#     grad_norm = T.sqrt(grad_norm)
#     scaling_num = rescale
#     scaling_den = T.maximum(rescale, grad_norm)
#     for n, (param, grad) in enumerate(zip(params, grads)):
#         grad = T.switch(not_finite, 0.1 * param,
#             grad * (scaling_num / scaling_den))
#         updates[param] = param - learning_rate * grad
#     return updates

# max_norm = 5.0
# grads = theano.gradient(loss, params)
# grads = [lasagne.updates.norm_constraint(grad, max_norm, range(grad.ndim))
#          for grad in grads]
# updates = lasagne.updates.whatever(grads, params)
Exemple #5
0
    def step(self,
             inputs,
             h_pre,
             k_pre,
             w_pre,
             seq_cond,
             seq_cond_mask,
             mask=None):
        """
        A single timestep.

        Parameters
        ----------
        inputs: (batch_size, n_in)
        h_pre: (batch_size, n_hidden)
        mask: (batch_size,)

        k_pre: (batch_size, n_mixt)
        w_pre: (batch_size, n_in_cond)

        seq_cond: (length_cond_sequence, batch_size, n_in_cond)
        seq_cond_mask: (length_cond_sequence, batch_size)
        """
        # inputs: (batch_size, n_in + n_in_cond)
        inputs = T.concatenate([inputs, w_pre], axis=1)

        # h: (batch_size, n_hidden)
        h = self.layer.step(inputs, h_pre, mask=mask, process_inputs=True)

        # act: (batch_size, 3*n_mixt)
        act = T.exp(T.dot(h, self.w_cond) + self.b_cond)

        a = act[:, :self.n_mixt]
        b = act[:, self.n_mixt:2 * self.n_mixt]
        k = k_pre + 0.1 * act[:, -self.n_mixt:]

        # u: (length_cond_sequence, 1, 1)
        u = T.shape_padright(T.arange(seq_cond.shape[0], dtype=floatX), 2)
        # phi: (length_cond_sequence, batch_size, n_mixt)
        phi = T.sum(a * T.exp(-b * (k - u)**2), axis=-1)
        # phi: (length_cond_sequence, batch_size)
        phi = phi * seq_cond_mask

        # w: (batch_size, n_chars)
        w = T.sum(T.shape_padright(phi) * seq_cond, axis=0)

        if mask:
            k = mask[:, None] * k + (1 - mask[:, None]) * k_pre
            w = mask[:, None] * w + (1 - mask[:, None]) * w_pre

        w = grad_clip(w, -100, 100)

        return h, a, k, phi, w
Exemple #6
0
def test_grad_clip():
    x = theano.tensor.scalar()

    z = theano.tensor.grad(gradient.grad_clip(x, -1, 1) ** 2, x)
    z2 = theano.tensor.grad(x ** 2, x)

    f = theano.function([x], outputs=[z, z2])

    if theano.config.mode != "FAST_COMPILE":
        topo = f.maker.fgraph.toposort()
        assert not any([isinstance(node.op, gradient.GradClip) for node in topo])
    out = f(2.0)
    assert np.allclose(out, (1, 4))
    assert not np.allclose(out[0], out[1])
def test_grad_clip():
    x = theano.tensor.scalar()

    z = theano.tensor.grad(gradient.grad_clip(x, -1, 1)**2, x)
    z2 = theano.tensor.grad(x**2, x)

    f = theano.function([x], outputs=[z, z2])

    if config.mode != "FAST_COMPILE":
        topo = f.maker.fgraph.toposort()
        assert not any(
            [isinstance(node.op, gradient.GradClip) for node in topo])
    out = f(2.0)
    assert np.allclose(out, (1, 4))
    assert not np.allclose(out[0], out[1])
Exemple #8
0
    def step(self, inputs, h_pre, k_pre, w_pre, seq_cond, seq_cond_mask,
             mask=None):
        """
        A single timestep.

        Parameters
        ----------
        inputs: (batch_size, n_in)
        h_pre: (batch_size, n_hidden)
        mask: (batch_size,)

        k_pre: (batch_size, n_mixt)
        w_pre: (batch_size, n_in_cond)

        seq_cond: (length_cond_sequence, batch_size, n_in_cond)
        seq_cond_mask: (length_cond_sequence, batch_size)
        """
        # inputs: (batch_size, n_in + n_in_cond)
        inputs = T.concatenate([inputs, w_pre], axis=1)

        # h: (batch_size, n_hidden)
        h = self.layer.step(inputs, h_pre, mask=mask, process_inputs=True)

        # act: (batch_size, 3*n_mixt)
        act = T.exp(T.dot(h, self.w_cond) + self.b_cond)

        a = act[:, :self.n_mixt]
        b = act[:, self.n_mixt:2*self.n_mixt]
        k = k_pre + 0.1*act[:, -self.n_mixt:]

        # u: (length_cond_sequence, 1, 1)
        u = T.shape_padright(T.arange(seq_cond.shape[0], dtype=floatX), 2)
        # phi: (length_cond_sequence, batch_size, n_mixt)
        phi = T.sum(a * T.exp(-b * (k-u)**2), axis=-1)
        # phi: (length_cond_sequence, batch_size)
        phi = phi * seq_cond_mask

        # w: (batch_size, n_chars)
        w = T.sum(T.shape_padright(phi) * seq_cond, axis=0)

        if mask:
            k = mask[:, None]*k + (1-mask[:, None])*k_pre
            w = mask[:, None]*w + (1-mask[:, None])*w_pre

        w = grad_clip(w, -100, 100)

        return h, a, k, phi, w
Exemple #9
0
    def define_train_test_funcs(self):
        # pYs = T.reshape(self.activation, (self.mask_y.shape[0] * self.batch_size, self.out_size))
        pYs_y = T.reshape(self.activation_dim_y,
                          (self.mask_y.shape[0] * self.batch_size, self.dim_y))
        pYs_pos = T.reshape(
            self.activation_dim_pos,
            (self.mask_y.shape[0] * self.batch_size, self.dim_pos))
        # tYs =  T.reshape(self.X, (self.mask.shape[0] * self.batch_size, self.out_size))
        tYs_y = T.reshape(self.Y_y,
                          (self.mask_y.shape[0] * self.batch_size, self.dim_y))
        tYs_pos = T.reshape(
            self.Y_pos, (self.mask_y.shape[0] * self.batch_size, self.dim_pos))
        # tYs =  T.reshape(self.Y, (self.mask_y.shape[0] * self.batch_size, self.out_size))
        cost_y = self.categorical_crossentropy(pYs_y, tYs_y)
        cost_pos = self.categorical_crossentropy(pYs_pos, tYs_pos)

        cost = cost_y + cost_pos

        gparams = []
        for param in self.params:
            gparam = T.grad(grad_clip(cost, -5.0, +5.0), param)
            gparams.append(gparam)

        lr = T.scalar("lr")
        # eval(): string to function
        optimizer = eval(self.optimizer)
        updates = optimizer(self.params, gparams, lr)

        #updates = sgd(self.params, gparams, lr)
        #updates = momentum(self.params, gparams, lr)
        #updates = rmsprop(self.params, gparams, lr)
        #updates = adagrad(self.params, gparams, lr)
        #updates = adadelta(self.params, gparams, lr)
        #updates = adam(self.params, gparams, lr)

        self.train = theano.function(
            inputs=[
                self.X, self.Y_y, self.Y_pos, self.mask, self.mask_y, lr,
                self.batch_size
            ],
            givens={self.is_train: np.cast['int32'](1)},
            outputs=[
                cost, cost_y, cost_pos, self.activation_dim_y,
                self.activation_dim_pos
            ],
            updates=updates)
def clipped_nesterov_momentum(loss, all_params, learning_rate, 
                              clip_range, momentum=0.9):
    # Adapted from Lasagne/lasagne/updates.py
    all_grads = theano.grad(grad_clip(loss, clip_range[0], clip_range[1]),
                            all_params)

    updates = []

    for param_i, grad_i in zip(all_params, all_grads):
        mparam_i = theano.shared(np.zeros(param_i.get_value().shape,
                                          dtype=theano.config.floatX))
        v = momentum * mparam_i - learning_rate * grad_i  # new momemtum
        w = param_i + momentum * v - learning_rate * grad_i  # new param values
        updates.append((mparam_i, v))
        updates.append((param_i, w))

    return updates
def clipped_nesterov_momentum(loss,
                              all_params,
                              learning_rate,
                              clip_range,
                              momentum=0.9):
    # Adapted from Lasagne/lasagne/updates.py
    all_grads = theano.grad(grad_clip(loss, clip_range[0], clip_range[1]),
                            all_params)

    updates = []

    for param_i, grad_i in zip(all_params, all_grads):
        mparam_i = theano.shared(
            np.zeros(param_i.get_value().shape, dtype=theano.config.floatX))
        v = momentum * mparam_i - learning_rate * grad_i  # new momemtum
        w = param_i + momentum * v - learning_rate * grad_i  # new param values
        updates.append((mparam_i, v))
        updates.append((param_i, w))

    return updates
Exemple #12
0
    def step(self, inputs, h_pre, mask=None, process_inputs=False):
        """
        A single timestep.

        Parameters
        ----------
        inputs: (batch_size, n_in)
        h_pre: (batch_size, n_hidden)
        mask: (batch_size,)

        process_inputs: bool
            If true, will process the input.
            If possible, it is better to process the whole input sequence
            beforehand. But sometimes this is not suitable, for example at
            prediction time.
        """
        n_out = h_pre.shape[1]

        if process_inputs:
            inputs = self.precompute_inputs(inputs)

        h_input = T.dot(h_pre, self.w_hid)

        sig = T.nnet.sigmoid

        gates = sig(inputs[:, :2 * n_out] + h_input[:, :2 * n_out])
        r_gate = gates[:, :n_out]
        u_gate = gates[:, n_out:2 * n_out]
        h_new = T.tanh(inputs[:, 2 * n_out:] + r_gate * h_input[:, 2 * n_out:])

        h = (1 - u_gate) * h_pre + u_gate * h_new

        if mask:
            h = T.switch(mask[:, None], h, h_pre)

        if self.grad_clipping:
            h = grad_clip(h, -self.grad_clipping, self.grad_clipping)

        return h
Exemple #13
0
    def step(self, inputs, h_pre, mask=None, process_inputs=False):
        """
        A single timestep.

        Parameters
        ----------
        inputs: (batch_size, n_in)
        h_pre: (batch_size, n_hidden)
        mask: (batch_size,)

        process_inputs: bool
            If true, will process the input.
            If possible, it is better to process the whole input sequence
            beforehand. But sometimes this is not suitable, for example at
            prediction time.
        """
        n_out = h_pre.shape[1]

        if process_inputs:
            inputs = self.precompute_inputs(inputs)

        h_input = T.dot(h_pre, self.w_hid)

        sig = T.nnet.sigmoid

        gates = sig(inputs[:, :2*n_out] + h_input[:, :2*n_out])
        r_gate = gates[:, :n_out]
        u_gate = gates[:, n_out:2*n_out]
        h_new = T.tanh(inputs[:, 2*n_out:] + r_gate * h_input[:, 2*n_out:])

        h = (1-u_gate)*h_pre + u_gate*h_new

        if mask:
            h = T.switch(mask[:, None], h, h_pre)

        h = grad_clip(h, -self.grad_clipping, self.grad_clipping)

        return h
Exemple #14
0
    def step(self, h, k_pre, w_pre, seq_cond, seq_cond_mask, mask=None):
        # act: (batch_size, 3*n_mixt)
        act = T.dot(h, self.w_cond) + self.b_cond

        if not self.normalize_att:
            act = T.exp(act)
            a = act[:, :self.n_mixt]
            b = act[:, self.n_mixt:2*self.n_mixt]
            k = k_pre + self.position_gap * act[:, -self.n_mixt:]

        else:
            a = T.nnet.softmax(act[:, :self.n_mixt])
            b = 2. + 2. * T.tanh(act[:, self.n_mixt:2 * self.n_mixt])
            k = k_pre + self.position_gap * (
                2. + 2. * T.tanh(act[:, self.n_mixt:2 * self.n_mixt]))

        # u: (length_cond_sequence, 1, 1)
        u = T.shape_padright(T.arange(seq_cond.shape[0], dtype=floatX), 2)
        # phi: (length_cond_sequence, batch_size, n_mixt)
        phi = T.sum(a * T.exp(-b * (k-u)**2), axis=-1)
        # phi: (length_cond_sequence, batch_size)
        phi *= seq_cond_mask

        # # TODO (not in Graves)
        # phi = phi * seq_cond_mask + -1000*(1-seq_cond_mask)
        # phi = T.nnet.softmax(phi.T).T * seq_cond_mask

        # w: (batch_size, condition_n_features)
        w = T.sum(T.shape_padright(phi) * seq_cond, axis=0)

        if mask:
            k = mask[:, None]*k + (1-mask[:, None])*k_pre
            w = mask[:, None]*w + (1-mask[:, None])*w_pre

        if self.grad_clip:
            w = grad_clip(w, -self.grad_clip, self.grad_clip)

        return a, k, phi, w
Exemple #15
0
    def __init__(self,
                 inputs=None,
                 hiddens=None,
                 params=None,
                 outdir='outputs/lstm/',
                 activation='relu',
                 gate_activation='sigmoid',
                 mrg=RNG_MRG.MRG_RandomStreams(1),
                 weights_init='uniform',
                 weights_interval='montreal',
                 weights_mean=0,
                 weights_std=5e-3,
                 bias_init=0.0,
                 r_weights_init='identity',
                 r_weights_interval='montreal',
                 r_weights_mean=0,
                 r_weights_std=5e-3,
                 r_bias_init=0.0,
                 direction='forward',
                 clip_recurrent_grads=False):
        """
        Initialize an LSTM.

        Parameters
        ----------
        inputs : List of [tuple(shape, `Theano.TensorType`)]
            The dimensionality of the inputs for this model, and the routing information for the model
            to accept inputs from elsewhere. `inputs` variable are expected to be of the form (timesteps, batch, data).
            `shape` will be a monad tuple representing known
            sizes for each dimension in the `Theano.TensorType`. The length of `shape` should be equal to number of
            dimensions in `Theano.TensorType`, where the shape element is an integer representing the size for its
            dimension, or None if the shape isn't known. For example, if you have a matrix with unknown batch size
            but fixed feature size of 784, `shape` would be: (None, 784). The full form of `inputs` would be:
            [((None, 784), <TensorType(float32, matrix)>)].
        hiddens : int or Tuple of (shape, `Theano.TensorType`)
            Int for the number of hidden units to use, or a tuple of shape, expression to route the starting
            hidden values from elsewhere.
        params : Dict(string_name: theano SharedVariable), optional
            A dictionary of model parameters (shared theano variables) that you should use when constructing
            this model (instead of initializing your own shared variables). This parameter is useful when you want to
            have two versions of the model that use the same parameters - such as siamese networks or pretraining some
            weights.
        outdir : str
            The location to produce outputs from training or running the :class:`LSTM`. If None, nothing will be saved.
        activation : str or callable
            The nonlinear (or linear) activation to perform for the hidden units.
            This activation function should be appropriate for the output unit types, i.e. 'sigmoid' for binary.
            See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass
            your own function to be used as long as it is callable.
        gate_activation : str or callable
            The activation to perform for the hidden gates (default sigmoid).
            See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass
            your own function to be used as long as it is callable.
        mrg : random
            A random number generator that is used when adding noise.
            I recommend using Theano's sandbox.rng_mrg.MRG_RandomStreams.
        weights_init : str
            Determines the method for initializing input-hidden model weights. See opendeep.utils.nnet for options.
        weights_interval : str or float
            If Uniform `weights_init`, the +- interval to use. See opendeep.utils.nnet for options.
        weights_mean : float
            If Gaussian `weights_init`, the mean value to use.
        weights_std : float
            If Gaussian `weights_init`, the standard deviation to use.
        bias_init : float
            The initial value to use for the bias parameter. Most often, the default of 0.0 is preferred.
        r_weights_init : str
            Determines the method for initializing recurrent hidden-hidden model weights.
            See opendeep.utils.nnet for options.
        r_weights_interval : str or float
            If Uniform `r_weights_init`, the +- interval to use. See opendeep.utils.nnet for options.
        r_weights_mean : float
            If Gaussian `r_weights_init`, the mean value to use.
        r_weights_std : float
            If Gaussian `r_weights_init`, the standard deviation to use.
        r_bias_init : float
            The initial value to use for the recurrent bias parameter. Most often, the default of 0.0 is preferred.
        direction : str
            The direction this recurrent model should go over its inputs. Can be 'forward', 'backward', or
            'bidirectional'. In the case of 'bidirectional', it will make two passes over the sequence,
            computing two sets of hiddens and adding them together.
        clip_recurrent_grads : False or float, optional
            Whether to clip the gradients for the parameters that unroll over timesteps (such as the weights
            connecting previous hidden states to the current hidden state, and not the weights from current
            input to hiddens). If it is a float, the gradients for the weights will be hard clipped to the range
            `+-clip_recurrent_grads`.
        """
        initial_parameters = locals().copy()
        initial_parameters.pop('self')
        super(LSTM, self).__init__(**initial_parameters)

        ##################
        # specifications #
        ##################
        backward = direction.lower() == 'backward'
        bidirectional = direction.lower() == 'bidirectional'

        ########################
        # activation functions #
        ########################
        # recurrent hidden activation functions!
        self.hidden_activation_func = get_activation_function(activation)
        self.gate_activation_func = get_activation_function(gate_activation)

        ##########
        # inputs #
        ##########
        # inputs are expected to have the shape (n_timesteps, batch_size, data)
        if len(self.inputs) > 1:
            raise NotImplementedError(
                "Expected 1 input, found %d. Please merge inputs before passing "
                "to the model!" % len(self.inputs))
        # self.inputs is a list of all the input expressions (we enforce only 1, so self.inputs[0] is the input)
        input_shape, self.input = self.inputs[0]
        if isinstance(input_shape, int):
            self.input_size = ((None, ) *
                               (self.input.ndim - 1)) + (input_shape, )
        else:
            self.input_size = input_shape
        assert self.input_size is not None, "Need to specify the shape for at least the last dimension of the input!"
        # input is 3D tensor of (timesteps, batch_size, data_dim)
        # if input is 2D tensor, assume it is of the form (timesteps, data_dim) i.e. batch_size is 1. Convert to 3D.
        # if input is > 3D tensor, assume it is of form (timesteps, batch_size, data...) and flatten to 3D.
        if self.input.ndim == 1:
            self.input = unbroadcast(self.input.dimshuffle(0, 'x', 'x'),
                                     [1, 2])

        elif self.input.ndim == 2:
            self.input = unbroadcast(self.input.dimshuffle(0, 'x', 1), 1)

        elif self.input.ndim > 3:
            self.input = self.input.flatten(3)
            self.input_size = self.input_size[:2] + (prod(self.input_size[2:]))

        ###########
        # hiddens #
        ###########
        # have only 1 hiddens
        assert len(
            self.hiddens) == 1, "Expected 1 `hiddens` param, found %d" % len(
                self.hiddens)
        self.hiddens = self.hiddens[0]
        # if hiddens is an int (hidden size parameter, not routing info)
        h_init = None
        if isinstance(self.hiddens, int):
            self.hidden_size = self.hiddens
        elif isinstance(self.hiddens, tuple):
            hidden_shape, h_init = self.hiddens
            if isinstance(hidden_shape, int):
                self.hidden_size = hidden_shape
            else:
                self.hidden_size = hidden_shape[-1]
        else:
            raise AssertionError(
                "Hiddens need to be an int or tuple of (shape, theano_expression), found %s"
                % type(self.hiddens))

        # output shape is going to be 3D with (timesteps, batch_size, hidden_size)
        self.output_size = (None, None, self.hidden_size)

        ##########################################################
        # parameters - make sure to deal with params dict input! #
        ##########################################################
        # all input-to-hidden weights
        W_c, W_i, W_f, W_o = [
            self.params.get(
                "W_%s" % sub,
                get_weights(
                    weights_init=weights_init,
                    shape=(self.input_size[-1], self.hidden_size),
                    name="W_%s" % sub,
                    # if gaussian
                    mean=weights_mean,
                    std=weights_std,
                    # if uniform
                    interval=weights_interval))
            for sub in ['c', 'i', 'f', 'o']
        ]
        # all hidden-to-hidden weights
        U_c, U_i, U_f, U_o = [
            self.params.get(
                "U_%s" % sub,
                get_weights(
                    weights_init=r_weights_init,
                    shape=(self.hidden_size, self.hidden_size),
                    name="U_%s" % sub,
                    # if gaussian
                    mean=r_weights_mean,
                    std=r_weights_std,
                    # if uniform
                    interval=r_weights_interval))
            for sub in ['c', 'i', 'f', 'o']
        ]
        # if bidirectional, make hidden-to-hidden weights again to go the opposite direction
        U_c_b, U_i_b, U_f_b, U_o_b = None, None, None, None
        if bidirectional:
            U_c_b, U_i_b, U_f_b, U_o_b = [
                self.params.get(
                    "U_%s_b" % sub,
                    get_weights(
                        weights_init=r_weights_init,
                        shape=(self.hidden_size, self.hidden_size),
                        name="U_%s_b" % sub,
                        # if gaussian
                        mean=r_weights_mean,
                        std=r_weights_std,
                        # if uniform
                        interval=r_weights_interval))
                for sub in ['c', 'i', 'f', 'o']
            ]
        # biases
        b_c, b_i, b_f, b_o = [
            self.params.get(
                "b_%s" % sub,
                get_bias(shape=(self.hidden_size, ),
                         name="b_%s" % sub,
                         init_values=r_bias_init))
            for sub in ['c', 'i', 'f', 'o']
        ]
        # clip gradients if we are doing that
        recurrent_params = [U_c, U_i, U_f, U_o, U_c_b, U_i_b, U_f_b, U_o_b]
        if clip_recurrent_grads:
            clip = abs(clip_recurrent_grads)
            U_c, U_i, U_f, U_o, U_c_b, U_i_b, U_f_b, U_o_b = [
                grad_clip(param, -clip, clip) if param is not None else None
                for param in recurrent_params
            ]

        # put all the parameters into our dictionary
        self.params = {
            "W_c": W_c,
            "W_i": W_i,
            "W_f": W_f,
            "W_o": W_o,
            "U_c": U_c,
            "U_i": U_i,
            "U_f": U_f,
            "U_o": U_o,
            "b_c": b_c,
            "b_i": b_i,
            "b_f": b_f,
            "b_o": b_o,
        }
        if bidirectional:
            self.params.update({
                "U_c_b": U_c_b,
                "U_i_b": U_i_b,
                "U_f_b": U_f_b,
                "U_o_b": U_o_b,
            })

        # make h_init the right sized tensor
        if h_init is None:
            h_init = zeros_like(dot(self.input[0], W_c))

        c_init = zeros_like(dot(self.input[0], W_c))

        ###############
        # computation #
        ###############
        # move some computation outside of scan to speed it up!
        x_c = dot(self.input, W_c) + b_c
        x_i = dot(self.input, W_i) + b_i
        x_f = dot(self.input, W_f) + b_f
        x_o = dot(self.input, W_o) + b_o

        # now do the recurrent stuff
        (self.hiddens,
         _), self.updates = scan(fn=self.recurrent_step,
                                 sequences=[x_c, x_i, x_f, x_o],
                                 outputs_info=[h_init, c_init],
                                 non_sequences=[U_c, U_i, U_f, U_o],
                                 go_backwards=backward,
                                 name="lstm_scan",
                                 strict=True)

        # if bidirectional, do the same in reverse!
        if bidirectional:
            (hiddens_b,
             _), updates_b = scan(fn=self.recurrent_step,
                                  sequences=[x_c, x_i, x_f, x_o],
                                  outputs_info=[h_init, c_init],
                                  non_sequences=[U_c_b, U_i_b, U_f_b, U_o_b],
                                  go_backwards=not backward,
                                  name="lstm_scan_back",
                                  strict=True)
            # flip the hiddens to be the right direction
            hiddens_b = hiddens_b[::-1]
            # update stuff
            self.updates.update(updates_b)
            self.hiddens += hiddens_b

        log.info("Initialized an LSTM!")
Exemple #16
0
    # Calculate final hidden state value
    h = (1.0 - z) * h_t_1 + z * h_d
    y = (th.dot(Y, h) + b_y)
    return h, softmax(y)

    #def predict(, x_vec):
        # return symbolic output of th pass
[h, out], _ = th.scan(gru_step, sequences=x, outputs_info=[h0, None])

#error = ((out - y)**2).sum()
error = t.nnet.categorical_crossentropy(out, y).sum()

# Implement adagrad and define symbolic updates which is a list of tuples
grads = t.grad(error, params)
#param_grads = grads
param_grads = [grad_clip(grad, -5, 5) for grad in grads]

# new_grad_hists = [g_hist + g ** 2 for g_hist, g in zip(grad_hists, param_grads)]

# param_updates = [
#     (param, param - gamma * param_grad/t.sqrt(g_hist + 1e-8))
#     for param, param_grad, g_hist in zip(params, param_grads, grad_hists)
# ]

#Iplemening gradient clipping here
param_updates = [
    (param, param - 0.01 * param_grad)
    for param, param_grad in zip(params, param_grads)
]
updates = param_updates
Exemple #17
0
    def __init__(self, inputs=None, hiddens=None, params=None, outdir='outputs/lstm/',
                 activation='relu', gate_activation='sigmoid',
                 mrg=RNG_MRG.MRG_RandomStreams(1),
                 weights_init='uniform', weights_interval='glorot', weights_mean=0, weights_std=5e-3,
                 bias_init=0.0,
                 r_weights_init='identity', r_weights_interval='glorot', r_weights_mean=0, r_weights_std=5e-3,
                 r_bias_init=0.0,
                 direction='forward',
                 clip_recurrent_grads=False):
        """
        Initialize an LSTM.

        Parameters
        ----------
        inputs : List of [tuple(shape, `Theano.TensorType`)]
            The dimensionality of the inputs for this model, and the routing information for the model
            to accept inputs from elsewhere. `inputs` variable are expected to be of the form (timesteps, batch, data).
            `shape` will be a monad tuple representing known
            sizes for each dimension in the `Theano.TensorType`. The length of `shape` should be equal to number of
            dimensions in `Theano.TensorType`, where the shape element is an integer representing the size for its
            dimension, or None if the shape isn't known. For example, if you have a matrix with unknown batch size
            but fixed feature size of 784, `shape` would be: (None, 784). The full form of `inputs` would be:
            [((None, 784), <TensorType(float32, matrix)>)].
        hiddens : int or Tuple of (shape, `Theano.TensorType`)
            Int for the number of hidden units to use, or a tuple of shape, expression to route the starting
            hidden values from elsewhere.
        params : Dict(string_name: theano SharedVariable), optional
            A dictionary of model parameters (shared theano variables) that you should use when constructing
            this model (instead of initializing your own shared variables). This parameter is useful when you want to
            have two versions of the model that use the same parameters - such as siamese networks or pretraining some
            weights.
        outdir : str
            The location to produce outputs from training or running the :class:`LSTM`. If None, nothing will be saved.
        activation : str or callable
            The nonlinear (or linear) activation to perform for the hidden units.
            This activation function should be appropriate for the output unit types, i.e. 'sigmoid' for binary.
            See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass
            your own function to be used as long as it is callable.
        gate_activation : str or callable
            The activation to perform for the hidden gates (default sigmoid).
            See opendeep.utils.activation for a list of available activation functions. Alternatively, you can pass
            your own function to be used as long as it is callable.
        mrg : random
            A random number generator that is used when adding noise.
            I recommend using Theano's sandbox.rng_mrg.MRG_RandomStreams.
        weights_init : str
            Determines the method for initializing input-hidden model weights. See opendeep.utils.nnet for options.
        weights_interval : str or float
            If Uniform `weights_init`, the +- interval to use. See opendeep.utils.nnet for options.
        weights_mean : float
            If Gaussian `weights_init`, the mean value to use.
        weights_std : float
            If Gaussian `weights_init`, the standard deviation to use.
        bias_init : float
            The initial value to use for the bias parameter. Most often, the default of 0.0 is preferred.
        r_weights_init : str
            Determines the method for initializing recurrent hidden-hidden model weights.
            See opendeep.utils.nnet for options.
        r_weights_interval : str or float
            If Uniform `r_weights_init`, the +- interval to use. See opendeep.utils.nnet for options.
        r_weights_mean : float
            If Gaussian `r_weights_init`, the mean value to use.
        r_weights_std : float
            If Gaussian `r_weights_init`, the standard deviation to use.
        r_bias_init : float
            The initial value to use for the recurrent bias parameter. Most often, the default of 0.0 is preferred.
        direction : str
            The direction this recurrent model should go over its inputs. Can be 'forward', 'backward', or
            'bidirectional'. In the case of 'bidirectional', it will make two passes over the sequence,
            computing two sets of hiddens and adding them together.
        clip_recurrent_grads : False or float, optional
            Whether to clip the gradients for the parameters that unroll over timesteps (such as the weights
            connecting previous hidden states to the current hidden state, and not the weights from current
            input to hiddens). If it is a float, the gradients for the weights will be hard clipped to the range
            `+-clip_recurrent_grads`.
        """
        initial_parameters = locals().copy()
        initial_parameters.pop('self')
        super(LSTM, self).__init__(**initial_parameters)

        ##################
        # specifications #
        ##################
        backward = direction.lower() == 'backward'
        bidirectional = direction.lower() == 'bidirectional'

        ########################
        # activation functions #
        ########################
        # recurrent hidden activation functions!
        self.hidden_activation_func = get_activation_function(activation)
        self.gate_activation_func = get_activation_function(gate_activation)

        ##########
        # inputs #
        ##########
        # inputs are expected to have the shape (n_timesteps, batch_size, data)
        if len(self.inputs) > 1:
            raise NotImplementedError("Expected 1 input, found %d. Please merge inputs before passing "
                                      "to the model!" % len(self.inputs))
        # self.inputs is a list of all the input expressions (we enforce only 1, so self.inputs[0] is the input)
        input_shape, self.input = self.inputs[0]
        if isinstance(input_shape, int):
            self.input_size = ((None,) * (self.input.ndim - 1)) + (input_shape,)
        else:
            self.input_size = input_shape
        assert self.input_size is not None, "Need to specify the shape for at least the last dimension of the input!"
        # input is 3D tensor of (timesteps, batch_size, data_dim)
        # if input is 2D tensor, assume it is of the form (timesteps, data_dim) i.e. batch_size is 1. Convert to 3D.
        # if input is > 3D tensor, assume it is of form (timesteps, batch_size, data...) and flatten to 3D.
        if self.input.ndim == 1:
            self.input = unbroadcast(self.input.dimshuffle(0, 'x', 'x'), [1, 2])

        elif self.input.ndim == 2:
            self.input = unbroadcast(self.input.dimshuffle(0, 'x', 1), 1)

        elif self.input.ndim > 3:
            flat_in = Flatten((self.input_size, self.input), ndim=3)
            self.input = flat_in.get_outputs()
            self.input_size = flat_in.output_size

        ###########
        # hiddens #
        ###########
        # have only 1 hiddens
        assert len(self.hiddens) == 1, "Expected 1 `hiddens` param, found %d" % len(self.hiddens)
        self.hiddens = self.hiddens[0]
        # if hiddens is an int (hidden size parameter, not routing info)
        h_init = None
        if isinstance(self.hiddens, int):
            self.hidden_size = self.hiddens
        elif isinstance(self.hiddens, tuple):
            hidden_shape, h_init = self.hiddens
            if isinstance(hidden_shape, int):
                self.hidden_size = hidden_shape
            else:
                self.hidden_size = hidden_shape[-1]
        else:
            raise AssertionError("Hiddens need to be an int or tuple of (shape, theano_expression), found %s" %
                                 type(self.hiddens))

        # output shape is going to be 3D with (timesteps, batch_size, hidden_size)
        self.output_size = (None, None, self.hidden_size)

        ##########################################################
        # parameters - make sure to deal with params dict input! #
        ##########################################################
        # all input-to-hidden weights
        W_c, W_i, W_f, W_o = [
            self.params.get(
                "W_%s" % sub,
                get_weights(weights_init=weights_init,
                            shape=(self.input_size[-1], self.hidden_size),
                            name="W_%s" % sub,
                            # if gaussian
                            mean=weights_mean,
                            std=weights_std,
                            # if uniform
                            interval=weights_interval)
            )
            for sub in ['c', 'i', 'f', 'o']
        ]
        # all hidden-to-hidden weights
        U_c, U_i, U_f, U_o = [
            self.params.get(
                "U_%s" % sub,
                get_weights(weights_init=r_weights_init,
                            shape=(self.hidden_size, self.hidden_size),
                            name="U_%s" % sub,
                            # if gaussian
                            mean=r_weights_mean,
                            std=r_weights_std,
                            # if uniform
                            interval=r_weights_interval)
            )
            for sub in ['c', 'i', 'f', 'o']
        ]
        # if bidirectional, make hidden-to-hidden weights again to go the opposite direction
        U_c_b, U_i_b, U_f_b, U_o_b = None, None, None, None
        if bidirectional:
            U_c_b, U_i_b, U_f_b, U_o_b = [
                self.params.get(
                    "U_%s_b" % sub,
                    get_weights(weights_init=r_weights_init,
                                shape=(self.hidden_size, self.hidden_size),
                                name="U_%s_b" % sub,
                                # if gaussian
                                mean=r_weights_mean,
                                std=r_weights_std,
                                # if uniform
                                interval=r_weights_interval)
                )
                for sub in ['c', 'i', 'f', 'o']
            ]
        # biases
        b_c, b_i, b_f, b_o = [
            self.params.get(
                "b_%s" % sub,
                get_bias(shape=(self.hidden_size,),
                         name="b_%s" % sub,
                         init_values=r_bias_init)
            )
            for sub in ['c', 'i', 'f', 'o']
        ]
        # clip gradients if we are doing that
        recurrent_params = [U_c, U_i, U_f, U_o, U_c_b, U_i_b, U_f_b, U_o_b]
        if clip_recurrent_grads:
            clip = abs(clip_recurrent_grads)
            U_c, U_i, U_f, U_o, U_c_b, U_i_b, U_f_b, U_o_b = [
                grad_clip(param, -clip, clip) if param is not None
                else None
                for param in recurrent_params
            ]

        # put all the parameters into our dictionary
        self.params = {
            "W_c": W_c,
            "W_i": W_i,
            "W_f": W_f,
            "W_o": W_o,

            "U_c": U_c,
            "U_i": U_i,
            "U_f": U_f,
            "U_o": U_o,

            "b_c": b_c,
            "b_i": b_i,
            "b_f": b_f,
            "b_o": b_o,
        }
        if bidirectional:
            self.params.update(
                {
                    "U_c_b": U_c_b,
                    "U_i_b": U_i_b,
                    "U_f_b": U_f_b,
                    "U_o_b": U_o_b,
                }
            )

        # make h_init the right sized tensor
        if h_init is None:
            h_init = zeros_like(dot(self.input[0], W_c))

        c_init = zeros_like(dot(self.input[0], W_c))

        ###############
        # computation #
        ###############
        # move some computation outside of scan to speed it up!
        x_c = dot(self.input, W_c) + b_c
        x_i = dot(self.input, W_i) + b_i
        x_f = dot(self.input, W_f) + b_f
        x_o = dot(self.input, W_o) + b_o

        # now do the recurrent stuff
        (self.hiddens, _), self.updates = scan(
            fn=self.recurrent_step,
            sequences=[x_c, x_i, x_f, x_o],
            outputs_info=[h_init, c_init],
            non_sequences=[U_c, U_i, U_f, U_o],
            go_backwards=backward,
            name="lstm_scan",
            strict=True
        )

        # if bidirectional, do the same in reverse!
        if bidirectional:
            (hiddens_b, _), updates_b = scan(
                fn=self.recurrent_step,
                sequences=[x_c, x_i, x_f, x_o],
                outputs_info=[h_init, c_init],
                non_sequences=[U_c_b, U_i_b, U_f_b, U_o_b],
                go_backwards=not backward,
                name="lstm_scan_back",
                strict=True
            )
            # flip the hiddens to be the right direction
            hiddens_b = hiddens_b[::-1]
            # update stuff
            self.updates.update(updates_b)
            self.hiddens += hiddens_b

        log.info("Initialized an LSTM!")
Exemple #18
0
    def step(self,
             input,
             h_left,
             c_left,
             x_pos,
             h_buffer,
             c_buffer,
             width,
             mask=None,
             process_input=False):
        """
        One time step. This function can be used as LSTMCell by setting `process_input=True`.
        :param input:   (B, input_dim)
        :param h_left:  (B, hidden_dim)
        :param c_left:  (B, hidden_dim)
        :param x_pos:   int64 scalar, width dimension
        :param h_buffer: (W, B, hidden_dim)
        :param c_buffer: (W, B, hidden_dim)
        :param width:   width for x_pos rounding
        :param mask:    (B,)
        :param process_input: If possible, it is better to process the whole input sequence beforehand.
                              But sometimes this is not suitable, for example at prediction time.
        :return: h, c, both (B, hidden_dim)
        """
        if process_input:
            input = self._precompute_input(input)  # (B, 4*hidden_dim)

        h_up = h_buffer[x_pos, :, :]  # (B, hidden_dim)
        c_up = c_buffer[x_pos, :, :]  # (B, hidden_dim)

        gates = input + tensor.dot(h_left, self.W_h_left) + tensor.dot(
            h_up, self.W_h_up)  # (B, 4*hidden_dim)
        if self.grad_clipping > 0:
            gates = grad_clip(gates, -self.grad_clipping, self.grad_clipping)

        i_gate = gates[:, :self.hidden_dim]  # input gate, (B, hidden_dim)
        f_gate = gates[:, self.hidden_dim:2 *
                       self.hidden_dim]  # forget gate, (B, hidden_dim)
        c_input = gates[:, 2 * self.hidden_dim:3 *
                        self.hidden_dim]  # cell input, (B, hidden_dim)
        o_gate = gates[:, 3 * self.hidden_dim:]  # output gate, (B, hidden_dim)

        if self.peephole:
            i_gate += (c_left * self.w_cell_to_igate_left +
                       c_up * self.w_cell_to_igate_up)
            f_gate += (c_left * self.w_cell_to_fgate_left +
                       c_up * self.w_cell_to_fgate_up)

        i_gate = sigmoid(i_gate)
        f_gate = sigmoid(f_gate)
        c_input = tanh(c_input)
        c = f_gate * (
            c_up + c_left
        ) * 0.5 + i_gate * c_input  # add 0.5 coefficient for numerical stability

        if self.peephole:
            o_gate += c * self.w_cell_to_ogate
        o_gate = sigmoid(o_gate)
        h = o_gate * self.hidden_activation(c)

        if mask:
            h = tensor.switch(mask[:, None], h, h_left)
            c = tensor.switch(mask[:, None], c, c_left)

        h_buffer = tensor.set_subtensor(h_buffer[x_pos, :, :], h)
        c_buffer = tensor.set_subtensor(c_buffer[x_pos, :, :], c)
        x_pos = x_pos + 1
        x_pos = tensor.mod(x_pos, width)
        return h, c, x_pos, h_buffer, c_buffer
    def __theano_build__(self):
        #Just making things more legible
        E, U, W, b, V, d = self.E, self.U, self.W, self.b, self.V, self.d

        x = T.ivector('x')  #Input sequence stored as theano variable x
        y = T.ivector('y')  #Target output value stored as theano variable y
        learnRate = T.scalar('learnRate')
        decayRate = T.scalar('decayRate')

        print("Loading forward_step")
        [out, s, C], updates = theano.scan(
            self.forward_step,
            sequences=x,
            truncate_gradient=4,
            outputs_info=[
                None,
                dict(initial=theano.shared(value=np.zeros(
                    self.hidden_dim).astype(theano.config.floatX))),
                dict(initial=theano.shared(value=np.ones(
                    self.hidden_dim).astype(theano.config.floatX)))
            ])

        pred = T.argmax(out, axis=1)

        #Predicts error of the output using categorical cross entropy
        pred_error = T.sum(T.nnet.categorical_crossentropy(out, y))

        print("Loading f_pred")
        self.f_pred = theano.function([x], out)  #Returns the class
        self.f_pred_class = theano.function([x], pred)

        #Define function for calculating error
        print("Loading ce_error")
        self.ce_error = theano.function([x, y],
                                        pred_error,
                                        allow_input_downcast=True)

        print("Loading gradients")
        #Gradients
        dE = grad_clip(T.grad(pred_error, E))
        dW = grad_clip(T.grad(pred_error, W))
        dU = grad_clip(T.grad(pred_error, U))
        dV = grad_clip(T.grad(pred_error, V))
        db = grad_clip(T.grad(pred_error, b))
        dd = grad_clip(T.grad(pred_error, d))

        # Adam cache updates
        beta1 = .9
        beta2 = .999
        eps = 1e-8
        mE = grad_clip(beta1 * self.mE + (1 - beta1) * dE)
        mU = grad_clip(beta1 * self.mU + (1 - beta1) * dU)
        mW = grad_clip(beta1 * self.mW + (1 - beta1) * dW)
        mV = grad_clip(beta1 * self.mV + (1 - beta1) * dV)
        mb = grad_clip(beta1 * self.mb + (1 - beta1) * db)
        md = grad_clip(beta1 * self.md + (1 - beta1) * dd)
        vE = grad_clip(beta2 * self.vE + (1 - beta2) * (dE**2))
        vU = grad_clip(beta2 * self.vU + (1 - beta2) * (dU**2))
        vW = grad_clip(beta2 * self.vW + (1 - beta2) * (dW**2))
        vV = grad_clip(beta2 * self.vV + (1 - beta2) * (dV**2))
        vb = grad_clip(beta2 * self.vb + (1 - beta2) * (db**2))
        vd = grad_clip(beta2 * self.vd + (1 - beta2) * (dd**2))

        print("Loading adam_step")
        self.adam_step = theano.function(
            [x, y, learnRate], [],
            updates=[
                (E, E - learnRate * mE / (T.sqrt(vE) + eps)),
                (U, U - learnRate * mU / (T.sqrt(vU) + eps)),
                (W, W - learnRate * mW / (T.sqrt(vW) + eps)),
                (V, V - learnRate * mV / (T.sqrt(vV) + eps)),
                (b, b - learnRate * mb / (T.sqrt(vb) + eps)),
                (d, d - learnRate * md / (T.sqrt(vd) + eps)), (self.mE, mE),
                (self.mU, mU), (self.mW, mW), (self.mV, mV), (self.mb, mb),
                (self.md, md), (self.vE, vE), (self.vU, vU), (self.vW, vW),
                (self.vV, vV), (self.vb, vb), (self.vd, vd)
            ],
            allow_input_downcast=True)