Esempio n. 1
0
    def recurrence(self, state_below, mask=None):
        nsteps = state_below.shape[0]
        if state_below.ndim == 3:
            n_samples = state_below.shape[1]
        else:
            n_samples = 1

        assert mask is not None

        def _slice(_x, n, dim):
            if _x.ndim == 3:
                return _x[:, :, n * dim:(n + 1) * dim]
            return _x[:, n * dim:(n + 1) * dim]

        def _step(m_, x_, h_, c_):
            preact = tensor.dot(h_, self.U)
            preact += x_

            i = tensor.nnet.sigmoid(
                _slice(preact, 0, self.option[Option.OUTPUT_DIM]))
            f = tensor.nnet.sigmoid(
                _slice(preact, 1, self.option[Option.OUTPUT_DIM]))
            o = tensor.nnet.sigmoid(
                _slice(preact, 2, self.option[Option.OUTPUT_DIM]))
            c = tensor.tanh(_slice(preact, 3, self.option[Option.OUTPUT_DIM]))

            c = f * c_ + i * c
            if m_ is not None:

                c1 = m_[:, None] * c
                c2 = (1. - m_)[:, None] * c_
                c = c1 + c2

            h = o * tensor.tanh(c)
            if m_ is not None:
                h = m_[:, None] * h + (1. - m_)[:, None] * h_

            return tensor.cast(h, th.config.floatX), tensor.cast(
                c, th.config.floatX)

        state_below = (tensor.dot(state_below, self.W) + self.b)

        dim_proj = self.option[Option.OUTPUT_DIM]
        rval, updates = th.scan(_step,
                                sequences=[mask, state_below],
                                outputs_info=[
                                    tensor.alloc(numpy_floatX(0.), n_samples,
                                                 dim_proj),
                                    tensor.alloc(numpy_floatX(0.), n_samples,
                                                 dim_proj)
                                ],
                                name=_p(self.id, '_layers'),
                                n_steps=nsteps)
        return rval[0]  # return hidden values
Esempio n. 2
0
    def compile(self):
        #####
        if self.mask is not None:
            self.core_layer.mask = self.mask
        self.core_layer.id = self.id
        self.core_layer.init_params()
        #####
        if isinstance(self.input, list):
            time_steps = len(self.input)
        else:
            time_steps = self.input.shape[0]

        def _step(_m, prev):
            self.core_layer.input = _m
            self.core_layer.compile(init_params=False)
            return self.core_layer.output

        result, updates = th.scan(
            fn=_step,
            outputs_info=tensor.alloc(
                numpy_floatX(0.), self.input.shape[1],
                self.core_layer.option[Option.OUTPUT_DIM]),
            sequences=[self.input])

        #self.output = result.swapaxes(0,1)
        self.output = result
        self.params = self.core_layer.params
Esempio n. 3
0
 def add_layer(self, l):
     if isinstance(l, list):
         for ll in l:
             self.add_layer(ll)
     else:
         self.layers.append(l)
         if isinstance(l, Dropout):
             self.use_noise = th.shared(numpy_floatX(0.))
Esempio n. 4
0
    def __init__(self,
                 idx="0",
                 input_value_type=config.floatX,
                 prediction_type='vector',
                 prediction_value_type=config.floatX,
                 use_mask=False,
                 use_noise=False):
        '''

        :param idx: id of the
        :param input_value_type:
        :param prediction_type:
        :param prediction_value_type:
        :param use_mask:
        :return:
        '''
        self.input = tensor.matrix('x', dtype=input_value_type)
        self.output = None
        self.prediction_type = prediction_type
        if prediction_type == 'vector':
            self.prediction = tensor.ivector('y')
            self.gold = tensor.ivector('g')
        if prediction_type == 'matrix':
            self.prediction = tensor.matrix('y', dtype=prediction_value_type)
            self.gold = tensor.matrix('g', dtype=prediction_value_type)
        self.params = OrderedDict()
        self.layers = []
        self.optimizer = None
        self.cost = None
        self.id = idx
        self.option = Option()  # options for the model
        if use_noise:
            self.use_noise = th.shared(numpy_floatX(0.))
        else:

            self.use_noise = None
        self.use_mask = use_mask  # can be set to True when working with sequences of different sizes

        self.input_mask = None
        self.output_mask = None

        self.f_cost = None
        self.f_grad = None
        self.lr = tensor.scalar(name='lr')
        self.f_grad_shared = None
        self.f_update = None
Esempio n. 5
0
def adadelta(lr,
             tparams,
             grads,
             x,
             y,
             cost,
             input_mask=None,
             output_mask=None):
    """
    An adaptive learning rate optimizer

    Parameters
    ----------
    lr : Theano SharedVariable
        Initial learning rate
    tpramas: Theano SharedVariable
        Model parameters
    grads: Theano variable
        Gradients of cost w.r.t to parameres
    x: Theano variable
        Model inputs
    mask: Theano variable
        Sequence mask
    y: Theano variable
        Targets
    cost: Theano variable
        Objective fucntion to minimize

    Notes
    -----
    For more information, see [ADADELTA]_.

    .. [ADADELTA] Matthew D. Zeiler, *ADADELTA: An Adaptive Learning
       Rate Method*, arXiv:1212.5701.
    """

    zipped_grads = [
        theano.shared(p.get_value() * numpy_floatX(0.), name='%s_grad' % k)
        for k, p in tparams.items()
    ]
    running_up2 = [
        theano.shared(p.get_value() * numpy_floatX(0.), name='%s_rup2' % k)
        for k, p in tparams.items()
    ]
    running_grads2 = [
        theano.shared(p.get_value() * numpy_floatX(0.), name='%s_rgrad2' % k)
        for k, p in tparams.items()
    ]

    zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
    rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g**2))
             for rg2, g in zip(running_grads2, grads)]
    if input_mask is not None and output_mask is not None:
        print("not none")
    else:
        print("none")
    if input_mask is not None and output_mask is not None:
        f_grad_shared = theano.function([x, y, input_mask, output_mask],
                                        cost,
                                        updates=zgup + rg2up,
                                        name='adadelta_f_grad_shared')
    else:
        f_grad_shared = theano.function([x, y],
                                        cost,
                                        updates=zgup + rg2up,
                                        name='adadelta_f_grad_shared')

    updir = [
        -tensor.sqrt(ru2 + 1e-6) / tensor.sqrt(rg2 + 1e-6) * zg
        for zg, ru2, rg2 in zip(zipped_grads, running_up2, running_grads2)
    ]
    ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud**2))
             for ru2, ud in zip(running_up2, updir)]
    param_up = [(p, p + ud) for p, ud in zip(tparams.values(), updir)]

    f_update = theano.function([lr], [],
                               updates=ru2up + param_up,
                               on_unused_input='ignore',
                               name='adadelta_f_update')

    return f_grad_shared, f_update
Esempio n. 6
0
def rmsprop(lr, tparams, grads, x, y, cost, input_mask=None, output_mask=None):
    """
    A variant of  SGD that scales the step size by running average of the
    recent step norms.

    Parameters
    ----------
    lr : Theano SharedVariable
        Initial learning rate
    tpramas: Theano SharedVariable
        Model parameters
    grads: Theano variable
        Gradients of cost w.r.t to parameres
    x: Theano variable
        Model inputs
    mask: Theano variable
        Sequence mask
    y: Theano variable
        Targets
    cost: Theano variable
        Objective fucntion to minimize

    Notes
    -----
    For more information, see [Hint2014]_.

    .. [Hint2014] Geoff Hinton, *Neural Networks for Machine Learning*,
       lecture 6a,
       http://cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf
    """

    zipped_grads = [
        theano.shared(p.get_value() * numpy_floatX(0.), name='%s_grad' % k)
        for k, p in tparams.items()
    ]
    running_grads = [
        theano.shared(p.get_value() * numpy_floatX(0.), name='%s_rgrad' % k)
        for k, p in tparams.items()
    ]
    running_grads2 = [
        theano.shared(p.get_value() * numpy_floatX(0.), name='%s_rgrad2' % k)
        for k, p in tparams.items()
    ]

    zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
    rgup = [(rg, 0.95 * rg + 0.05 * g) for rg, g in zip(running_grads, grads)]
    rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g**2))
             for rg2, g in zip(running_grads2, grads)]
    if input_mask is not None and output_mask is not None:

        f_grad_shared = theano.function([x, y, input_mask, output_mask],
                                        cost,
                                        updates=zgup + rgup + rg2up,
                                        name='rmsprop_f_grad_shared')
    else:
        f_grad_shared = theano.function([x, y],
                                        cost,
                                        updates=zgup + rgup + rg2up,
                                        name='rmsprop_f_grad_shared')

    updir = [
        theano.shared(p.get_value() * numpy_floatX(0.), name='%s_updir' % k)
        for k, p in tparams.items()
    ]
    updir_new = [(ud, 0.9 * ud - 1e-4 * zg / tensor.sqrt(rg2 - rg**2 + 1e-4))
                 for ud, zg, rg, rg2 in zip(updir, zipped_grads, running_grads,
                                            running_grads2)]
    param_up = [(p, p + udn[1]) for p, udn in zip(tparams.values(), updir_new)]
    f_update = theano.function([lr], [],
                               updates=updir_new + param_up,
                               on_unused_input='ignore',
                               name='rmsprop_f_update')

    return f_grad_shared, f_update