Ejemplo n.º 1
0
def rmsprop(lr, tparams, grads, x, mask, y, cost):
    """
    A variant of  SGD that scales the step size by running average of the
    recent step norms.

    Parameters
    ----------
    lr : Theano SharedVariable
        Initial learning rate
    tpramas: Theano SharedVariable
        Model parameters
    grads: Theano variable
        Gradients of cost w.r.t to parameres
    x: Theano variable
        Model inputs
    mask: Theano variable
        Sequence mask
    y: Theano variable
        Targets
    cost: Theano variable
        Objective fucntion to minimize

    Notes
    -----
    For more information, see [Hint2014]_.

    .. [Hint2014] Geoff Hinton, *Neural Networks for Machine Learning*,
       lecture 6a,
       http://cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf
    """

    zipped_grads = [
        theano.shared(p.get_value() * utils.numpy_floatX(0.0), name="%s_grad" % k) for k, p in tparams.iteritems()
    ]
    running_grads = [
        theano.shared(p.get_value() * utils.numpy_floatX(0.0), name="%s_rgrad" % k) for k, p in tparams.iteritems()
    ]
    running_grads2 = [
        theano.shared(p.get_value() * utils.numpy_floatX(0.0), name="%s_rgrad2" % k) for k, p in tparams.iteritems()
    ]

    zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
    rgup = [(rg, 0.95 * rg + 0.05 * g) for rg, g in zip(running_grads, grads)]
    rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2)) for rg2, g in zip(running_grads2, grads)]

    f_grad_shared = theano.function([x, mask, y], cost, updates=zgup + rgup + rg2up, name="rmsprop_f_grad_shared")

    updir = [
        theano.shared(p.get_value() * utils.numpy_floatX(0.0), name="%s_updir" % k) for k, p in tparams.iteritems()
    ]
    updir_new = [
        (ud, 0.9 * ud - 1e-4 * zg / tensor.sqrt(rg2 - rg ** 2 + 1e-4))
        for ud, zg, rg, rg2 in zip(updir, zipped_grads, running_grads, running_grads2)
    ]
    param_up = [(p, p + udn[1]) for p, udn in zip(tparams.values(), updir_new)]
    f_update = theano.function(
        [lr], [], updates=updir_new + param_up, on_unused_input="ignore", name="rmsprop_f_update"
    )

    return f_grad_shared, f_update
Ejemplo n.º 2
0
def adadelta(lr, tparams, grads, x, mask, y, cost):
    """
    An adaptive learning rate optimizer

    Parameters
    ----------
    lr : Theano SharedVariable
        Initial learning rate
    tpramas: Theano SharedVariable
        Model parameters
    grads: Theano variable
        Gradients of cost w.r.t to parameres
    x: Theano variable
        Model inputs
    mask: Theano variable
        Sequence mask
    y: Theano variable
        Targets
    cost: Theano variable
        Objective fucntion to minimize

    Notes
    -----
    For more information, see [ADADELTA]_.

    .. [ADADELTA] Matthew D. Zeiler, *ADADELTA: An Adaptive Learning
       Rate Method*, arXiv:1212.5701.
    """

    zipped_grads = [theano.shared(p.get_value() * utils.numpy_floatX(0.),
                                  name='%s_grad' % k)
                    for k, p in tparams.items()]
    running_up2 = [theano.shared(p.get_value() * utils.numpy_floatX(0.),
                                 name='%s_rup2' % k)
                   for k, p in tparams.items()]
    running_grads2 = [theano.shared(p.get_value() * utils.numpy_floatX(0.),
                                    name='%s_rgrad2' % k)
                      for k, p in tparams.items()]

    zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
    rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2))
             for rg2, g in zip(running_grads2, grads)]

    f_grad_shared = theano.function([x, mask, y], cost, updates=zgup + rg2up,
                                    name='adadelta_f_grad_shared')

    updir = [-T.sqrt(ru2 + 1e-6) / T.sqrt(rg2 + 1e-6) * zg
             for zg, ru2, rg2 in zip(zipped_grads,
                                     running_up2,
                                     running_grads2)]
    ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2))
             for ru2, ud in zip(running_up2, updir)]
    param_up = [(p, p + ud) for p, ud in zip(tparams.values(), updir)]

    f_update = theano.function([lr], [], updates=ru2up + param_up,
                               on_unused_input='ignore',
                               name='adadelta_f_update')

    return f_grad_shared, f_update
Ejemplo n.º 3
0
def encoder(tparams,
            state_below,
            mask,
            seq_output=False,
            prefix='lstm_encoder'):
    """ state_below: size of  n_steps * n_samples * n_x
    """

    n_steps = state_below.shape[0]
    n_samples = state_below.shape[1]

    n_h = tparams[_p(prefix, 'U')].shape[0]

    def _slice(_x, n, dim):
        if _x.ndim == 3:
            return _x[:, :, n * dim:(n + 1) * dim]
        return _x[:, n * dim:(n + 1) * dim]

    state_below_ = tensor.dot(state_below, tparams[_p(prefix, 'W')]) + \
                    tparams[_p(prefix, 'b')]

    def _step(m_, x_, h_, c_, U):
        preact = tensor.dot(h_, U)
        preact += x_

        i = tensor.nnet.sigmoid(_slice(preact, 0, n_h))
        f = tensor.nnet.sigmoid(_slice(preact, 1, n_h))
        o = tensor.nnet.sigmoid(_slice(preact, 2, n_h))
        c = tensor.tanh(_slice(preact, 3, n_h))

        c = f * c_ + i * c
        c = m_[:, None] * c + (1. - m_)[:, None] * c_

        h = o * tensor.tanh(c)
        h = m_[:, None] * h + (1. - m_)[:, None] * h_

        return h, c

    seqs = [mask, state_below_]

    rval, updates = theano.scan(_step,
                                sequences=seqs,
                                outputs_info=[
                                    tensor.alloc(numpy_floatX(0.), n_samples,
                                                 n_h),
                                    tensor.alloc(numpy_floatX(0.), n_samples,
                                                 n_h)
                                ],
                                non_sequences=[tparams[_p(prefix, 'U')]],
                                name=_p(prefix, '_layers'),
                                n_steps=n_steps,
                                strict=True)

    h_rval = rval[0]
    if seq_output:
        return h_rval
    else:
        # size of n_samples * n_h
        return h_rval[-1]
Ejemplo n.º 4
0
def encoder(tparams, state_below, mask, seq_output=False, prefix='lstm_encoder'):
    
    """ state_below: size of  n_steps * n_samples * n_x
    """

    n_steps = state_below.shape[0]
    n_samples = state_below.shape[1]

    n_h = tparams[_p(prefix,'U')].shape[0]

    def _slice(_x, n, dim):
        if _x.ndim == 3:
            return _x[:, :, n*dim:(n+1)*dim]
        return _x[:, n*dim:(n+1)*dim]

    state_below_ = tensor.dot(state_below, tparams[_p(prefix, 'W')]) + \
                    tparams[_p(prefix, 'b')]

    def _step(m_, x_, h_, c_, U):
        preact = tensor.dot(h_, U)
        preact += x_

        i = tensor.nnet.sigmoid(_slice(preact, 0, n_h))
        f = tensor.nnet.sigmoid(_slice(preact, 1, n_h))
        o = tensor.nnet.sigmoid(_slice(preact, 2, n_h))
        c = tensor.tanh(_slice(preact, 3, n_h))
        
        c = f * c_ + i * c
        c = m_[:, None] * c + (1. - m_)[:, None] * c_

        h = o * tensor.tanh(c)
        h = m_[:, None] * h + (1. - m_)[:, None] * h_

        return h, c

    seqs = [mask, state_below_]

    rval, updates = theano.scan(_step,
                                sequences=seqs,
                                outputs_info=[tensor.alloc(numpy_floatX(0.),
                                                    n_samples,n_h),
                                              tensor.alloc(numpy_floatX(0.),
                                                    n_samples,n_h)],
                                non_sequences = [tparams[_p(prefix, 'U')]],
                                name=_p(prefix, '_layers'),
                                n_steps=n_steps,
                                strict=True)
    
    h_rval = rval[0] 
    if seq_output:
        return h_rval
    else:
        # size of n_samples * n_h
        return h_rval[-1]  
Ejemplo n.º 5
0
def decoder_layer(tparams, state_below, prefix='decoder_lstm'):
    """ state_below: size of n_steps * n_samples * n_x 
    """

    nsteps = state_below.shape[0]
    n_h = tparams[_p(prefix, 'U')].shape[0]

    if state_below.ndim == 3:
        n_samples = state_below.shape[1]
    else:
        n_samples = 1

    def _slice(_x, n, dim):
        if _x.ndim == 3:
            return _x[:, :, n * dim:(n + 1) * dim]
        return _x[:, n * dim:(n + 1) * dim]

    def _step(x_, h_, c_, U):
        preact = tensor.dot(h_, U)
        preact += x_

        i = tensor.nnet.sigmoid(_slice(preact, 0, n_h))
        f = tensor.nnet.sigmoid(_slice(preact, 1, n_h))
        o = tensor.nnet.sigmoid(_slice(preact, 2, n_h))
        c = tensor.tanh(_slice(preact, 3, n_h))

        c = f * c_ + i * c

        h = o * tensor.tanh(c)

        return h, c

    state_below_ = tensor.dot(state_below, tparams[_p(
        prefix, 'W')]) + tparams[_p(prefix, 'b')]

    seqs = [state_below_]

    rval, updates = theano.scan(_step,
                                sequences=seqs,
                                outputs_info=[
                                    tensor.alloc(numpy_floatX(0.), n_samples,
                                                 n_h),
                                    tensor.alloc(numpy_floatX(0.), n_samples,
                                                 n_h)
                                ],
                                non_sequences=[tparams[_p(prefix, 'U')]],
                                name=_p(prefix, '_layers'),
                                n_steps=nsteps,
                                strict=True)

    h_rval = rval[0]

    return h_rval
Ejemplo n.º 6
0
def rmsprop(lr, tparams, grads, iin, out, updates):
    """
    A variant of  SGD that scales the step size by running average of the
    recent step norms.

    Notes
    -----
    For more information, see [Hint2014]_.

    .. [Hint2014] Geoff Hinton, *Neural Networks for Machine Learning*,
       lecture 6a,
       http://cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf
    """

    zipped_grads = [
        theano.shared(p.get_value() * utils.numpy_floatX(0.),
                      name='%s_grad' % k) for k, p in tparams.items()
    ]
    running_grads = [
        theano.shared(p.get_value() * utils.numpy_floatX(0.),
                      name='%s_rgrad' % k) for k, p in tparams.items()
    ]
    running_grads2 = [
        theano.shared(p.get_value() * utils.numpy_floatX(0.),
                      name='%s_rgrad2' % k) for k, p in tparams.items()
    ]

    zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
    rgup = [(rg, 0.95 * rg + 0.05 * g) for rg, g in zip(running_grads, grads)]
    rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g**2))
             for rg2, g in zip(running_grads2, grads)]

    f_grad_shared = theano.function(iin,
                                    out,
                                    updates=zgup + rgup + rg2up + updates,
                                    name='rmsprop_f_grad_shared')

    updir = [
        theano.shared(p.get_value() * utils.numpy_floatX(0.),
                      name='%s_updir' % k) for k, p in tparams.items()
    ]
    updir_new = [(ud, 0.9 * ud - 1e-4 * zg / tensor.sqrt(rg2 - rg**2 + 1e-4))
                 for ud, zg, rg, rg2 in zip(updir, zipped_grads, running_grads,
                                            running_grads2)]
    param_up = [(p, p + udn[1]) for p, udn in zip(tparams.values(), updir_new)]
    f_update = theano.function([lr], [],
                               updates=updir_new + param_up,
                               on_unused_input='ignore',
                               name='rmsprop_f_update')

    return f_grad_shared, f_update
Ejemplo n.º 7
0
    def perform(self, x):
        x1, x2 = x[0], x[1]
        nsteps = x1.shape[0]
        n_samples = x1.shape[1]
        # if x1.ndim == 3:
        #     n_samples = x1.shape[1]
        # else:
        #     n_samples = 1
        #
        def _slice(x_t, idx, ndim):
            if x_t.ndim == 3:
                return x_t[:, :, idx * ndim: (idx + 1) * ndim]
            return x_t[:, idx * ndim:(idx + 1) * ndim]

        def _step(x_t, h_tm1, c_tm1, W, U, b):
            # z = sigmoid( W * x(t) + U * h(t-1) + b)
            # zi =  W * x(t) + U * h(t-1) + b
            zi = T.dot(x_t, W) + T.dot(h_tm1, U) + b
            # zi = T.dot(h_tm1, self.Uh)
            # zi += x_t
            # W = [Wi, Wf, Wo, Wc], U = [Ui, Uf, Uo, Uc],  b = [bi, bf, bo, bc]
            i = T.nnet.sigmoid(_slice(zi, 0, self.n_output))
            f = T.nnet.sigmoid(_slice(zi, 1, self.n_output))
            o = T.nnet.sigmoid(_slice(zi, 2, self.n_output))
            c = T.tanh(_slice(zi, 3, self.n_output))

            c = f * c_tm1 + i * c;

            h = o * T.tanh(c)
            # output at each time
            # s = softmax(w * h_t + b)
            return h, c

        # h0 and c0 are initialized randomly
        h0 = T.alloc(numpy_floatX(0.), n_samples, self.n_output);
        c0 = T.alloc(numpy_floatX(0.), n_samples, self.n_output)
        h0 = theano.tensor.unbroadcast(h0, 1);
        c0 = theano.tensor.unbroadcast(c0, 1)
        [h, c], _ = theano.scan(_step, sequences=[x1],
                                outputs_info=[h0, c0],
                                non_sequences=[self.Wh, self.Uh, self.bh],
                                name='blstm_layers', n_steps=nsteps)

        [h_reverse, c_reverse], _ = theano.scan(_step, sequences=[x2],
                                                outputs_info=[h0, c0],
                                                non_sequences=[self.Wh_reverse, self.Uh_reverse, self.bh_reverse],
                                                name='blstm_layers_reverse', n_steps=nsteps,
                                                go_backwards=True)
        self.input = x
        self.output = [h, h_reverse]
Ejemplo n.º 8
0
def adadelta(lr, tparams, grads, iin, out, updates):
    """
    An adaptive learning rate optimizer

    Notes
    -----
    For more information, see [ADADELTA]_.

    .. [ADADELTA] Matthew D. Zeiler, *ADADELTA: An Adaptive Learning
       Rate Method*, arXiv:1212.5701.
    """

    zipped_grads = [
        theano.shared(p.get_value() * utils.numpy_floatX(0.),
                      name='%s_grad' % k) for k, p in tparams.items()
    ]
    running_up2 = [
        theano.shared(p.get_value() * utils.numpy_floatX(0.),
                      name='%s_rup2' % k) for k, p in tparams.items()
    ]
    running_grads2 = [
        theano.shared(p.get_value() * utils.numpy_floatX(0.),
                      name='%s_rgrad2' % k) for k, p in tparams.items()
    ]

    zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
    rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g**2))
             for rg2, g in zip(running_grads2, grads)]

    f_grad_shared = theano.function(iin,
                                    out,
                                    updates=zgup + rg2up + updates,
                                    name='adadelta_f_grad_shared')

    updir = [
        -tensor.sqrt(ru2 + 1e-6) / tensor.sqrt(rg2 + 1e-6) * zg
        for zg, ru2, rg2 in zip(zipped_grads, running_up2, running_grads2)
    ]
    ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud**2))
             for ru2, ud in zip(running_up2, updir)]
    param_up = [(p, p + ud) for p, ud in zip(tparams.values(), updir)]

    f_update = theano.function([lr], [],
                               updates=ru2up + param_up,
                               on_unused_input='ignore',
                               name='adadelta_f_update')

    return f_grad_shared, f_update
Ejemplo n.º 9
0
def lstm_layer(tparams, state_below, options, prefix='lstm', mask=None):
    nsteps = state_below.shape[0]
    if state_below.ndim == 3:
        n_samples = state_below.shape[1]
    else:
        n_samples = 1

    assert mask is not None

    def _slice(_x, n, dim):
        if _x.ndim == 3:
            return _x[:, :, n * dim:(n + 1) * dim]
        return _x[:, n * dim:(n + 1) * dim]

    def _step(m_, x_, h_, c_):
        preact = tensor.dot(h_, tparams[_p(prefix, 'U')])
        preact += x_

        i = tensor.nnet.sigmoid(_slice(preact, 0, options['dim_proj']))
        f = tensor.nnet.sigmoid(_slice(preact, 1, options['dim_proj']))
        o = tensor.nnet.sigmoid(_slice(preact, 2, options['dim_proj']))
        c = tensor.tanh(_slice(preact, 3, options['dim_proj']))

        c = f * c_ + i * c
        c = m_[:, None] * c + (1. - m_)[:, None] * c_

        h = o * tensor.tanh(c)
        h = m_[:, None] * h + (1. - m_)[:, None] * h_

        return h, c

    state_below = (tensor.dot(state_below, tparams[_p(prefix, 'W')]) +
                   tparams[_p(prefix, 'b')])

    dim_proj = options['dim_proj']
    rval, updates = theano.scan(_step,
                                sequences=[mask, state_below],
                                outputs_info=[
                                    tensor.alloc(numpy_floatX(0.), n_samples,
                                                 dim_proj),
                                    tensor.alloc(numpy_floatX(0.), n_samples,
                                                 dim_proj)
                                ],
                                name=_p(prefix, '_layers'),
                                n_steps=nsteps)
    # outputs_info include h_ and c_
    # return only hidden states, so return rval[0]
    return rval[0]
Ejemplo n.º 10
0
def Adam(tparams, cost, inps, lr, b1=0.1, b2=0.001, e=1e-8):
    """ default: lr=0.0002 """

    grads = tensor.grad(cost, tparams.values())
    norm = tensor.sqrt(sum([tensor.sum(g ** 2) for g in grads]))
    if tensor.ge(norm, 5):
        grads = [g * 5 / norm for g in grads]

    gshared = [theano.shared(p.get_value() * 0.0, name="%s_grad" % k) for k, p in tparams.iteritems()]
    gsup = [(gs, g) for gs, g in zip(gshared, grads)]
    f_grad_shared = theano.function(inps, cost, updates=gsup)

    updates = []

    i = theano.shared(numpy_floatX(0.0))
    i_t = i + 1.0
    fix1 = 1.0 - b1 ** (i_t)
    fix2 = 1.0 - b2 ** (i_t)
    lr_t = lr * (tensor.sqrt(fix2) / fix1)

    for p, g in zip(tparams.values(), gshared):
        m = theano.shared(p.get_value() * 0.0)
        v = theano.shared(p.get_value() * 0.0)
        m_t = (b1 * g) + ((1.0 - b1) * m)
        v_t = (b2 * tensor.sqr(g)) + ((1.0 - b2) * v)
        g_t = m_t / (tensor.sqrt(v_t) + e)
        p_t = p - (lr_t * g_t)
        updates.append((m, m_t))
        updates.append((v, v_t))
        updates.append((p, p_t))
    updates.append((i, i_t))

    f_update = theano.function([lr], [], updates=updates)

    return f_grad_shared, f_update
Ejemplo n.º 11
0
def build_model(tparams, options):

    trng = RandomStreams(options['SEED'])

    # Used for dropout.
    use_noise = theano.shared(numpy_floatX(0.))

    # size of n_samples * n_z
    z = tensor.matrix('z', dtype=config.floatX)
    # size of n_samples * n_y
    y = tensor.matrix('y', dtype=config.floatX)

    z = dropout(z, trng, use_noise)

    h = tensor.tanh(tensor.dot(z, tparams['Wy1']) + tparams['by1'])
    h = dropout(h, trng, use_noise)

    # size of n_samples * n_y
    pred = tensor.nnet.sigmoid(tensor.dot(h, tparams['Wy2']) + tparams['by2'])

    f_pred = theano.function([z], pred, name='f_pred')

    cost = (-y * tensor.log(pred + 1e-6) -
            (1. - y) * tensor.log(1. - pred + 1e-6)).sum() / z.shape[0]

    return use_noise, z, y, cost, f_pred
def build_model(tparams,options):
    
    trng = RandomStreams(options['SEED'])
    
    # Used for dropout.
    use_noise = theano.shared(numpy_floatX(0.))

    # input sentences, size of n_steps * n_samples
    x = tensor.matrix('x', dtype='int64')
    # the corresponding masks padding zeros
    mask = tensor.matrix('mask', dtype=config.floatX)
    # size of n_samples * n_z
    z = tensor.matrix('z', dtype=config.floatX)
    y = tensor.matrix('y', dtype=config.floatX)
    z = dropout(z, trng, use_noise)
    y = dropout(y, trng, use_noise)

    n_steps = x.shape[0] # the sentence length in this mini-batch
    n_samples = x.shape[1] # the number of sentences in this mini-batch
    
    n_x = tparams['Wemb'].shape[1] # the dimension of the word embedding
    
    # size of n_steps,n_samples,n_x
    emb = tparams['Wemb'][x.flatten()].reshape([n_steps,n_samples,n_x])
    emb = dropout(emb, trng, use_noise)
    
    # 1 * n_samples * n_x
    z0 =tensor.dot(z,tparams['C0']).dimshuffle('x',0,1)
    # n_steps * n_samples * n_x
    emb_input = tensor.concatenate((z0,emb[:n_steps-1]))
    # n_steps * n_samples
    mask0 =mask[0].dimshuffle('x',0)
    mask_input = tensor.concatenate((mask0,mask[:n_steps-1]))

    # decoding the sentence vector z back into the original sentence
    h_decoder = encoder_layer(tparams, emb_input, mask_input,y, seq_output=True)
    h_decoder = dropout(h_decoder, trng, use_noise)
                                         
    shape = h_decoder.shape
    h_decoder = h_decoder.reshape((shape[0]*shape[1], shape[2]))
    
    Vhid = tensor.dot(tparams['Vhid'],tparams['Wemb'].T)
    pred_x = tensor.dot(h_decoder, Vhid) + tparams['bhid']
    pred = tensor.nnet.softmax(pred_x)
    
    x_vec = x.reshape((shape[0]*shape[1],))
    
    index = tensor.arange(shape[0]*shape[1])
    
    pred_word = pred[index, x_vec]
    mask_word = mask.reshape((shape[0]*shape[1],))
    
    index_list = theano.tensor.eq(mask_word, 1.).nonzero()[0]
    
    pred_word = pred_word[index_list]
    
    # the cross-entropy loss                 
    cost = -tensor.log(pred_word + 1e-6).sum() / n_samples  
    
    return use_noise, x, mask, y, z, cost
Ejemplo n.º 13
0
def decoder_layer(tparams, state_below, prefix='decoder_vanilla'):
    """ state_below: size of n_steps * n_samples * n_x 
    """

    n_steps = state_below.shape[0]
    if state_below.ndim == 3:
        n_samples = state_below.shape[1]
    else:
        n_samples = 1

    n_h = tparams[_p(prefix, 'U')].shape[0]

    def _step_slice(x_, h_, U):
        preact = tensor.dot(h_, U)
        preact += x_
        h = tensor.tanh(preact)

        return h

    state_below_ = tensor.dot(state_below, tparams[_p(prefix, 'W')]) + \
                    tparams[_p(prefix, 'b')]

    rval, updates = theano.scan(
        _step_slice,
        sequences=[state_below_],
        outputs_info=[tensor.alloc(numpy_floatX(0.), n_samples, n_h)],
        non_sequences=[tparams[_p(prefix, 'U')]],
        name=_p(prefix, '_layers'),
        n_steps=n_steps)

    return rval
Ejemplo n.º 14
0
    def rmsprop(self, lr, tparams, grads, inp_list, cost, params):
        clip = params["grad_clip"]
        decay_rate = tensor.constant(params["decay_rate"], dtype=theano.config.floatX)
        smooth_eps = tensor.constant(params["smooth_eps"], dtype=theano.config.floatX)
        zipped_grads = [theano.shared(np.zeros_like(p.get_value()), name="%s_grad" % k) for k, p in tparams.iteritems()]
        running_grads2 = [
            theano.shared(np.zeros_like(p.get_value()), name="%s_rgrad2" % k) for k, p in tparams.iteritems()
        ]
        zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
        if clip > 0.0:
            rg2up = [
                (
                    rg2,
                    tensor.clip(decay_rate * rg2 + (1 - decay_rate) * (tensor.clip(g, -clip, clip) ** 2), 0.0, np.inf),
                )
                for rg2, g in zip(running_grads2, grads)
            ]
        else:
            rg2up = [
                (rg2, tensor.clip(decay_rate * rg2 + (1 - decay_rate) * (g ** 2), 0.0, np.inf))
                for rg2, g in zip(running_grads2, grads)
            ]

        f_grad_shared = theano.function(inp_list, cost, updates=zgup + rg2up, name="rmsprop_f_grad_shared")

        updir = [theano.shared(p.get_value() * numpy_floatX(0.0), name="%s_updir" % k) for k, p in tparams.iteritems()]
        updir_new = [
            (ud, -lr * zg / (tensor.sqrt(rg2) + smooth_eps)) for ud, zg, rg2 in zip(updir, zipped_grads, running_grads2)
        ]
        param_up = [(p, p + udn[1]) for p, udn in zip(tparams.values(), updir_new)]
        f_update = theano.function(
            [lr], [], updates=updir_new + param_up, on_unused_input="ignore", name="rmsprop_f_update"
        )

        return f_grad_shared, f_update, zipped_grads, running_grads2, updir
Ejemplo n.º 15
0
def pred_error(f_pred, prepare_data, data, iterator, fname='', verbose=False):
    """
    Just compute the error
    f_pred: Theano fct computing the prediction
    prepare_data: usual prepare_data for that dataset.
    """
    valid_err = 0
    if verbose:
        f = open('../data/trec/TREC_10.label')
        lines = f.readlines()
        f.close()
        f_out = open(fname + 'trec_out_dscnn.txt', 'w')
        cat_ind = ['ABBR', 'ENTY', 'DESC', 'HUM', 'LOC', 'NUM']
        cnt = 0
    for b, valid_index in iterator:
        x, mask, y = prepare_data([data[0][t] for t in valid_index],
                                  numpy.array(data[1])[valid_index])
        preds = f_pred(x, mask)
        targets = numpy.array(data[1])[valid_index]

        if verbose:
            for i in range(len(preds)):
                p = preds[i]
                if p != targets[i]:
                    f_out.write('*')
                f_out.write(cat_ind[p] + ' ')
                f_out.write(lines[cnt])
                cnt += 1

        valid_err += (preds == targets).sum()
    valid_err = 1. - numpy_floatX(valid_err) / len(data[0])

    return valid_err * 100
Ejemplo n.º 16
0
def lstm_layer(tparams, state_below, options, prefix='lstm', mask=None):
    nsteps = state_below.shape[0]
    if state_below.ndim == 3:
        n_samples = state_below.shape[1]
    else:
        n_samples = 1

    assert mask is not None

    def _slice(_x, n, dim):
        if _x.ndim == 3:
            return _x[:, :, n * dim:(n+1) * dim]
        return _x[:, n * dim:(n+1) * dim]

    def _step(m_, x_, h_, c_):
        preact = tensor.dot(h_, tparams[_p(prefix, 'U')])
        preact += x_

        i = tensor.nnet.sigmoid(_slice(preact, 0, options['dim_proj']))
        f = tensor.nnet.sigmoid(_slice(preact, 1, options['dim_proj']))
        o = tensor.nnet.sigmoid(_slice(preact, 2, options['dim_proj']))
        c = tensor.tanh(_slice(preact, 3, options['dim_proj']))

        c = f * c_ + i * c
        c = m_[:, None] * c + (1. - m_)[:, None] * c_

        h = o * tensor.tanh(c)
        h = m_[:, None] * h + (1. - m_)[:, None] * h_

        return h, c

    state_below = (tensor.dot(state_below, tparams[_p(prefix, 'W')]) +
                   tparams[_p(prefix, 'b')])

    dim_proj = options['dim_proj']
    rval, updates = theano.scan(_step,
                                sequences=[mask, state_below],
                                outputs_info=[tensor.alloc(numpy_floatX(0.),
                                                           n_samples,
                                                           dim_proj),
                                              tensor.alloc(numpy_floatX(0.),
                                                           n_samples,
                                                           dim_proj)],
                                name=_p(prefix, '_layers'),
                                n_steps=nsteps)

    return rval[0][-1]
Ejemplo n.º 17
0
def decoder_layer(tparams, state_below, z, mask, prefix='decoder_lstm'):
    """ state_below: size of n_steps * n_samples * n_x 
        z: size of n_samples * n_z
    """

    nsteps = state_below.shape[0]
    n_h = tparams[_p(prefix, 'U')].shape[0]

    if state_below.ndim == 3:
        n_samples = state_below.shape[1]
    else:
        n_samples = 1

    state_belowx0 = tensor.dot(z, tparams[_p(prefix, 'C0')]) + \
            tparams[_p(prefix, 'b0')]
    h0 = tensor.tanh(state_belowx0)

    def _slice(_x, n, dim):
        if _x.ndim == 3:
            return _x[:, :, n * dim:(n + 1) * dim]
        return _x[:, n * dim:(n + 1) * dim]

    state_below_ = tensor.dot(state_below, tparams[_p(
        prefix, 'W')]) + tparams[_p(prefix, 'b')]

    # tensor.dot(z, tparams[_p(prefix, 'C')])

    def _step(m_, x_, h_, c_, U):
        preact = tensor.dot(h_, U)
        preact += x_

        i = tensor.nnet.sigmoid(_slice(preact, 0, n_h))
        f = tensor.nnet.sigmoid(_slice(preact, 1, n_h))
        o = tensor.nnet.sigmoid(_slice(preact, 2, n_h))
        c = tensor.tanh(_slice(preact, 3, n_h))

        c = f * c_ + i * c
        c = m_[:, None] * c + (1. - m_)[:, None] * c_

        h = o * tensor.tanh(c)
        h = m_[:, None] * h + (1. - m_)[:, None] * h_

        return h, c

    seqs = [mask[:nsteps - 1], state_below_[:nsteps - 1]]

    rval, updates = theano.scan(
        _step,
        sequences=seqs,
        outputs_info=[h0, tensor.alloc(numpy_floatX(0.), n_samples, n_h)],
        non_sequences=[tparams[_p(prefix, 'U')]],
        name=_p(prefix, '_layers'),
        n_steps=nsteps - 1,
        strict=True)

    h0x = h0.dimshuffle('x', 0, 1)
    h_rval = rval[0]

    return tensor.concatenate((h0x, h_rval))
Ejemplo n.º 18
0
def lstm_layer(tparams, x, mask, prefix):

    n_steps = x.shape[0]
    n_samples = x.shape[1]

    n_h = tparams[_p(prefix, 'U_i')].shape[0]

    x_i = tensor.dot(x, tparams[_p(prefix, 'W_i')]) + tparams[_p(
        prefix, 'b_i')]
    x_f = tensor.dot(x, tparams[_p(prefix, 'W_f')]) + tparams[_p(
        prefix, 'b_f')]
    x_o = tensor.dot(x, tparams[_p(prefix, 'W_o')]) + tparams[_p(
        prefix, 'b_o')]
    x_c = tensor.dot(x, tparams[_p(prefix, 'W_c')]) + tparams[_p(
        prefix, 'b_c')]

    def _step(m_, xt_i, xt_f, xt_o, xt_c, h_, c_, U_i, U_f, U_o, U_c):
        i = tensor.nnet.sigmoid(tensor.dot(h_, U_i) + xt_i)
        f = tensor.nnet.sigmoid(tensor.dot(h_, U_f) + xt_f)
        o = tensor.nnet.sigmoid(tensor.dot(h_, U_o) + xt_o)
        c = tensor.tanh(tensor.dot(h_, U_c) + xt_c)
        c = f * c_ + i * c
        c = m_[:, None] * c + (1. - m_)[:, None] * c_
        h = o * tensor.tanh(c)
        h = m_[:, None] * h + (1. - m_)[:, None] * h_
        return h, c

    seqs = [mask, x_i, x_f, x_o, x_c]
    non_seqs = [
        tparams[_p(prefix, 'U_i')], tparams[_p(prefix, 'U_f')],
        tparams[_p(prefix, 'U_o')], tparams[_p(prefix, 'U_c')]
    ]
    rval, updates = theano.scan(_step,
                                sequences=seqs,
                                outputs_info=[
                                    tensor.alloc(numpy_floatX(0.), n_samples,
                                                 n_h),
                                    tensor.alloc(numpy_floatX(0.), n_samples,
                                                 n_h)
                                ],
                                non_sequences=non_seqs,
                                name=_p(prefix, '_layers'),
                                n_steps=n_steps,
                                strict=True)
    # hseq, cseq
    return rval
Ejemplo n.º 19
0
    def perform(self, x):
        nsteps = x.shape[0]
        # if x.ndim == 3:
        #     n_samples = x.shape[1]
        # else:
        #     n_samples = 1
        #
        n_samples = x.shape[1]

        def _slice(x_t, idx, ndim):
            if x_t.ndim == 3:
                return x_t[:, :, idx * ndim: (idx + 1) * ndim]
            return x_t[:, idx * ndim:(idx + 1) * ndim]

        def _step(x_t, h_tm1, c_tm1):
            # z = sigmoid( W * x(t) + U * h(t-1) + b)
            # zi =  W * x(t) + U * h(t-1) + b
            zi = T.dot(x_t, self.Wh) + T.dot(h_tm1, self.Uh) + self.bh
            # zi = T.dot(h_tm1, self.Uh)
            # zi += x_t
            # W = [Wi, Wf, Wo, Wc], U = [Ui, Uf, Uo, Uc],  b = [bi, bf, bo, bc]
            i = T.nnet.sigmoid(_slice(zi, 0, self.n_output))
            f = T.nnet.sigmoid(_slice(zi, 1, self.n_output))
            o = T.nnet.sigmoid(_slice(zi, 2, self.n_output))
            c = T.tanh(_slice(zi, 3, self.n_output))

            c = f * c_tm1 + i * c;

            h = o * T.tanh(c)
            # output at each time
            # s = softmax(w * h_t + b)
            return [h, c]

        # h0 and c0 are initialized randomly
        h0 = T.alloc(numpy_floatX(0.), n_samples, self.n_output);
        c0 = T.alloc(numpy_floatX(0.), n_samples, self.n_output)
        h0 = theano.tensor.unbroadcast(h0, 1);
        c0 = theano.tensor.unbroadcast(c0, 1)
        [h, c], _ = theano.scan(fn=_step, sequences=x,
                                outputs_info=[h0, c0],
                                n_steps=nsteps)
        self.input = x
        self.output = h
Ejemplo n.º 20
0
def Santa(tparams, cost, inps, lr, eidx, nframes, max_epoch, rho=0.95, anne_rate=0.5, e=1e-8, clip_norm=5):
    """ The implementation of Santa algorithm.
        tparams: theano shared variables, params that we need to optimize
        cost: cost function, the cross-entropy loss in our case
        inps: input theano variables
        lr: learning rate, in our case, we choose it to be 1.*1e-3, or 2.*1e-4
        eidx: the current epochs we are running, used to decide when to change 
            from exploration to refinement
        nframes: how many time-steps we have in the training dataset.
        max_epoch: the maximum of epochs we run
        rho, anne_rate, e, clip_norm: hyper-parameters we used in all the algorithms.
    """
    
    trng = RandomStreams(123)
    
    grads = tensor.grad(cost, tparams.values())
    norm = tensor.sqrt(sum([tensor.sum(g**2) for g in grads]))
    if tensor.ge(norm, clip_norm):
        grads = [g*clip_norm/norm for g in grads]
    
    gshared = [theano.shared(p.get_value() * 0., name='%s_grad'%k) 
                for k, p in tparams.iteritems()]
    gsup = [(gs, g) for gs, g in zip(gshared, grads)]
    f_grad_shared = theano.function(inps, cost, updates=gsup)
    
    updates = []
    
    i = theano.shared(numpy_floatX(0.))    
    i_t = i + 1.

    for p, g in zip(tparams.values(), gshared):
        m = theano.shared(p.get_value() * 0.)
        v = theano.shared(p.get_value() * 0.)
        alpha = theano.shared(np.ones(p.get_value().shape)*.5)
        
        alpha_t = alpha + (m**2 - lr/(i_t ** anne_rate)) * tensor.lt(eidx, 0.15*max_epoch) 
        v_t = rho * v + (1.-rho) * (g ** 2) 
        pcder = tensor.sqrt(tensor.sqrt(v_t)+e) 
            
        eps = trng.normal(p.get_value().shape, avg = 0.0, std = 1.0, 
                          dtype=theano.config.floatX)
            
        m_t = -lr*g/pcder + (1. - alpha_t) * m + (tensor.sqrt(2*lr*v_t/(i_t ** anne_rate)/nframes) *eps) * tensor.lt(eidx, 0.15*max_epoch)
        p_t = p + (m_t/ pcder)
        
        updates.append((alpha, alpha_t))
        updates.append((m, m_t))
        updates.append((v, v_t))
        updates.append((p, p_t))
    updates.append((i, i_t))
    
    f_update = theano.function([lr,eidx,nframes,max_epoch], [], updates=updates)
    
    return f_grad_shared, f_update
Ejemplo n.º 21
0
def encoder(tparams, state_below, mask, seq_output=False, prefix='gru_encoder'):
    
    """ state_below: size of n_steps * n_samples * n_x 
    """

    n_steps = state_below.shape[0]
    n_samples = state_below.shape[1]

    n_h = tparams[_p(prefix,'Ux')].shape[1]

    def _slice(_x, n, dim):
        if _x.ndim == 3:
            return _x[:, :, n*dim:(n+1)*dim]
        return _x[:, n*dim:(n+1)*dim]

    state_below_ = tensor.dot(state_below, tparams[_p(prefix, 'W')]) + \
                    tparams[_p(prefix, 'b')]
    state_belowx = tensor.dot(state_below, tparams[_p(prefix, 'Wx')]) + \
                    tparams[_p(prefix, 'bx')]

    def _step(m_, x_, xx_, h_, U, Ux):
        preact = tensor.dot(h_, U)
        preact += x_

        r = tensor.nnet.sigmoid(_slice(preact, 0, n_h))
        u = tensor.nnet.sigmoid(_slice(preact, 1, n_h))

        preactx = tensor.dot(h_, Ux)
        preactx = preactx * r
        preactx = preactx + xx_

        h = tensor.tanh(preactx)

        h = u * h_ + (1. - u) * h
        h = m_[:,None] * h + (1. - m_)[:,None] * h_

        return h

    seqs = [mask, state_below_, state_belowx]

    rval, updates = theano.scan(_step,
                                sequences=seqs,
                                outputs_info = [tensor.alloc(numpy_floatX(0.),
                                                             n_samples, n_h)],
                                non_sequences = [tparams[_p(prefix, 'U')],
                                                 tparams[_p(prefix, 'Ux')]],
                                name=_p(prefix, '_layers'),
                                n_steps=n_steps,
                                strict=True)
    if seq_output:
        return rval
    else:
        # size of n_samples * n_h
        return rval[-1]  
Ejemplo n.º 22
0
def Adam(tparams, cost, inps, lr, b1=0.1, b2=0.001, e=1e-8, clip_norm=5):
    """ default: lr=0.0002 
        This is the implementation of the Adam algorithm
        Reference: http://arxiv.org/pdf/1412.6980v8.pdf
    """

    grads = tensor.grad(cost, tparams.values())
    norm = tensor.sqrt(sum([tensor.sum(g**2) for g in grads]))
    if tensor.ge(norm, clip_norm):
        grads = [g * clip_norm / (norm + e) for g in grads]
    zero = numpy.float32(0)
    gshared = [
        theano.shared(p.get_value() * zero, name='%s_grad' % k)
        for k, p in tparams.iteritems()
    ]

    gsup = [(gs, g) for gs, g in zip(gshared, grads)]
    f_grad_shared = theano.function(inps, cost, updates=gsup)
    updates = []

    i = theano.shared(numpy_floatX(0.))
    i_t = i + 1.
    fix1 = 1. - b1**(i_t)
    fix2 = 1. - b2**(i_t)
    lr_t = lr * (tensor.sqrt(fix2) / fix1)
    _s = tensor.scalar('s', dtype='float32')
    for p, g in zip(tparams.values(), gshared):
        m = theano.shared(p.get_value() * zero)
        v = theano.shared(p.get_value() * zero)
        m_t = (b1 * g) + ((1. - b1) * m)
        v_t = (b2 * tensor.sqr(g)) + ((1. - b2) * v)
        g_t = m_t / (tensor.sqrt(v_t) + e)
        p_t = p - (lr_t * g_t)
        if tensor.eq(_s, 0.) and (p.name is 'gp_beta' or p.name is 'gp_alpha'
                                  or p.name is 'r'):
            p_t = p - (_s * lr_t * g_t)
        #elif tensor.eq(_s,1.) and (p.name is not 'gp_beta' and p.name is not 'gp_alpha' and  p.name is not 'r'):
        #    p_t = p - ((1-_s) * lr_t * g_t)
        if p.name == 'e_beta' or p.name == 'd_beta':
            p_t = p_t * (p_t > 0)
        elif p.name is 'gp_beta' or p.name is 'gp_alpha':
            m_t = m_t.astype('float32')
            v_t = v_t.astype('float32')
            p_t = p_t.astype('float32')
        updates.append((m, m_t))
        updates.append((v, v_t))
        updates.append((p, p_t))
    updates.append((i, i_t))

    f_update = theano.function([lr, _s], [], updates=updates)

    return f_grad_shared, f_update
Ejemplo n.º 23
0
def fully_layer(params, input, results, nCategories=101, nout=512, weights_path=None):
    trng = RandomStreams(SEED)
    # Used for dropout.
    use_noise = theano.shared(numpy_floatX(0.))
    ninput = tensor.prod(input.shape[1:])
    denselayer1 = tensor.dot(input, params['fc1_w']) + params['fc1_b']
    denselayer1 = relu(denselayer1)
    denselayer2 = tensor.dot(denselayer1, params['fc2_w']) + params['fc2_b']
    denselayer2 = relu(denselayer2)
    results['fc1'] = denselayer1
    results['fc2'] = denselayer2

    return params, results
Ejemplo n.º 24
0
def build_model(tparams, options):

    trng = RandomStreams(SEED)

    # Used for dropout.
    use_noise = theano.shared(numpy_floatX(0.))

    # input sentences, size of n_steps * n_samples
    x = tensor.matrix('x', dtype='int64')
    # the corresponding masks padding zeros
    mask = tensor.matrix('mask', dtype=config.floatX)
    # size of n_z * n_samples
    z = tensor.matrix('z', dtype=config.floatX)
    z = dropout(z, trng, use_noise)

    n_steps = x.shape[0]  # the sentence length in this mini-batch
    n_samples = x.shape[1]  # the number of sentences in this mini-batch

    n_x = tparams['Wemb'].shape[1]  # the dimension of the word embedding

    emb = tparams['Wemb'][x.flatten()].reshape([n_steps, n_samples, n_x])
    emb = dropout(emb, trng, use_noise)

    # decoding the sentence vector z back into the original sentence
    h_decoder = decoder_layer(tparams, emb, z, mask=mask)
    h_decoder = dropout(h_decoder, trng, use_noise)

    shape = h_decoder.shape
    h_decoder = h_decoder.reshape((shape[0] * shape[1], shape[2]))

    Vhid = tensor.dot(tparams['Vhid'], tparams['Wemb'].T)
    pred_x = tensor.dot(h_decoder, Vhid) + tparams['bhid']
    pred = tensor.nnet.softmax(pred_x)

    x_vec = x.reshape((shape[0] * shape[1], ))

    index = tensor.arange(shape[0] * shape[1])

    pred_word = pred[index, x_vec]
    mask_word = mask.reshape((shape[0] * shape[1], ))

    index_list = theano.tensor.eq(mask_word, 1.).nonzero()[0]

    pred_word = pred_word[index_list]

    # the cross-entropy loss
    cost = -tensor.log(pred_word + 1e-6).sum() / n_samples

    f_pred_prob = theano.function([x, mask, z], pred_word, name='f_pred_prob')

    return use_noise, x, mask, z, f_pred_prob, cost
Ejemplo n.º 25
0
def decoder_layer(tparams, state_below, prefix='decoder_gru'):
    """ state_below: size of n_steps * n_samples * n_x 
    """

    nsteps = state_below.shape[0]
    n_h = tparams[_p(prefix, 'Ux')].shape[1]
    if state_below.ndim == 3:
        n_samples = state_below.shape[1]
    else:
        n_samples = 1

    def _slice(_x, n, dim):
        if _x.ndim == 3:
            return _x[:, :, n * dim:(n + 1) * dim]
        return _x[:, n * dim:(n + 1) * dim]

    state_below_ = tensor.dot(state_below, tparams[_p(
        prefix, 'W')]) + tparams[_p(prefix, 'b')]
    state_belowx = tensor.dot(state_below, tparams[_p(
        prefix, 'Wx')]) + tparams[_p(prefix, 'bx')]

    def _step_slice(x_, xx_, h_, U, Ux):
        preact = tensor.dot(h_, U)
        preact += x_

        r = tensor.nnet.sigmoid(_slice(preact, 0, n_h))
        u = tensor.nnet.sigmoid(_slice(preact, 1, n_h))

        preactx = tensor.dot(h_, Ux)
        preactx = preactx * r
        preactx = preactx + xx_

        h = tensor.tanh(preactx)

        h = u * h_ + (1. - u) * h

        return h

    seqs = [state_below_, state_belowx]
    _step = _step_slice

    rval, updates = theano.scan(
        _step,
        sequences=seqs,
        outputs_info=[tensor.alloc(numpy_floatX(0.), n_samples, n_h)],
        non_sequences=[tparams[_p(prefix, 'U')], tparams[_p(prefix, 'Ux')]],
        name=_p(prefix, '_layers'),
        n_steps=nsteps,
        strict=True)

    return rval
Ejemplo n.º 26
0
def pSGLD_test(tparams,
               cost,
               inps,
               lr,
               rho=0.99,
               epsilon=1e-6,
               eta=0.01,
               anne_rate=0.55,
               clip_norm=5):
    """ default: lr=0.001 """

    trng = RandomStreams(123)

    grads = tensor.grad(cost, tparams.values())
    norm = tensor.sqrt(sum([tensor.sum(g**2) for g in grads]))
    if tensor.ge(norm, clip_norm):
        grads = [g * clip_norm / norm for g in grads]

    gshared = [
        theano.shared(p.get_value() * 0., name='%s_grad' % k)
        for k, p in tparams.iteritems()
    ]
    gsup = [(gs, g) for gs, g in zip(gshared, grads)]
    f_grad_shared = theano.function(inps, cost, updates=gsup)

    updates = []

    i = theano.shared(numpy_floatX(0.))
    i_t = i + 1.

    for p, g in zip(tparams.values(), gshared):
        acc = theano.shared(p.get_value() * 0.)
        acc_new = rho * acc + (1 - rho) * g**2
        updates.append((acc, acc_new))

        G = tensor.sqrt(acc_new + epsilon)

        eps = trng.normal(p.get_value().shape,
                          avg=0.0,
                          std=1.0,
                          dtype=theano.config.floatX)

        updated_p = p - lr * g / G + tensor.sqrt(
            lr / G) * eta / (1 + i_t)**anne_rate * eps
        updates.append((p, updated_p))
    updates.append((i, i_t))

    f_update = theano.function([lr], [], updates=updates)

    return f_grad_shared, f_update
def pred_error(f_pred, prepare_data, data, iterator, verbose=False):
    """ compute the prediction error. 
    """
    valid_err = 0
    for _, valid_index in iterator:
        x, mask, y = prepare_data([data[0][t] for t in valid_index],
                                  np.array(data[1])[valid_index],
                                  maxlen=None)
        preds = f_pred(x, mask)
        targets = np.array(data[1])[valid_index]
        valid_err += (preds == targets).sum()
    valid_err = 1. - numpy_floatX(valid_err) / len(data[0])

    return valid_err
Ejemplo n.º 28
0
def pred_error(f_pred, prepare_data, data, iterator, fname='', verbose=False):
    """
    Just compute the error
    f_pred: Theano fct computing the prediction
    prepare_data: usual prepare_data for that dataset.
    """
    valid_err = 0
    preds_all = []
    targets_all = []
    if verbose:
        true_labels = []
        f_label = open("paper2_labels_without_175_repeat_with_3200.txt", "r")
        for line_label in f_label:
            true_labels.append(int(line_label.strip()))
        f_label.close()
        f_out = open(fname + 'hdf_out_dscnn.txt', 'w')
        cat_ind = ['1', '2']
        cnt = 0
    for b, valid_index in iterator:
        x, mask, y = prepare_data([data[0][t] for t in valid_index],
                                  numpy.array(data[1])[valid_index])
        preds = f_pred(x, mask)
        for pred_item in preds:
            preds_all.append(pred_item)
        targets = numpy.array(data[1])[valid_index]
        for target_item in targets:
            targets_all.append(target_item)

        if verbose:
            for i in range(len(preds)):
                p = preds[i]
                if p != targets[i]:
                    f_out.write('*')
                else:
                    f_out.write(' ')
                f_out.write(str(preds[i]) + ' ')
                f_out.write(str(targets[i]) + ' ')
                f_out.write(cat_ind[p] + ' ')
                f_out.write(str(true_labels[cnt]) + '\n')
                cnt += 1
        equals = 0
        for i in range(len(preds)):
            if preds[i] == targets[i]:
                equals += 1
        valid_err += equals
    valid_err = 1. - numpy_floatX(valid_err) / len(data[0])
    print 'len(preds_all):', len(preds_all)
    print 'len(targets_all):', len(targets_all)
    return valid_err * 100, preds_all, targets_all
Ejemplo n.º 29
0
    def rmsprop(self, lr, tparams, grads, inp_list, cost, params):
        clip = params['grad_clip']
        decay_rate = tensor.constant(params['decay_rate'],
                                     dtype=theano.config.floatX)
        smooth_eps = tensor.constant(params['smooth_eps'],
                                     dtype=theano.config.floatX)
        zipped_grads = [
            theano.shared(np.zeros_like(p.get_value()), name='%s_grad' % k)
            for k, p in tparams.iteritems()
        ]
        running_grads2 = [
            theano.shared(np.zeros_like(p.get_value()), name='%s_rgrad2' % k)
            for k, p in tparams.iteritems()
        ]
        zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
        if clip > 0.0:
            rg2up = [(rg2,
                      tensor.clip(
                          decay_rate * rg2 + (1 - decay_rate) *
                          (tensor.clip(g, -clip, clip)**2), 0.0, np.inf))
                     for rg2, g in zip(running_grads2, grads)]
        else:
            rg2up = [(rg2,
                      tensor.clip(decay_rate * rg2 + (1 - decay_rate) * (g**2),
                                  0.0, np.inf))
                     for rg2, g in zip(running_grads2, grads)]

        f_grad_shared = theano.function(inp_list,
                                        cost,
                                        updates=zgup + rg2up,
                                        name='rmsprop_f_grad_shared')

        updir = [
            theano.shared(p.get_value() * numpy_floatX(0.),
                          name='%s_updir' % k) for k, p in tparams.iteritems()
        ]
        updir_new = [
            (ud, -lr * zg / (tensor.sqrt(rg2) + smooth_eps))
            for ud, zg, rg2 in zip(updir, zipped_grads, running_grads2)
        ]
        param_up = [(p, p + udn[1])
                    for p, udn in zip(tparams.values(), updir_new)]
        f_update = theano.function([lr], [],
                                   updates=updir_new + param_up,
                                   on_unused_input='ignore',
                                   name='rmsprop_f_update')

        return f_grad_shared, f_update, zipped_grads, running_grads2, updir
 def build_model(self, tparams):
   # sents -> word_indices * #batch_size
   sents = tensor.matrix('sents', dtype="int64")
   # mask -> n_word_indices * #batch_size
   mask = tensor.matrix('mask', dtype=config.floatX)
   # imgs -> #4098 * #batch_size
   imgs = tensor.matrix('imgs', dtype=config.floatX)
   # gt_sents -> word_indices * #batch_size
   gt_sents = tensor.matrix('gt_sents', dtype="int64")
   # Used for dropout.
   use_noise = theano.shared(numpy_floatX(1.))
       
   with open("testTagData.pkl", "rb") as f:
     sents_tag, mask_tag, imgs_tag, gt_sents_tag = pickle.load(f)
   sents.tag.test_value = sents_tag
   mask.tag.test_value = mask_tag
   imgs.tag.test_value = imgs_tag
   gt_sents.tag.test_value = gt_sents_tag
   
   n_timesteps = sents.shape[0]
   n_samples = sents.shape[1]
   
   # Image encoding
   # Xe -> #batch_size * #image_encoding_size
   x_e = (tensor.dot(imgs.T, tparams['We']) + tparams['be'])
   # sentences (i.e. captions) encoding
   # Xs -> #no_of_words * #batch_size * #word_encoding_size
   x_s = tparams['Ws'][sents.flatten()].reshape([n_timesteps,
                                               n_samples,
                                               self.word_img_embed_hidden_dim])
   
   # Xes has the image vector as the first timestep
   # Xes -> #no_timesteps (no_of_words + 1 (for image)) * #batch_size * #word_image_encoding_size
   x_es = tensor.zeros([n_timesteps + 1, n_samples, self.word_img_embed_hidden_dim], dtype=config.floatX)
   x_es = tensor.set_subtensor(x_es[1:], x_s)
   x_es = tensor.set_subtensor(x_es[0], x_e)
   
   mask_es = tensor.ones([mask.shape[0] + 1, mask.shape[1]], dtype=config.floatX)
   mask_es = tensor.set_subtensor(mask_es[1:], mask)
   
   # pred_softmax -> #batch_size * #no_of_words * #vocab_size
   pred_softmax = self._lstm_build_model(tparams, x_es, mask_es, use_noise)
   cost = negative_log_likelihood(pred_softmax, gt_sents)
   # pred_prob = lstm_output.max(axis=2)
   # pred = lstm_output.argmax(axis=2)
   
   return sents, mask, imgs, gt_sents, use_noise, cost
Ejemplo n.º 31
0
def build_model(tparams, options):

    trng = RandomStreams(SEED)

    # Used for dropout.
    use_noise = theano.shared(numpy_floatX(0.))

    # input sentence: n_steps * n_samples
    x = tensor.matrix('x', dtype='int32')
    # label: (n_samples,)
    y = tensor.vector('y', dtype='int32')

    layer0_input = tparams['Wemb'][tensor.cast(x.flatten(),
                                               dtype='int32')].reshape(
                                                   (x.shape[0], 1, x.shape[1],
                                                    tparams['Wemb'].shape[1]))
    layer0_input = dropout(layer0_input, trng, use_noise)

    layer1_inputs = []
    for i in xrange(len(options['filter_hs'])):
        filter_shape = options['filter_shapes'][i]
        pool_size = options['pool_sizes'][i]
        conv_layer = encoder(tparams,
                             layer0_input,
                             filter_shape=filter_shape,
                             pool_size=pool_size,
                             prefix=_p('cnn_encoder', i))
        layer1_input = conv_layer
        layer1_inputs.append(layer1_input)
    layer1_input = tensor.concatenate(layer1_inputs, 1)
    layer1_input = dropout(layer1_input, trng, use_noise)

    # this is the label prediction you made
    pred = tensor.nnet.softmax(
        tensor.dot(layer1_input, tparams['Wy']) + tparams['by'])

    f_pred_prob = theano.function([x], pred, name='f_pred_prob')
    f_pred = theano.function([x], pred.argmax(axis=1), name='f_pred')

    # get the expression of how we calculate the cost function
    # i.e. corss-entropy loss
    index = tensor.arange(x.shape[0])
    cost = -tensor.log(pred[index, y] + 1e-6).mean()

    return use_noise, x, y, f_pred_prob, f_pred, cost
Ejemplo n.º 32
0
def build_model(tparams, options):

    trng = RandomStreams(SEED)

    # Used for dropout.
    use_noise = theano.shared(numpy_floatX(0.))

    # n_samples * n_chars
    x = tensor.matrix('x', dtype='int32')
    y = tensor.matrix('y', dtype='int32')
    # (ncons*n_samples) * n_chars
    cy = tensor.matrix('cy', dtype='int32')

    # n_samples * n_h
    tmp_x = tensor.tanh(tensor.dot(x, tparams['W1']) + tparams['b1'])
    tmp_y = tensor.tanh(tensor.dot(y, tparams['W1']) + tparams['b1'])
    # (ncons*n_samples) * n_h
    tmp_cy = tensor.tanh(tensor.dot(cy, tparams['W1']) + tparams['b1'])

    # n_samples * n_h
    feats_x = tensor.tanh(tensor.dot(tmp_x, tparams['W2']) + tparams['b2'])
    feats_y = tensor.tanh(tensor.dot(tmp_y, tparams['W2']) + tparams['b2'])
    # (ncons*n_samples) * n_h
    feats_cy = tensor.tanh(tensor.dot(tmp_cy, tparams['W2']) + tparams['b2'])

    feats_x = dropout(feats_x, trng, use_noise)
    feats_y = dropout(feats_y, trng, use_noise)
    feats_cy = dropout(feats_cy, trng, use_noise)

    feats_x = l2norm(feats_x)
    feats_y = l2norm(feats_y)
    feats_cy = l2norm(feats_cy)

    # Tile by number of contrast terms
    # (ncon*n_samples) * n_h
    feats_x = tensor.tile(feats_x, (options['ncon'], 1))
    feats_y = tensor.tile(feats_y, (options['ncon'], 1))

    cost = tensor.log(1 + tensor.sum(
        tensor.exp(-options['gamma'] * ((feats_x * feats_y).sum(axis=1) -
                                        (feats_x * feats_cy).sum(axis=1)))))

    return use_noise, [x, y, cy], cost
Ejemplo n.º 33
0
def build_model(tparams, options):
    """first model - blind single answer qa without masking the response at ?

    :tparams: TODO
    :options: TODO
    :returns: TODO

    """
    trng = RandomStreams(SEED)

    # Used for dropout
    use_noise = theano.shared(utils.numpy_floatX(0.))

    x = tensor.matrix('x', dtype='int64')
    mask = tensor.matrix('mask', dtype=config.floatX)
    y = tensor.vector('y', dtype='int64')

    n_timesteps = x.shape[0]
    n_samples = x.shape[1]

    emb = tparams['Wemb'][x.flatten()].reshape([n_timesteps,
                                                n_samples,
                                                options['dim_proj']])
    proj = get_layer(options['encoder'])[1](tparams, emb, options,
                                            prefix=options['encoder'],
                                            mask=mask)

    if options['use_dropout']:
        proj = utils.dropout_layer(proj, use_noise, trng)

    pred = tensor.nnet.softmax(tensor.dot(proj, tparams['U']) + tparams['b'])

    f_pred_prob = theano.function([x, mask], pred, name='f_pred_prob')
    f_pred = theano.function([x, mask], pred.argmax(axis=1), name='f_pred')

    off = 1e-8
    if pred.dtype == 'float16':
        off = 1e-6

    cost = -tensor.log(pred[tensor.arange(n_samples), y] + off).mean()

    return use_noise, x, mask, y, f_pred_prob, f_pred, cost
Ejemplo n.º 34
0
def build_model(tparams,options):
    
    trng = RandomStreams(SEED)
    
    # Used for dropout.
    use_noise = theano.shared(numpy_floatX(0.))
    
    # input sentence: n_steps * n_samples
    x = tensor.matrix('x', dtype='int32')
    mask = tensor.matrix('mask', dtype=config.floatX)
    
    # label: (n_samples,)
    y = tensor.vector('y',dtype='int32')

    n_steps = x.shape[0] # the length of the longest sentence in this minibatch
    n_samples = x.shape[1] # how many samples we have in this minibatch
    n_x = tparams['Wemb'].shape[1] # the dimension of the word-embedding
    
    emb = tparams['Wemb'][x.flatten()].reshape([n_steps,n_samples,n_x])  
    emb = dropout(emb, trng, use_noise)
                        
    # encoding of the sentence, size of n_samples * n_h                                                               
    h_encoder = encoder(tparams, emb, mask=mask, prefix='lstm_encoder')
    h_encoder_rev = encoder(tparams, emb[::-1], mask=mask[::-1], prefix='lstm_encoder_rev')
    
    # size of n_samples * (2*n_h) 
    z = tensor.concatenate((h_encoder,h_encoder_rev),axis=1) 
    z = dropout(z, trng, use_noise)  
    
    # this is the label prediction you made 
    # size of n_samples * n_y
    pred = tensor.nnet.softmax(tensor.dot(z, tparams['Wy'])+tparams['by'])
    
    f_pred_prob = theano.function([x, mask], pred, name='f_pred_prob')
    f_pred = theano.function([x, mask], pred.argmax(axis=1), name='f_pred')

    # get the expression of how we calculate the cost function
    # i.e. corss-entropy loss
    index = tensor.arange(n_samples)
    cost = -tensor.log(pred[index, y] + 1e-6).mean()                          

    return use_noise, x, mask, y, f_pred_prob, f_pred, cost
Ejemplo n.º 35
0
def Adam(tparams, cost, inps, lr, b1=0.1, b2=0.001, e=1e-8):
    """ default: lr=0.0002 
        This is the implementation of the Adam algorithm
        Reference: http://arxiv.org/pdf/1412.6980v8.pdf
    """

    grads = tensor.grad(cost, tparams.values())
    norm = tensor.sqrt(sum([tensor.sum(g**2) for g in grads]))
    if tensor.ge(norm, 5):
        grads = [g * 5 / norm for g in grads]

    gshared = [
        theano.shared(p.get_value() * 0., name='%s_grad' % k)
        for k, p in tparams.iteritems()
    ]
    gsup = [(gs, g) for gs, g in zip(gshared, grads)]
    f_grad_shared = theano.function(inps, cost, updates=gsup)

    updates = []

    i = theano.shared(numpy_floatX(0.))
    i_t = i + 1.
    fix1 = 1. - b1**(i_t)
    fix2 = 1. - b2**(i_t)
    lr_t = lr * (tensor.sqrt(fix2) / fix1)

    for p, g in zip(tparams.values(), gshared):
        m = theano.shared(p.get_value() * 0.)
        v = theano.shared(p.get_value() * 0.)
        m_t = (b1 * g) + ((1. - b1) * m)
        v_t = (b2 * tensor.sqr(g)) + ((1. - b2) * v)
        g_t = m_t / (tensor.sqrt(v_t) + e)
        p_t = p - (lr_t * g_t)
        updates.append((m, m_t))
        updates.append((v, v_t))
        updates.append((p, p_t))
    updates.append((i, i_t))

    f_update = theano.function([lr], [], updates=updates)

    return f_grad_shared, f_update
Ejemplo n.º 36
0
def build_model(tparams, options):

    trng = RandomStreams(SEED)

    # Used for dropout.
    use_noise = theano.shared(numpy_floatX(0.))

    # x: n_steps * n_samples
    x = tensor.matrix('x', dtype='int64')
    y = tensor.matrix('y', dtype='int64')

    n_steps = x.shape[0]
    n_samples = x.shape[1]

    n_x = tparams['Wemb'].shape[1]

    emb = tparams['Wemb'][x.flatten()].reshape([n_steps, n_samples, n_x])
    emb = dropout(emb, trng, use_noise)

    h_decoder = decoder_layer(tparams, emb, prefix='decoder_h1')
    h_decoder = dropout(h_decoder, trng, use_noise)

    h_decoder = decoder_layer(tparams, h_decoder, prefix='decoder_h2')
    h_decoder = dropout(h_decoder, trng, use_noise)

    # n_steps * n_samples * n_h
    shape = h_decoder.shape
    h_decoder = h_decoder.reshape((shape[0] * shape[1], shape[2]))

    pred = tensor.dot(h_decoder, tparams['Vhid']) + tparams['bhid']
    pred = tensor.nnet.softmax(pred)

    y_vec = y.reshape((shape[0] * shape[1], ))
    index = tensor.arange(shape[0] * shape[1])
    y_pred = pred[index, y_vec]

    f_pred_prob = theano.function([x, y], y_pred, name='f_pred_prob')
    cost = -tensor.log(y_pred + 1e-6).sum() / n_steps / n_samples

    return use_noise, x, y, f_pred_prob, cost
Ejemplo n.º 37
0
def Adam(tparams, cost, inps, lr, b1=0.1, b2=0.001, e=1e-8, clip_norm=5):
    """ default: lr=0.0002 
        This is the implementation of the Adam algorithm
        Reference: http://arxiv.org/pdf/1412.6980v8.pdf
    """
    
    grads = tensor.grad(cost, tparams.values())
    norm = tensor.sqrt(sum([tensor.sum(g**2) for g in grads]))
    if tensor.ge(norm, clip_norm):
        grads = [g*clip_norm/norm for g in grads]
    
    gshared = [theano.shared(p.get_value() * 0., name='%s_grad'%k) 
                for k, p in tparams.iteritems()]
    gsup = [(gs, g) for gs, g in zip(gshared, grads)]
    f_grad_shared = theano.function(inps, cost, updates=gsup)
    
    updates = []

    i = theano.shared(numpy_floatX(0.))    
    i_t = i + 1.
    fix1 = 1. - b1**(i_t)
    fix2 = 1. - b2**(i_t)
    lr_t = lr * (tensor.sqrt(fix2) / fix1)

    for p, g in zip(tparams.values(), gshared):
        m = theano.shared(p.get_value() * 0.)
        v = theano.shared(p.get_value() * 0.)
        m_t = (b1 * g) + ((1. - b1) * m)
        v_t = (b2 * tensor.sqr(g)) + ((1. - b2) * v)
        g_t = m_t / (tensor.sqrt(v_t) + e)
        p_t = p - (lr_t * g_t)
        updates.append((m, m_t))
        updates.append((v, v_t))
        updates.append((p, p_t))
    updates.append((i, i_t))
    
    f_update = theano.function([lr], [], updates=updates)
    
    return f_grad_shared, f_update   
Ejemplo n.º 38
0
def build_model(tparams,options):
    
    trng = RandomStreams(SEED)
    
    # Used for dropout.
    use_noise = theano.shared(numpy_floatX(0.))
    
    # input sentence: n_steps * n_samples
    x = tensor.matrix('x', dtype='int32')
    # label: (n_samples,)
    y = tensor.vector('y',dtype='int32')
    
    layer0_input = tparams['Wemb'][tensor.cast(x.flatten(),dtype='int32')].reshape((x.shape[0],1,x.shape[1],tparams['Wemb'].shape[1])) 
    layer0_input = dropout(layer0_input, trng, use_noise)
 
    layer1_inputs = []
    for i in xrange(len(options['filter_hs'])):
        filter_shape = options['filter_shapes'][i]
        pool_size = options['pool_sizes'][i]
        conv_layer = encoder(tparams, layer0_input,filter_shape=filter_shape, pool_size=pool_size,prefix=_p('cnn_encoder',i))                          
        layer1_input = conv_layer
        layer1_inputs.append(layer1_input)
    layer1_input = tensor.concatenate(layer1_inputs,1)
    layer1_input = dropout(layer1_input, trng, use_noise) 
    
    # this is the label prediction you made 
    pred = tensor.nnet.softmax(tensor.dot(layer1_input, tparams['Wy']) + tparams['by'])
    
    f_pred_prob = theano.function([x], pred, name='f_pred_prob')
    f_pred = theano.function([x], pred.argmax(axis=1), name='f_pred')

    # get the expression of how we calculate the cost function
    # i.e. corss-entropy loss
    index = tensor.arange(x.shape[0])
    cost = -tensor.log(pred[index, y] + 1e-6).mean()                          

    return use_noise, x, y, f_pred_prob, f_pred, cost
Ejemplo n.º 39
0
def Adam(tparams, cost, inps, lr, b1=0.1, b2=0.001, e=1e-8, clip_norm=5):

    grads = tensor.grad(cost, tparams.values())
    norm = tensor.sqrt(sum([tensor.sum(g**2) for g in grads]))
    if tensor.ge(norm, clip_norm):
        grads = [g * clip_norm / norm for g in grads]

    gshared = [
        theano.shared(p.get_value() * 0., name='%s_grad' % k)
        for k, p in tparams.iteritems()
    ]
    gsup = [(gs, g) for gs, g in zip(gshared, grads)]
    f_grad_shared = theano.function(inps, cost, updates=gsup)

    updates = []

    i = theano.shared(numpy_floatX(0.))
    i_t = i + 1.
    fix1 = 1. - b1**(i_t)
    fix2 = 1. - b2**(i_t)
    lr_t = lr * (tensor.sqrt(fix2) / fix1)

    for p, g in zip(tparams.values(), gshared):
        m = theano.shared(p.get_value() * 0.)
        v = theano.shared(p.get_value() * 0.)
        m_t = (b1 * g) + ((1. - b1) * m)
        v_t = (b2 * tensor.sqr(g)) + ((1. - b2) * v)
        g_t = m_t / (tensor.sqrt(v_t) + e)
        p_t = p - (lr_t * g_t)
        updates.append((m, m_t))
        updates.append((v, v_t))
        updates.append((p, p_t))
    updates.append((i, i_t))

    f_update = theano.function([lr], [], updates=updates)

    return f_grad_shared, f_update
Ejemplo n.º 40
0
def SGMGHMC(tparams,
            cost,
            inps,
            ntrain,
            lr,
            iterations,
            rho=0.9,
            epsilon=1e-6,
            clip_norm=1):
    """ Additional parameters """
    mom_tparams = OrderedDict()
    xi_tparams = OrderedDict()
    for k, p0 in tparams.iteritems():
        mom_tparams[k] = theano.shared(p0.get_value() * 0. + 1e-10,
                                       name='%s_mom' % k)
        xi_tparams[k] = theano.shared(p0.get_value() * 0. + 1e-10,
                                      name='%s_xi' % k)

    #a = theano.shared(numpy_floatX(1.))
    m = theano.shared(numpy_floatX(1.))
    c = theano.shared(numpy_floatX(5.))
    sigma_p = theano.shared(numpy_floatX(1.))
    sigma_xi = theano.shared(numpy_floatX(1.))
    gamma_xi = theano.shared(numpy_floatX(0.001))
    logger = logging.getLogger('eval_ptb_sgmgnht')
    logger.setLevel(logging.INFO)
    fh = logging.FileHandler('eval_ptb_sgmgnht.log')
    logger.info('a = 1, m {} c {} s_p{} s_xi{} g_xi{}'.format(
        m.get_value(), c.get_value(), sigma_p.get_value(),
        sigma_xi.get_value(), gamma_xi.get_value()))

    p = tensor.vector('p', dtype=theano.config.floatX)
    """ default: lr=0.001 """

    trng = RandomStreams(123)

    grads = tensor.grad(cost, tparams.values())
    norm = tensor.sqrt(sum([tensor.sum(g**2) for g in grads]))
    if tensor.ge(norm, clip_norm):
        grads = [g * clip_norm / norm for g in grads]

    gshared = [
        theano.shared(p0.get_value() * 0., name='%s_grad' % k)
        for k, p0 in tparams.iteritems()
    ]
    gsup = [(gs, g) for gs, g in zip(gshared, grads)]
    f_grad_shared = theano.function(inps, cost, updates=gsup)

    updates = []

    for p, mom, xi, g in zip(tparams.values(), mom_tparams.values(),
                             xi_tparams.values(), gshared):

        g_f = mom / m
        K_f = -g_f + 2 / c * (c * g_f + tensor.log(1 + tensor.exp(-c * g_f)))

        psi_f_1 = (1 - tensor.exp(-c * g_f)) / (1 + tensor.exp(-c * g_f))
        f1_f_1 = 1 / m * psi_f_1
        psi_grad_f_1 = 2 * c * tensor.exp(
            -c * g_f) / (1 + tensor.exp(-c * g_f))**2
        f3_f_1 = 1 / m**2 * (psi_f_1**2 - psi_grad_f_1)

        psi_f = (tensor.exp(c * g_f) - 1) / (tensor.exp(c * g_f) + 1)
        f1_f = 1 / m * psi_f
        psi_grad_f = 2 * c * tensor.exp(c * g_f) / (tensor.exp(c * g_f) + 1)**2
        f3_f = 1 / m**2 * (psi_f**2 - psi_grad_f)

        temp_f1 = tensor.switch(tensor.ge(g_f, 0), f1_f_1, f1_f)
        temp_f3 = tensor.switch(tensor.ge(g_f, 0), f3_f_1, f3_f)

        noise_p = trng.normal(p.get_value().shape,
                              avg=0.0,
                              std=1.,
                              dtype=theano.config.floatX)
        noise_xi = trng.normal(p.get_value().shape,
                               avg=0.0,
                               std=1.,
                               dtype=theano.config.floatX)
        # generata gamma(a,2): N(0,1)^2 = gamma(1/2,2)
        noise_temp = tensor.zeros(p.get_value().shape)
        for aa in xrange(2):
            this_noise = trng.normal(p.get_value().shape,
                                     avg=0.0,
                                     std=1.,
                                     dtype=theano.config.floatX)
            noise_temp = tensor.inc_subtensor(noise_temp[:], this_noise**2)
        randmg = (noise_temp * m / 2) * tensor.sgn(
            trng.normal(p.get_value().shape,
                        avg=0.0,
                        std=1.,
                        dtype=theano.config.floatX))

        updated_p = p + temp_f1 * lr
        updated_mom = (mom - temp_f1 * xi * lr - g * lr * ntrain +
                       tensor.sqrt(2 * sigma_p * lr) * noise_p) * (
                           1 - tensor.eq(tensor.mod(iterations, 50), 0)
                       ) + randmg * tensor.eq(tensor.mod(iterations, 50), 0)
        #updated_mom = mom - temp_f1* xi *lr  - g * lr * ntrain + tensor.sqrt(2*sigma_p*lr) * noise_p
        temp_xi = trng.normal(p.get_value().shape,
                              avg=sigma_p,
                              std=tensor.sqrt(sigma_xi / 2),
                              dtype=theano.config.floatX)
        updated_xi = (xi + temp_f3 * sigma_xi * lr -
                      (xi - sigma_p) * gamma_xi * lr +
                      tensor.sqrt(2 * sigma_xi * gamma_xi * lr) * noise_xi) * (
                          1 - tensor.eq(tensor.mod(iterations, 100), 50)
                      ) + temp_xi * tensor.eq(tensor.mod(iterations, 100), 50)

        updates.append((p, updated_p))
        updates.append((mom, updated_mom))
        updates.append((xi, updated_xi))

    f_update = theano.function([lr, ntrain, iterations], [p, mom, xi],
                               updates=updates)
    #f_params = theano.function([], [a, m, c, mom.shape])
    return f_grad_shared, f_update