def model(x, embedding_size, n_hidden):

    # hidden and input weights
    U = shared_glorot_uniform((embedding_size, n_hidden), name="U")
    W = shared_glorot_uniform((n_hidden, n_hidden), name="W")
    bh = shared_zeros((n_hidden, ), name="bh")

    # output weights
    V = shared_glorot_uniform((n_hidden, embedding_size), name="V")
    by = shared_zeros((embedding_size, ), name="by")

    params = [U, V, W, by, bh]

    def step(x_t, h_tm1):
        h_t = T.tanh(U[x_t] + T.dot(h_tm1, W) + bh)
        y_t = T.dot(h_t, V) + by
        return h_t, y_t

    h0 = shared_zeros((n_hidden, ), name='h0')
    [h, y_pred], _ = theano.scan(step,
                                 sequences=x,
                                 outputs_info=[h0, None],
                                 truncate_gradient=10)

    model = T.nnet.softmax(y_pred)
    return model, params
Exemple #2
0
def model(inputs, _is_training, params, batch_size, hidden_size, drop_i, drop_s, init_scale, init_H_bias, tied_noise, _theano_rng):
    noise_i_for_i = get_dropout_noise((batch_size, hidden_size), drop_i, _theano_rng)
    noise_i_for_f = get_dropout_noise((batch_size, hidden_size), drop_i, _theano_rng) if not tied_noise else noise_i_for_i
    noise_i_for_c = get_dropout_noise((batch_size, hidden_size), drop_i, _theano_rng) if not tied_noise else noise_i_for_i
    noise_i_for_o = get_dropout_noise((batch_size, hidden_size), drop_i, _theano_rng) if not tied_noise else noise_i_for_i

    i_for_i = ifelse(_is_training, inputs* noise_i_for_i, inputs)
    i_for_f = ifelse(_is_training, inputs* noise_i_for_f, inputs)
    i_for_c = ifelse(_is_training, inputs* noise_i_for_c, inputs)
    i_for_o = ifelse(_is_training, inputs* noise_i_for_o, inputs)

    i_for_i = linear.model(i_for_i, params, hidden_size, hidden_size, init_scale, bias_init=init_H_bias)
    i_for_f = linear.model(i_for_f, params, hidden_size, hidden_size, init_scale, bias_init=init_H_bias)
    i_for_c = linear.model(i_for_c, params, hidden_size, hidden_size, init_scale, bias_init=init_H_bias)
    i_for_o = linear.model(i_for_o, params, hidden_size, hidden_size, init_scale, bias_init=init_H_bias)

    # Dropout noise for recurrent hidden state.
    noise_s = get_dropout_noise((batch_size, hidden_size), drop_s, _theano_rng)
    if not tied_noise:
      noise_s = T.stack(noise_s, get_dropout_noise((batch_size, hidden_size), drop_s, _theano_rng),
        get_dropout_noise((batch_size, hidden_size), drop_s, _theano_rng),
        get_dropout_noise((batch_size, hidden_size), drop_s, _theano_rng))


    def step(i_for_i_t,i_for_f_t,i_for_c_t,i_for_o_t, y_tm1, c_tm1, noise_s):
        noise_s_for_i = noise_s if tied_noise else noise_s[0]
        noise_s_for_f = noise_s if tied_noise else noise_s[1]
        noise_s_for_c = noise_s if tied_noise else noise_s[2]
        noise_s_for_o = noise_s if tied_noise else noise_s[3]

        s_lm1_for_i = ifelse(_is_training, y_tm1 * noise_s_for_i, y_tm1)
        s_lm1_for_f = ifelse(_is_training, y_tm1 * noise_s_for_f, y_tm1)
        s_lm1_for_c = ifelse(_is_training, y_tm1 * noise_s_for_c, y_tm1)
        s_lm1_for_o = ifelse(_is_training, y_tm1 * noise_s_for_o, y_tm1)

        i_t = T.nnet.sigmoid(i_for_i_t + linear.model(s_lm1_for_i, params, hidden_size, hidden_size, init_scale))
        f_t = T.nnet.sigmoid(i_for_o_t + linear.model(s_lm1_for_f, params, hidden_size, hidden_size, init_scale))
        c_t = f_t * c_tm1 + i_t * T.tanh(i_for_c_t + linear.model(s_lm1_for_c, params, hidden_size, hidden_size, init_scale))
        o_t = T.nnet.sigmoid(i_for_o_t + linear.model(s_lm1_for_o, params, hidden_size, hidden_size, init_scale))
        return o_t * T.tanh(c_t), c_t

    y_0 = shared_zeros((batch_size,hidden_size), name='h0')
    c_0 = shared_zeros((batch_size,hidden_size), name='c0')
    [y, c], _ = theano.scan(step,
        sequences=[i_for_i,i_for_f,i_for_c,i_for_o],
        outputs_info=[y_0,c_0],
        non_sequences = [noise_s])

    y_last = y[-1]
    sticky_state_updates = [(y_0, y_last)]

    return y, y_0, sticky_state_updates
Exemple #3
0
 def get_rnn_params(number, n_visible, n_hidden_recurrent):
     w_in_update = shared_normal('w_in_update_%d' % number, n_visible, n_hidden_recurrent, scale=0.0001)
     w_hidden_update = shared_normal('w_hidden_update_%d' % number, n_hidden_recurrent, n_hidden_recurrent,
                                     scale=0.0001)
     b_update = shared_zeros('b_update_%d' % number, n_hidden_recurrent)
     w_in_reset = shared_normal('w_in_reset_%d' % number, n_visible, n_hidden_recurrent, scale=0.0001)
     w_hidden_reset = shared_normal('w_hidden_reset_%d' % number, n_hidden_recurrent, n_hidden_recurrent,
                                    scale=0.0001)
     b_reset = shared_zeros('b_reset_%d' % number, n_hidden_recurrent)
     w_in_hidden = shared_normal('w_in_hidden_%d' % number, n_visible, n_hidden_recurrent, scale=0.0001)
     w_reset_hidden = shared_normal('w_reset_hidden_%d' % number, n_hidden_recurrent, n_hidden_recurrent,
                                    scale=0.0001)
     b_hidden = shared_zeros('b_hidden_%d' % number, n_hidden_recurrent)
     return [w_in_update, w_hidden_update, b_update,
             w_in_reset, w_hidden_reset, b_reset,
             w_in_hidden, w_reset_hidden, b_hidden]
Exemple #4
0
    def __init__(self, n_visible, n_hidden=150, n_hidden_recurrent=100, lr=0.001, l2_norm=None, l1_norm=None):
        (v, v_sample, cost, monitor, params, updates_train,
         v_t, updates_generate, n_steps) = build_rnnrbm(n_visible, n_hidden, n_hidden_recurrent, lr, l2_norm=l2_norm,
                                                        l1_norm=l1_norm)

        for param in params:
            gradient = T.grad(cost, param, consider_constant=[v_sample])

            # remove nan and inf values
            not_finite = T.or_(T.isnan(gradient), T.isinf(gradient))
            gradient = T.switch(not_finite, 0.1 * param, gradient)
            # max_grad = param * 1e-3
            # gradient = T.switch(T.gt(gradient, max_grad), max_grad, gradient)

            # momentum
            # velocity = shared_zeros('velocity_' + str(param.name), param.get_value(borrow=True).shape)
            # update = param - T.cast(lr, dtype=dtype) * gradient
            # x = momentum * velocity + update - param
            # updates_train[velocity] = x
            # updates_train[param] = momentum * x + update

            # rmsprop
            accu = shared_zeros('accu_' + str(param.name), param.get_value(borrow=True).shape)
            accu_new = 0.9 * accu + 0.1 * gradient ** 2
            updates_train[accu] = accu_new
            updates_train[param] = param - (lr * gradient / T.sqrt(accu_new + 1e-6))
        self.params = params
        self.train_function = theano.function([v], monitor, updates=updates_train)
        self.generate_function = theano.function([n_steps], v_t, updates=updates_generate)
Exemple #5
0
def model(inputs, _is_training, params, batch_size, hidden_size, drop_i,
          drop_s, init_scale, init_H_bias, _theano_rng):
    noise_i_for_H = get_dropout_noise((batch_size, hidden_size), drop_i,
                                      _theano_rng)
    i_for_H = ifelse(_is_training, inputs * noise_i_for_H, inputs)
    i_for_H = linear.model(i_for_H,
                           params,
                           hidden_size,
                           hidden_size,
                           init_scale,
                           bias_init=init_H_bias)

    # Dropout noise for recurrent hidden state.
    noise_s = get_dropout_noise((batch_size, hidden_size), drop_s, _theano_rng)

    def step(i_for_H_t, y_tm1, noise_s):
        s_lm1_for_H = ifelse(_is_training, y_tm1 * noise_s, y_tm1)
        return T.tanh(i_for_H_t + linear.model(
            s_lm1_for_H, params, hidden_size, hidden_size, init_scale))

    y_0 = shared_zeros((batch_size, hidden_size), name='h0')
    y, _ = theano.scan(step,
                       sequences=i_for_H,
                       outputs_info=[y_0],
                       non_sequences=[noise_s])

    y_last = y[-1]
    sticky_state_updates = [(y_0, y_last)]

    return y, y_0, sticky_state_updates
def model(x, embedding_size, n_hidden):

    # Update gate weights
    W_xz = shared_glorot_uniform((embedding_size, n_hidden))
    W_hz = shared_glorot_uniform((n_hidden, n_hidden))
    b_z = shared_zeros((n_hidden, ))

    # Reset gate weights
    W_xr = shared_glorot_uniform((embedding_size, n_hidden))
    W_hr = shared_glorot_uniform((n_hidden, n_hidden))
    b_r = shared_zeros((n_hidden, ))

    # Hidden layer
    W_xh = shared_glorot_uniform((embedding_size, n_hidden))
    W_hh = shared_glorot_uniform((n_hidden, n_hidden))
    b_h = shared_zeros((n_hidden, ))

    # Output weights
    W_y = shared_glorot_uniform((n_hidden, embedding_size), name="V")
    b_y = shared_zeros((embedding_size, ), name="by")

    params = [W_xz, W_hz, b_z, W_xr, W_hr, b_r, W_xh, W_hh, b_h, W_y, b_y]

    def step(x_t, h_tm1):
        z_t = T.nnet.sigmoid(W_xz[x_t] + T.dot(W_hz, h_tm1) + b_z)
        r_t = T.nnet.sigmoid(W_xr[x_t] + T.dot(W_hr, h_tm1) + b_r)
        can_h_t = T.tanh(W_xh[x_t] + r_t * T.dot(W_hh, h_tm1) + b_h)
        h_t = (1 - z_t) * h_tm1 + z_t * can_h_t
        y_t = T.dot(h_t, W_y) + b_y
        return h_t, y_t

    h0 = shared_zeros((n_hidden, ), name='h0')
    [h, y_pred], _ = theano.scan(step,
                                 sequences=x,
                                 outputs_info=[h0, None],
                                 truncate_gradient=10)

    model = T.nnet.softmax(y_pred)
    return model, params
Exemple #7
0
def build_rnnrbm(n_visible, n_hidden, n_hidden_recurrent, lr, l2_norm=None, l1_norm=None):
    # rbm params
    W = shared_normal('W', n_visible, n_hidden, scale=0.01)
    bv = shared_zeros('bv', n_visible)
    bh = shared_zeros('bh', n_hidden)

    # rnn -> rbm connections
    Wuh = shared_normal('Wuh', n_hidden_recurrent, n_hidden, scale=0.0001)
    Wuv = shared_normal('Wuv', n_hidden_recurrent, n_visible, scale=0.0001)

    params = [W, bv, bh, Wuh, Wuv]

    def get_rnn_params(number, n_visible, n_hidden_recurrent):
        w_in_update = shared_normal('w_in_update_%d' % number, n_visible, n_hidden_recurrent, scale=0.0001)
        w_hidden_update = shared_normal('w_hidden_update_%d' % number, n_hidden_recurrent, n_hidden_recurrent,
                                        scale=0.0001)
        b_update = shared_zeros('b_update_%d' % number, n_hidden_recurrent)
        w_in_reset = shared_normal('w_in_reset_%d' % number, n_visible, n_hidden_recurrent, scale=0.0001)
        w_hidden_reset = shared_normal('w_hidden_reset_%d' % number, n_hidden_recurrent, n_hidden_recurrent,
                                       scale=0.0001)
        b_reset = shared_zeros('b_reset_%d' % number, n_hidden_recurrent)
        w_in_hidden = shared_normal('w_in_hidden_%d' % number, n_visible, n_hidden_recurrent, scale=0.0001)
        w_reset_hidden = shared_normal('w_reset_hidden_%d' % number, n_hidden_recurrent, n_hidden_recurrent,
                                       scale=0.0001)
        b_hidden = shared_zeros('b_hidden_%d' % number, n_hidden_recurrent)
        return [w_in_update, w_hidden_update, b_update,
                w_in_reset, w_hidden_reset, b_reset,
                w_in_hidden, w_reset_hidden, b_hidden]

    def build_rnn(params, v_t, u_tm1):
        w_in_update, w_hidden_update, b_update, \
        w_in_reset, w_hidden_reset, b_reset, \
        w_in_hidden, w_reset_hidden, b_hidden = params

        update_gate = T.tanh(T.dot(v_t, w_in_update) + T.dot(u_tm1, w_hidden_update) + b_update)
        reset_gate = T.tanh(T.dot(v_t, w_in_reset) + T.dot(u_tm1, w_hidden_reset) + b_reset)
        u_t_temp = T.tanh(T.dot(v_t, w_in_hidden) + T.dot(u_tm1 * reset_gate, w_reset_hidden) + b_hidden)
        u_t = (1 - update_gate) * u_t_temp + update_gate * u_tm1

        return u_t

    # update gate
    rnn_params_1 = get_rnn_params(1, n_visible, n_hidden_recurrent)
    rnn_params_2 = get_rnn_params(2, n_hidden_recurrent, n_hidden_recurrent)
    rnn_params_3 = get_rnn_params(3, n_hidden_recurrent, n_hidden_recurrent)
    params += rnn_params_1 + rnn_params_2 + rnn_params_3

    def build_rbm(v, W, bv, bh, k):
        def gibbs_step(v, binomial=False):
            mean_h = sigm(T.dot(v, W) + bh)
            h = rng.binomial(size=mean_h.shape, n=1, p=mean_h, dtype=dtype)
            mean_v = sigm(T.dot(h, W.T) + bv)
            v = rng.binomial(size=mean_v.shape, n=1, p=mean_v, dtype=theano.config.floatX) if binomial else mean_v
            return mean_v, v

        chain, updates = theano.scan(lambda v: gibbs_step(v)[1], outputs_info=[v], n_steps=k)
        v_sample = chain[-1]
        mean_v = gibbs_step(v_sample)[0]
        monitor = T.xlogx.xlogy0(v, mean_v) + T.xlogx.xlogy0(1 - v, 1 - mean_v)
        monitor = monitor.sum() / v.shape[0]

        def free_energy(v):
            return -(v * bv).sum() - T.log(1 + T.exp(T.dot(v, W) + bh)).sum()

        cost = (free_energy(v) - free_energy(v_sample)) / v.shape[0]
        return v_sample, cost, monitor, updates

    def recurrence(v_t, u1_tm1, u2_tm1):
        bv_t = bv + T.dot(u2_tm1, Wuv)
        bh_t = bh + T.dot(u2_tm1, Wuh)
        generate = v_t is None

        # generate a probability distribution for the visible units, with certain biases
        if generate:
            v_t, _, _, updates = build_rbm(T.zeros((n_visible,)), W, bv_t, bh_t, k=15)

        u1_t = build_rnn(rnn_params_1, v_t, u1_tm1)
        u2_t = build_rnn(rnn_params_2, u1_t, u2_tm1)

        return ([v_t, u1_t, u2_t], updates) if generate else [u1_t, u2_t, bv_t, bh_t]

    v = T.matrix()

    # rnn initial values
    u1_0 = T.zeros((n_hidden_recurrent,))
    u2_0 = T.zeros((n_hidden_recurrent,))

    (_, _, bv_t, bh_t), updates_train = theano.scan(
        lambda v_t, u1_tm1, u2_tm1, *_: recurrence(v_t, u1_tm1, u2_tm1), sequences=v,
        outputs_info=[u1_0, u2_0, None, None])

    v_sample, cost, monitor, updates_rbm = build_rbm(v, W, bv_t, bh_t, k=20)
    updates_train.update(updates_rbm)

    n_steps = T.scalar(dtype='int32')

    (v_t, _, _), updates_generate = theano.scan(
        lambda u1_tm1, u2_tm1, *_: recurrence(None, u1_tm1, u2_tm1),
        outputs_info=[None, u1_0, u2_0], n_steps=n_steps)

    # l1 and l2 regularizers
    for param in rnn_params_1 + rnn_params_2 + rnn_params_3:
        if l2_norm is not None:
            cost += T.sum(param ** 2) * l2_norm * lr
        if l1_norm is not None:
            cost += T.sum(abs(param)) * l1_norm * lr

    return (v, v_sample, cost, monitor, params, updates_train, v_t, updates_generate, n_steps)
def model(inputs, _is_training, params, depth, batch_size, hidden_size, drop_i,
          drop_s, init_scale, init_T_bias, init_H_bias, tied_noise,
          _theano_rng):
    noise_i_for_H = get_dropout_noise((batch_size, hidden_size), drop_i,
                                      _theano_rng)
    noise_i_for_T = get_dropout_noise(
        (batch_size, hidden_size), drop_i,
        _theano_rng) if not tied_noise else noise_i_for_H

    i_for_H = ifelse(_is_training, noise_i_for_H * inputs, inputs)
    i_for_T = ifelse(_is_training, noise_i_for_T * inputs, inputs)

    i_for_H = linear(i_for_H,
                     params,
                     in_size=hidden_size,
                     out_size=hidden_size,
                     init_scale=init_scale,
                     bias_init=init_H_bias)
    i_for_T = linear(i_for_T,
                     params,
                     in_size=hidden_size,
                     out_size=hidden_size,
                     init_scale=init_scale,
                     bias_init=init_T_bias)

    # Dropout noise for recurrent hidden state.
    noise_s = get_dropout_noise((batch_size, hidden_size), drop_s, _theano_rng)
    if not tied_noise:
        noise_s = T.stack(
            noise_s,
            get_dropout_noise((batch_size, hidden_size), drop_s, _theano_rng))

    def deep_step_fn(i_for_H_t, i_for_T_t, y_tm1, noise_s):
        tanh, sigm = T.tanh, T.nnet.sigmoid
        noise_s_for_H = noise_s if tied_noise else noise_s[0]
        noise_s_for_T = noise_s if tied_noise else noise_s[1]

        s_lm1 = y_tm1
        for l in range(depth):
            s_lm1_for_H = ifelse(_is_training, s_lm1 * noise_s_for_H, s_lm1)
            s_lm1_for_T = ifelse(_is_training, s_lm1 * noise_s_for_T, s_lm1)
            if l == 0:
                # On the first micro-timestep of each timestep we already have bias
                # terms summed into i_for_H_t and into i_for_T_t.
                H = tanh(i_for_H_t + linear(s_lm1_for_H,
                                            params,
                                            in_size=hidden_size,
                                            out_size=hidden_size,
                                            init_scale=init_scale))
                Tr = sigm(i_for_T_t + linear(s_lm1_for_T,
                                             params,
                                             in_size=hidden_size,
                                             out_size=hidden_size,
                                             init_scale=init_scale))
            else:
                H = tanh(
                    linear(s_lm1_for_H,
                           params,
                           in_size=hidden_size,
                           out_size=hidden_size,
                           init_scale=init_scale,
                           bias_init=init_H_bias))
                Tr = sigm(
                    linear(s_lm1_for_T,
                           params,
                           in_size=hidden_size,
                           out_size=hidden_size,
                           init_scale=init_scale,
                           bias_init=init_T_bias))
            s_l = (H - s_lm1) * Tr + s_lm1
            s_lm1 = s_l

        y_t = s_l
        return y_t

    y_0 = shared_zeros((batch_size, hidden_size))

    y, _ = theano.scan(deep_step_fn,
                       sequences=[i_for_H, i_for_T],
                       outputs_info=[y_0],
                       non_sequences=[noise_s])

    y_last = y[-1]
    sticky_state_updates = [(y_0, y_last)]

    return y, y_0, sticky_state_updates