biases += att_to_h1.get_biases()
    biases += att_to_h2.get_biases()
    biases += att_to_h3.get_biases()
    biases += h1_to_h2.get_biases()
    biases += h1_to_h3.get_biases()
    biases += h2_to_h3.get_biases()

    # 3 to include groundtruth, pixel RNN style
    outs_to_v_h1 = GRUFork(3, n_v_proj, random_state)
    params += outs_to_v_h1.get_params()
    biases += outs_to_v_h1.get_biases()

    v_cell1 = GRU(n_v_proj, n_v_proj, random_state)
    params += v_cell1.get_params()

    h1_to_att_a, h1_to_att_b, h1_to_att_k = make_weights(
        n_hid, 3 * [att_size], random_state)
    h1_to_outs, = make_weights(n_hid, [n_proj], random_state)
    h2_to_outs, = make_weights(n_hid, [n_proj], random_state)
    h3_to_outs, = make_weights(n_hid, [n_proj], random_state)

    params += [h1_to_att_a, h1_to_att_b, h1_to_att_k]
    params += [h1_to_outs, h2_to_outs, h3_to_outs]

    pred_proj, = make_weights(n_v_proj, [n_pred_proj], random_state)
    pred_b, = make_biases([n_pred_proj])

    params += [pred_proj, pred_b]
    biases += [pred_b]

    inpt = X_sym[:-1]
    target = X_sym[1:]
    biases += att_to_h1.get_biases()
    biases += att_to_h2.get_biases()
    biases += att_to_h3.get_biases()
    biases += h1_to_h2.get_biases()
    biases += h1_to_h3.get_biases()
    biases += h2_to_h3.get_biases()

    # 3 to include groundtruth, pixel RNN style
    outs_to_v_h1 = GRUFork(3, n_v_proj, random_state)
    params += outs_to_v_h1.get_params()
    biases += outs_to_v_h1.get_biases()

    v_cell1 = GRU(n_v_proj, n_v_proj, random_state)
    params += v_cell1.get_params()

    h1_to_att_a, h1_to_att_b, h1_to_att_k = make_weights(n_hid, 3 * [att_size],
                                                         random_state)
    h1_to_outs, = make_weights(n_hid, [n_proj], random_state)
    h2_to_outs, = make_weights(n_hid, [n_proj], random_state)
    h3_to_outs, = make_weights(n_hid, [n_proj], random_state)

    params += [h1_to_att_a, h1_to_att_b, h1_to_att_k]
    params += [h1_to_outs, h2_to_outs, h3_to_outs]

    pred_proj, = make_weights(n_v_proj, [n_pred_proj], random_state)
    pred_b, = make_biases([n_pred_proj])

    params += [pred_proj, pred_b]
    biases += [pred_b]

    inpt = X_sym[:-1]
    target = X_sym[1:]
    cell1 = GRU(n_kernels, n_hid, random_state)
    cell2 = GRU(n_hid, n_hid, random_state)
    params += cell1.get_params()
    params += cell2.get_params()

    # Use GRU classes only to fork 1 inp to 2 inp:gate pairs
    att_to_h1 = GRUFork(n_chars, n_hid, random_state)
    att_to_h2 = GRUFork(n_chars, n_hid, random_state)
    h1_to_h2 = GRUFork(n_hid, n_hid, random_state)

    params += att_to_h1.get_params()
    params += att_to_h2.get_params()
    params += h1_to_h2.get_params()

    h1_to_att_a, h1_to_att_b, h1_to_att_k = make_weights(n_hid, 3 * [att_size],
                                                         random_state)
    params += [h1_to_att_a, h1_to_att_b, h1_to_att_k]

    # Need a , on single results since it always returns a list
    # + 1 to force correct upsampling ratio
    h1_to_outs, = make_weights(n_hid, [input_dim + 1], random_state)
    h2_to_outs, = make_weights(n_hid, [input_dim + 1], random_state)
    params += [h1_to_outs, h2_to_outs]

    # n_kernels in output tup is an arbitrary number
    w_deconv1, = make_conv_weights(1, (n_kernels,),
                                   (deconv_size1, input_dim + 1),
                                   random_state)
    b_deconv1, = make_biases((n_kernels,))
    w_deconv2, = make_conv_weights(n_kernels, (n_kernels,),
                                 (deconv_size2, input_dim + 1), random_state)
Esempio n. 4
0
    cell1 = GRU(n_kernels, n_hid, random_state)
    cell2 = GRU(n_hid, n_hid, random_state)
    params += cell1.get_params()
    params += cell2.get_params()

    # Use GRU classes only to fork 1 inp to 2 inp:gate pairs
    att_to_h1 = GRUFork(n_chars, n_hid, random_state)
    att_to_h2 = GRUFork(n_chars, n_hid, random_state)
    h1_to_h2 = GRUFork(n_hid, n_hid, random_state)

    params += att_to_h1.get_params()
    params += att_to_h2.get_params()
    params += h1_to_h2.get_params()

    h1_to_att_a, h1_to_att_b, h1_to_att_k = make_weights(
        n_hid, 3 * [att_size], random_state)
    params += [h1_to_att_a, h1_to_att_b, h1_to_att_k]

    # Need a , on single results since it always returns a list
    # + 1 to force correct upsampling ratio
    h1_to_outs, = make_weights(n_hid, [input_dim + 1], random_state)
    h2_to_outs, = make_weights(n_hid, [input_dim + 1], random_state)
    params += [h1_to_outs, h2_to_outs]

    # n_kernels in output tup is an arbitrary number
    w_deconv1, = make_conv_weights(1, (n_kernels, ),
                                   (deconv_size1, input_dim + 1), random_state)
    b_deconv1, = make_biases((n_kernels, ))
    w_deconv2, = make_conv_weights(n_kernels, (n_kernels, ),
                                   (deconv_size2, input_dim + 1), random_state)
    b_deconv2, = make_biases((n_kernels, ))
Esempio n. 5
0
    h2_to_h3 = GRUFork(n_hid, n_hid, random_state)

    params += inp_to_h1.get_params()
    params += inp_to_h2.get_params()
    params += inp_to_h3.get_params()
    params += att_to_h1.get_params()
    params += att_to_h2.get_params()
    params += att_to_h3.get_params()
    params += h1_to_h2.get_params()
    params += h1_to_h3.get_params()
    params += h2_to_h3.get_params()

    inp_to_v_h1 = GRUFork(1, n_v_hid, random_state)
    params += inp_to_v_h1.get_params()

    h1_to_att_a, h1_to_att_b, h1_to_att_k = make_weights(
        n_hid, 3 * [att_size], random_state)
    params += [h1_to_att_a, h1_to_att_b, h1_to_att_k]

    # Need a , on single results since it always returns a list
    h1_to_outs, = make_weights(n_hid, [n_hid], random_state)
    h2_to_outs, = make_weights(n_hid, [n_hid], random_state)
    h3_to_outs, = make_weights(n_hid, [n_hid], random_state)
    params += [h1_to_outs, h2_to_outs, h3_to_outs]

    # 2 * for mag and phase
    v_outs_to_corr_outs, = make_weights(n_v_hid, [1], random_state)
    corr_outs_to_final_outs, = make_weights(n_hid, [2 * n_density],
                                            random_state)
    params += [v_outs_to_corr_outs, corr_outs_to_final_outs]

    inpt = X_sym[:-1]
Esempio n. 6
0
    params += inp_to_h1.get_params()
    params += inp_to_h2.get_params()
    params += inp_to_h3.get_params()
    params += h1_to_h2.get_params()
    params += h1_to_h3.get_params()
    params += h2_to_h3.get_params()

    biases += inp_to_h1.get_biases()
    biases += inp_to_h2.get_biases()
    biases += inp_to_h3.get_biases()
    biases += h1_to_h2.get_biases()
    biases += h1_to_h3.get_biases()
    biases += h2_to_h3.get_biases()

    h1_to_outs, = make_weights(n_hid, [n_proj], random_state)
    h2_to_outs, = make_weights(n_hid, [n_proj], random_state)
    h3_to_outs, = make_weights(n_hid, [n_proj], random_state)
    b_to_outs, = make_biases([n_proj])

    params += [h1_to_outs, h2_to_outs, h3_to_outs]
    biases += [b_to_outs]

    mlp1_w, = make_weights(n_inpt_mlp, [n_hid_mlp], random_state)
    mlp2_w, mlp3_w = make_weights(n_hid_mlp, 2 * [n_hid_mlp], random_state)
    pred_w, = make_weights(n_hid_mlp, [n_bins], random_state)

    mlp1_b, mlp2_b, mlp3_b = make_biases(3 * [n_hid_mlp])
    pred_b, = make_biases([n_bins])

    params += [mlp1_w, mlp1_b]
Esempio n. 7
0
    init_w = tensor.matrix("init_w")
    init_w.tag.test_value = np_zeros((minibatch_size, n_chars))

    params = []
    biases = []

    cell1 = GRU(input_dim, n_hid, random_state)
    cell2 = GRU(n_hid, n_hid, random_state)
    cell3 = GRU(n_hid, n_hid, random_state)

    params += cell1.get_params()
    params += cell2.get_params()
    params += cell3.get_params()

    # Use GRU classes only to fork 1 inp to 2 inp:gate pairs
    inp_proj, = make_weights(input_dim, [n_hid], random_state)
    inp_b, = make_biases([n_hid])

    params += [inp_proj, inp_b]
    biases += [inp_b]

    inp_to_h1 = GRUFork(n_hid, n_hid, random_state)
    inp_to_h2 = GRUFork(n_hid, n_hid, random_state)
    inp_to_h3 = GRUFork(n_hid, n_hid, random_state)
    att_to_h1 = GRUFork(n_chars, n_hid, random_state)
    att_to_h2 = GRUFork(n_chars, n_hid, random_state)
    att_to_h3 = GRUFork(n_chars, n_hid, random_state)
    h1_to_h2 = GRUFork(n_hid, n_hid, random_state)
    h1_to_h3 = GRUFork(n_hid, n_hid, random_state)
    h2_to_h3 = GRUFork(n_hid, n_hid, random_state)
    biases = []

    n_conv1 = 128
    k_conv1 = (1, 1)
    k_conv1_hid = (1, 3)

    conv1_w, = make_conv_weights(1, [n_conv1,], k_conv1, random_state)
    conv1_b, = make_biases([n_conv1,])
    params += [conv1_w, conv1_b]
    biases += [conv1_b]

    # Might become 3* for GRU or 4* for LSTM
    conv1_hid, = make_conv_weights(n_conv1, [n_conv1,], k_conv1_hid, random_state)
    params += [conv1_hid]

    pred_w, = make_weights(n_conv1, [n_bins,], init="fan",
                           random_state=random_state)
    pred_b, = make_biases([n_bins])
    params += [pred_w, pred_b]
    biases += [pred_b]

    theano.printing.Print("X_sym.shape")(X_sym.shape)
    # add channel dim
    im = X_sym.dimshuffle(1, 'x', 0, 2)
    target = im
    shp = im.shape
    # careful shift to avoid leakage
    conv1 = conv2d(im, conv1_w, conv1_b, border_mode=(0, k_conv1[1]))
    theano.printing.Print("conv1.shape")(conv1.shape)
    conv1 = conv1[:, :, :, :shp[3]]
    theano.printing.Print("conv1.shape")(conv1.shape)
    r_conv1 = conv1.dimshuffle(2, 1, 0, 3)
def build_lstm_softmax_rbm(n_classes, n_visible, n_hidden, n_hidden_recurrent):
    '''
    Construct a symbolic RNN-RBM and initialize parameters.

    n_classes : integer
      Number of classes
    n_visible : integer
      Number of visible units.
    n_hidden : integer
      Number of hidden units of the conditional RBMs.
    n_hidden_recurrent : integer
      Number of hidden units of the RNN.

    Return a (v, v_sample, cost, monitor, params, updates_train, v_t,
              updates_generate) tuple:

    v : Theano matrix
      Symbolic variable holding an input sequence (used during training)
    v_sample : Theano matrix
      Symbolic variable holding the negative particles for CD log-likelihood
      gradient estimation (used during training)
    cost : Theano scalar
      Expression whose gradient (considering v_sample constant) corresponds to
      the LL gradient of the RNN-RBM (used during training)
    monitor : Theano scalar
      Frame-level pseudo-likelihood (useful for monitoring during training)
    params : tuple of Theano shared variables
      The parameters of the model to be optimized during training.
    updates_train : dictionary of Theano variable -> Theano variable
      Update object that should be passed to theano.function when compiling the
      training function.
    v_t : Theano matrix
      Symbolic variable holding a generated sequence (used during sampling)
    updates_generate : dictionary of Theano variable -> Theano variable
      Update object that should be passed to theano.function when compiling the
      generation function.
    '''
    random_state = np.random.RandomState(1999)
    W, = make_weights(n_visible, [n_hidden],
                      random_state,
                      init="normal",
                      scale=0.01)
    bv, bh = make_biases([n_visible, n_hidden])

    scale = 0.0001
    Wuh, Wuv = make_weights(n_hidden_recurrent, [n_hidden, n_visible],
                            random_state,
                            init="normal",
                            scale=scale)
    Wvu, = make_weights(n_visible, [
        n_hidden_recurrent,
    ],
                        random_state,
                        init="normal",
                        scale=scale)

    Wuu, Wui, Wqi, Wci, Wuf, Wqf, Wcf, Wuc, Wqc, Wuo, Wqo, Wco = make_weights(
        n_hidden_recurrent, [n_hidden_recurrent] * 12,
        random_state,
        init="normal",
        scale=scale)
    Wqv, Wqh = make_weights(n_hidden_recurrent, [n_visible, n_hidden],
                            random_state,
                            init="normal",
                            scale=scale)
    bu, bi, bf, bc, bo = make_biases([n_hidden_recurrent] * 5)

    params = [
        W, bv, bh, Wuh, Wuv, Wvu, Wuu, bu, Wui, Wqi, Wci, bi, Wuf, Wqf, Wcf,
        bf, Wuc, Wqc, bc, Wuo, Wqo, Wco, bo, Wqv, Wqh
    ]
    # learned parameters as shared
    # variables

    v = tensor.tensor3()  # a training sequence
    v.tag.test_value = dataset[0][:100]
    u0 = tensor.zeros((n_hidden_recurrent, ))  # initial value for the RNN
    # hidden units
    q0 = tensor.zeros((n_hidden_recurrent, ))
    c0 = tensor.zeros((n_hidden_recurrent, ))

    # If `v_t` is given, deterministic recurrence to compute the variable
    # biases bv_t, bh_t at each time step. If `v_t` is None, same recurrence
    # but with a separate Gibbs chain at each time step to sample (generate)
    # from the RNN-RBM. The resulting sample v_t is returned in order to be
    # passed down to the sequence history.
    def recurrence(v_t, u_tm1, q_tm1, c_tm1):
        bv_t = bv + u_tm1.dot(Wuv) + q_tm1.dot(Wqv)
        bh_t = bh + u_tm1.dot(Wuh) + q_tm1.dot(Wqh)
        generate = v_t is None
        if generate:
            v_t, _, _, updates = build_rbm(tensor.zeros((n_visible, )),
                                           W,
                                           bv_t,
                                           bh_t,
                                           k=25)
        u_t = tensor.tanh(bu + v_t.dot(Wvu) + u_tm1.dot(Wuu))

        i_t = tensor.tanh(bi + c_tm1.dot(Wci) + q_tm1.dot(Wqi) + u_t.dot(Wui))
        f_t = tensor.tanh(bf + c_tm1.dot(Wcf) + q_tm1.dot(Wqf) + u_t.dot(Wuf))
        c_t = (f_t * c_tm1) + (i_t *
                               tensor.tanh(u_t.dot(Wuc) + q_tm1.dot(Wqc) + bc))
        o_t = tensor.tanh(bo + c_t.dot(Wco) + q_tm1.dot(Wqo) + u_t.dot(Wuo))
        q_t = o_t * tensor.tanh(c_t)
        if generate:
            return ([v_t, u_t, q_t, c_t], updates)
        else:
            return [u_t, q_t, c_t, bv_t, bh_t]

    # For training, the deterministic recurrence is used to compute all the
    # {bv_t, bh_t, 1 <= t <= T} given v. Conditional RBMs can then be trained
    # in batches using those parameters.

    (u_t, q_t, c_t, bv_t,
     bh_t), updates_train = theano.scan(lambda v_t, u_tm1, q_tm1, c_tm1, *_:
                                        recurrence(v_t, u_tm1, q_tm1, c_tm1),
                                        sequences=v,
                                        outputs_info=[u0, q0, c0, None, None],
                                        non_sequences=params)
    v_sample, cost, monitor, updates_rbm = build_rbm(v,
                                                     W,
                                                     bv_t[:],
                                                     bh_t[:],
                                                     k=15)
    updates_train.update(updates_rbm)

    # symbolic loop for sequence generation
    (v_t, u_t, q_t, c_t), updates_generate = theano.scan(
        lambda u_tm1, q_tm1, c_tm1, *_: recurrence(None, u_tm1, q_tm1, c_tm1),
        outputs_info=[None, u0, q0, c0],
        non_sequences=params,
        n_steps=200)

    return (v, v_sample, cost, monitor, params, updates_train, v_t,
            updates_generate)
    biases += inp_to_h3.get_biases()
    biases += att_to_h1.get_biases()
    biases += att_to_h2.get_biases()
    biases += att_to_h3.get_biases()
    biases += h1_to_h2.get_biases()
    biases += h1_to_h3.get_biases()
    biases += h2_to_h3.get_biases()

    outs_to_v_h1 = GRUFork(1, n_v_proj, random_state)
    params += outs_to_v_h1.get_params()
    biases += outs_to_v_h1.get_biases()

    v_cell1 = GRU(n_v_proj, n_v_proj, random_state)
    params += v_cell1.get_params()

    h1_to_att_a, h1_to_att_b, h1_to_att_k = make_weights(n_hid, 3 * [att_size],
                                                         random_state)
    h1_to_outs, = make_weights(n_hid, [n_proj], random_state)
    h2_to_outs, = make_weights(n_hid, [n_proj], random_state)
    h3_to_outs, = make_weights(n_hid, [n_proj], random_state)

    params += [h1_to_att_a, h1_to_att_b, h1_to_att_k]
    params += [h1_to_outs, h2_to_outs, h3_to_outs]

    # Not used
    l1_proj, l2_proj = make_weights(n_proj, [n_proj, n_proj], random_state,
                                    init="fan")
    l1_b, l2_b = make_biases([n_proj, n_proj])
    #params += [l1_proj, l1_b, l2_proj, l2_b]

    pred_proj, = make_weights(n_proj * n_v_proj, [n_pred_proj], random_state)
    pred_b, = make_biases([n_pred_proj])
Esempio n. 11
0
    biases += inp_to_h3.get_biases()
    biases += att_to_h1.get_biases()
    biases += att_to_h2.get_biases()
    biases += att_to_h3.get_biases()
    biases += h1_to_h2.get_biases()
    biases += h1_to_h3.get_biases()
    biases += h2_to_h3.get_biases()

    outs_to_v_h1 = GRUFork(1, n_v_proj, random_state)
    params += outs_to_v_h1.get_params()
    biases += outs_to_v_h1.get_biases()

    v_cell1 = GRU(n_v_proj, n_v_proj, random_state)
    params += v_cell1.get_params()

    h1_to_att_a, h1_to_att_b, h1_to_att_k = make_weights(
        n_hid, 3 * [att_size], random_state)
    h1_to_outs, = make_weights(n_hid, [n_proj], random_state)
    h2_to_outs, = make_weights(n_hid, [n_proj], random_state)
    h3_to_outs, = make_weights(n_hid, [n_proj], random_state)

    params += [h1_to_att_a, h1_to_att_b, h1_to_att_k]
    params += [h1_to_outs, h2_to_outs, h3_to_outs]

    # Not used
    l1_proj, l2_proj = make_weights(n_proj, [n_proj, n_proj],
                                    random_state,
                                    init="fan")
    l1_b, l2_b = make_biases([n_proj, n_proj])
    #params += [l1_proj, l1_b, l2_proj, l2_b]

    pred_proj, = make_weights(n_proj * n_v_proj, [n_pred_proj], random_state)
Esempio n. 12
0
    h2_to_h3 = GRUFork(n_hid, n_hid, random_state)

    params += inp_to_h1.get_params()
    params += inp_to_h2.get_params()
    params += inp_to_h3.get_params()
    params += att_to_h1.get_params()
    params += att_to_h2.get_params()
    params += att_to_h3.get_params()
    params += h1_to_h2.get_params()
    params += h1_to_h3.get_params()
    params += h2_to_h3.get_params()

    inp_to_v_h1 = GRUFork(1, n_v_hid, random_state)
    params += inp_to_v_h1.get_params()

    h1_to_att_a, h1_to_att_b, h1_to_att_k = make_weights(n_hid, 3 * [att_size],
                                                         random_state)
    params += [h1_to_att_a, h1_to_att_b, h1_to_att_k]

    # Need a , on single results since it always returns a list
    h1_to_outs, = make_weights(n_hid, [n_hid], random_state)
    h2_to_outs, = make_weights(n_hid, [n_hid], random_state)
    h3_to_outs, = make_weights(n_hid, [n_hid], random_state)
    params += [h1_to_outs, h2_to_outs, h3_to_outs]

    # 2 * for mag and phase
    v_outs_to_corr_outs, = make_weights(n_v_hid, [1], random_state)
    corr_outs_to_final_outs, = make_weights(n_hid, [2 * n_density],
                                            random_state)
    params += [v_outs_to_corr_outs, corr_outs_to_final_outs]

    inpt = X_sym[:-1]
Esempio n. 13
0
    ], k_conv1, random_state)
    conv1_b, = make_biases([
        n_conv1,
    ])
    params += [conv1_w, conv1_b]
    biases += [conv1_b]

    # Might become 3* for GRU or 4* for LSTM
    conv1_hid, = make_conv_weights(n_conv1, [
        n_conv1,
    ], k_conv1_hid, random_state)
    params += [conv1_hid]

    pred_w, = make_weights(n_conv1, [
        n_bins,
    ],
                           init="fan",
                           random_state=random_state)
    pred_b, = make_biases([n_bins])
    params += [pred_w, pred_b]
    biases += [pred_b]

    theano.printing.Print("X_sym.shape")(X_sym.shape)
    # add channel dim
    im = X_sym.dimshuffle(1, 'x', 0, 2)
    target = im
    shp = im.shape
    # careful shift to avoid leakage
    conv1 = conv2d(im, conv1_w, conv1_b, border_mode=(0, k_conv1[1]))
    theano.printing.Print("conv1.shape")(conv1.shape)
    conv1 = conv1[:, :, :, :shp[3]]
    att_to_h3 = GRUFork(n_chars, n_hid, random_state)
    h1_to_h2 = GRUFork(n_hid, n_hid, random_state)
    h1_to_h3 = GRUFork(n_hid, n_hid, random_state)
    h2_to_h3 = GRUFork(n_hid, n_hid, random_state)

    params += inp_to_h1.get_params()
    params += inp_to_h2.get_params()
    params += inp_to_h3.get_params()
    params += att_to_h1.get_params()
    params += att_to_h2.get_params()
    params += att_to_h3.get_params()
    params += h1_to_h2.get_params()
    params += h1_to_h3.get_params()
    params += h2_to_h3.get_params()

    h1_to_att_a, h1_to_att_b, h1_to_att_k = make_weights(
        n_hid, 3 * [att_size], random_state)
    params += [h1_to_att_a, h1_to_att_b, h1_to_att_k]

    # Need a , on single results since it always returns a list
    h1_to_outs, = make_weights(n_hid, [n_hid], random_state)
    h2_to_outs, = make_weights(n_hid, [n_hid], random_state)
    h3_to_outs, = make_weights(n_hid, [n_hid], random_state)
    params += [h1_to_outs, h2_to_outs, h3_to_outs]

    l1_proj, = make_weights(n_hid, [n_proj], random_state)
    b_l1_proj, = make_biases([n_proj])
    params += [l1_proj, b_l1_proj]

    l2_proj, = make_weights(n_proj, [n_proj], random_state)
    b_l2_proj, = make_biases([n_proj])
    params += [l2_proj, b_l2_proj]
Esempio n. 15
0
def build_lstmrbm(n_visible, n_hidden, n_hidden_recurrent):
    '''
    Construct a symbolic RNN-RBM and initialize parameters.

    n_visible : integer
      Number of visible units.
    n_hidden : integer
      Number of hidden units of the conditional RBMs.
    n_hidden_recurrent : integer
      Number of hidden units of the RNN.

    Return a (v, v_sample, cost, monitor, params, updates_train, v_t,
              updates_generate) tuple:

    v : Theano matrix
      Symbolic variable holding an input sequence (used during training)
    v_sample : Theano matrix
      Symbolic variable holding the negative particles for CD log-likelihood
      gradient estimation (used during training)
    cost : Theano scalar
      Expression whose gradient (considering v_sample constant) corresponds to
      the LL gradient of the RNN-RBM (used during training)
    monitor : Theano scalar
      Frame-level pseudo-likelihood (useful for monitoring during training)
    params : tuple of Theano shared variables
      The parameters of the model to be optimized during training.
    updates_train : dictionary of Theano variable -> Theano variable
      Update object that should be passed to theano.function when compiling the
      training function.
    v_t : Theano matrix
      Symbolic variable holding a generated sequence (used during sampling)
    updates_generate : dictionary of Theano variable -> Theano variable
      Update object that should be passed to theano.function when compiling the
      generation function.
    '''
    random_state = np.random.RandomState(1999)
    W, = make_weights(n_visible, [n_hidden], random_state, init="normal",
                      scale=0.01)
    bv, bh = make_biases([n_visible, n_hidden])

    scale = 0.0001
    Wuh, Wuv = make_weights(n_hidden_recurrent, [n_hidden, n_visible],
                            random_state, init="normal", scale=scale)
    Wvu, = make_weights(n_visible, [n_hidden_recurrent,], random_state,
                        init="normal", scale=scale)

    Wuu, Wui, Wqi, Wci, Wuf, Wqf, Wcf, Wuc, Wqc, Wuo, Wqo, Wco = make_weights(
        n_hidden_recurrent, [n_hidden_recurrent] * 12, random_state,
        init="normal", scale=scale)
    Wqv, Wqh = make_weights(n_hidden_recurrent, [n_visible, n_hidden],
                            random_state, init="normal", scale=scale)
    bu, bi, bf, bc, bo = make_biases([n_hidden_recurrent] * 5)

    params = [W, bv, bh, Wuh, Wuv, Wvu, Wuu, bu, Wui, Wqi, Wci, bi,
              Wuf, Wqf, Wcf, bf, Wuc, Wqc, bc, Wuo, Wqo, Wco, bo , Wqv, Wqh]
    # learned parameters as shared
    # variables

    v = tensor.matrix()  # a training sequence
    u0 = tensor.zeros((n_hidden_recurrent,))  # initial value for the RNN
                                              # hidden units
    q0 = tensor.zeros((n_hidden_recurrent,))
    c0 = tensor.zeros((n_hidden_recurrent,))


    # If `v_t` is given, deterministic recurrence to compute the variable
    # biases bv_t, bh_t at each time step. If `v_t` is None, same recurrence
    # but with a separate Gibbs chain at each time step to sample (generate)
    # from the RNN-RBM. The resulting sample v_t is returned in order to be
    # passed down to the sequence history.
    def recurrence(v_t, u_tm1, q_tm1, c_tm1):
        bv_t = bv + u_tm1.dot(Wuv) + q_tm1.dot(Wqv)
        bh_t = bh + u_tm1.dot(Wuh) + q_tm1.dot(Wqh)
        generate = v_t is None
        if generate:
            v_t, _, _, updates = build_rbm(tensor.zeros((n_visible,)), W, bv_t,
                                           bh_t, k=25)
        u_t = tensor.tanh(bu + v_t.dot(Wvu) + u_tm1.dot(Wuu))

        i_t = tensor.tanh(bi + c_tm1.dot(Wci) + q_tm1.dot(Wqi) + u_t.dot(Wui))
        f_t = tensor.tanh(bf + c_tm1.dot(Wcf) + q_tm1.dot(Wqf) + u_t.dot(Wuf))
        c_t = (f_t * c_tm1) + (i_t * tensor.tanh(u_t.dot(Wuc) + q_tm1.dot(Wqc) + bc))
        o_t = tensor.tanh(bo + c_t.dot(Wco) + q_tm1.dot(Wqo) + u_t.dot(Wuo))
        q_t = o_t * tensor.tanh(c_t)
        if generate:
            return ([v_t, u_t, q_t, c_t], updates)
        else:
            return [u_t, q_t, c_t, bv_t, bh_t]

    # For training, the deterministic recurrence is used to compute all the
    # {bv_t, bh_t, 1 <= t <= T} given v. Conditional RBMs can then be trained
    # in batches using those parameters.

    (u_t, q_t, c_t, bv_t, bh_t), updates_train = theano.scan(
        lambda v_t, u_tm1, q_tm1, c_tm1, *_: recurrence(v_t, u_tm1, q_tm1, c_tm1),
        sequences=v, outputs_info=[u0, q0, c0, None, None], non_sequences=params)
    v_sample, cost, monitor, updates_rbm = build_rbm(v, W, bv_t[:], bh_t[:],
                                                     k=15)
    updates_train.update(updates_rbm)

    # symbolic loop for sequence generation
    (v_t, u_t, q_t, c_t), updates_generate = theano.scan(
        lambda u_tm1, q_tm1, c_tm1, *_: recurrence(None, u_tm1, q_tm1, c_tm1),
        outputs_info=[None, u0, q0, c0], non_sequences=params, n_steps=200)

    return (v, v_sample, cost, monitor, params, updates_train, v_t,
            updates_generate)
    att_to_h3 = GRUFork(n_chars, n_hid, random_state)
    h1_to_h2 = GRUFork(n_hid, n_hid, random_state)
    h1_to_h3 = GRUFork(n_hid, n_hid, random_state)
    h2_to_h3 = GRUFork(n_hid, n_hid, random_state)

    params += inp_to_h1.get_params()
    params += inp_to_h2.get_params()
    params += inp_to_h3.get_params()
    params += att_to_h1.get_params()
    params += att_to_h2.get_params()
    params += att_to_h3.get_params()
    params += h1_to_h2.get_params()
    params += h1_to_h3.get_params()
    params += h2_to_h3.get_params()

    h1_to_att_a, h1_to_att_b, h1_to_att_k = make_weights(n_hid, 3 * [att_size],
                                                         random_state)
    h1_to_outs, = make_weights(n_hid, [n_out], random_state)
    h2_to_outs, = make_weights(n_hid, [n_out], random_state)
    h3_to_outs, = make_weights(n_hid, [n_out], random_state)

    params += [h1_to_att_a, h1_to_att_b, h1_to_att_k]
    params += [h1_to_outs, h2_to_outs, h3_to_outs]

    inpt = X_sym[:-1]
    target = X_sym[1:]
    mask = X_mask_sym[1:]
    context = c_sym * c_mask_sym.dimshuffle(0, 1, 'x')

    inp_h1, inpgate_h1 = inp_to_h1.proj(inpt)
    inp_h2, inpgate_h2 = inp_to_h2.proj(inpt)
    inp_h3, inpgate_h3 = inp_to_h3.proj(inpt)