biases += outs_to_v_h1.get_biases() v_cell1 = GRU(n_v_proj, n_v_proj, random_state) params += v_cell1.get_params() h1_to_att_a, h1_to_att_b, h1_to_att_k = make_weights( n_hid, 3 * [att_size], random_state) h1_to_outs, = make_weights(n_hid, [n_proj], random_state) h2_to_outs, = make_weights(n_hid, [n_proj], random_state) h3_to_outs, = make_weights(n_hid, [n_proj], random_state) params += [h1_to_att_a, h1_to_att_b, h1_to_att_k] params += [h1_to_outs, h2_to_outs, h3_to_outs] pred_proj, = make_weights(n_v_proj, [n_pred_proj], random_state) pred_b, = make_biases([n_pred_proj]) params += [pred_proj, pred_b] biases += [pred_b] inpt = X_sym[:-1] target = X_sym[1:] mask = X_mask_sym[1:] context = c_sym * c_mask_sym.dimshuffle(0, 1, 'x') inp_h1, inpgate_h1 = inp_to_h1.proj(inpt) inp_h2, inpgate_h2 = inp_to_h2.proj(inpt) inp_h3, inpgate_h3 = inp_to_h3.proj(inpt) u = tensor.arange(c_sym.shape[0]).dimshuffle('x', 'x', 0) u = tensor.cast(u, theano.config.floatX)
biases += outs_to_v_h1.get_biases() v_cell1 = GRU(n_v_proj, n_v_proj, random_state) params += v_cell1.get_params() h1_to_att_a, h1_to_att_b, h1_to_att_k = make_weights(n_hid, 3 * [att_size], random_state) h1_to_outs, = make_weights(n_hid, [n_proj], random_state) h2_to_outs, = make_weights(n_hid, [n_proj], random_state) h3_to_outs, = make_weights(n_hid, [n_proj], random_state) params += [h1_to_att_a, h1_to_att_b, h1_to_att_k] params += [h1_to_outs, h2_to_outs, h3_to_outs] pred_proj, = make_weights(n_v_proj, [n_pred_proj], random_state) pred_b, = make_biases([n_pred_proj]) params += [pred_proj, pred_b] biases += [pred_b] inpt = X_sym[:-1] target = X_sym[1:] mask = X_mask_sym[1:] context = c_sym * c_mask_sym.dimshuffle(0, 1, 'x') inp_h1, inpgate_h1 = inp_to_h1.proj(inpt) inp_h2, inpgate_h2 = inp_to_h2.proj(inpt) inp_h3, inpgate_h3 = inp_to_h3.proj(inpt) u = tensor.arange(c_sym.shape[0]).dimshuffle('x', 'x', 0) u = tensor.cast(u, theano.config.floatX)
init_h2 = tensor.matrix("init_h2") init_h2.tag.test_value = np_zeros((minibatch_size, n_hid)) init_h3 = tensor.matrix("init_h3") init_h3.tag.test_value = np_zeros((minibatch_size, n_hid)) init_kappa = tensor.matrix("init_kappa") init_kappa.tag.test_value = np_zeros((minibatch_size, att_size)) init_w = tensor.matrix("init_w") init_w.tag.test_value = np_zeros((minibatch_size, n_chars)) params = [] w_conv1, = make_conv_weights(1, (n_kernels,), (conv_size1, input_dim), random_state) b_conv1, = make_biases((n_kernels,)) w_conv2, = make_conv_weights(n_kernels, (n_kernels,), (conv_size2, 1), random_state) b_conv2, = make_biases((n_kernels,)) params += [w_conv1, b_conv1, w_conv2, b_conv2] # Use GRU classes only to fork 1 inp to 2 inp:gate pairs conv_to_h1 = GRUFork(n_kernels, n_hid, random_state) conv_to_h2 = GRUFork(n_kernels, n_hid, random_state) params += conv_to_h1.get_params() params += conv_to_h2.get_params() cell1 = GRU(n_kernels, n_hid, random_state) cell2 = GRU(n_hid, n_hid, random_state) params += cell1.get_params() params += cell2.get_params()
init_w.tag.test_value = np_zeros((minibatch_size, n_chars)) params = [] biases = [] cell1 = GRU(input_dim, n_hid, random_state) cell2 = GRU(n_hid, n_hid, random_state) cell3 = GRU(n_hid, n_hid, random_state) params += cell1.get_params() params += cell2.get_params() params += cell3.get_params() # Use GRU classes only to fork 1 inp to 2 inp:gate pairs inp_proj, = make_weights(input_dim, [n_hid], random_state) inp_b, = make_biases([n_hid]) params += [inp_proj, inp_b] biases += [inp_b] inp_to_h1 = GRUFork(n_hid, n_hid, random_state) inp_to_h2 = GRUFork(n_hid, n_hid, random_state) inp_to_h3 = GRUFork(n_hid, n_hid, random_state) att_to_h1 = GRUFork(n_chars, n_hid, random_state) att_to_h2 = GRUFork(n_chars, n_hid, random_state) att_to_h3 = GRUFork(n_chars, n_hid, random_state) h1_to_h2 = GRUFork(n_hid, n_hid, random_state) h1_to_h3 = GRUFork(n_hid, n_hid, random_state) h2_to_h3 = GRUFork(n_hid, n_hid, random_state) params += inp_to_h1.get_params()
init_h2 = tensor.matrix("init_h2") init_h2.tag.test_value = np_zeros((minibatch_size, n_hid)) init_h3 = tensor.matrix("init_h3") init_h3.tag.test_value = np_zeros((minibatch_size, n_hid)) init_kappa = tensor.matrix("init_kappa") init_kappa.tag.test_value = np_zeros((minibatch_size, att_size)) init_w = tensor.matrix("init_w") init_w.tag.test_value = np_zeros((minibatch_size, n_chars)) params = [] w_conv1, = make_conv_weights(1, (n_kernels, ), (conv_size1, input_dim), random_state) b_conv1, = make_biases((n_kernels, )) w_conv2, = make_conv_weights(n_kernels, (n_kernels, ), (conv_size2, 1), random_state) b_conv2, = make_biases((n_kernels, )) params += [w_conv1, b_conv1, w_conv2, b_conv2] # Use GRU classes only to fork 1 inp to 2 inp:gate pairs conv_to_h1 = GRUFork(n_kernels, n_hid, random_state) conv_to_h2 = GRUFork(n_kernels, n_hid, random_state) params += conv_to_h1.get_params() params += conv_to_h2.get_params() cell1 = GRU(n_kernels, n_hid, random_state) cell2 = GRU(n_hid, n_hid, random_state) params += cell1.get_params() params += cell2.get_params()
def build_lstm_softmax_rbm(n_classes, n_visible, n_hidden, n_hidden_recurrent): ''' Construct a symbolic RNN-RBM and initialize parameters. n_classes : integer Number of classes n_visible : integer Number of visible units. n_hidden : integer Number of hidden units of the conditional RBMs. n_hidden_recurrent : integer Number of hidden units of the RNN. Return a (v, v_sample, cost, monitor, params, updates_train, v_t, updates_generate) tuple: v : Theano matrix Symbolic variable holding an input sequence (used during training) v_sample : Theano matrix Symbolic variable holding the negative particles for CD log-likelihood gradient estimation (used during training) cost : Theano scalar Expression whose gradient (considering v_sample constant) corresponds to the LL gradient of the RNN-RBM (used during training) monitor : Theano scalar Frame-level pseudo-likelihood (useful for monitoring during training) params : tuple of Theano shared variables The parameters of the model to be optimized during training. updates_train : dictionary of Theano variable -> Theano variable Update object that should be passed to theano.function when compiling the training function. v_t : Theano matrix Symbolic variable holding a generated sequence (used during sampling) updates_generate : dictionary of Theano variable -> Theano variable Update object that should be passed to theano.function when compiling the generation function. ''' random_state = np.random.RandomState(1999) W, = make_weights(n_visible, [n_hidden], random_state, init="normal", scale=0.01) bv, bh = make_biases([n_visible, n_hidden]) scale = 0.0001 Wuh, Wuv = make_weights(n_hidden_recurrent, [n_hidden, n_visible], random_state, init="normal", scale=scale) Wvu, = make_weights(n_visible, [ n_hidden_recurrent, ], random_state, init="normal", scale=scale) Wuu, Wui, Wqi, Wci, Wuf, Wqf, Wcf, Wuc, Wqc, Wuo, Wqo, Wco = make_weights( n_hidden_recurrent, [n_hidden_recurrent] * 12, random_state, init="normal", scale=scale) Wqv, Wqh = make_weights(n_hidden_recurrent, [n_visible, n_hidden], random_state, init="normal", scale=scale) bu, bi, bf, bc, bo = make_biases([n_hidden_recurrent] * 5) params = [ W, bv, bh, Wuh, Wuv, Wvu, Wuu, bu, Wui, Wqi, Wci, bi, Wuf, Wqf, Wcf, bf, Wuc, Wqc, bc, Wuo, Wqo, Wco, bo, Wqv, Wqh ] # learned parameters as shared # variables v = tensor.tensor3() # a training sequence v.tag.test_value = dataset[0][:100] u0 = tensor.zeros((n_hidden_recurrent, )) # initial value for the RNN # hidden units q0 = tensor.zeros((n_hidden_recurrent, )) c0 = tensor.zeros((n_hidden_recurrent, )) # If `v_t` is given, deterministic recurrence to compute the variable # biases bv_t, bh_t at each time step. If `v_t` is None, same recurrence # but with a separate Gibbs chain at each time step to sample (generate) # from the RNN-RBM. The resulting sample v_t is returned in order to be # passed down to the sequence history. def recurrence(v_t, u_tm1, q_tm1, c_tm1): bv_t = bv + u_tm1.dot(Wuv) + q_tm1.dot(Wqv) bh_t = bh + u_tm1.dot(Wuh) + q_tm1.dot(Wqh) generate = v_t is None if generate: v_t, _, _, updates = build_rbm(tensor.zeros((n_visible, )), W, bv_t, bh_t, k=25) u_t = tensor.tanh(bu + v_t.dot(Wvu) + u_tm1.dot(Wuu)) i_t = tensor.tanh(bi + c_tm1.dot(Wci) + q_tm1.dot(Wqi) + u_t.dot(Wui)) f_t = tensor.tanh(bf + c_tm1.dot(Wcf) + q_tm1.dot(Wqf) + u_t.dot(Wuf)) c_t = (f_t * c_tm1) + (i_t * tensor.tanh(u_t.dot(Wuc) + q_tm1.dot(Wqc) + bc)) o_t = tensor.tanh(bo + c_t.dot(Wco) + q_tm1.dot(Wqo) + u_t.dot(Wuo)) q_t = o_t * tensor.tanh(c_t) if generate: return ([v_t, u_t, q_t, c_t], updates) else: return [u_t, q_t, c_t, bv_t, bh_t] # For training, the deterministic recurrence is used to compute all the # {bv_t, bh_t, 1 <= t <= T} given v. Conditional RBMs can then be trained # in batches using those parameters. (u_t, q_t, c_t, bv_t, bh_t), updates_train = theano.scan(lambda v_t, u_tm1, q_tm1, c_tm1, *_: recurrence(v_t, u_tm1, q_tm1, c_tm1), sequences=v, outputs_info=[u0, q0, c0, None, None], non_sequences=params) v_sample, cost, monitor, updates_rbm = build_rbm(v, W, bv_t[:], bh_t[:], k=15) updates_train.update(updates_rbm) # symbolic loop for sequence generation (v_t, u_t, q_t, c_t), updates_generate = theano.scan( lambda u_tm1, q_tm1, c_tm1, *_: recurrence(None, u_tm1, q_tm1, c_tm1), outputs_info=[None, u0, q0, c0], non_sequences=params, n_steps=200) return (v, v_sample, cost, monitor, params, updates_train, v_t, updates_generate)
params += inp_to_h3.get_params() params += h1_to_h2.get_params() params += h1_to_h3.get_params() params += h2_to_h3.get_params() biases += inp_to_h1.get_biases() biases += inp_to_h2.get_biases() biases += inp_to_h3.get_biases() biases += h1_to_h2.get_biases() biases += h1_to_h3.get_biases() biases += h2_to_h3.get_biases() h1_to_outs, = make_weights(n_hid, [n_proj], random_state) h2_to_outs, = make_weights(n_hid, [n_proj], random_state) h3_to_outs, = make_weights(n_hid, [n_proj], random_state) b_to_outs, = make_biases([n_proj]) params += [h1_to_outs, h2_to_outs, h3_to_outs] biases += [b_to_outs] mlp1_w, = make_weights(n_inpt_mlp, [n_hid_mlp], random_state) mlp2_w, mlp3_w = make_weights(n_hid_mlp, 2 * [n_hid_mlp], random_state) pred_w, = make_weights(n_hid_mlp, [n_bins], random_state) mlp1_b, mlp2_b, mlp3_b = make_biases(3 * [n_hid_mlp]) pred_b, = make_biases([n_bins]) params += [mlp1_w, mlp1_b] params += [mlp2_w, mlp2_b] params += [mlp3_w, mlp3_b] params += [pred_w, pred_b]
params += v_cell1.get_params() h1_to_att_a, h1_to_att_b, h1_to_att_k = make_weights( n_hid, 3 * [att_size], random_state) h1_to_outs, = make_weights(n_hid, [n_proj], random_state) h2_to_outs, = make_weights(n_hid, [n_proj], random_state) h3_to_outs, = make_weights(n_hid, [n_proj], random_state) params += [h1_to_att_a, h1_to_att_b, h1_to_att_k] params += [h1_to_outs, h2_to_outs, h3_to_outs] # Not used l1_proj, l2_proj = make_weights(n_proj, [n_proj, n_proj], random_state, init="fan") l1_b, l2_b = make_biases([n_proj, n_proj]) #params += [l1_proj, l1_b, l2_proj, l2_b] pred_proj, = make_weights(n_proj * n_v_proj, [n_pred_proj], random_state) pred_b, = make_biases([n_pred_proj]) params += [pred_proj, pred_b] biases += [pred_b] inpt = X_sym[:-1] target = X_sym[1:] mask = X_mask_sym[1:] context = c_sym * c_mask_sym.dimshuffle(0, 1, 'x') inp_h1, inpgate_h1 = inp_to_h1.proj(inpt) inp_h2, inpgate_h2 = inp_to_h2.proj(inpt)
X_sym = tensor.tensor3("X_sym") X_sym.tag.test_value = X_mb X_mask_sym = tensor.matrix("X_mask_sym") X_mask_sym.tag.test_value = X_mb_mask params = [] biases = [] n_conv1 = 128 k_conv1 = (1, 1) k_conv1_hid = (1, 3) conv1_w, = make_conv_weights(1, [n_conv1,], k_conv1, random_state) conv1_b, = make_biases([n_conv1,]) params += [conv1_w, conv1_b] biases += [conv1_b] # Might become 3* for GRU or 4* for LSTM conv1_hid, = make_conv_weights(n_conv1, [n_conv1,], k_conv1_hid, random_state) params += [conv1_hid] pred_w, = make_weights(n_conv1, [n_bins,], init="fan", random_state=random_state) pred_b, = make_biases([n_bins]) params += [pred_w, pred_b] biases += [pred_b] theano.printing.Print("X_sym.shape")(X_sym.shape) # add channel dim
v_cell1 = GRU(n_v_proj, n_v_proj, random_state) params += v_cell1.get_params() h1_to_att_a, h1_to_att_b, h1_to_att_k = make_weights(n_hid, 3 * [att_size], random_state) h1_to_outs, = make_weights(n_hid, [n_proj], random_state) h2_to_outs, = make_weights(n_hid, [n_proj], random_state) h3_to_outs, = make_weights(n_hid, [n_proj], random_state) params += [h1_to_att_a, h1_to_att_b, h1_to_att_k] params += [h1_to_outs, h2_to_outs, h3_to_outs] # Not used l1_proj, l2_proj = make_weights(n_proj, [n_proj, n_proj], random_state, init="fan") l1_b, l2_b = make_biases([n_proj, n_proj]) #params += [l1_proj, l1_b, l2_proj, l2_b] pred_proj, = make_weights(n_proj * n_v_proj, [n_pred_proj], random_state) pred_b, = make_biases([n_pred_proj]) params += [pred_proj, pred_b] biases += [pred_b] inpt = X_sym[:-1] target = X_sym[1:] mask = X_mask_sym[1:] context = c_sym * c_mask_sym.dimshuffle(0, 1, 'x') inp_h1, inpgate_h1 = inp_to_h1.proj(inpt) inp_h2, inpgate_h2 = inp_to_h2.proj(inpt)
X_sym.tag.test_value = X_mb X_mask_sym = tensor.matrix("X_mask_sym") X_mask_sym.tag.test_value = X_mb_mask params = [] biases = [] n_conv1 = 128 k_conv1 = (1, 1) k_conv1_hid = (1, 3) conv1_w, = make_conv_weights(1, [ n_conv1, ], k_conv1, random_state) conv1_b, = make_biases([ n_conv1, ]) params += [conv1_w, conv1_b] biases += [conv1_b] # Might become 3* for GRU or 4* for LSTM conv1_hid, = make_conv_weights(n_conv1, [ n_conv1, ], k_conv1_hid, random_state) params += [conv1_hid] pred_w, = make_weights(n_conv1, [ n_bins, ], init="fan", random_state=random_state)
params += h1_to_h2.get_params() params += h1_to_h3.get_params() params += h2_to_h3.get_params() h1_to_att_a, h1_to_att_b, h1_to_att_k = make_weights( n_hid, 3 * [att_size], random_state) params += [h1_to_att_a, h1_to_att_b, h1_to_att_k] # Need a , on single results since it always returns a list h1_to_outs, = make_weights(n_hid, [n_hid], random_state) h2_to_outs, = make_weights(n_hid, [n_hid], random_state) h3_to_outs, = make_weights(n_hid, [n_hid], random_state) params += [h1_to_outs, h2_to_outs, h3_to_outs] l1_proj, = make_weights(n_hid, [n_proj], random_state) b_l1_proj, = make_biases([n_proj]) params += [l1_proj, b_l1_proj] l2_proj, = make_weights(n_proj, [n_proj], random_state) b_l2_proj, = make_biases([n_proj]) params += [l2_proj, b_l2_proj] l3_proj, = make_weights(n_proj, [n_proj], random_state) b_l3_proj, = make_biases([n_proj]) params += [l3_proj, b_l3_proj] softmax_proj, = make_weights(n_proj, [n_out], random_state) b_softmax_proj, = make_biases([n_out]) params += [softmax_proj, b_softmax_proj] inpt = X_sym[:-1]
biases += h1_to_h3.get_biases() biases += h2_to_h3.get_biases() h1_to_att_a, h1_to_att_b, h1_to_att_k = make_weights(n_hid, 3 * [att_size], random_state) h1_to_outs, = make_weights(n_hid, [n_proj], random_state) h2_to_outs, = make_weights(n_hid, [n_proj], random_state) h3_to_outs, = make_weights(n_hid, [n_proj], random_state) params += [h1_to_att_a, h1_to_att_b, h1_to_att_k] params += [h1_to_outs, h2_to_outs, h3_to_outs] # Not used l1_proj, l2_proj = make_weights(n_proj, [n_proj, n_proj], random_state, init="fan") l1_b, l2_b = make_biases([n_proj, n_proj]) #params += [l1_proj, l1_b, l2_proj, l2_b] softmax_proj, = make_weights(n_proj, [n_softmax_proj], random_state) softmax_b, = make_biases([n_softmax_proj]) params += [softmax_proj, softmax_b] biases += [softmax_b] inpt = X_sym[:-1] target = X_sym[1:] mask = X_mask_sym[1:] context = c_sym * c_mask_sym.dimshuffle(0, 1, 'x') inp_h1, inpgate_h1 = inp_to_h1.proj(inpt) inp_h2, inpgate_h2 = inp_to_h2.proj(inpt)
def build_lstmrbm(n_visible, n_hidden, n_hidden_recurrent): ''' Construct a symbolic RNN-RBM and initialize parameters. n_visible : integer Number of visible units. n_hidden : integer Number of hidden units of the conditional RBMs. n_hidden_recurrent : integer Number of hidden units of the RNN. Return a (v, v_sample, cost, monitor, params, updates_train, v_t, updates_generate) tuple: v : Theano matrix Symbolic variable holding an input sequence (used during training) v_sample : Theano matrix Symbolic variable holding the negative particles for CD log-likelihood gradient estimation (used during training) cost : Theano scalar Expression whose gradient (considering v_sample constant) corresponds to the LL gradient of the RNN-RBM (used during training) monitor : Theano scalar Frame-level pseudo-likelihood (useful for monitoring during training) params : tuple of Theano shared variables The parameters of the model to be optimized during training. updates_train : dictionary of Theano variable -> Theano variable Update object that should be passed to theano.function when compiling the training function. v_t : Theano matrix Symbolic variable holding a generated sequence (used during sampling) updates_generate : dictionary of Theano variable -> Theano variable Update object that should be passed to theano.function when compiling the generation function. ''' random_state = np.random.RandomState(1999) W, = make_weights(n_visible, [n_hidden], random_state, init="normal", scale=0.01) bv, bh = make_biases([n_visible, n_hidden]) scale = 0.0001 Wuh, Wuv = make_weights(n_hidden_recurrent, [n_hidden, n_visible], random_state, init="normal", scale=scale) Wvu, = make_weights(n_visible, [n_hidden_recurrent,], random_state, init="normal", scale=scale) Wuu, Wui, Wqi, Wci, Wuf, Wqf, Wcf, Wuc, Wqc, Wuo, Wqo, Wco = make_weights( n_hidden_recurrent, [n_hidden_recurrent] * 12, random_state, init="normal", scale=scale) Wqv, Wqh = make_weights(n_hidden_recurrent, [n_visible, n_hidden], random_state, init="normal", scale=scale) bu, bi, bf, bc, bo = make_biases([n_hidden_recurrent] * 5) params = [W, bv, bh, Wuh, Wuv, Wvu, Wuu, bu, Wui, Wqi, Wci, bi, Wuf, Wqf, Wcf, bf, Wuc, Wqc, bc, Wuo, Wqo, Wco, bo , Wqv, Wqh] # learned parameters as shared # variables v = tensor.matrix() # a training sequence u0 = tensor.zeros((n_hidden_recurrent,)) # initial value for the RNN # hidden units q0 = tensor.zeros((n_hidden_recurrent,)) c0 = tensor.zeros((n_hidden_recurrent,)) # If `v_t` is given, deterministic recurrence to compute the variable # biases bv_t, bh_t at each time step. If `v_t` is None, same recurrence # but with a separate Gibbs chain at each time step to sample (generate) # from the RNN-RBM. The resulting sample v_t is returned in order to be # passed down to the sequence history. def recurrence(v_t, u_tm1, q_tm1, c_tm1): bv_t = bv + u_tm1.dot(Wuv) + q_tm1.dot(Wqv) bh_t = bh + u_tm1.dot(Wuh) + q_tm1.dot(Wqh) generate = v_t is None if generate: v_t, _, _, updates = build_rbm(tensor.zeros((n_visible,)), W, bv_t, bh_t, k=25) u_t = tensor.tanh(bu + v_t.dot(Wvu) + u_tm1.dot(Wuu)) i_t = tensor.tanh(bi + c_tm1.dot(Wci) + q_tm1.dot(Wqi) + u_t.dot(Wui)) f_t = tensor.tanh(bf + c_tm1.dot(Wcf) + q_tm1.dot(Wqf) + u_t.dot(Wuf)) c_t = (f_t * c_tm1) + (i_t * tensor.tanh(u_t.dot(Wuc) + q_tm1.dot(Wqc) + bc)) o_t = tensor.tanh(bo + c_t.dot(Wco) + q_tm1.dot(Wqo) + u_t.dot(Wuo)) q_t = o_t * tensor.tanh(c_t) if generate: return ([v_t, u_t, q_t, c_t], updates) else: return [u_t, q_t, c_t, bv_t, bh_t] # For training, the deterministic recurrence is used to compute all the # {bv_t, bh_t, 1 <= t <= T} given v. Conditional RBMs can then be trained # in batches using those parameters. (u_t, q_t, c_t, bv_t, bh_t), updates_train = theano.scan( lambda v_t, u_tm1, q_tm1, c_tm1, *_: recurrence(v_t, u_tm1, q_tm1, c_tm1), sequences=v, outputs_info=[u0, q0, c0, None, None], non_sequences=params) v_sample, cost, monitor, updates_rbm = build_rbm(v, W, bv_t[:], bh_t[:], k=15) updates_train.update(updates_rbm) # symbolic loop for sequence generation (v_t, u_t, q_t, c_t), updates_generate = theano.scan( lambda u_tm1, q_tm1, c_tm1, *_: recurrence(None, u_tm1, q_tm1, c_tm1), outputs_info=[None, u0, q0, c0], non_sequences=params, n_steps=200) return (v, v_sample, cost, monitor, params, updates_train, v_t, updates_generate)