コード例 #1
0
ファイル: nn_word.py プロジェクト: afcarl/neural-pos-tagger
    def __init__(self, name, x, y, lr, init_emb, vocab_size, emb_dim,
                 hidden_dim, output_dim, window, opt):

        assert window % 2 == 1, 'Window size must be odd'
        """ input """
        self.name = name
        self.x = x
        self.y = y
        self.lr = lr
        self.input = [self.x, self.y, self.lr]

        n_words = x.shape[0]
        """ params """
        if init_emb is not None:
            self.emb = theano.shared(init_emb)
        else:
            self.emb = theano.shared(sample_weights(vocab_size, emb_dim))

        self.W_in = theano.shared(
            sample_weights(hidden_dim, 1, window, emb_dim))
        self.W_out = theano.shared(sample_weights(hidden_dim, output_dim))

        self.b_in = theano.shared(sample_weights(hidden_dim, 1))
        self.b_y = theano.shared(sample_weights(output_dim))

        self.params = [self.W_in, self.W_out, self.b_in, self.b_y]
        """ pad """
        self.zero = theano.shared(
            np.zeros(shape=(1, 1, window / 2, emb_dim),
                     dtype=theano.config.floatX))
        """ look up embedding """
        self.x_emb = self.emb[self.x]  # x_emb: 1D: n_words, 2D: n_emb
        """ convolution """
        self.x_in = self.conv(self.x_emb)
        """ feed-forward computation """
        self.h = relu(
            self.x_in.reshape((self.x_in.shape[1], self.x_in.shape[2])) +
            T.repeat(self.b_in, T.cast(self.x_in.shape[2], 'int32'), 1)).T
        self.o = T.dot(self.h, self.W_out) + self.b_y
        self.p_y_given_x = T.nnet.softmax(self.o)
        """ prediction """
        self.y_pred = T.argmax(self.o, axis=1)
        self.result = T.eq(self.y_pred, self.y)
        """ cost function """
        self.nll = -T.sum(T.log(self.p_y_given_x)[T.arange(n_words), self.y])
        self.cost = self.nll

        if opt == 'sgd':
            self.updates = sgd(self.cost, self.params, self.emb, self.x_emb,
                               self.lr)
        else:
            self.updates = ada_grad(self.cost, self.params, self.emb,
                                    self.x_emb, self.x, self.lr)
コード例 #2
0
ファイル: nn_word.py プロジェクト: hiroki13/neural-pos-tagger
    def __init__(self, name, x, y, lr, init_emb, vocab_size, emb_dim, hidden_dim, output_dim, window, opt):

        assert window % 2 == 1, 'Window size must be odd'

        """ input """
        self.name = name
        self.x = x
        self.y = y
        self.lr = lr
        self.input = [self.x, self.y, self.lr]

        n_words = x.shape[0]

        """ params """
        if init_emb is not None:
            self.emb = theano.shared(init_emb)
        else:
            self.emb = theano.shared(sample_weights(vocab_size, emb_dim))

        self.W_in = theano.shared(sample_weights(hidden_dim, 1, window, emb_dim))
        self.W_out = theano.shared(sample_weights(hidden_dim, output_dim))

        self.b_in = theano.shared(sample_weights(hidden_dim, 1))
        self.b_y = theano.shared(sample_weights(output_dim))

        self.params = [self.W_in, self.W_out, self.b_in, self.b_y]

        """ pad """
        self.zero = theano.shared(np.zeros(shape=(1, 1, window / 2, emb_dim), dtype=theano.config.floatX))

        """ look up embedding """
        self.x_emb = self.emb[self.x]  # x_emb: 1D: n_words, 2D: n_emb

        """ convolution """
        self.x_in = self.conv(self.x_emb)

        """ feed-forward computation """
        self.h = relu(self.x_in.reshape((self.x_in.shape[1], self.x_in.shape[2])) + T.repeat(self.b_in, T.cast(self.x_in.shape[2], 'int32'), 1)).T
        self.o = T.dot(self.h, self.W_out) + self.b_y
        self.p_y_given_x = T.nnet.softmax(self.o)

        """ prediction """
        self.y_pred = T.argmax(self.o, axis=1)
        self.result = T.eq(self.y_pred, self.y)

        """ cost function """
        self.nll = -T.sum(T.log(self.p_y_given_x)[T.arange(n_words), self.y])
        self.cost = self.nll

        if opt == 'sgd':
            self.updates = sgd(self.cost, self.params, self.emb, self.x_emb, self.lr)
        else:
            self.updates = ada_grad(self.cost, self.params, self.emb, self.x_emb, self.x, self.lr)
コード例 #3
0
    def __init__(self, x, y, n_words, batch_size, lr, init_emb, vocab_size,
                 emb_dim, hidden_dim, output_dim, window, opt):
        assert window % 2 == 1, 'Window size must be odd'
        """ input """
        self.x = x  # 1D: n_words * batch_size, 2D: window; elem=word id
        self.x_v = x.flatten(
        )  # 1D: n_words * batch_size * window; elem=word id
        self.y = y
        self.batch_size = batch_size
        self.n_words = n_words
        self.lr = lr
        """ params """
        if init_emb is not None:
            self.emb = theano.shared(init_emb)
        else:
            self.emb = theano.shared(sample_weights(vocab_size, emb_dim))

        self.W_in = theano.shared(sample_weights(emb_dim * window, hidden_dim))
        self.W_out = theano.shared(sample_weights(hidden_dim, output_dim))

        self.b_in = theano.shared(sample_weights(hidden_dim))
        self.b_y = theano.shared(sample_weights(output_dim))

        self.params = [self.W_in, self.W_out, self.b_in, self.b_y]
        """ look up embedding """
        self.x_emb = self.emb[
            self.x_v]  # x_emb: 1D: batch_size * n_words * window, 2D: emb_dim
        """ forward """
        self.h = relu(
            T.dot(self.x_emb.reshape((batch_size * n_words, emb_dim *
                                      window)), self.W_in) + self.b_in)
        self.o = T.dot(self.h, self.W_out) + self.b_y
        self.p_y_given_x = T.nnet.softmax(self.o)
        """ predict """
        self.y_pred = T.argmax(self.o, axis=1)
        self.result = T.eq(self.y_pred, self.y)
        """ loss """
        self.log_p = T.log(self.p_y_given_x)[T.arange(batch_size * n_words),
                                             self.y]
        self.nll = -T.sum(self.log_p)
        self.cost = self.nll

        if opt == 'sgd':
            self.updates = sgd(self.cost, self.params, self.emb, self.x_emb,
                               self.lr)
        else:
            self.updates = ada_grad(self.cost, self.params, self.emb,
                                    self.x_emb, self.x, self.lr)
コード例 #4
0
    def __init__(self, x, y, n_words, batch_size, lr, init_emb, vocab_size, emb_dim, hidden_dim, output_dim, window, opt):
        assert window % 2 == 1, 'Window size must be odd'

        """ input """
        self.x = x  # 1D: n_words * batch_size, 2D: window; elem=word id
        self.x_v = x.flatten()  # 1D: n_words * batch_size * window; elem=word id
        self.y = y
        self.batch_size = batch_size
        self.n_words = n_words
        self.lr = lr

        """ params """
        if init_emb is not None:
            self.emb = theano.shared(init_emb)
        else:
            self.emb = theano.shared(sample_weights(vocab_size, emb_dim))

        self.W_in = theano.shared(sample_weights(emb_dim * window, hidden_dim))
        self.W_out = theano.shared(sample_weights(hidden_dim, output_dim))

        self.b_in = theano.shared(sample_weights(hidden_dim))
        self.b_y = theano.shared(sample_weights(output_dim))

        self.params = [self.W_in, self.W_out, self.b_in, self.b_y]

        """ look up embedding """
        self.x_emb = self.emb[self.x_v]  # x_emb: 1D: batch_size * n_words * window, 2D: emb_dim

        """ forward """
        self.h = relu(T.dot(self.x_emb.reshape((batch_size * n_words, emb_dim * window)), self.W_in) + self.b_in)
        self.o = T.dot(self.h, self.W_out) + self.b_y
        self.p_y_given_x = T.nnet.softmax(self.o)

        """ predict """
        self.y_pred = T.argmax(self.o, axis=1)
        self.result = T.eq(self.y_pred, self.y)

        """ loss """
        self.log_p = T.log(self.p_y_given_x)[T.arange(batch_size * n_words), self.y]
        self.nll = -T.sum(self.log_p)
        self.cost = self.nll

        if opt == 'sgd':
            self.updates = sgd(self.cost, self.params, self.emb, self.x_emb, self.lr)
        else:
            self.updates = ada_grad(self.cost, self.params, self.emb, self.x_emb, self.x, self.lr)
コード例 #5
0
            return args.lr * (args.decay_factor**2)
        else:
            return args.lr * args.decay_factor

    lr = lr_schedule
elif args.linear_lr_decay:

    def lr_schedule(epoch):
        return args.lr * (args.epochs - epoch) / args.epochs

    lr = lr_schedule
else:
    lr = args.lr

if args.optimizer == 'sgd':
    opt_init, opt_apply, get_params = myopt.sgd(lr)
elif args.optimizer == 'momentum':
    opt_init, opt_apply, get_params = myopt.momentum(
        lr, args.momentum, weight_decay=args.weight_decay)
elif args.optimizer == 'adagrad':
    opt_init, opt_apply, get_params = optimizers.adagrad(lr, args.momentum)
elif args.optimizer == 'adam':
    opt_init, opt_apply, get_params = optimizers.adam(lr)

state = opt_init(params)

if args.loss == 'logistic':
    loss = lambda fx, y: np.mean(-np.sum(logsoftmax(fx) * y, axis=1))
elif args.loss == 'squared':
    loss = lambda fx, y: np.mean(np.sum((fx - y)**2, axis=1))
value_and_grad_loss = jit(
コード例 #6
0
    def __init__(self, x_span, x_word, x_ctx, x_dist, x_slen, y, init_emb, n_vocab, dim_w, dim_d, dim_h, L2_reg):
        """
        :param x_span: 1D: batch, 2D: limit * 2 (10); elem=word id
        :param x_word: 1D: batch, 2D: 4 (m_first, m_last, a_first, a_last); elem=word id
        :param x_ctx : 1D: batch, 2D: window * 2 * 2 (20); elem=word id
        :param x_dist: 1D: batch; 2D: 2; elem=[sent dist, ment dist]
        :param x_slen: 1D: batch; 2D: 3; elem=[m_span_len, a_span_len, head_match]
        :param y     : 1D: batch
        """

        self.input  = [x_span, x_word, x_ctx, x_dist, y]
        self.x_span = x_span
        self.x_word = x_word
        self.x_ctx  = x_ctx
        self.x_dist = x_dist
        self.x_slen = x_slen
        self.y      = y

        dim_x = dim_w * (10 + 4 + 4 + 2 + 3)
        batch = y.shape[0]

        """ Params """
        if init_emb is None:
            self.emb = theano.shared(sample_weights(n_vocab, dim_w))
        else:
            self.emb = theano.shared(init_emb)

        self.W_d = theano.shared(sample_weights(dim_d, dim_w))
        self.W_l = theano.shared(sample_weights(7, dim_w))
        self.W_i = theano.shared(sample_weights(dim_x, dim_h))
        self.W_h = theano.shared(sample_weights(dim_h, dim_h))
        self.W_o = theano.shared(sample_weights(dim_h))
        self.params = [self.W_d, self.W_l, self.W_i, self.W_h, self.W_o]

        """ Input Layer """
        x_vec = T.concatenate([x_span, x_word, x_ctx], 1).flatten()  # 1D: batch * (limit * 2 + 4 + 20)
        x_in = self.emb[x_vec]     # 1D: batch, 2D: limit * 2, 3D: dim_w
        x_d = self.W_d[x_dist]     # 1D: batch, 2D: 2, 3D: dim_w
        x_l = self.W_l[x_slen]     # 1D: batch, 2D: 2, 3D: dim_w
        x = T.concatenate([x_in.reshape((batch, -1)), x_d.reshape((batch, -1)), x_l.reshape((batch, -1))], 1)

        """ Intermediate Layers """
        h1 = relu(T.dot(x, self.W_i))   # h1: 1D: batch, 2D: dim_h
        h2 = relu(T.dot(h1, self.W_h))  # h2: 1D: batch, 2D: dim_h

        """ Output Layer """
        p_y = sigmoid(T.dot(h2, self.W_o))  # p_y: 1D: batch

        """ Cost Function """
        self.nll = - T.sum(y * T.log(p_y) + (1. - y) * T.log((1. - p_y)))  # TODO: ranking criterion
        self.cost = self.nll + L2_reg * L2_sqr(params=self.params) / 2

        """ Update """
        self.updates = sgd(self.cost, self.params, self.emb, x_in)

        """ Predicts """
        self.thresholds = theano.shared(np.asarray([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9], dtype=theano.config.floatX))
        self.y_hat = self.binary_predict(p_y)  # 1D: batch, 2D: 9 (thresholds)
        self.y_hat_index = T.argmax(p_y)
        self.p_y_hat = p_y[self.y_hat_index]

        """ Check Results """
        self.result = T.eq(self.y_hat, y.reshape((y.shape[0], 1)))  # 1D: batch, 2D: 9 (thresholds)
        self.total_p = T.sum(self.y_hat, 0)
        self.total_r = T.sum(y, keepdims=True)
        self.correct = T.sum(self.result, 0)
        self.correct_t, self.correct_f = correct_tf(self.result, y.reshape((y.shape[0], 1)))
コード例 #7
0
    def __init__(self, name, w, c, b, y, lr,
                 init_w_emb, vocab_w_size, vocab_c_size,
                 w_emb_dim, c_emb_dim, w_hidden_dim, c_hidden_dim, output_dim,
                 window, opt):

        assert window % 2 == 1, 'Window size must be odd'

        """ input """
        self.name = name
        self.w = w
        self.c = c
        self.b = b
        self.y = y
        self.lr = lr
        self.input = [self.w, self.c, self.b, self.y, self.lr]

        n_phi = w_emb_dim + c_emb_dim * window
        n_words = w.shape[0]

        """ params """
        if init_w_emb is not None:
            self.emb = theano.shared(init_w_emb)
        else:
            self.emb = theano.shared(sample_weights(vocab_w_size, w_emb_dim))

        self.emb_c = theano.shared(sample_norm_dist(vocab_c_size, c_emb_dim))
        self.W_in = theano.shared(sample_weights(w_hidden_dim, 1, window, n_phi))
        self.W_c = theano.shared(sample_weights(c_hidden_dim, 1, window, c_emb_dim))
        self.W_out = theano.shared(sample_weights(w_hidden_dim, output_dim))

        self.b_in = theano.shared(sample_weights(w_hidden_dim, 1))
        self.b_c = theano.shared(sample_weights(c_hidden_dim))
        self.b_y = theano.shared(sample_weights(output_dim))

        """ pad """
        self.zero = theano.shared(np.zeros(shape=(1, 1, window / 2, n_phi), dtype=theano.config.floatX))
        self.zero_c = theano.shared(np.zeros(shape=(1, 1, window / 2, c_emb_dim), dtype=theano.config.floatX))

        self.params = [self.emb_c, self.W_in, self.W_c, self.W_out, self.b_in, self.b_c, self.b_y]

        """ look up embedding """
        x_emb = self.emb[self.w]  # x_emb: 1D: n_words, 2D: w_emb_dim
        c_emb = self.emb_c[self.c]  # c_emb: 1D: n_chars, 2D: c_emb_dim

        """ create feature """
        c_phi = self.create_char_feature(self.b, c_emb, self.zero_c) + self.b_c  # 1D: n_words, 2D: c_hidden_dim(50)
        x_phi = T.concatenate([x_emb, c_phi], axis=1)  # 1D: n_words, 2D: w_emb_dim(100) + c_hidden_dim(50)

        """ convolution """
        x_padded = T.concatenate([self.zero, x_phi.reshape((1, 1, x_phi.shape[0], x_phi.shape[1])), self.zero], axis=2)  # x_padded: 1D: n_words + n_pad, 2D: n_phi
        x_in = conv2d(input=x_padded, filters=self.W_in)  # 1D: 1, 2D: w_hidden_dim(300), 3D: n_words, 4D: 1

        """ feed-forward computation """
        h = relu(x_in.reshape((x_in.shape[1], x_in.shape[2])) + T.repeat(self.b_in, T.cast(x_in.shape[2], 'int32'), 1)).T
        self.o = T.dot(h, self.W_out) + self.b_y
        self.p_y_given_x = T.nnet.softmax(self.o)

        """ prediction """
        self.y_pred = T.argmax(self.o, axis=1)
        self.result = T.eq(self.y_pred, self.y)

        """ cost function """
        self.nll = -T.sum(T.log(self.p_y_given_x)[T.arange(n_words), self.y])
        self.cost = self.nll

        if opt == 'sgd':
            self.updates = sgd(self.cost, self.params, self.emb, x_emb, self.lr)
        else:
            self.updates = ada_grad(self.cost, self.params, self.emb, x_emb, self.w, self.lr)
コード例 #8
0
import metrics as mt
import numpy as np
import optimizers as op
import utils

trials = 1000

gradsum = 0.0
sgdsum = 0.0
batchsum = 0.0
coordsum = 0.0

for i in range(trials):
    Xtrain, ytrain, Xval, yval, Xtest, Ytest = utils.load_data()

    wgrad, bgrad = op.grad_desc(Xtrain, ytrain, 0.01, 0.000001)
    wsgd, bsgd = op.sgd(Xtrain, ytrain, 0.01, 0.000001)
    wbsgd, bbsgd = op.batch_sgd(Xtrain, ytrain, 0.01, 0.000001, 50)
    wcord, bcord = op.coorddesc(Xtrain, ytrain, 0.01)

    gradsum += mt.error(Xval, yval, wgrad, bgrad)
    sgdsum += mt.error(Xval, yval, wsgd, bsgd)
    batchsum += mt.error(Xval, yval, wbsgd, bbsgd)
    coordsum += mt.error(Xval, yval, wcord, bcord)

print('Average Grad Misclassification: ', gradsum / float(trials))
print('Average SGD Misclassification: ', sgdsum / float(trials))
print('Average Batch SGD Misclassification: ', batchsum / float(trials))
print('Average LASSO Misclassification: ', coordsum / float(trials))
コード例 #9
0
        init_log_std = -5 * np.ones(D)
        init_var_params = np.concatenate([init_mean, init_log_std])
        variational_params = optfun(num_iters, init_var_params, callback)
        return np.array(elbos)

    # let's optimize this with a few different step sizes
    elbo_lists = []
    step_sizes = [.1, .25, .5]
    for step_size in step_sizes:
        # optimize with standard gradient + adam
        optfun = lambda n, init, cb: adam(gradient, init, step_size=step_size,
                                                    num_iters=n, callback=cb)
        standard_lls = optimize_and_lls(optfun)

        # optimize with natural gradient + sgd, no momentum
        optnat = lambda n, init, cb: sgd(natural_gradient, init, step_size=step_size,
                                         num_iters=n, callback=cb, mass=.001)
        natural_lls = optimize_and_lls(optnat)
        elbo_lists.append((standard_lls, natural_lls))

    # visually compare the ELBO
    plt.figure(figsize=(12,8))
    colors = ['b', 'k', 'g']
    for col, ss, (stand_lls, nat_lls) in zip(colors, step_sizes, elbo_lists):
        plt.plot(np.arange(len(stand_lls)), stand_lls,
                 '--', label="standard (adam, step-size = %2.2f)"%ss, alpha=.5, c=col)
        plt.plot(np.arange(len(nat_lls)), nat_lls, '-',
                 label="natural (sgd, step-size = %2.2f)"%ss, c=col)

    llrange = natural_lls.max() - natural_lls.min()
    plt.ylim((natural_lls.max() - llrange*.1, natural_lls.max() + 10))
    plt.xlabel("optimization iteration")
コード例 #10
0
rng = random.PRNGKey(0)
in_shape = (-1, 784)
out_shape, net_params = net_init(rng, in_shape)

###
# Loss calculation, negative log likelihood
###

def loss(params, batch):
  inputs, targets = batch
  preds = net_apply(params, inputs)
  return -np.mean(preds * targets)

lr = 0.00025
# Use optimizers to set optimizer initialization and update functions
opt_init, opt_update, get_params = optimizers.sgd(1.0) #optimizers.exponential_decay(lr, 1000, 0.95))

###
# Update step
###

from adacurv.jax.utils.cg import cg_solve_jax_hvp

def hvp(loss, params, batch, v):
  """Computes the hessian vector product Hv.
  This implementation uses forward-over-reverse mode for computing the hvp.
  Args:
    loss: function computing the loss with signature
      loss(params, batch).
    params: pytree for the parameters of the model.
    batch:  A batch of data. Any format is fine as long as it is a valid input
コード例 #11
0
    def __init__(self, x, c, y, n_words, batch_size, lr, init_emb,
                 vocab_w_size, w_emb_dim, w_hidden_dim, c_emb_dim,
                 c_hidden_dim, output_dim, vocab_c_size, window, opt):
        assert window % 2 == 1, 'Window size must be odd'
        """ input """
        self.x = x  # 1D: n_words * batch_size, 2D: window; elem=word id
        self.x_v = x.flatten(
        )  # 1D: n_words * batch_size * window; elem=word id
        self.c = c  # 1D: n_words * batch_size, 2D: window, 3D: max_len_char, 4D: window; elem=char id
        self.y = y
        self.batch_size = batch_size
        self.n_words = n_words
        self.lr = lr

        n_phi = (w_emb_dim + c_hidden_dim) * window
        max_len_char = T.cast(self.c.shape[2], 'int32')
        """ params """
        if init_emb is not None:
            self.emb = theano.shared(init_emb)
        else:
            self.emb = theano.shared(sample_weights(vocab_w_size, w_emb_dim))

        self.pad = build_shared_zeros((1, c_emb_dim))
        self.e_c = theano.shared(sample_norm_dist(vocab_c_size - 1, c_emb_dim))
        self.emb_c = T.concatenate([self.pad, self.e_c], 0)

        self.W_in = theano.shared(sample_weights(n_phi, w_hidden_dim))
        self.W_c = theano.shared(
            sample_weights(c_emb_dim * window, c_hidden_dim))
        self.W_out = theano.shared(sample_weights(w_hidden_dim, output_dim))

        self.b_in = theano.shared(sample_weights(w_hidden_dim))
        self.b_c = theano.shared(sample_weights(c_hidden_dim))
        self.b_y = theano.shared(sample_weights(output_dim))

        self.params = [
            self.e_c, self.W_in, self.W_c, self.W_out, self.b_in, self.b_c,
            self.b_y
        ]
        """ look up embedding """
        self.x_emb = self.emb[
            self.x_v]  # 1D: batch_size*n_words * window, 2D: emb_dim
        self.c_emb = self.emb_c[
            self.
            c]  # 1D: batch_size*n_words, 2D: window, 3D: max_len_char, 4D: window, 5D: n_c_emb
        self.x_emb_r = self.x_emb.reshape((x.shape[0], x.shape[1], -1))
        """ convolution """
        self.c_phi = T.max(
            T.dot(
                self.c_emb.reshape(
                    (batch_size * n_words, window, max_len_char, -1)),
                self.W_c) + self.b_c, 2)  # 1D: n_words, 2D: window, 3D: n_h_c
        self.x_phi = T.concatenate([self.x_emb_r, self.c_phi], axis=2)
        """ forward """
        self.h = relu(
            T.dot(self.x_phi.reshape((batch_size * n_words,
                                      n_phi)), self.W_in) + self.b_in)
        self.o = T.dot(self.h, self.W_out) + self.b_y
        self.p_y_given_x = T.nnet.softmax(self.o)
        """ predict """
        self.y_pred = T.argmax(self.o, axis=1)
        self.result = T.eq(self.y_pred, self.y)
        """ loss """
        self.log_p = T.log(self.p_y_given_x)[T.arange(batch_size * n_words),
                                             self.y]
        self.nll = -T.sum(self.log_p)
        self.cost = self.nll

        if opt == 'sgd':
            self.updates = sgd(self.cost, self.params, self.emb, self.x_emb,
                               self.lr)
        else:
            self.updates = ada_grad(self.cost, self.params, self.emb,
                                    self.x_emb, self.x, self.lr)
コード例 #12
0
    def __init__(self, x, c, y, n_words, batch_size, lr, init_emb, vocab_w_size, w_emb_dim, w_hidden_dim,
                 c_emb_dim, c_hidden_dim, output_dim, vocab_c_size, window, opt):
        assert window % 2 == 1, 'Window size must be odd'

        """ input """
        self.x = x  # 1D: n_words * batch_size, 2D: window; elem=word id
        self.x_v = x.flatten()  # 1D: n_words * batch_size * window; elem=word id
        self.c = c  # 1D: n_words * batch_size, 2D: window, 3D: max_len_char, 4D: window; elem=char id
        self.y = y
        self.batch_size = batch_size
        self.n_words = n_words
        self.lr = lr

        n_phi = (w_emb_dim + c_hidden_dim) * window
        max_len_char = T.cast(self.c.shape[2], 'int32')

        """ params """
        if init_emb is not None:
            self.emb = theano.shared(init_emb)
        else:
            self.emb = theano.shared(sample_weights(vocab_w_size, w_emb_dim))

        self.pad = build_shared_zeros((1, c_emb_dim))
        self.e_c = theano.shared(sample_norm_dist(vocab_c_size - 1, c_emb_dim))
        self.emb_c = T.concatenate([self.pad, self.e_c], 0)

        self.W_in = theano.shared(sample_weights(n_phi, w_hidden_dim))
        self.W_c = theano.shared(sample_weights(c_emb_dim * window, c_hidden_dim))
        self.W_out = theano.shared(sample_weights(w_hidden_dim, output_dim))

        self.b_in = theano.shared(sample_weights(w_hidden_dim))
        self.b_c = theano.shared(sample_weights(c_hidden_dim))
        self.b_y = theano.shared(sample_weights(output_dim))

        self.params = [self.e_c, self.W_in, self.W_c, self.W_out, self.b_in, self.b_c, self.b_y]

        """ look up embedding """
        self.x_emb = self.emb[self.x_v]  # 1D: batch_size*n_words * window, 2D: emb_dim
        self.c_emb = self.emb_c[self.c]  # 1D: batch_size*n_words, 2D: window, 3D: max_len_char, 4D: window, 5D: n_c_emb
        self.x_emb_r = self.x_emb.reshape((x.shape[0], x.shape[1], -1))

        """ convolution """
        self.c_phi = T.max(T.dot(self.c_emb.reshape((batch_size * n_words, window, max_len_char, -1)), self.W_c) + self.b_c, 2)  # 1D: n_words, 2D: window, 3D: n_h_c
        self.x_phi = T.concatenate([self.x_emb_r, self.c_phi], axis=2)

        """ forward """
        self.h = relu(T.dot(self.x_phi.reshape((batch_size * n_words, n_phi)), self.W_in) + self.b_in)
        self.o = T.dot(self.h, self.W_out) + self.b_y
        self.p_y_given_x = T.nnet.softmax(self.o)

        """ predict """
        self.y_pred = T.argmax(self.o, axis=1)
        self.result = T.eq(self.y_pred, self.y)

        """ loss """
        self.log_p = T.log(self.p_y_given_x)[T.arange(batch_size * n_words), self.y]
        self.nll = -T.sum(self.log_p)
        self.cost = self.nll

        if opt == 'sgd':
            self.updates = sgd(self.cost, self.params, self.emb, self.x_emb, self.lr)
        else:
            self.updates = ada_grad(self.cost, self.params, self.emb, self.x_emb, self.x, self.lr)