def __init__(self, name, x, y, lr, init_emb, vocab_size, emb_dim, hidden_dim, output_dim, window, opt): assert window % 2 == 1, 'Window size must be odd' """ input """ self.name = name self.x = x self.y = y self.lr = lr self.input = [self.x, self.y, self.lr] n_words = x.shape[0] """ params """ if init_emb is not None: self.emb = theano.shared(init_emb) else: self.emb = theano.shared(sample_weights(vocab_size, emb_dim)) self.W_in = theano.shared( sample_weights(hidden_dim, 1, window, emb_dim)) self.W_out = theano.shared(sample_weights(hidden_dim, output_dim)) self.b_in = theano.shared(sample_weights(hidden_dim, 1)) self.b_y = theano.shared(sample_weights(output_dim)) self.params = [self.W_in, self.W_out, self.b_in, self.b_y] """ pad """ self.zero = theano.shared( np.zeros(shape=(1, 1, window / 2, emb_dim), dtype=theano.config.floatX)) """ look up embedding """ self.x_emb = self.emb[self.x] # x_emb: 1D: n_words, 2D: n_emb """ convolution """ self.x_in = self.conv(self.x_emb) """ feed-forward computation """ self.h = relu( self.x_in.reshape((self.x_in.shape[1], self.x_in.shape[2])) + T.repeat(self.b_in, T.cast(self.x_in.shape[2], 'int32'), 1)).T self.o = T.dot(self.h, self.W_out) + self.b_y self.p_y_given_x = T.nnet.softmax(self.o) """ prediction """ self.y_pred = T.argmax(self.o, axis=1) self.result = T.eq(self.y_pred, self.y) """ cost function """ self.nll = -T.sum(T.log(self.p_y_given_x)[T.arange(n_words), self.y]) self.cost = self.nll if opt == 'sgd': self.updates = sgd(self.cost, self.params, self.emb, self.x_emb, self.lr) else: self.updates = ada_grad(self.cost, self.params, self.emb, self.x_emb, self.x, self.lr)
def __init__(self, name, x, y, lr, init_emb, vocab_size, emb_dim, hidden_dim, output_dim, window, opt): assert window % 2 == 1, 'Window size must be odd' """ input """ self.name = name self.x = x self.y = y self.lr = lr self.input = [self.x, self.y, self.lr] n_words = x.shape[0] """ params """ if init_emb is not None: self.emb = theano.shared(init_emb) else: self.emb = theano.shared(sample_weights(vocab_size, emb_dim)) self.W_in = theano.shared(sample_weights(hidden_dim, 1, window, emb_dim)) self.W_out = theano.shared(sample_weights(hidden_dim, output_dim)) self.b_in = theano.shared(sample_weights(hidden_dim, 1)) self.b_y = theano.shared(sample_weights(output_dim)) self.params = [self.W_in, self.W_out, self.b_in, self.b_y] """ pad """ self.zero = theano.shared(np.zeros(shape=(1, 1, window / 2, emb_dim), dtype=theano.config.floatX)) """ look up embedding """ self.x_emb = self.emb[self.x] # x_emb: 1D: n_words, 2D: n_emb """ convolution """ self.x_in = self.conv(self.x_emb) """ feed-forward computation """ self.h = relu(self.x_in.reshape((self.x_in.shape[1], self.x_in.shape[2])) + T.repeat(self.b_in, T.cast(self.x_in.shape[2], 'int32'), 1)).T self.o = T.dot(self.h, self.W_out) + self.b_y self.p_y_given_x = T.nnet.softmax(self.o) """ prediction """ self.y_pred = T.argmax(self.o, axis=1) self.result = T.eq(self.y_pred, self.y) """ cost function """ self.nll = -T.sum(T.log(self.p_y_given_x)[T.arange(n_words), self.y]) self.cost = self.nll if opt == 'sgd': self.updates = sgd(self.cost, self.params, self.emb, self.x_emb, self.lr) else: self.updates = ada_grad(self.cost, self.params, self.emb, self.x_emb, self.x, self.lr)
def __init__(self, x, y, n_words, batch_size, lr, init_emb, vocab_size, emb_dim, hidden_dim, output_dim, window, opt): assert window % 2 == 1, 'Window size must be odd' """ input """ self.x = x # 1D: n_words * batch_size, 2D: window; elem=word id self.x_v = x.flatten( ) # 1D: n_words * batch_size * window; elem=word id self.y = y self.batch_size = batch_size self.n_words = n_words self.lr = lr """ params """ if init_emb is not None: self.emb = theano.shared(init_emb) else: self.emb = theano.shared(sample_weights(vocab_size, emb_dim)) self.W_in = theano.shared(sample_weights(emb_dim * window, hidden_dim)) self.W_out = theano.shared(sample_weights(hidden_dim, output_dim)) self.b_in = theano.shared(sample_weights(hidden_dim)) self.b_y = theano.shared(sample_weights(output_dim)) self.params = [self.W_in, self.W_out, self.b_in, self.b_y] """ look up embedding """ self.x_emb = self.emb[ self.x_v] # x_emb: 1D: batch_size * n_words * window, 2D: emb_dim """ forward """ self.h = relu( T.dot(self.x_emb.reshape((batch_size * n_words, emb_dim * window)), self.W_in) + self.b_in) self.o = T.dot(self.h, self.W_out) + self.b_y self.p_y_given_x = T.nnet.softmax(self.o) """ predict """ self.y_pred = T.argmax(self.o, axis=1) self.result = T.eq(self.y_pred, self.y) """ loss """ self.log_p = T.log(self.p_y_given_x)[T.arange(batch_size * n_words), self.y] self.nll = -T.sum(self.log_p) self.cost = self.nll if opt == 'sgd': self.updates = sgd(self.cost, self.params, self.emb, self.x_emb, self.lr) else: self.updates = ada_grad(self.cost, self.params, self.emb, self.x_emb, self.x, self.lr)
def __init__(self, x, y, n_words, batch_size, lr, init_emb, vocab_size, emb_dim, hidden_dim, output_dim, window, opt): assert window % 2 == 1, 'Window size must be odd' """ input """ self.x = x # 1D: n_words * batch_size, 2D: window; elem=word id self.x_v = x.flatten() # 1D: n_words * batch_size * window; elem=word id self.y = y self.batch_size = batch_size self.n_words = n_words self.lr = lr """ params """ if init_emb is not None: self.emb = theano.shared(init_emb) else: self.emb = theano.shared(sample_weights(vocab_size, emb_dim)) self.W_in = theano.shared(sample_weights(emb_dim * window, hidden_dim)) self.W_out = theano.shared(sample_weights(hidden_dim, output_dim)) self.b_in = theano.shared(sample_weights(hidden_dim)) self.b_y = theano.shared(sample_weights(output_dim)) self.params = [self.W_in, self.W_out, self.b_in, self.b_y] """ look up embedding """ self.x_emb = self.emb[self.x_v] # x_emb: 1D: batch_size * n_words * window, 2D: emb_dim """ forward """ self.h = relu(T.dot(self.x_emb.reshape((batch_size * n_words, emb_dim * window)), self.W_in) + self.b_in) self.o = T.dot(self.h, self.W_out) + self.b_y self.p_y_given_x = T.nnet.softmax(self.o) """ predict """ self.y_pred = T.argmax(self.o, axis=1) self.result = T.eq(self.y_pred, self.y) """ loss """ self.log_p = T.log(self.p_y_given_x)[T.arange(batch_size * n_words), self.y] self.nll = -T.sum(self.log_p) self.cost = self.nll if opt == 'sgd': self.updates = sgd(self.cost, self.params, self.emb, self.x_emb, self.lr) else: self.updates = ada_grad(self.cost, self.params, self.emb, self.x_emb, self.x, self.lr)
return args.lr * (args.decay_factor**2) else: return args.lr * args.decay_factor lr = lr_schedule elif args.linear_lr_decay: def lr_schedule(epoch): return args.lr * (args.epochs - epoch) / args.epochs lr = lr_schedule else: lr = args.lr if args.optimizer == 'sgd': opt_init, opt_apply, get_params = myopt.sgd(lr) elif args.optimizer == 'momentum': opt_init, opt_apply, get_params = myopt.momentum( lr, args.momentum, weight_decay=args.weight_decay) elif args.optimizer == 'adagrad': opt_init, opt_apply, get_params = optimizers.adagrad(lr, args.momentum) elif args.optimizer == 'adam': opt_init, opt_apply, get_params = optimizers.adam(lr) state = opt_init(params) if args.loss == 'logistic': loss = lambda fx, y: np.mean(-np.sum(logsoftmax(fx) * y, axis=1)) elif args.loss == 'squared': loss = lambda fx, y: np.mean(np.sum((fx - y)**2, axis=1)) value_and_grad_loss = jit(
def __init__(self, x_span, x_word, x_ctx, x_dist, x_slen, y, init_emb, n_vocab, dim_w, dim_d, dim_h, L2_reg): """ :param x_span: 1D: batch, 2D: limit * 2 (10); elem=word id :param x_word: 1D: batch, 2D: 4 (m_first, m_last, a_first, a_last); elem=word id :param x_ctx : 1D: batch, 2D: window * 2 * 2 (20); elem=word id :param x_dist: 1D: batch; 2D: 2; elem=[sent dist, ment dist] :param x_slen: 1D: batch; 2D: 3; elem=[m_span_len, a_span_len, head_match] :param y : 1D: batch """ self.input = [x_span, x_word, x_ctx, x_dist, y] self.x_span = x_span self.x_word = x_word self.x_ctx = x_ctx self.x_dist = x_dist self.x_slen = x_slen self.y = y dim_x = dim_w * (10 + 4 + 4 + 2 + 3) batch = y.shape[0] """ Params """ if init_emb is None: self.emb = theano.shared(sample_weights(n_vocab, dim_w)) else: self.emb = theano.shared(init_emb) self.W_d = theano.shared(sample_weights(dim_d, dim_w)) self.W_l = theano.shared(sample_weights(7, dim_w)) self.W_i = theano.shared(sample_weights(dim_x, dim_h)) self.W_h = theano.shared(sample_weights(dim_h, dim_h)) self.W_o = theano.shared(sample_weights(dim_h)) self.params = [self.W_d, self.W_l, self.W_i, self.W_h, self.W_o] """ Input Layer """ x_vec = T.concatenate([x_span, x_word, x_ctx], 1).flatten() # 1D: batch * (limit * 2 + 4 + 20) x_in = self.emb[x_vec] # 1D: batch, 2D: limit * 2, 3D: dim_w x_d = self.W_d[x_dist] # 1D: batch, 2D: 2, 3D: dim_w x_l = self.W_l[x_slen] # 1D: batch, 2D: 2, 3D: dim_w x = T.concatenate([x_in.reshape((batch, -1)), x_d.reshape((batch, -1)), x_l.reshape((batch, -1))], 1) """ Intermediate Layers """ h1 = relu(T.dot(x, self.W_i)) # h1: 1D: batch, 2D: dim_h h2 = relu(T.dot(h1, self.W_h)) # h2: 1D: batch, 2D: dim_h """ Output Layer """ p_y = sigmoid(T.dot(h2, self.W_o)) # p_y: 1D: batch """ Cost Function """ self.nll = - T.sum(y * T.log(p_y) + (1. - y) * T.log((1. - p_y))) # TODO: ranking criterion self.cost = self.nll + L2_reg * L2_sqr(params=self.params) / 2 """ Update """ self.updates = sgd(self.cost, self.params, self.emb, x_in) """ Predicts """ self.thresholds = theano.shared(np.asarray([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9], dtype=theano.config.floatX)) self.y_hat = self.binary_predict(p_y) # 1D: batch, 2D: 9 (thresholds) self.y_hat_index = T.argmax(p_y) self.p_y_hat = p_y[self.y_hat_index] """ Check Results """ self.result = T.eq(self.y_hat, y.reshape((y.shape[0], 1))) # 1D: batch, 2D: 9 (thresholds) self.total_p = T.sum(self.y_hat, 0) self.total_r = T.sum(y, keepdims=True) self.correct = T.sum(self.result, 0) self.correct_t, self.correct_f = correct_tf(self.result, y.reshape((y.shape[0], 1)))
def __init__(self, name, w, c, b, y, lr, init_w_emb, vocab_w_size, vocab_c_size, w_emb_dim, c_emb_dim, w_hidden_dim, c_hidden_dim, output_dim, window, opt): assert window % 2 == 1, 'Window size must be odd' """ input """ self.name = name self.w = w self.c = c self.b = b self.y = y self.lr = lr self.input = [self.w, self.c, self.b, self.y, self.lr] n_phi = w_emb_dim + c_emb_dim * window n_words = w.shape[0] """ params """ if init_w_emb is not None: self.emb = theano.shared(init_w_emb) else: self.emb = theano.shared(sample_weights(vocab_w_size, w_emb_dim)) self.emb_c = theano.shared(sample_norm_dist(vocab_c_size, c_emb_dim)) self.W_in = theano.shared(sample_weights(w_hidden_dim, 1, window, n_phi)) self.W_c = theano.shared(sample_weights(c_hidden_dim, 1, window, c_emb_dim)) self.W_out = theano.shared(sample_weights(w_hidden_dim, output_dim)) self.b_in = theano.shared(sample_weights(w_hidden_dim, 1)) self.b_c = theano.shared(sample_weights(c_hidden_dim)) self.b_y = theano.shared(sample_weights(output_dim)) """ pad """ self.zero = theano.shared(np.zeros(shape=(1, 1, window / 2, n_phi), dtype=theano.config.floatX)) self.zero_c = theano.shared(np.zeros(shape=(1, 1, window / 2, c_emb_dim), dtype=theano.config.floatX)) self.params = [self.emb_c, self.W_in, self.W_c, self.W_out, self.b_in, self.b_c, self.b_y] """ look up embedding """ x_emb = self.emb[self.w] # x_emb: 1D: n_words, 2D: w_emb_dim c_emb = self.emb_c[self.c] # c_emb: 1D: n_chars, 2D: c_emb_dim """ create feature """ c_phi = self.create_char_feature(self.b, c_emb, self.zero_c) + self.b_c # 1D: n_words, 2D: c_hidden_dim(50) x_phi = T.concatenate([x_emb, c_phi], axis=1) # 1D: n_words, 2D: w_emb_dim(100) + c_hidden_dim(50) """ convolution """ x_padded = T.concatenate([self.zero, x_phi.reshape((1, 1, x_phi.shape[0], x_phi.shape[1])), self.zero], axis=2) # x_padded: 1D: n_words + n_pad, 2D: n_phi x_in = conv2d(input=x_padded, filters=self.W_in) # 1D: 1, 2D: w_hidden_dim(300), 3D: n_words, 4D: 1 """ feed-forward computation """ h = relu(x_in.reshape((x_in.shape[1], x_in.shape[2])) + T.repeat(self.b_in, T.cast(x_in.shape[2], 'int32'), 1)).T self.o = T.dot(h, self.W_out) + self.b_y self.p_y_given_x = T.nnet.softmax(self.o) """ prediction """ self.y_pred = T.argmax(self.o, axis=1) self.result = T.eq(self.y_pred, self.y) """ cost function """ self.nll = -T.sum(T.log(self.p_y_given_x)[T.arange(n_words), self.y]) self.cost = self.nll if opt == 'sgd': self.updates = sgd(self.cost, self.params, self.emb, x_emb, self.lr) else: self.updates = ada_grad(self.cost, self.params, self.emb, x_emb, self.w, self.lr)
import metrics as mt import numpy as np import optimizers as op import utils trials = 1000 gradsum = 0.0 sgdsum = 0.0 batchsum = 0.0 coordsum = 0.0 for i in range(trials): Xtrain, ytrain, Xval, yval, Xtest, Ytest = utils.load_data() wgrad, bgrad = op.grad_desc(Xtrain, ytrain, 0.01, 0.000001) wsgd, bsgd = op.sgd(Xtrain, ytrain, 0.01, 0.000001) wbsgd, bbsgd = op.batch_sgd(Xtrain, ytrain, 0.01, 0.000001, 50) wcord, bcord = op.coorddesc(Xtrain, ytrain, 0.01) gradsum += mt.error(Xval, yval, wgrad, bgrad) sgdsum += mt.error(Xval, yval, wsgd, bsgd) batchsum += mt.error(Xval, yval, wbsgd, bbsgd) coordsum += mt.error(Xval, yval, wcord, bcord) print('Average Grad Misclassification: ', gradsum / float(trials)) print('Average SGD Misclassification: ', sgdsum / float(trials)) print('Average Batch SGD Misclassification: ', batchsum / float(trials)) print('Average LASSO Misclassification: ', coordsum / float(trials))
init_log_std = -5 * np.ones(D) init_var_params = np.concatenate([init_mean, init_log_std]) variational_params = optfun(num_iters, init_var_params, callback) return np.array(elbos) # let's optimize this with a few different step sizes elbo_lists = [] step_sizes = [.1, .25, .5] for step_size in step_sizes: # optimize with standard gradient + adam optfun = lambda n, init, cb: adam(gradient, init, step_size=step_size, num_iters=n, callback=cb) standard_lls = optimize_and_lls(optfun) # optimize with natural gradient + sgd, no momentum optnat = lambda n, init, cb: sgd(natural_gradient, init, step_size=step_size, num_iters=n, callback=cb, mass=.001) natural_lls = optimize_and_lls(optnat) elbo_lists.append((standard_lls, natural_lls)) # visually compare the ELBO plt.figure(figsize=(12,8)) colors = ['b', 'k', 'g'] for col, ss, (stand_lls, nat_lls) in zip(colors, step_sizes, elbo_lists): plt.plot(np.arange(len(stand_lls)), stand_lls, '--', label="standard (adam, step-size = %2.2f)"%ss, alpha=.5, c=col) plt.plot(np.arange(len(nat_lls)), nat_lls, '-', label="natural (sgd, step-size = %2.2f)"%ss, c=col) llrange = natural_lls.max() - natural_lls.min() plt.ylim((natural_lls.max() - llrange*.1, natural_lls.max() + 10)) plt.xlabel("optimization iteration")
rng = random.PRNGKey(0) in_shape = (-1, 784) out_shape, net_params = net_init(rng, in_shape) ### # Loss calculation, negative log likelihood ### def loss(params, batch): inputs, targets = batch preds = net_apply(params, inputs) return -np.mean(preds * targets) lr = 0.00025 # Use optimizers to set optimizer initialization and update functions opt_init, opt_update, get_params = optimizers.sgd(1.0) #optimizers.exponential_decay(lr, 1000, 0.95)) ### # Update step ### from adacurv.jax.utils.cg import cg_solve_jax_hvp def hvp(loss, params, batch, v): """Computes the hessian vector product Hv. This implementation uses forward-over-reverse mode for computing the hvp. Args: loss: function computing the loss with signature loss(params, batch). params: pytree for the parameters of the model. batch: A batch of data. Any format is fine as long as it is a valid input
def __init__(self, x, c, y, n_words, batch_size, lr, init_emb, vocab_w_size, w_emb_dim, w_hidden_dim, c_emb_dim, c_hidden_dim, output_dim, vocab_c_size, window, opt): assert window % 2 == 1, 'Window size must be odd' """ input """ self.x = x # 1D: n_words * batch_size, 2D: window; elem=word id self.x_v = x.flatten( ) # 1D: n_words * batch_size * window; elem=word id self.c = c # 1D: n_words * batch_size, 2D: window, 3D: max_len_char, 4D: window; elem=char id self.y = y self.batch_size = batch_size self.n_words = n_words self.lr = lr n_phi = (w_emb_dim + c_hidden_dim) * window max_len_char = T.cast(self.c.shape[2], 'int32') """ params """ if init_emb is not None: self.emb = theano.shared(init_emb) else: self.emb = theano.shared(sample_weights(vocab_w_size, w_emb_dim)) self.pad = build_shared_zeros((1, c_emb_dim)) self.e_c = theano.shared(sample_norm_dist(vocab_c_size - 1, c_emb_dim)) self.emb_c = T.concatenate([self.pad, self.e_c], 0) self.W_in = theano.shared(sample_weights(n_phi, w_hidden_dim)) self.W_c = theano.shared( sample_weights(c_emb_dim * window, c_hidden_dim)) self.W_out = theano.shared(sample_weights(w_hidden_dim, output_dim)) self.b_in = theano.shared(sample_weights(w_hidden_dim)) self.b_c = theano.shared(sample_weights(c_hidden_dim)) self.b_y = theano.shared(sample_weights(output_dim)) self.params = [ self.e_c, self.W_in, self.W_c, self.W_out, self.b_in, self.b_c, self.b_y ] """ look up embedding """ self.x_emb = self.emb[ self.x_v] # 1D: batch_size*n_words * window, 2D: emb_dim self.c_emb = self.emb_c[ self. c] # 1D: batch_size*n_words, 2D: window, 3D: max_len_char, 4D: window, 5D: n_c_emb self.x_emb_r = self.x_emb.reshape((x.shape[0], x.shape[1], -1)) """ convolution """ self.c_phi = T.max( T.dot( self.c_emb.reshape( (batch_size * n_words, window, max_len_char, -1)), self.W_c) + self.b_c, 2) # 1D: n_words, 2D: window, 3D: n_h_c self.x_phi = T.concatenate([self.x_emb_r, self.c_phi], axis=2) """ forward """ self.h = relu( T.dot(self.x_phi.reshape((batch_size * n_words, n_phi)), self.W_in) + self.b_in) self.o = T.dot(self.h, self.W_out) + self.b_y self.p_y_given_x = T.nnet.softmax(self.o) """ predict """ self.y_pred = T.argmax(self.o, axis=1) self.result = T.eq(self.y_pred, self.y) """ loss """ self.log_p = T.log(self.p_y_given_x)[T.arange(batch_size * n_words), self.y] self.nll = -T.sum(self.log_p) self.cost = self.nll if opt == 'sgd': self.updates = sgd(self.cost, self.params, self.emb, self.x_emb, self.lr) else: self.updates = ada_grad(self.cost, self.params, self.emb, self.x_emb, self.x, self.lr)
def __init__(self, x, c, y, n_words, batch_size, lr, init_emb, vocab_w_size, w_emb_dim, w_hidden_dim, c_emb_dim, c_hidden_dim, output_dim, vocab_c_size, window, opt): assert window % 2 == 1, 'Window size must be odd' """ input """ self.x = x # 1D: n_words * batch_size, 2D: window; elem=word id self.x_v = x.flatten() # 1D: n_words * batch_size * window; elem=word id self.c = c # 1D: n_words * batch_size, 2D: window, 3D: max_len_char, 4D: window; elem=char id self.y = y self.batch_size = batch_size self.n_words = n_words self.lr = lr n_phi = (w_emb_dim + c_hidden_dim) * window max_len_char = T.cast(self.c.shape[2], 'int32') """ params """ if init_emb is not None: self.emb = theano.shared(init_emb) else: self.emb = theano.shared(sample_weights(vocab_w_size, w_emb_dim)) self.pad = build_shared_zeros((1, c_emb_dim)) self.e_c = theano.shared(sample_norm_dist(vocab_c_size - 1, c_emb_dim)) self.emb_c = T.concatenate([self.pad, self.e_c], 0) self.W_in = theano.shared(sample_weights(n_phi, w_hidden_dim)) self.W_c = theano.shared(sample_weights(c_emb_dim * window, c_hidden_dim)) self.W_out = theano.shared(sample_weights(w_hidden_dim, output_dim)) self.b_in = theano.shared(sample_weights(w_hidden_dim)) self.b_c = theano.shared(sample_weights(c_hidden_dim)) self.b_y = theano.shared(sample_weights(output_dim)) self.params = [self.e_c, self.W_in, self.W_c, self.W_out, self.b_in, self.b_c, self.b_y] """ look up embedding """ self.x_emb = self.emb[self.x_v] # 1D: batch_size*n_words * window, 2D: emb_dim self.c_emb = self.emb_c[self.c] # 1D: batch_size*n_words, 2D: window, 3D: max_len_char, 4D: window, 5D: n_c_emb self.x_emb_r = self.x_emb.reshape((x.shape[0], x.shape[1], -1)) """ convolution """ self.c_phi = T.max(T.dot(self.c_emb.reshape((batch_size * n_words, window, max_len_char, -1)), self.W_c) + self.b_c, 2) # 1D: n_words, 2D: window, 3D: n_h_c self.x_phi = T.concatenate([self.x_emb_r, self.c_phi], axis=2) """ forward """ self.h = relu(T.dot(self.x_phi.reshape((batch_size * n_words, n_phi)), self.W_in) + self.b_in) self.o = T.dot(self.h, self.W_out) + self.b_y self.p_y_given_x = T.nnet.softmax(self.o) """ predict """ self.y_pred = T.argmax(self.o, axis=1) self.result = T.eq(self.y_pred, self.y) """ loss """ self.log_p = T.log(self.p_y_given_x)[T.arange(batch_size * n_words), self.y] self.nll = -T.sum(self.log_p) self.cost = self.nll if opt == 'sgd': self.updates = sgd(self.cost, self.params, self.emb, self.x_emb, self.lr) else: self.updates = ada_grad(self.cost, self.params, self.emb, self.x_emb, self.x, self.lr)