def sgd_updates_adadelta(self, params, cost, rho=0.95, epsilon=1e-6, norm_lim=9, word_vec_name='Words'): """ adadelta update rule, mostly from https://groups.google.com/forum/#!topic/pylearn-dev/3QbKtCumAW4 (for Adadelta) """ updates = OrderedDict({}) exp_sqr_grads = OrderedDict({}) exp_sqr_ups = OrderedDict({}) gparams = [] for param in params: empty = np.zeros_like(param.get_value()) exp_name = "exp_grad_%s" % param.name exp_sqr_grads[param] = shared_common(as_floatX(empty), exp_name) gp = T.grad(cost, param) exp_sqr_ups[param] = shared_common(as_floatX(empty), exp_name) gparams.append(gp) for param, gp in zip(params, gparams): exp_sg = exp_sqr_grads[param] exp_su = exp_sqr_ups[param] up_exp_sg = rho * exp_sg + (1 - rho) * T.sqr(gp) updates[exp_sg] = up_exp_sg step = -(T.sqrt(exp_su + epsilon) / T.sqrt(up_exp_sg + epsilon)) * gp updates[exp_su] = rho * exp_su + (1 - rho) * T.sqr(step) stepped_param = param + step if (param.get_value(borrow=True).ndim == 2) and (param.name != 'Words'): col_norms = T.sqrt(T.sum(T.sqr(stepped_param), axis=0)) desired_norms = T.clip(col_norms, 0, T.sqrt(norm_lim)) scale = desired_norms / (1e-7 + col_norms) updates[param] = stepped_param * scale else: updates[param] = stepped_param return updates
def __init__(self, n_in, n_hidden, n_out, activation=T.tanh, inner_activation=T.nnet.sigmoid, output_type='real', batch_size=200): self.activation = activation self.inner_activation = inner_activation self.output_type = output_type self.batch_size = batch_size self.n_hidden = n_hidden self.W_i = shared_common(gloroat_uniform((n_in, n_hidden))) self.U_i = shared_common(ortho_weight(n_hidden)) self.b_i = shared_zeros((n_hidden, )) self.W_f = shared_common(gloroat_uniform((n_in, n_hidden))) self.U_f = shared_common(ortho_weight(n_hidden)) self.b_f = shared_zeros((n_hidden, )) self.W_c = shared_common(gloroat_uniform((n_in, n_hidden))) self.U_c = shared_common(ortho_weight(n_hidden)) self.b_c = shared_zeros((n_hidden, )) self.W_o = shared_common(gloroat_uniform((n_in, n_hidden))) self.U_o = shared_common(ortho_weight(n_hidden)) self.b_o = shared_zeros((n_hidden, )) self.params = [ self.W_i, self.U_i, self.b_i, self.W_c, self.U_c, self.b_c, self.W_f, self.U_f, self.b_f, self.W_o, self.U_o, self.b_o ]
def __init__(self, rng, n_in, n_out, tensor_num=3, activation=T.tanh): self.tensor_num = tensor_num self.W = [] for i in range(tensor_num): self.W.append(shared_common(ortho_weight(100))) self.activation = activation self.hidden_layer = HiddenLayer2(rng, tensor_num * 5 * n_in, n_out) self.params = self.W + self.hidden_layer.params
def __init__(self, input_value, n_in, n_out, rng): self.W = shared_common(np.asarray( rng.uniform( low=-np.sqrt(6. / (n_in + n_out)), high=np.sqrt(6. / (n_in + n_out)), size=(n_in, n_out) ), dtype=floatX)) self.b = shared_zeros(n_out, 'b') self.predict_prob = T.nnet.softmax(T.dot(input_value, self.W) + self.b) self.predict_y = T.argmax(self.predict_prob, axis=1) self.params = [self.W, self.b]
def __init__(self, rng, n_in, n_out, W=None, b=None, activation=T.tanh, hidden_size=100): self.W = shared_common(ortho_weight(hidden_size), 'W') self.activation = activation self.hidden_layer = HiddenLayer2(rng, 2 * 5 * n_in, n_out) self.params = [self.W] + self.hidden_layer.params
def __init__(self, input_l, input_r, n_in, n_hidden, n_out, activation=T.tanh, output_type='real', batch_size=200, input_lm=None, input_rm=None): if input_lm is None: input_lm = shared_ones((batch_size, 20)) if input_rm is None: input_rm = shared_ones((batch_size, 20)) self.activation = activation self.output_type = output_type self.W = shared_common(ortho_weight(n_hidden), 'W') self.W_in = shared_common(gloroat_uniform(n_in, n_hidden), 'W_in') self.h0 = shared_zeros((batch_size, n_hidden), 'h0') self.bh = shared_zeros((batch_size, n_hidden), 'bh') self.params = [self.W, self.W_in, self.bh] def step(x_t, mask, h_tm1): h_tm1 = mask * h_tm1 h_t = T.tanh( T.dot(x_t, self.W_in) + T.dot(h_tm1, self.W) + self.bh) return h_t self.h_l, _ = scan_dimshuffle(step, input_l, input_lm, shared_zeros((batch_size, n_hidden))) self.h_r, _ = scan_dimshuffle(step, input_r, input_rm, shared_zeros((batch_size, n_hidden))) self.h_l = self.h_l.dimshuffle(1, 0, 2) self.h_r = self.h_r.dimshuffle(1, 0, 2)
def __init__(self, rng, n_in, n_out, W=None, b=None, activation=T.tanh, hidden_size=100): self.W = shared_common(ortho_weight(hidden_size)) self.activation = activation self.conv_layer = LeNetConvPoolLayer(rng, filter_shape=(8, 1, 3, 3), image_shape=(200, 1, 50, 50), poolsize=(3, 3), non_linear='relu') self.hidden_layer = HiddenLayer2(rng, 2048, n_out) self.params = self.conv_layer.params + self.hidden_layer.params
def predict( datasets, U, # pre-trained word embeddings n_epochs=5, batch_size=20, max_l=100, hidden_size=100, word_embedding_size=100, block_size=50, session_hidden_size=50, session_input_size=50, model_name='SMN/data/model_4.pkl', result_file='SMN/data/result_4.txt'): # for optimization """ return: a list of dicts of lists, each list contains (ansId, groundTruth, prediction) for a question """ hiddensize = hidden_size U = U.astype(dtype=floatX) rng = np.random.RandomState(3435) lsize, rsize = max_l, max_l sessionmask = T.matrix() lx = [] lxmask = [] for i in range(max_turn): lx.append(T.matrix()) lxmask.append(T.matrix()) index = T.lscalar() rx = T.matrix('rx') rxmask = T.matrix() y = T.ivector('y') Words = shared_common(U, "Words") llayer0_input = [] for i in range(max_turn): llayer0_input.append(Words[T.cast(lx[i].flatten(), dtype="int32")].reshape( (lx[i].shape[0], lx[i].shape[1], Words.shape[1]))) rlayer0_input = Words[T.cast(rx.flatten(), dtype="int32")].reshape( (rx.shape[0], rx.shape[1], Words.shape[1])) # input: word embeddings of the mini batch dev_set, test_set = datasets[1], datasets[2] q_embedding = [] offset = 2 * lsize val_set_lx = [] val_set_lx_mask = [] for i in range(max_turn): val_set_lx.append( shared_common( np.asarray(dev_set[:, offset * i:offset * i + lsize], dtype=floatX))) val_set_lx_mask.append( shared_common( np.asarray(dev_set[:, offset * i + lsize:offset * i + 2 * lsize], dtype=floatX))) val_set_rx = shared_common( np.asarray(dev_set[:, offset * max_turn:offset * max_turn + lsize], dtype=floatX)) val_set_rx_mask = shared_common( np.asarray(dev_set[:, offset * max_turn + lsize:offset * max_turn + 2 * lsize], dtype=floatX)) val_set_session_mask = shared_common( np.asarray(dev_set[:, -max_turn - 1:-1], dtype=floatX)) val_set_y = shared_common(np.asarray(dev_set[:, -1], dtype="int32")) val_dic = {} for i in range(max_turn): val_dic[lx[i]] = val_set_lx[i][index * batch_size:(index + 1) * batch_size] val_dic[lxmask[i]] = val_set_lx_mask[i][index * batch_size:(index + 1) * batch_size] val_dic[rx] = val_set_rx[index * batch_size:(index + 1) * batch_size] val_dic[sessionmask] = val_set_session_mask[index * batch_size:(index + 1) * batch_size] val_dic[rxmask] = val_set_rx_mask[index * batch_size:(index + 1) * batch_size] val_dic[y] = val_set_y[index * batch_size:(index + 1) * batch_size] sentence2vec = GRU(n_in=word_embedding_size, n_hidden=hiddensize, n_out=hiddensize) for i in range(max_turn): q_embedding.append(sentence2vec(llayer0_input[i], lxmask[i], True)) r_embedding = sentence2vec(rlayer0_input, rxmask, True) pooling_layer = ConvSim(rng, max_l, session_input_size, hidden_size=hiddensize) poolingoutput = [] test = theano.function([index], pooling_layer(llayer0_input[-4], rlayer0_input, q_embedding[i], r_embedding), givens=val_dic, on_unused_input='ignore') for i in range(max_turn): poolingoutput.append( pooling_layer(llayer0_input[i], rlayer0_input, q_embedding[i], r_embedding)) session2vec = GRU(n_in=session_input_size, n_hidden=session_hidden_size, n_out=session_hidden_size) res = session2vec(T.stack(poolingoutput, 1), sessionmask) classifier = LogisticRegression(res, session_hidden_size, 2, rng) cost = classifier.negative_log_likelihood(y) error = classifier.errors(y) opt = Adam() params = classifier.params params += sentence2vec.params params += session2vec.params params += pooling_layer.params params += [Words] load_params(params, model_name) predict = classifier.predict_prob val_model = theano.function([index], [y, predict, cost, error], givens=val_dic, on_unused_input='ignore') with open(result_file, 'w') as f: loss = 0. for minibatch_index in range(int(datasets[1].shape[0] / batch_size)): a, b, c, d = val_model(minibatch_index) loss += c f.write(str(list(b[:, 1]))[1:-1].replace(', ', '\n') + '\n') print(loss / (datasets[1].shape[0] / batch_size))
def train( datasets, U, # pre-trained word embeddings n_epochs=3, batch_size=20, max_l=100, hidden_size=100, word_embedding_size=100, session_hidden_size=50, session_input_size=50, model_name='SMN/data/model_11', exicted_model=None): hiddensize = hidden_size U = U.astype(dtype=floatX) rng = np.random.RandomState(3435) lsize, rsize = max_l, max_l sessionmask = T.matrix() lx = [] lxmask = [] for i in range(max_turn): lx.append(T.matrix()) lxmask.append(T.matrix()) index = T.lscalar() rx = T.matrix('rx') rxmask = T.matrix() y = T.ivector('y') Words = shared_common(U, "Words") llayer0_input = [] for i in range(max_turn): llayer0_input.append(Words[T.cast(lx[i].flatten(), dtype="int32")].reshape( (lx[i].shape[0], lx[i].shape[1], Words.shape[1]))) rlayer0_input = Words[T.cast(rx.flatten(), dtype="int32")].reshape( (rx.shape[0], rx.shape[1], Words.shape[1])) # input: word embeddings of the mini batch train_set, dev_set, test_set = datasets[0], datasets[1], datasets[2] train_set_lx = [] train_set_lx_mask = [] q_embedding = [] offset = 2 * lsize for i in range(max_turn): train_set_lx.append( shared_common( np.asarray(train_set[:, offset * i:offset * i + lsize], dtype=floatX))) train_set_lx_mask.append( shared_common( np.asarray(train_set[:, offset * i + lsize:offset * i + 2 * lsize], dtype=floatX))) train_set_rx = shared_common( np.asarray(train_set[:, offset * max_turn:offset * max_turn + lsize], dtype=floatX)) train_set_rx_mask = shared_common( np.asarray(train_set[:, offset * max_turn + lsize:offset * max_turn + 2 * lsize], dtype=floatX)) train_set_session_mask = shared_common( np.asarray(train_set[:, -max_turn - 1:-1], dtype=floatX)) train_set_y = shared_common(np.asarray(train_set[:, -1], dtype="int32")) val_set_lx = [] val_set_lx_mask = [] for i in range(max_turn): val_set_lx.append( shared_common( np.asarray(dev_set[:, offset * i:offset * i + lsize], dtype=floatX))) val_set_lx_mask.append( shared_common( np.asarray(dev_set[:, offset * i + lsize:offset * i + 2 * lsize], dtype=floatX))) val_set_rx = shared_common( np.asarray(dev_set[:, offset * max_turn:offset * max_turn + lsize], dtype=floatX)) val_set_rx_mask = shared_common( np.asarray(dev_set[:, offset * max_turn + lsize:offset * max_turn + 2 * lsize], dtype=floatX)) val_set_session_mask = shared_common( np.asarray(dev_set[:, -max_turn - 1:-1], dtype=floatX)) val_set_y = shared_common(np.asarray(dev_set[:, -1], dtype="int32")) dic = {} for i in range(max_turn): dic[lx[i]] = train_set_lx[i][index * batch_size:(index + 1) * batch_size] dic[lxmask[i]] = train_set_lx_mask[i][index * batch_size:(index + 1) * batch_size] dic[rx] = train_set_rx[index * batch_size:(index + 1) * batch_size] dic[sessionmask] = train_set_session_mask[index * batch_size:(index + 1) * batch_size] dic[rxmask] = train_set_rx_mask[index * batch_size:(index + 1) * batch_size] dic[y] = train_set_y[index * batch_size:(index + 1) * batch_size] val_dic = {} for i in range(max_turn): val_dic[lx[i]] = val_set_lx[i][index * batch_size:(index + 1) * batch_size] val_dic[lxmask[i]] = val_set_lx_mask[i][index * batch_size:(index + 1) * batch_size] val_dic[rx] = val_set_rx[index * batch_size:(index + 1) * batch_size] val_dic[sessionmask] = val_set_session_mask[index * batch_size:(index + 1) * batch_size] val_dic[rxmask] = val_set_rx_mask[index * batch_size:(index + 1) * batch_size] val_dic[y] = val_set_y[index * batch_size:(index + 1) * batch_size] sentence2vec = GRU(n_in=word_embedding_size, n_hidden=hiddensize, n_out=hiddensize) for i in range(max_turn): q_embedding.append(sentence2vec(llayer0_input[i], lxmask[i], True)) r_embedding = sentence2vec(rlayer0_input, rxmask, True) pooling_layer = ConvSim(rng, max_l, session_input_size, hidden_size=hiddensize) poolingoutput = [] test = theano.function([index], pooling_layer(llayer0_input[-4], rlayer0_input, q_embedding[i], r_embedding), givens=val_dic, on_unused_input='ignore') for i in range(max_turn): poolingoutput.append( pooling_layer(llayer0_input[i], rlayer0_input, q_embedding[i], r_embedding)) session2vec = GRU(n_in=session_input_size, n_hidden=session_hidden_size, n_out=session_hidden_size) res = session2vec(T.stack(poolingoutput, 1), sessionmask) classifier = LogisticRegression(res, session_hidden_size, 2, rng) cost = classifier.negative_log_likelihood(y) error = classifier.errors(y) opt = Adam() params = classifier.params params += sentence2vec.params params += session2vec.params params += pooling_layer.params params += [Words] if exicted_model != None: load_params(params, exicted_model) grad_updates = opt.Adam( cost=cost, params=params, lr=0.001 ) # opt.sgd_updates_adadelta(params, cost, lr_decay, 1e-8, sqr_norm_lim) train_model = theano.function([index], cost, updates=grad_updates, givens=dic, on_unused_input='ignore') val_model = theano.function([index], [cost, error], givens=val_dic, on_unused_input='ignore') best_dev = 1. n_train_batches = int(datasets[0].shape[0] / batch_size) for i in range(n_epochs): cost = 0 total = 0. for minibatch_index in np.random.permutation(range(n_train_batches)): batch_cost = train_model(minibatch_index) total += 1 cost += batch_cost if not total % 50: print(total, cost / total) cost = cost / n_train_batches print("echo %d loss %f" % (i, cost)) cost = 0 errors = 0 j = 0 for minibatch_index in range(int(datasets[1].shape[0] / batch_size)): tcost, terr = val_model(minibatch_index) cost += tcost errors += terr j += 1 if not j: j = 1 cost /= j errors /= j if cost < best_dev: best_dev = cost temp_model_name = model_name + str(i) + '.pkl' save_params(params, temp_model_name) correct = test_model(model_name=temp_model_name) print("echo %d dev_correct %f" % (i, float(correct))) print("echo %d dev_loss %f" % (i, cost)) print("echo %d dev_accuracy %f" % (i, 1 - errors))