def sgd_updates_adadelta(self, params, cost, rho=0.95, epsilon=1e-6, norm_lim=9, word_vec_name='Words'):
     """
     adadelta update rule, mostly from
     https://groups.google.com/forum/#!topic/pylearn-dev/3QbKtCumAW4 (for Adadelta)
     """
     updates = OrderedDict({})
     exp_sqr_grads = OrderedDict({})
     exp_sqr_ups = OrderedDict({})
     gparams = []
     for param in params:
         empty = np.zeros_like(param.get_value())
         exp_name = "exp_grad_%s" % param.name
         exp_sqr_grads[param] = shared_common(as_floatX(empty), exp_name)
         gp = T.grad(cost, param)
         exp_sqr_ups[param] = shared_common(as_floatX(empty), exp_name)
         gparams.append(gp)
     for param, gp in zip(params, gparams):
         exp_sg = exp_sqr_grads[param]
         exp_su = exp_sqr_ups[param]
         up_exp_sg = rho * exp_sg + (1 - rho) * T.sqr(gp)
         updates[exp_sg] = up_exp_sg
         step = -(T.sqrt(exp_su + epsilon) /
                  T.sqrt(up_exp_sg + epsilon)) * gp
         updates[exp_su] = rho * exp_su + (1 - rho) * T.sqr(step)
         stepped_param = param + step
         if (param.get_value(borrow=True).ndim == 2) and (param.name != 'Words'):
             col_norms = T.sqrt(T.sum(T.sqr(stepped_param), axis=0))
             desired_norms = T.clip(col_norms, 0, T.sqrt(norm_lim))
             scale = desired_norms / (1e-7 + col_norms)
             updates[param] = stepped_param * scale
         else:
             updates[param] = stepped_param
     return updates
Exemple #2
0
    def __init__(self,
                 n_in,
                 n_hidden,
                 n_out,
                 activation=T.tanh,
                 inner_activation=T.nnet.sigmoid,
                 output_type='real',
                 batch_size=200):
        self.activation = activation
        self.inner_activation = inner_activation
        self.output_type = output_type

        self.batch_size = batch_size
        self.n_hidden = n_hidden

        self.W_i = shared_common(gloroat_uniform((n_in, n_hidden)))
        self.U_i = shared_common(ortho_weight(n_hidden))
        self.b_i = shared_zeros((n_hidden, ))

        self.W_f = shared_common(gloroat_uniform((n_in, n_hidden)))
        self.U_f = shared_common(ortho_weight(n_hidden))
        self.b_f = shared_zeros((n_hidden, ))

        self.W_c = shared_common(gloroat_uniform((n_in, n_hidden)))
        self.U_c = shared_common(ortho_weight(n_hidden))
        self.b_c = shared_zeros((n_hidden, ))

        self.W_o = shared_common(gloroat_uniform((n_in, n_hidden)))
        self.U_o = shared_common(ortho_weight(n_hidden))
        self.b_o = shared_zeros((n_hidden, ))

        self.params = [
            self.W_i, self.U_i, self.b_i, self.W_c, self.U_c, self.b_c,
            self.W_f, self.U_f, self.b_f, self.W_o, self.U_o, self.b_o
        ]
    def __init__(self, rng, n_in, n_out, tensor_num=3, activation=T.tanh):
        self.tensor_num = tensor_num
        self.W = []
        for i in range(tensor_num):
            self.W.append(shared_common(ortho_weight(100)))
        self.activation = activation
        self.hidden_layer = HiddenLayer2(rng, tensor_num * 5 * n_in, n_out)

        self.params = self.W + self.hidden_layer.params
Exemple #4
0
 def __init__(self, input_value, n_in, n_out, rng):
     self.W = shared_common(np.asarray(
         rng.uniform(
             low=-np.sqrt(6. / (n_in + n_out)),
             high=np.sqrt(6. / (n_in + n_out)),
             size=(n_in, n_out)
         ), dtype=floatX))
     self.b = shared_zeros(n_out, 'b')
     self.predict_prob = T.nnet.softmax(T.dot(input_value, self.W) + self.b)
     self.predict_y = T.argmax(self.predict_prob, axis=1)
     self.params = [self.W, self.b]
    def __init__(self,
                 rng,
                 n_in,
                 n_out,
                 W=None,
                 b=None,
                 activation=T.tanh,
                 hidden_size=100):
        self.W = shared_common(ortho_weight(hidden_size), 'W')
        self.activation = activation
        self.hidden_layer = HiddenLayer2(rng, 2 * 5 * n_in, n_out)

        self.params = [self.W] + self.hidden_layer.params
Exemple #6
0
    def __init__(self,
                 input_l,
                 input_r,
                 n_in,
                 n_hidden,
                 n_out,
                 activation=T.tanh,
                 output_type='real',
                 batch_size=200,
                 input_lm=None,
                 input_rm=None):
        if input_lm is None:
            input_lm = shared_ones((batch_size, 20))
        if input_rm is None:
            input_rm = shared_ones((batch_size, 20))
        self.activation = activation
        self.output_type = output_type

        self.W = shared_common(ortho_weight(n_hidden), 'W')
        self.W_in = shared_common(gloroat_uniform(n_in, n_hidden), 'W_in')

        self.h0 = shared_zeros((batch_size, n_hidden), 'h0')
        self.bh = shared_zeros((batch_size, n_hidden), 'bh')

        self.params = [self.W, self.W_in, self.bh]

        def step(x_t, mask, h_tm1):
            h_tm1 = mask * h_tm1
            h_t = T.tanh(
                T.dot(x_t, self.W_in) + T.dot(h_tm1, self.W) + self.bh)
            return h_t

        self.h_l, _ = scan_dimshuffle(step, input_l, input_lm,
                                      shared_zeros((batch_size, n_hidden)))
        self.h_r, _ = scan_dimshuffle(step, input_r, input_rm,
                                      shared_zeros((batch_size, n_hidden)))
        self.h_l = self.h_l.dimshuffle(1, 0, 2)
        self.h_r = self.h_r.dimshuffle(1, 0, 2)
    def __init__(self,
                 rng,
                 n_in,
                 n_out,
                 W=None,
                 b=None,
                 activation=T.tanh,
                 hidden_size=100):
        self.W = shared_common(ortho_weight(hidden_size))
        self.activation = activation

        self.conv_layer = LeNetConvPoolLayer(rng,
                                             filter_shape=(8, 1, 3, 3),
                                             image_shape=(200, 1, 50, 50),
                                             poolsize=(3, 3),
                                             non_linear='relu')

        self.hidden_layer = HiddenLayer2(rng, 2048, n_out)
        self.params = self.conv_layer.params + self.hidden_layer.params
Exemple #8
0
def predict(
        datasets,
        U,  # pre-trained word embeddings
        n_epochs=5,
        batch_size=20,
        max_l=100,
        hidden_size=100,
        word_embedding_size=100,
        block_size=50,
        session_hidden_size=50,
        session_input_size=50,
        model_name='SMN/data/model_4.pkl',
        result_file='SMN/data/result_4.txt'):  # for optimization
    """
    return: a list of dicts of lists, each list contains (ansId, groundTruth, prediction) for a question
    """
    hiddensize = hidden_size
    U = U.astype(dtype=floatX)
    rng = np.random.RandomState(3435)
    lsize, rsize = max_l, max_l

    sessionmask = T.matrix()
    lx = []
    lxmask = []
    for i in range(max_turn):
        lx.append(T.matrix())
        lxmask.append(T.matrix())

    index = T.lscalar()
    rx = T.matrix('rx')
    rxmask = T.matrix()
    y = T.ivector('y')
    Words = shared_common(U, "Words")
    llayer0_input = []
    for i in range(max_turn):
        llayer0_input.append(Words[T.cast(lx[i].flatten(),
                                          dtype="int32")].reshape(
                                              (lx[i].shape[0], lx[i].shape[1],
                                               Words.shape[1])))

    rlayer0_input = Words[T.cast(rx.flatten(), dtype="int32")].reshape(
        (rx.shape[0], rx.shape[1],
         Words.shape[1]))  # input: word embeddings of the mini batch

    dev_set, test_set = datasets[1], datasets[2]

    q_embedding = []
    offset = 2 * lsize

    val_set_lx = []
    val_set_lx_mask = []
    for i in range(max_turn):
        val_set_lx.append(
            shared_common(
                np.asarray(dev_set[:, offset * i:offset * i + lsize],
                           dtype=floatX)))
        val_set_lx_mask.append(
            shared_common(
                np.asarray(dev_set[:,
                                   offset * i + lsize:offset * i + 2 * lsize],
                           dtype=floatX)))

    val_set_rx = shared_common(
        np.asarray(dev_set[:, offset * max_turn:offset * max_turn + lsize],
                   dtype=floatX))
    val_set_rx_mask = shared_common(
        np.asarray(dev_set[:, offset * max_turn + lsize:offset * max_turn +
                           2 * lsize],
                   dtype=floatX))
    val_set_session_mask = shared_common(
        np.asarray(dev_set[:, -max_turn - 1:-1], dtype=floatX))
    val_set_y = shared_common(np.asarray(dev_set[:, -1], dtype="int32"))

    val_dic = {}
    for i in range(max_turn):
        val_dic[lx[i]] = val_set_lx[i][index * batch_size:(index + 1) *
                                       batch_size]
        val_dic[lxmask[i]] = val_set_lx_mask[i][index *
                                                batch_size:(index + 1) *
                                                batch_size]
    val_dic[rx] = val_set_rx[index * batch_size:(index + 1) * batch_size]
    val_dic[sessionmask] = val_set_session_mask[index *
                                                batch_size:(index + 1) *
                                                batch_size]
    val_dic[rxmask] = val_set_rx_mask[index * batch_size:(index + 1) *
                                      batch_size]
    val_dic[y] = val_set_y[index * batch_size:(index + 1) * batch_size]

    sentence2vec = GRU(n_in=word_embedding_size,
                       n_hidden=hiddensize,
                       n_out=hiddensize)

    for i in range(max_turn):
        q_embedding.append(sentence2vec(llayer0_input[i], lxmask[i], True))
    r_embedding = sentence2vec(rlayer0_input, rxmask, True)

    pooling_layer = ConvSim(rng,
                            max_l,
                            session_input_size,
                            hidden_size=hiddensize)

    poolingoutput = []
    test = theano.function([index],
                           pooling_layer(llayer0_input[-4], rlayer0_input,
                                         q_embedding[i], r_embedding),
                           givens=val_dic,
                           on_unused_input='ignore')

    for i in range(max_turn):
        poolingoutput.append(
            pooling_layer(llayer0_input[i], rlayer0_input, q_embedding[i],
                          r_embedding))

    session2vec = GRU(n_in=session_input_size,
                      n_hidden=session_hidden_size,
                      n_out=session_hidden_size)
    res = session2vec(T.stack(poolingoutput, 1), sessionmask)
    classifier = LogisticRegression(res, session_hidden_size, 2, rng)

    cost = classifier.negative_log_likelihood(y)
    error = classifier.errors(y)
    opt = Adam()
    params = classifier.params
    params += sentence2vec.params
    params += session2vec.params
    params += pooling_layer.params
    params += [Words]

    load_params(params, model_name)

    predict = classifier.predict_prob

    val_model = theano.function([index], [y, predict, cost, error],
                                givens=val_dic,
                                on_unused_input='ignore')
    with open(result_file, 'w') as f:
        loss = 0.
        for minibatch_index in range(int(datasets[1].shape[0] / batch_size)):
            a, b, c, d = val_model(minibatch_index)
            loss += c
            f.write(str(list(b[:, 1]))[1:-1].replace(', ', '\n') + '\n')
        print(loss / (datasets[1].shape[0] / batch_size))
Exemple #9
0
def train(
        datasets,
        U,  # pre-trained word embeddings
        n_epochs=3,
        batch_size=20,
        max_l=100,
        hidden_size=100,
        word_embedding_size=100,
        session_hidden_size=50,
        session_input_size=50,
        model_name='SMN/data/model_11',
        exicted_model=None):
    hiddensize = hidden_size
    U = U.astype(dtype=floatX)
    rng = np.random.RandomState(3435)
    lsize, rsize = max_l, max_l
    sessionmask = T.matrix()
    lx = []
    lxmask = []
    for i in range(max_turn):
        lx.append(T.matrix())
        lxmask.append(T.matrix())

    index = T.lscalar()
    rx = T.matrix('rx')
    rxmask = T.matrix()
    y = T.ivector('y')
    Words = shared_common(U, "Words")
    llayer0_input = []
    for i in range(max_turn):
        llayer0_input.append(Words[T.cast(lx[i].flatten(),
                                          dtype="int32")].reshape(
                                              (lx[i].shape[0], lx[i].shape[1],
                                               Words.shape[1])))

    rlayer0_input = Words[T.cast(rx.flatten(), dtype="int32")].reshape(
        (rx.shape[0], rx.shape[1],
         Words.shape[1]))  # input: word embeddings of the mini batch

    train_set, dev_set, test_set = datasets[0], datasets[1], datasets[2]

    train_set_lx = []
    train_set_lx_mask = []
    q_embedding = []
    offset = 2 * lsize
    for i in range(max_turn):
        train_set_lx.append(
            shared_common(
                np.asarray(train_set[:, offset * i:offset * i + lsize],
                           dtype=floatX)))
        train_set_lx_mask.append(
            shared_common(
                np.asarray(train_set[:, offset * i + lsize:offset * i +
                                     2 * lsize],
                           dtype=floatX)))
    train_set_rx = shared_common(
        np.asarray(train_set[:, offset * max_turn:offset * max_turn + lsize],
                   dtype=floatX))
    train_set_rx_mask = shared_common(
        np.asarray(train_set[:, offset * max_turn + lsize:offset * max_turn +
                             2 * lsize],
                   dtype=floatX))
    train_set_session_mask = shared_common(
        np.asarray(train_set[:, -max_turn - 1:-1], dtype=floatX))
    train_set_y = shared_common(np.asarray(train_set[:, -1], dtype="int32"))

    val_set_lx = []
    val_set_lx_mask = []
    for i in range(max_turn):
        val_set_lx.append(
            shared_common(
                np.asarray(dev_set[:, offset * i:offset * i + lsize],
                           dtype=floatX)))
        val_set_lx_mask.append(
            shared_common(
                np.asarray(dev_set[:,
                                   offset * i + lsize:offset * i + 2 * lsize],
                           dtype=floatX)))

    val_set_rx = shared_common(
        np.asarray(dev_set[:, offset * max_turn:offset * max_turn + lsize],
                   dtype=floatX))
    val_set_rx_mask = shared_common(
        np.asarray(dev_set[:, offset * max_turn + lsize:offset * max_turn +
                           2 * lsize],
                   dtype=floatX))
    val_set_session_mask = shared_common(
        np.asarray(dev_set[:, -max_turn - 1:-1], dtype=floatX))
    val_set_y = shared_common(np.asarray(dev_set[:, -1], dtype="int32"))

    dic = {}
    for i in range(max_turn):
        dic[lx[i]] = train_set_lx[i][index * batch_size:(index + 1) *
                                     batch_size]
        dic[lxmask[i]] = train_set_lx_mask[i][index * batch_size:(index + 1) *
                                              batch_size]
    dic[rx] = train_set_rx[index * batch_size:(index + 1) * batch_size]
    dic[sessionmask] = train_set_session_mask[index * batch_size:(index + 1) *
                                              batch_size]
    dic[rxmask] = train_set_rx_mask[index * batch_size:(index + 1) *
                                    batch_size]
    dic[y] = train_set_y[index * batch_size:(index + 1) * batch_size]

    val_dic = {}
    for i in range(max_turn):
        val_dic[lx[i]] = val_set_lx[i][index * batch_size:(index + 1) *
                                       batch_size]
        val_dic[lxmask[i]] = val_set_lx_mask[i][index *
                                                batch_size:(index + 1) *
                                                batch_size]
    val_dic[rx] = val_set_rx[index * batch_size:(index + 1) * batch_size]
    val_dic[sessionmask] = val_set_session_mask[index *
                                                batch_size:(index + 1) *
                                                batch_size]
    val_dic[rxmask] = val_set_rx_mask[index * batch_size:(index + 1) *
                                      batch_size]
    val_dic[y] = val_set_y[index * batch_size:(index + 1) * batch_size]

    sentence2vec = GRU(n_in=word_embedding_size,
                       n_hidden=hiddensize,
                       n_out=hiddensize)

    for i in range(max_turn):
        q_embedding.append(sentence2vec(llayer0_input[i], lxmask[i], True))
    r_embedding = sentence2vec(rlayer0_input, rxmask, True)

    pooling_layer = ConvSim(rng,
                            max_l,
                            session_input_size,
                            hidden_size=hiddensize)

    poolingoutput = []
    test = theano.function([index],
                           pooling_layer(llayer0_input[-4], rlayer0_input,
                                         q_embedding[i], r_embedding),
                           givens=val_dic,
                           on_unused_input='ignore')

    for i in range(max_turn):
        poolingoutput.append(
            pooling_layer(llayer0_input[i], rlayer0_input, q_embedding[i],
                          r_embedding))

    session2vec = GRU(n_in=session_input_size,
                      n_hidden=session_hidden_size,
                      n_out=session_hidden_size)
    res = session2vec(T.stack(poolingoutput, 1), sessionmask)
    classifier = LogisticRegression(res, session_hidden_size, 2, rng)

    cost = classifier.negative_log_likelihood(y)
    error = classifier.errors(y)
    opt = Adam()
    params = classifier.params
    params += sentence2vec.params
    params += session2vec.params
    params += pooling_layer.params
    params += [Words]

    if exicted_model != None:
        load_params(params, exicted_model)

    grad_updates = opt.Adam(
        cost=cost, params=params, lr=0.001
    )  # opt.sgd_updates_adadelta(params, cost, lr_decay, 1e-8, sqr_norm_lim)
    train_model = theano.function([index],
                                  cost,
                                  updates=grad_updates,
                                  givens=dic,
                                  on_unused_input='ignore')
    val_model = theano.function([index], [cost, error],
                                givens=val_dic,
                                on_unused_input='ignore')
    best_dev = 1.
    n_train_batches = int(datasets[0].shape[0] / batch_size)
    for i in range(n_epochs):
        cost = 0
        total = 0.
        for minibatch_index in np.random.permutation(range(n_train_batches)):
            batch_cost = train_model(minibatch_index)
            total += 1
            cost += batch_cost
            if not total % 50:
                print(total, cost / total)
        cost = cost / n_train_batches
        print("echo %d loss %f" % (i, cost))

        cost = 0
        errors = 0
        j = 0
        for minibatch_index in range(int(datasets[1].shape[0] / batch_size)):
            tcost, terr = val_model(minibatch_index)
            cost += tcost
            errors += terr
            j += 1
        if not j:
            j = 1
        cost /= j
        errors /= j
        if cost < best_dev:
            best_dev = cost
        temp_model_name = model_name + str(i) + '.pkl'
        save_params(params, temp_model_name)
        correct = test_model(model_name=temp_model_name)
        print("echo %d dev_correct %f" % (i, float(correct)))
        print("echo %d dev_loss %f" % (i, cost))
        print("echo %d dev_accuracy %f" % (i, 1 - errors))