Example #1
0
    def new_episode(self, mem, all_h=False):
        '''
        Create a new episode
        Compute the g using the attention mechanism
        Compute the new state h_t^i of the mem update mechanism
        Use it to compute e^i = h_T_C^i (see paper AMA:DMN for QA)
        :param mem: current memory
        :return e[-1]: latest episode
        '''
        #g_updates seems useless.
        g, g_updates = theano.scan(fn=self.new_attention_step,
                                   sequences=self.inp_c,
                                   non_sequences=[mem, self.q_q],
                                   outputs_info=T.zeros_like(self.inp_c[0][0]))

        #Softmax if normalize_attention?
        if (self.normalize_attention):
            g = nn_utils.softmax(g)

        #e_updates seems useless.
        e, e_updates = theano.scan(fn=self.new_episode_step,
                                   sequences=[self.inp_c, g],
                                   outputs_info=T.zeros_like(self.inp_c[0]))

        return e[-1]
Example #2
0
            def answer_step(prev_a, prev_y):
                a = nn_utils.GRU_update(
                    prev_a, T.concatenate([prev_y, self.q_q, self.last_mem]),
                    self.W_ans_res_in, self.W_ans_res_hid, self.b_ans_res,
                    self.W_ans_upd_in, self.W_ans_upd_hid, self.b_ans_upd,
                    self.W_ans_hid_in, self.W_ans_hid_hid, self.b_ans_hid)

                y = nn_utils.softmax(T.dot(self.W_a, a))
                return [a, y]
Example #3
0
def transformer(src, target, n):
    z = src
    for i in range(n):
        z = encoder(z)
    z_enc = z
    z = target
    for i in range(n):
        z = decoder(z)
    z = linear(z)
    z = nn_utils.softmax(z)
    return z
Example #4
0
    def new_episode(self, mem):
        g, g_updates = theano.scan(fn=self.new_attention_step,
                                   sequences=self.inp_c,
                                   non_sequences=[mem, self.q_q, self.c_vecs],
                                   outputs_info=T.zeros_like(self.inp_c[0][0]))

        if (self.normalize_attention):
            g = nn_utils.softmax(g)

        e, e_updates = theano.scan(fn=self.new_episode_step,
                                   sequences=[self.inp_c, g],
                                   outputs_info=T.zeros_like(self.inp_c[0]))

        return e[-1]
Example #5
0
    def new_episode(self, mem):
        g, g_updates = theano.scan(fn=self.new_attention_step,
                                   sequences=self.inp_c,
                                   non_sequences=[mem, self.q_q],
                                   outputs_info=T.zeros_like(self.inp_c[0][0]))

        if (self.normalize_attention):
            g = nn_utils.softmax(g)

        e, e_updates = theano.scan(fn=self.new_episode_step,
                                   sequences=[self.inp_c, g],
                                   outputs_info=T.zeros_like(self.inp_c[0]))

        e_list = []
        for index in range(self.batch_size):
            e_list.append(e[self.fact_count_var[index] - 1, :, index])
        return T.stack(e_list).dimshuffle((1, 0))
Example #6
0
    def __init__(self, babi_train_raw, babi_test_raw, word2vec,
                 word_vector_size, dim, mode, answer_module, input_mask_mode,
                 memory_hops, batch_size, l2, normalize_attention, batch_norm,
                 dropout, **kwargs):

        print "==> not used params in DMN class:", kwargs.keys()

        self.vocab = {}
        self.ivocab = {}

        self.type = "batch"

        self.word2vec = word2vec
        self.word_vector_size = word_vector_size
        self.dim = dim
        self.mode = mode
        self.answer_module = answer_module
        self.input_mask_mode = input_mask_mode
        self.memory_hops = memory_hops
        self.batch_size = batch_size
        self.l2 = l2
        self.normalize_attention = normalize_attention
        self.batch_norm = batch_norm
        self.dropout = dropout

        self.train_input, self.train_q, self.train_answer, self.train_fact_count, self.train_input_mask = self._process_input(
            babi_train_raw)
        self.test_input, self.test_q, self.test_answer, self.test_fact_count, self.test_input_mask = self._process_input(
            babi_test_raw)
        self.vocab_size = len(self.vocab)

        self.input_var = T.tensor3(
            'input_var')  # (batch_size, seq_len, glove_dim)
        self.q_var = T.tensor3('question_var')  # as self.input_var
        self.answer_var = T.ivector(
            'answer_var')  # answer of example in minibatch
        self.fact_count_var = T.ivector(
            'fact_count_var')  # number of facts in the example of minibatch
        self.input_mask_var = T.imatrix(
            'input_mask_var')  # (batch_size, indices)

        print "==> building input module"
        self.W_inp_res_in = nn_utils.normal_param(
            std=0.1, shape=(self.dim, self.word_vector_size))
        self.W_inp_res_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim, self.dim))
        self.b_inp_res = nn_utils.constant_param(value=0.0, shape=(self.dim, ))

        self.W_inp_upd_in = nn_utils.normal_param(
            std=0.1, shape=(self.dim, self.word_vector_size))
        self.W_inp_upd_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim, self.dim))
        self.b_inp_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, ))

        self.W_inp_hid_in = nn_utils.normal_param(
            std=0.1, shape=(self.dim, self.word_vector_size))
        self.W_inp_hid_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim, self.dim))
        self.b_inp_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, ))

        input_var_shuffled = self.input_var.dimshuffle(1, 2, 0)
        inp_dummy = theano.shared(
            np.zeros((self.dim, self.batch_size), dtype=floatX))
        inp_c_history, _ = theano.scan(fn=self.input_gru_step,
                                       sequences=input_var_shuffled,
                                       outputs_info=T.zeros_like(inp_dummy))

        inp_c_history_shuffled = inp_c_history.dimshuffle(2, 0, 1)

        inp_c_list = []
        inp_c_mask_list = []
        for batch_index in range(self.batch_size):
            taken = inp_c_history_shuffled[batch_index].take(
                self.input_mask_var[
                    batch_index, :self.fact_count_var[batch_index]],
                axis=0)
            inp_c_list.append(
                T.concatenate([
                    taken,
                    T.zeros((self.input_mask_var.shape[1] - taken.shape[0],
                             self.dim), floatX)
                ]))
            inp_c_mask_list.append(
                T.concatenate([
                    T.ones((taken.shape[0], ), np.int32),
                    T.zeros((self.input_mask_var.shape[1] - taken.shape[0], ),
                            np.int32)
                ]))

        self.inp_c = T.stack(inp_c_list).dimshuffle(1, 2, 0)
        inp_c_mask = T.stack(inp_c_mask_list).dimshuffle(1, 0)

        q_var_shuffled = self.q_var.dimshuffle(1, 2, 0)
        q_dummy = theano.shared(
            np.zeros((self.dim, self.batch_size), dtype=floatX))
        q_q_history, _ = theano.scan(fn=self.input_gru_step,
                                     sequences=q_var_shuffled,
                                     outputs_info=T.zeros_like(q_dummy))
        self.q_q = q_q_history[-1]

        print "==> creating parameters for memory module"
        self.W_mem_res_in = nn_utils.normal_param(std=0.1,
                                                  shape=(self.dim, self.dim))
        self.W_mem_res_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim, self.dim))
        self.b_mem_res = nn_utils.constant_param(value=0.0, shape=(self.dim, ))

        self.W_mem_upd_in = nn_utils.normal_param(std=0.1,
                                                  shape=(self.dim, self.dim))
        self.W_mem_upd_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim, self.dim))
        self.b_mem_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, ))

        self.W_mem_hid_in = nn_utils.normal_param(std=0.1,
                                                  shape=(self.dim, self.dim))
        self.W_mem_hid_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim, self.dim))
        self.b_mem_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, ))

        self.W_b = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim))
        self.W_1 = nn_utils.normal_param(std=0.1,
                                         shape=(self.dim, 7 * self.dim + 0))
        self.W_2 = nn_utils.normal_param(std=0.1, shape=(1, self.dim))
        self.b_1 = nn_utils.constant_param(value=0.0, shape=(self.dim, ))
        self.b_2 = nn_utils.constant_param(value=0.0, shape=(1, ))

        print "==> building episodic memory module (fixed number of steps: %d)" % self.memory_hops
        memory = [self.q_q.copy()]
        for iter in range(1, self.memory_hops + 1):
            current_episode = self.new_episode(memory[iter - 1])
            memory.append(
                self.GRU_update(memory[iter - 1], current_episode,
                                self.W_mem_res_in, self.W_mem_res_hid,
                                self.b_mem_res, self.W_mem_upd_in,
                                self.W_mem_upd_hid, self.b_mem_upd,
                                self.W_mem_hid_in, self.W_mem_hid_hid,
                                self.b_mem_hid))

        last_mem_raw = memory[-1].dimshuffle((1, 0))

        net = layers.InputLayer(shape=(self.batch_size, self.dim),
                                input_var=last_mem_raw)
        if self.batch_norm:
            net = layers.BatchNormLayer(incoming=net)
        if self.dropout > 0 and self.mode == 'train':
            net = layers.DropoutLayer(net, p=self.dropout)
        last_mem = layers.get_output(net).dimshuffle((1, 0))

        print "==> building answer module"
        self.W_a = nn_utils.normal_param(std=0.1,
                                         shape=(self.vocab_size, self.dim))

        if self.answer_module == 'feedforward':
            self.prediction = nn_utils.softmax(T.dot(self.W_a, last_mem))

        elif self.answer_module == 'recurrent':
            self.W_ans_res_in = nn_utils.normal_param(
                std=0.1, shape=(self.dim, self.dim + self.vocab_size))
            self.W_ans_res_hid = nn_utils.normal_param(std=0.1,
                                                       shape=(self.dim,
                                                              self.dim))
            self.b_ans_res = nn_utils.constant_param(value=0.0,
                                                     shape=(self.dim, ))

            self.W_ans_upd_in = nn_utils.normal_param(
                std=0.1, shape=(self.dim, self.dim + self.vocab_size))
            self.W_ans_upd_hid = nn_utils.normal_param(std=0.1,
                                                       shape=(self.dim,
                                                              self.dim))
            self.b_ans_upd = nn_utils.constant_param(value=0.0,
                                                     shape=(self.dim, ))

            self.W_ans_hid_in = nn_utils.normal_param(
                std=0.1, shape=(self.dim, self.dim + self.vocab_size))
            self.W_ans_hid_hid = nn_utils.normal_param(std=0.1,
                                                       shape=(self.dim,
                                                              self.dim))
            self.b_ans_hid = nn_utils.constant_param(value=0.0,
                                                     shape=(self.dim, ))

            def answer_step(prev_a, prev_y):
                a = self.GRU_update(prev_a, T.concatenate([prev_y, self.q_q]),
                                    self.W_ans_res_in, self.W_ans_res_hid,
                                    self.b_ans_res, self.W_ans_upd_in,
                                    self.W_ans_upd_hid, self.b_ans_upd,
                                    self.W_ans_hid_in, self.W_ans_hid_hid,
                                    self.b_ans_hid)

                y = nn_utils.softmax(T.dot(self.W_a, a))
                return [a, y]

            # TODO: add conditional ending
            dummy = theano.shared(
                np.zeros((self.vocab_size, self.batch_size), dtype=floatX))
            results, updates = theano.scan(
                fn=self.answer_step,
                outputs_info=[last_mem, T.zeros_like(dummy)],  #(last_mem, y)
                n_steps=1)
            self.prediction = results[1][-1]

        else:
            raise Exception("invalid answer_module")

        self.prediction = self.prediction.dimshuffle(1, 0)

        self.params = [
            self.W_inp_res_in,
            self.W_inp_res_hid,
            self.b_inp_res,
            self.W_inp_upd_in,
            self.W_inp_upd_hid,
            self.b_inp_upd,
            self.W_inp_hid_in,
            self.W_inp_hid_hid,
            self.b_inp_hid,
            self.W_mem_res_in,
            self.W_mem_res_hid,
            self.b_mem_res,
            self.W_mem_upd_in,
            self.W_mem_upd_hid,
            self.b_mem_upd,
            self.W_mem_hid_in,
            self.W_mem_hid_hid,
            self.b_mem_hid,  #self.W_b
            self.W_1,
            self.W_2,
            self.b_1,
            self.b_2,
            self.W_a
        ]

        if self.answer_module == 'recurrent':
            self.params = self.params + [
                self.W_ans_res_in, self.W_ans_res_hid, self.b_ans_res,
                self.W_ans_upd_in, self.W_ans_upd_hid, self.b_ans_upd,
                self.W_ans_hid_in, self.W_ans_hid_hid, self.b_ans_hid
            ]

        print "==> building loss layer and computing updates"
        self.loss_ce = T.nnet.categorical_crossentropy(self.prediction,
                                                       self.answer_var).mean()

        if self.l2 > 0:
            self.loss_l2 = self.l2 * nn_utils.l2_reg(self.params)
        else:
            self.loss_l2 = 0

        self.loss = self.loss_ce + self.loss_l2

        updates = lasagne.updates.adadelta(self.loss, self.params)
        #updates = lasagne.updates.momentum(self.loss, self.params, learning_rate=0.001)

        if self.mode == 'train':
            print "==> compiling train_fn"
            self.train_fn = theano.function(
                inputs=[
                    self.input_var, self.q_var, self.answer_var,
                    self.fact_count_var, self.input_mask_var
                ],
                outputs=[self.prediction, self.loss],
                updates=updates)

        print "==> compiling test_fn"
        self.test_fn = theano.function(inputs=[
            self.input_var, self.q_var, self.answer_var, self.fact_count_var,
            self.input_mask_var
        ],
                                       outputs=[self.prediction, self.loss])
Example #7
0
    def __init__(self, babi_train_raw, babi_test_raw, word2vec,
                 word_vector_size, dim, mode, answer_module, input_mask_mode,
                 memory_hops, l2, normalize_attention, batch_norm, dropout,
                 **kwargs):

        print "==> not used params in DMN class:", kwargs.keys()
        self.vocab = {}
        self.ivocab = {}

        self.word2vec = word2vec
        self.word_vector_size = word_vector_size
        self.dim = dim
        self.mode = mode
        self.answer_module = answer_module
        self.input_mask_mode = input_mask_mode
        self.memory_hops = memory_hops
        self.l2 = l2
        self.normalize_attention = normalize_attention
        self.batch_norm = batch_norm
        self.dropout = dropout

        self.train_input, self.train_q, self.train_answer, self.train_input_mask = self._process_input(
            babi_train_raw)
        self.test_input, self.test_q, self.test_answer, self.test_input_mask = self._process_input(
            babi_test_raw)
        self.vocab_size = len(self.vocab)

        self.input_var = T.matrix('input_var')
        self.q_var = T.matrix('question_var')
        self.answer_var = T.iscalar('answer_var')
        self.input_mask_var = T.ivector('input_mask_var')

        self.attentions = []

        print "==> building input module"
        self.W_inp_res_in = nn_utils.normal_param(
            std=0.1, shape=(self.dim, self.word_vector_size))
        self.W_inp_res_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim, self.dim))
        self.b_inp_res = nn_utils.constant_param(value=0.0, shape=(self.dim, ))

        self.W_inp_upd_in = nn_utils.normal_param(
            std=0.1, shape=(self.dim, self.word_vector_size))
        self.W_inp_upd_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim, self.dim))
        self.b_inp_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, ))

        self.W_inp_hid_in = nn_utils.normal_param(
            std=0.1, shape=(self.dim, self.word_vector_size))
        self.W_inp_hid_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim, self.dim))
        self.b_inp_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, ))

        inp_c_history, _ = theano.scan(fn=self.input_gru_step,
                                       sequences=self.input_var,
                                       outputs_info=T.zeros_like(
                                           self.b_inp_hid))

        self.inp_c = inp_c_history.take(self.input_mask_var, axis=0)

        self.q_q, _ = theano.scan(fn=self.input_gru_step,
                                  sequences=self.q_var,
                                  outputs_info=T.zeros_like(self.b_inp_hid))

        self.q_q = self.q_q[-1]

        print "==> creating parameters for memory module"
        self.W_mem_res_in = nn_utils.normal_param(std=0.1,
                                                  shape=(self.dim, self.dim))
        self.W_mem_res_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim, self.dim))
        self.b_mem_res = nn_utils.constant_param(value=0.0, shape=(self.dim, ))

        self.W_mem_upd_in = nn_utils.normal_param(std=0.1,
                                                  shape=(self.dim, self.dim))
        self.W_mem_upd_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim, self.dim))
        self.b_mem_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, ))

        self.W_mem_hid_in = nn_utils.normal_param(std=0.1,
                                                  shape=(self.dim, self.dim))
        self.W_mem_hid_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim, self.dim))
        self.b_mem_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, ))

        self.W_b = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim))
        self.W_1 = nn_utils.normal_param(std=0.1,
                                         shape=(self.dim, 7 * self.dim + 0))
        self.W_2 = nn_utils.normal_param(std=0.1, shape=(1, self.dim))
        self.b_1 = nn_utils.constant_param(value=0.0, shape=(self.dim, ))
        self.b_2 = nn_utils.constant_param(value=0.0, shape=(1, ))

        print "==> building episodic memory module (fixed number of steps: %d)" % self.memory_hops
        memory = [self.q_q.copy()]
        for iter in range(1, self.memory_hops + 1):
            current_episode = self.new_episode(memory[iter - 1])
            memory.append(
                self.GRU_update(memory[iter - 1], current_episode,
                                self.W_mem_res_in, self.W_mem_res_hid,
                                self.b_mem_res, self.W_mem_upd_in,
                                self.W_mem_upd_hid, self.b_mem_upd,
                                self.W_mem_hid_in, self.W_mem_hid_hid,
                                self.b_mem_hid))

        last_mem_raw = memory[-1].dimshuffle(('x', 0))

        net = layers.InputLayer(shape=(1, self.dim), input_var=last_mem_raw)
        if self.dropout > 0 and self.mode == 'train':
            net = layers.DropoutLayer(net, p=self.dropout)
        last_mem = layers.get_output(net)[0]

        print "==> building answer module"
        self.W_a = nn_utils.normal_param(std=0.1,
                                         shape=(self.vocab_size, self.dim))

        if self.answer_module == 'feedforward':
            self.prediction = nn_utils.softmax(T.dot(self.W_a, last_mem))

        elif self.answer_module == 'recurrent':
            self.W_ans_res_in = nn_utils.normal_param(
                std=0.1, shape=(self.dim, self.dim + self.vocab_size))
            self.W_ans_res_hid = nn_utils.normal_param(std=0.1,
                                                       shape=(self.dim,
                                                              self.dim))
            self.b_ans_res = nn_utils.constant_param(value=0.0,
                                                     shape=(self.dim, ))

            self.W_ans_upd_in = nn_utils.normal_param(
                std=0.1, shape=(self.dim, self.dim + self.vocab_size))
            self.W_ans_upd_hid = nn_utils.normal_param(std=0.1,
                                                       shape=(self.dim,
                                                              self.dim))
            self.b_ans_upd = nn_utils.constant_param(value=0.0,
                                                     shape=(self.dim, ))

            self.W_ans_hid_in = nn_utils.normal_param(
                std=0.1, shape=(self.dim, self.dim + self.vocab_size))
            self.W_ans_hid_hid = nn_utils.normal_param(std=0.1,
                                                       shape=(self.dim,
                                                              self.dim))
            self.b_ans_hid = nn_utils.constant_param(value=0.0,
                                                     shape=(self.dim, ))

            def answer_step(prev_a, prev_y):
                a = self.GRU_update(prev_a, T.concatenate([prev_y, self.q_q]),
                                    self.W_ans_res_in, self.W_ans_res_hid,
                                    self.b_ans_res, self.W_ans_upd_in,
                                    self.W_ans_upd_hid, self.b_ans_upd,
                                    self.W_ans_hid_in, self.W_ans_hid_hid,
                                    self.b_ans_hid)

                y = nn_utils.softmax(T.dot(self.W_a, a))
                return [a, y]

            # TODO: add conditional ending
            dummy = theano.shared(np.zeros((self.vocab_size, ), dtype=floatX))
            results, updates = theano.scan(
                fn=answer_step,
                outputs_info=[last_mem, T.zeros_like(dummy)],
                n_steps=1)
            self.prediction = results[1][-1]

        else:
            raise Exception("invalid answer_module")

        print "==> collecting all parameters"
        self.params = [
            self.W_inp_res_in,
            self.W_inp_res_hid,
            self.b_inp_res,
            self.W_inp_upd_in,
            self.W_inp_upd_hid,
            self.b_inp_upd,
            self.W_inp_hid_in,
            self.W_inp_hid_hid,
            self.b_inp_hid,
            self.W_mem_res_in,
            self.W_mem_res_hid,
            self.b_mem_res,
            self.W_mem_upd_in,
            self.W_mem_upd_hid,
            self.b_mem_upd,
            self.W_mem_hid_in,
            self.W_mem_hid_hid,
            self.b_mem_hid,  #self.W_b
            self.W_1,
            self.W_2,
            self.b_1,
            self.b_2,
            self.W_a
        ]

        if self.answer_module == 'recurrent':
            self.params = self.params + [
                self.W_ans_res_in, self.W_ans_res_hid, self.b_ans_res,
                self.W_ans_upd_in, self.W_ans_upd_hid, self.b_ans_upd,
                self.W_ans_hid_in, self.W_ans_hid_hid, self.b_ans_hid
            ]

        print "==> building loss layer and computing updates"
        self.loss_ce = T.nnet.categorical_crossentropy(
            self.prediction.dimshuffle('x', 0), T.stack([self.answer_var]))[0]

        if self.l2 > 0:
            self.loss_l2 = self.l2 * nn_utils.l2_reg(self.params)
        else:
            self.loss_l2 = 0

        self.loss = self.loss_ce + self.loss_l2

        updates = lasagne.updates.adadelta(self.loss, self.params)
        #updates = lasagne.updates.momentum(self.loss, self.params, learning_rate=0.0003)

        if self.mode == 'train':
            print "==> compiling train_fn"
            self.train_fn = theano.function(
                inputs=[
                    self.input_var, self.q_var, self.answer_var,
                    self.input_mask_var
                ],
                outputs=[self.prediction, self.loss],
                updates=updates)

        self.attentions = T.stack(self.attentions)
        print "==> compiling test_fn"
        self.test_fn = theano.function(
            inputs=[
                self.input_var, self.q_var, self.answer_var,
                self.input_mask_var
            ],
            outputs=[self.prediction, self.loss, self.attentions])
Example #8
0
    def __init__(self, babi_train_raw, babi_test_raw, word2vec,
                 word_vector_size, dim, mode, answer_module, answer_step_nbr,
                 input_mask_mode, memory_hops, l2, normalize_attention,
                 max_input_size, **kwargs):
        '''
        Build the DMN
        :param babi_train_raw: train dataset
        :param babi_test_raw: test dataset
        :param word2vec: a dictionary containing the word embeddings TODO: Check if right
        :param word_vector_size: dimension of the word embeddings (50,100,200,300)
        :param dim: number of hidden units in input module GRU
        :param mode: train or test mode
        :param answer_module: answer module type: feedforward or recurrent
        :param input_mask_mode: input_mask_mode: word or sentence
        :param memory_hops: memory GRU steps
        :param l2: L2 regularization
        :param normalize_attention: enable softmax on attention vector
        :param **kwargs:
        '''

        print("==> not used params in DMN class:", kwargs.keys())
        self.type = "pointer"
        self.vocab = {}
        self.ivocab = {}

        print(max_input_size)

        #save params
        self.word2vec = word2vec
        self.word_vector_size = word_vector_size
        self.dim = dim  #number of hidden units in input layer GRU
        self.pointer_dim = max_input_size  #maximal size for the input, used as hyperparameter
        self.mode = mode
        self.answer_module = answer_module
        self.answer_step_nbr = answer_step_nbr
        self.input_mask_mode = input_mask_mode
        self.memory_hops = memory_hops
        self.l2 = l2
        self.normalize_attention = normalize_attention

        self.train_input, self.train_q, self.train_answer, self.train_input_mask, self.train_pointers_s, self.train_pointers_e = self._process_input(
            babi_train_raw)
        self.test_input, self.test_q, self.test_answer, self.test_input_mask, self.test_pointers_s, self.test_pointers_e = self._process_input(
            babi_test_raw)
        self.vocab_size = len(self.vocab)

        self.input_var = T.matrix('input_var')
        self.q_var = T.matrix('question_var')
        self.answer_var = T.ivector('answer_var')
        self.input_mask_var = T.ivector('input_mask_var')
        self.pointers_s_var = T.ivector('pointers_s_var')
        self.pointers_e_var = T.ivector('pointer_e_var')

        print("==> building input module")
        self.W_inp_res_in = nn_utils.normal_param(
            std=0.1, shape=(self.dim, self.word_vector_size))
        self.W_inp_res_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim, self.dim))
        self.b_inp_res = nn_utils.constant_param(value=0.0, shape=(self.dim, ))

        self.W_inp_upd_in = nn_utils.normal_param(
            std=0.1, shape=(self.dim, self.word_vector_size))
        self.W_inp_upd_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim, self.dim))
        self.b_inp_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, ))

        self.W_inp_hid_in = nn_utils.normal_param(
            std=0.1, shape=(self.dim, self.word_vector_size))
        self.W_inp_hid_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim, self.dim))
        self.b_inp_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, ))
        #TODO why 3 different set of weights & bias?

        #This does some loop
        inp_c_history, _ = theano.scan(fn=self.input_gru_step,
                                       sequences=self.input_var,
                                       outputs_info=T.zeros_like(
                                           self.b_inp_hid))

        #in case of multiple sentences, only keep the hidden states which index match the <eos> char
        self.inp_c = inp_c_history.take(self.input_mask_var, axis=0)

        #This seems to be the memory.
        self.q_q, _ = theano.scan(fn=self.input_gru_step,
                                  sequences=self.q_var,
                                  outputs_info=T.zeros_like(self.b_inp_hid))

        self.q_q = self.q_q[-1]  #take only last elem

        print("==> creating parameters for memory module")
        self.W_mem_res_in = nn_utils.normal_param(std=0.1,
                                                  shape=(self.dim, self.dim))
        self.W_mem_res_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim, self.dim))
        self.b_mem_res = nn_utils.constant_param(value=0.0, shape=(self.dim, ))

        self.W_mem_upd_in = nn_utils.normal_param(std=0.1,
                                                  shape=(self.dim, self.dim))
        self.W_mem_upd_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim, self.dim))
        self.b_mem_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, ))

        self.W_mem_hid_in = nn_utils.normal_param(std=0.1,
                                                  shape=(self.dim, self.dim))
        self.W_mem_hid_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim, self.dim))
        self.b_mem_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, ))

        #Attnetion mechanisms 2 layer FFNN weights & bias
        self.W_b = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim))
        self.W_1 = nn_utils.normal_param(std=0.1,
                                         shape=(self.dim, 7 * self.dim + 2))
        self.W_2 = nn_utils.normal_param(std=0.1, shape=(1, self.dim))
        self.b_1 = nn_utils.constant_param(value=0.0, shape=(self.dim, ))
        self.b_2 = nn_utils.constant_param(value=0.0, shape=(1, ))

        print(
            "==> building episodic memory module (fixed number of steps: %d)" %
            self.memory_hops)
        memory = [self.q_q.copy()]  #So q_q is memory initialization
        for iter in range(1, self.memory_hops + 1):
            current_episode = self.new_episode(memory[iter - 1])
            memory.append(
                nn_utils.GRU_update(memory[iter - 1], current_episode,
                                    self.W_mem_res_in, self.W_mem_res_hid,
                                    self.b_mem_res, self.W_mem_upd_in,
                                    self.W_mem_upd_hid, self.b_mem_upd,
                                    self.W_mem_hid_in, self.W_mem_hid_hid,
                                    self.b_mem_hid))

        self.last_mem = memory[-1]

        print("==> building answer module")
        self.Ws_p = nn_utils.normal_param(
            std=0.1,
            shape=(self.pointer_dim,
                   self.dim))  #shape must be size_input * mem_size = self.dim
        self.We_p = nn_utils.normal_param(std=0.1,
                                          shape=(self.pointer_dim, self.dim))
        self.Wh_p = nn_utils.normal_param(std=0.1,
                                          shape=(self.pointer_dim, self.dim))
        self.Ws_pr = nn_utils.normal_param(
            std=0.1,
            shape=(self.pointer_dim,
                   self.dim))  #shape must be size_input * mem_size = self.dim
        self.We_pr = nn_utils.normal_param(std=0.1,
                                           shape=(self.pointer_dim, self.dim))
        self.Wh_pr = nn_utils.normal_param(std=0.1,
                                           shape=(self.pointer_dim, self.dim))

        self.Psp = nn_utils.softmax(T.dot(
            self.Ws_p, self.last_mem))  #size must be == size_input
        self.Pepr = nn_utils.softmax(T.dot(self.We_pr, self.last_mem))

        #TODO:
        self.start_idx = T.argmax(self.Psp)
        self.end_idxr = T.argmax(self.Pepr)

        self.start_idx_state = inp_c_history[
            self.
            start_idx]  #must be hidden state idx idx_max_val(Psp)  self.last_mem#
        self.end_idx_state = inp_c_history[self.end_idxr]
        #temp1 = T.dot(self.We_p, self.last_mem)
        #temp2 = T.dot(self.Wh_p, self.start_idx_state)
        #temp3 = temp1 + temp2
        self.Pep = nn_utils.softmax(
            T.dot(self.We_p, self.last_mem) + T.dot(
                self.Wh_p, self.start_idx_state))  #size must be == size_input
        self.Pspr = nn_utils.softmax(
            T.dot(self.Ws_pr, self.last_mem) +
            T.dot(self.Wh_pr, self.end_idx_state))

        Ps = (self.Psp + self.Pspr) / 2
        Pe = (self.Pep + self.Pepr) / 2
        self.start_idxr = T.argmax(self.Pspr)
        self.end_idx = T.argmax(self.Pep)

        self.start_idx_f = T.argmax(Ps)  #(self.start_idx + self.start_idxr)/2
        self.end_idx_f = T.argmax(Pe)  #(self.end_idx + self.end_idxr)/2

        #multiple_answers = []
        #bboole = T.lt(self.start_idx_f, self.end_idx_f)

        #trange = ifelse(bboole, T.arange(self.start_idx_f, self.end_idx_f), T.arange(self.start_idx_f - 1, self.start_idx_f))

        #        self.W_a = nn_utils.normal_param(std=0.1, shape=(self.vocab_size, self.dim))
        #
        #        if self.answer_module == 'recurrent':
        #            self.W_ans_res_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim + self.dim + self.vocab_size))
        #            self.W_ans_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim))
        #            self.b_ans_res = nn_utils.constant_param(value=0.0, shape=(self.dim,))
        #
        #            self.W_ans_upd_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim + self.dim + self.vocab_size))
        #            self.W_ans_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim))
        #            self.b_ans_upd = nn_utils.constant_param(value=0.0, shape=(self.dim,))
        #
        #            self.W_ans_hid_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim + self.dim + self.vocab_size))
        #            self.W_ans_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim))
        #            self.b_ans_hid = nn_utils.constant_param(value=0.0, shape=(self.dim,))
        #
        #            def answer_step(prev_a, prev_y):
        #                a = nn_utils.GRU_update(prev_a, T.concatenate([prev_y, self.q_q, self.last_mem]),
        #                                  self.W_ans_res_in, self.W_ans_res_hid, self.b_ans_res,
        #                                  self.W_ans_upd_in, self.W_ans_upd_hid, self.b_ans_upd,
        #                                  self.W_ans_hid_in, self.W_ans_hid_hid, self.b_ans_hid)
        #
        #                y = nn_utils.softmax(T.dot(self.W_a, a))
        #                return [a, y]
        #
        #            # TODO: add conditional ending
        #            dummy_ = theano.shared(np.zeros((self.vocab_size, ), dtype=floatX))
        #            results, updates = theano.scan(fn=answer_step,
        #                outputs_info=[self.last_mem, T.zeros_like(dummy_)],
        #                n_steps=self.answer_step_nbr)
        #
        #            self.multiple_predictions = results[1] #don't get the memory (i.e. a)
        #
        #
        #        else:
        #            raise Exception("invalid answer_module")

        print("==> collecting all parameters")
        self.params = [
            self.W_inp_res_in, self.W_inp_res_hid, self.b_inp_res,
            self.W_inp_upd_in, self.W_inp_upd_hid, self.b_inp_upd,
            self.W_inp_hid_in, self.W_inp_hid_hid, self.b_inp_hid,
            self.W_mem_res_in, self.W_mem_res_hid, self.b_mem_res,
            self.W_mem_upd_in, self.W_mem_upd_hid, self.b_mem_upd,
            self.W_mem_hid_in, self.W_mem_hid_hid, self.b_mem_hid, self.W_b,
            self.W_1, self.W_2, self.b_1, self.b_2, self.Ws_p, self.We_p,
            self.Wh_p, self.Ws_pr, self.We_pr, self.Wh_pr
        ]

        #        if self.answer_module == 'recurrent':
        #            self.params = self.params + [self.W_ans_res_in, self.W_ans_res_hid, self.b_ans_res,
        #                              self.W_ans_upd_in, self.W_ans_upd_hid, self.b_ans_upd,
        #                              self.W_ans_hid_in, self.W_ans_hid_hid, self.b_ans_hid]
        #
        #
        print("==> building loss layer and computing updates")
        #        def temp_loss(curr_pred, curr_ans, loss):
        #            temp = T.nnet.categorical_crossentropy(curr_pred.dimshuffle("x",0),T.stack([curr_ans]))[0]
        #            return loss + temp
        #
        #        outputs, updates = theano.scan(fn=temp_loss,
        #                                            sequences=[self.multiple_predictions, self.answer_var],
        #                                            outputs_info = [np.float64(0.0)],
        #                                            n_steps=self.answer_step_nbr)

        loss_start = T.nnet.categorical_crossentropy(
            Ps.dimshuffle("x", 0), T.stack([self.pointers_s_var[0]]))[0]
        loss_end = T.nnet.categorical_crossentropy(
            Pe.dimshuffle("x", 0), T.stack([self.pointers_e_var[0]]))[0]
        #loss_1 = Ps
        #        def temp_loss(curr_idx, curr_ans, loss):
        #            curr_pred = self.input_var[curr_idx]
        #            temp = T.nnet.catergorical_crossentropy(curr_pred, curr_ans)[0]
        #            return loss + temp
        #
        #        outputs, udpates = theano.scan(fn=temp_loss,
        #                                       sequences = [answers_range, self.answer_var],
        #                                        outputs_info = [np.float64(0.0)],
        #                                        n_steps = ???)

        #        self.loss_ce = outputs[-1]
        #temp1 = (self.end_idx_f - self.pointers_e_var)
        #temp2 = T.abs_(temp1) #* temp1
        #temp3 = (self.start_idx_f)# - self.pointers_s_var)
        #temp4 = T.abs_(temp3) #* temp3
        self.loss_ce = loss_start + loss_end  #(temp2 + temp4)
        if self.l2 > 0:
            self.loss_l2 = self.l2 * nn_utils.l2_reg(self.params)
        else:
            self.loss_l2 = 0

        self.loss = self.loss_ce + self.loss_l2

        updates = lasagne.updates.adadelta(self.loss, self.params)

        if self.mode == 'train':
            print("==> compiling train_fn")
            self.train_fn = theano.function(
                inputs=[
                    self.input_var, self.q_var, self.input_mask_var,
                    self.pointers_s_var, self.pointers_e_var
                ],
                outputs=[self.start_idx_f, self.end_idx_f, self.loss],
                updates=updates,
                allow_input_downcast=True)
        if self.mode != 'minitest':
            print("==> compiling test_fn")
            self.test_fn = theano.function(inputs=[
                self.input_var, self.q_var, self.input_mask_var,
                self.pointers_s_var, self.pointers_e_var
            ],
                                           outputs=[
                                               self.start_idx_f,
                                               self.end_idx_f, self.loss,
                                               self.inp_c, self.q_q
                                           ],
                                           allow_input_downcast=True)

        if self.mode == 'minitest':
            print("==> compiling minitest_fn")
            self.minitest_fn = theano.function(
                inputs=[
                    self.input_var, self.q_var, self.input_mask_var,
                    self.pointers_s_var, self.pointers_e_var
                ],
                outputs=[self.start_idx_f, self.end_idx_f])
Example #9
0
    def __init__(self, babi_train_raw, babi_test_raw, word2vec,
                 word_vector_size, dim, mode, answer_module, input_mask_mode,
                 memory_hops, l2, normalize_attention, **kwargs):
        '''
        Build the DMN
        :param babi_train_raw: train dataset
        :param babi_test_raw: test dataset
        :param word2vec: a dictionary containing the word embeddings TODO: Check if right
        :param word_vector_size: dimension of the word embeddings (50,100,200,300)
        :param dim: number of hidden units in input module GRU
        :param mode: train or test mode
        :param answer_module: answer module type: feedforward or recurrent
        :param input_mask_mode: input_mask_mode: word or sentence
        :param memory_hops: memory GRU steps
        :param l2: L2 regularization
        :param normalize_attention: enable softmax on attention vector
        :param **kwargs:
        '''

        print("==> not used params in DMN class:", kwargs.keys())
        self.vocab = {}
        self.ivocab = {}

        self.type = "basic"

        #save params
        self.word2vec = word2vec
        self.word_vector_size = word_vector_size
        self.dim = dim
        self.mode = mode
        self.answer_module = answer_module
        self.input_mask_mode = input_mask_mode
        self.memory_hops = memory_hops
        self.l2 = l2
        self.normalize_attention = normalize_attention

        self.train_input, self.train_q, self.train_answer, self.train_input_mask = self._process_input(
            babi_train_raw)
        self.test_input, self.test_q, self.test_answer, self.test_input_mask = self._process_input(
            babi_test_raw)
        self.vocab_size = len(self.vocab)

        self.input_var = T.matrix('input_var')
        self.q_var = T.matrix('question_var')
        self.answer_var = T.iscalar('answer_var')
        self.input_mask_var = T.ivector('input_mask_var')

        print("==> building input module")
        #Input weights for first layer
        #TODO why is there an input layer??
        self.W_inp_res_in = nn_utils.normal_param(
            std=0.1, shape=(self.dim, self.word_vector_size))
        #Input weights for hidden layer
        self.W_inp_res_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim, self.dim))
        #Input bias
        #TODO why constant?
        self.b_inp_res = nn_utils.constant_param(value=0.0, shape=(self.dim, ))

        self.W_inp_upd_in = nn_utils.normal_param(
            std=0.1, shape=(self.dim, self.word_vector_size))
        self.W_inp_upd_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim, self.dim))
        self.b_inp_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, ))

        self.W_inp_hid_in = nn_utils.normal_param(
            std=0.1, shape=(self.dim, self.word_vector_size))
        self.W_inp_hid_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim, self.dim))
        self.b_inp_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, ))
        #TODO why 3 different set of weights & bias?

        #This does some loop
        inp_c_history, _ = theano.scan(fn=self.input_gru_step,
                                       sequences=self.input_var,
                                       outputs_info=T.zeros_like(
                                           self.b_inp_hid))

        self.inp_c = inp_c_history.take(self.input_mask_var, axis=0)

        #This seems to be the memory.
        self.q_q, _ = theano.scan(fn=self.input_gru_step,
                                  sequences=self.q_var,
                                  outputs_info=T.zeros_like(self.b_inp_hid))

        self.q_q = self.q_q[-1]  #take only last elem

        print("==> creating parameters for memory module")
        self.W_mem_res_in = nn_utils.normal_param(std=0.1,
                                                  shape=(self.dim, self.dim))
        self.W_mem_res_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim, self.dim))
        self.b_mem_res = nn_utils.constant_param(value=0.0, shape=(self.dim, ))

        self.W_mem_upd_in = nn_utils.normal_param(std=0.1,
                                                  shape=(self.dim, self.dim))
        self.W_mem_upd_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim, self.dim))
        self.b_mem_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, ))

        self.W_mem_hid_in = nn_utils.normal_param(std=0.1,
                                                  shape=(self.dim, self.dim))
        self.W_mem_hid_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim, self.dim))
        self.b_mem_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, ))

        #Attnetion mechanisms 2 layer FFNN weights & bias
        self.W_b = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim))
        self.W_1 = nn_utils.normal_param(std=0.1,
                                         shape=(self.dim, 7 * self.dim + 2))
        self.W_2 = nn_utils.normal_param(std=0.1, shape=(1, self.dim))
        self.b_1 = nn_utils.constant_param(value=0.0, shape=(self.dim, ))
        self.b_2 = nn_utils.constant_param(value=0.0, shape=(1, ))

        print(
            "==> building episodic memory module (fixed number of steps: %d)" %
            self.memory_hops)
        memory = [self.q_q.copy()]  #So q_q is memory initialization
        for iter in range(1, self.memory_hops + 1):
            current_episode = self.new_episode(memory[iter - 1])
            memory.append(
                nn_utils.GRU_update(memory[iter - 1], current_episode,
                                    self.W_mem_res_in, self.W_mem_res_hid,
                                    self.b_mem_res, self.W_mem_upd_in,
                                    self.W_mem_upd_hid, self.b_mem_upd,
                                    self.W_mem_hid_in, self.W_mem_hid_hid,
                                    self.b_mem_hid))

        last_mem = memory[-1]

        print("==> building answer module")
        self.W_a = nn_utils.normal_param(std=0.1,
                                         shape=(self.vocab_size, self.dim))

        if self.answer_module == 'feedforward':
            self.prediction = nn_utils.softmax(T.dot(self.W_a, last_mem))

        elif self.answer_module == 'recurrent':
            self.W_ans_res_in = nn_utils.normal_param(
                std=0.1, shape=(self.dim, self.dim + self.vocab_size))
            self.W_ans_res_hid = nn_utils.normal_param(std=0.1,
                                                       shape=(self.dim,
                                                              self.dim))
            self.b_ans_res = nn_utils.constant_param(value=0.0,
                                                     shape=(self.dim, ))

            self.W_ans_upd_in = nn_utils.normal_param(
                std=0.1, shape=(self.dim, self.dim + self.vocab_size))
            self.W_ans_upd_hid = nn_utils.normal_param(std=0.1,
                                                       shape=(self.dim,
                                                              self.dim))
            self.b_ans_upd = nn_utils.constant_param(value=0.0,
                                                     shape=(self.dim, ))

            self.W_ans_hid_in = nn_utils.normal_param(
                std=0.1, shape=(self.dim, self.dim + self.vocab_size))
            self.W_ans_hid_hid = nn_utils.normal_param(std=0.1,
                                                       shape=(self.dim,
                                                              self.dim))
            self.b_ans_hid = nn_utils.constant_param(value=0.0,
                                                     shape=(self.dim, ))

            def answer_step(prev_a, prev_y):
                a = nn_utils.GRU_update(prev_a,
                                        T.concatenate([prev_y, self.q_q
                                                       ]), self.W_ans_res_in,
                                        self.W_ans_res_hid, self.b_ans_res,
                                        self.W_ans_upd_in, self.W_ans_upd_hid,
                                        self.b_ans_upd, self.W_ans_hid_in,
                                        self.W_ans_hid_hid, self.b_ans_hid)

                y = nn_utils.softmax(T.dot(self.W_a, a))
                return [a, y]

            # TODO: add conditional ending
            dummy = theano.shared(np.zeros((self.vocab_size, ), dtype=floatX))
            results, updates = theano.scan(
                fn=answer_step,
                outputs_info=[last_mem, T.zeros_like(dummy)],
                n_steps=1)
            self.prediction = results[1][-1]

        else:
            raise Exception("invalid answer_module")

        print("==> collecting all parameters")
        self.params = [
            self.W_inp_res_in, self.W_inp_res_hid, self.b_inp_res,
            self.W_inp_upd_in, self.W_inp_upd_hid, self.b_inp_upd,
            self.W_inp_hid_in, self.W_inp_hid_hid, self.b_inp_hid,
            self.W_mem_res_in, self.W_mem_res_hid, self.b_mem_res,
            self.W_mem_upd_in, self.W_mem_upd_hid, self.b_mem_upd,
            self.W_mem_hid_in, self.W_mem_hid_hid, self.b_mem_hid, self.W_b,
            self.W_1, self.W_2, self.b_1, self.b_2, self.W_a
        ]

        if self.answer_module == 'recurrent':
            self.params = self.params + [
                self.W_ans_res_in, self.W_ans_res_hid, self.b_ans_res,
                self.W_ans_upd_in, self.W_ans_upd_hid, self.b_ans_upd,
                self.W_ans_hid_in, self.W_ans_hid_hid, self.b_ans_hid
            ]

        print("==> building loss layer and computing updates")
        self.loss_ce = T.nnet.categorical_crossentropy(
            self.prediction.dimshuffle('x', 0), T.stack([self.answer_var]))[0]
        if self.l2 > 0:
            self.loss_l2 = self.l2 * nn_utils.l2_reg(self.params)
        else:
            self.loss_l2 = 0

        self.loss = self.loss_ce + self.loss_l2

        updates = lasagne.updates.adadelta(self.loss, self.params)

        if self.mode == 'train':
            print("==> compiling train_fn")
            self.train_fn = theano.function(
                inputs=[
                    self.input_var, self.q_var, self.answer_var,
                    self.input_mask_var
                ],
                outputs=[self.prediction, self.loss],
                updates=updates)

        print("==> compiling test_fn")
        self.test_fn = theano.function(inputs=[
            self.input_var, self.q_var, self.answer_var, self.input_mask_var
        ],
                                       outputs=[
                                           self.prediction, self.loss,
                                           self.inp_c, self.q_q, last_mem
                                       ])

        if self.mode == 'train':
            print("==> computing gradients (for debugging)")
            gradient = T.grad(self.loss, self.params)
            self.get_gradient_fn = theano.function(inputs=[
                self.input_var, self.q_var, self.answer_var,
                self.input_mask_var
            ],
                                                   outputs=gradient)
Example #10
0
def feed_forward(z, W1, W2):
    h = z.dot(W1)
    u = h.dot(W2)
    z = nn_utils.softmax(u)
    return z
Example #11
0
def attention(Q, K, V, f, d_k=1):
    col = [i for i in range(len(K.shape))]
    col[-2], col[-1] = col[-1], col[-2]
    return nn_utils.softmax(f(Q, K.transpose(tuple(col))) / d_k).dot(V)
Example #12
0
    def __init__(self, babi_train_raw, babi_test_raw, word2vec,
                 word_vector_size, dim, mode, input_mask_mode, memory_hops, l2,
                 normalize_attention, **kwargs):

        print "==> not used params in DMN class:", kwargs.keys()
        self.vocab = {}
        self.ivocab = {}

        self.word2vec = word2vec
        self.word_vector_size = word_vector_size
        self.dim = dim
        self.mode = mode
        self.input_mask_mode = input_mask_mode
        self.memory_hops = memory_hops
        #self.batch_size = 1
        self.l2 = l2
        self.normalize_attention = normalize_attention

        self.train_input, self.train_q, self.train_answer, self.train_choices, self.train_input_mask = self._process_input(
            babi_train_raw)
        self.test_input, self.test_q, self.test_answer, self.test_choices, self.test_input_mask = self._process_input(
            babi_test_raw)
        self.vocab_size = 4  # number of answer choices

        self.inp_var = T.matrix('input_var')
        self.q_var = T.matrix('question_var')
        self.ca_var = T.matrix('ca_var')
        self.cb_var = T.matrix('cb_var')
        self.cc_var = T.matrix('cc_var')
        self.cd_var = T.matrix('cd_var')
        self.ans_var = T.iscalar('answer_var')
        self.input_mask_var = T.ivector('input_mask_var')

        print "==> building input module"
        self.W_inp_res_in = theano.shared(lasagne.init.Normal(0.1).sample(
            (self.dim, self.word_vector_size)),
                                          borrow=True)
        self.W_inp_res_hid = theano.shared(lasagne.init.Normal(0.1).sample(
            (self.dim, self.dim)),
                                           borrow=True)
        self.b_inp_res = theano.shared(lasagne.init.Constant(0.0).sample(
            (self.dim, )),
                                       borrow=True)

        self.W_inp_upd_in = theano.shared(lasagne.init.Normal(0.1).sample(
            (self.dim, self.word_vector_size)),
                                          borrow=True)
        self.W_inp_upd_hid = theano.shared(lasagne.init.Normal(0.1).sample(
            (self.dim, self.dim)),
                                           borrow=True)
        self.b_inp_upd = theano.shared(lasagne.init.Constant(0.0).sample(
            (self.dim, )),
                                       borrow=True)

        self.W_inp_hid_in = theano.shared(lasagne.init.Normal(0.1).sample(
            (self.dim, self.word_vector_size)),
                                          borrow=True)
        self.W_inp_hid_hid = theano.shared(lasagne.init.Normal(0.1).sample(
            (self.dim, self.dim)),
                                           borrow=True)
        self.b_inp_hid = theano.shared(lasagne.init.Constant(0.0).sample(
            (self.dim, )),
                                       borrow=True)

        inp_c_history, _ = theano.scan(fn=self.input_gru_step,
                                       sequences=self.inp_var,
                                       outputs_info=T.zeros_like(
                                           self.b_inp_hid))

        self.inp_c = inp_c_history.take(self.input_mask_var, axis=0)

        self.q_q, _ = theano.scan(fn=self.input_gru_step,
                                  sequences=self.q_var,
                                  outputs_info=T.zeros_like(self.b_inp_hid))

        self.q_q = self.q_q[-1]

        self.c_vecs = []
        for choice in [self.ca_var, self.cb_var, self.cc_var, self.cd_var]:
            history, _ = theano.scan(fn=self.input_gru_step,
                                     sequences=choice,
                                     outputs_info=T.zeros_like(self.b_inp_hid))
            self.c_vecs.append(history[-1])

        self.c_vecs = T.stack(self.c_vecs).transpose((1, 0))  # (dim, 4)
        self.inp_c = T.stack([self.inp_c] * 4).transpose(
            (1, 2, 0))  # (fact_cnt, dim, 4)
        self.q_q = T.stack([self.q_q] * 4).transpose((1, 0))  # (dim, 4)

        print "==> creating parameters for memory module"
        self.W_mem_res_in = theano.shared(lasagne.init.Normal(0.1).sample(
            (self.dim, self.dim)),
                                          borrow=True)
        self.W_mem_res_hid = theano.shared(lasagne.init.Normal(0.1).sample(
            (self.dim, self.dim)),
                                           borrow=True)
        self.b_mem_res = theano.shared(lasagne.init.Constant(0.0).sample(
            (self.dim, )),
                                       borrow=True)

        self.W_mem_upd_in = theano.shared(lasagne.init.Normal(0.1).sample(
            (self.dim, self.dim)),
                                          borrow=True)
        self.W_mem_upd_hid = theano.shared(lasagne.init.Normal(0.1).sample(
            (self.dim, self.dim)),
                                           borrow=True)
        self.b_mem_upd = theano.shared(lasagne.init.Constant(0.0).sample(
            (self.dim, )),
                                       borrow=True)

        self.W_mem_hid_in = theano.shared(lasagne.init.Normal(0.1).sample(
            (self.dim, self.dim)),
                                          borrow=True)
        self.W_mem_hid_hid = theano.shared(lasagne.init.Normal(0.1).sample(
            (self.dim, self.dim)),
                                           borrow=True)
        self.b_mem_hid = theano.shared(lasagne.init.Constant(0.0).sample(
            (self.dim, )),
                                       borrow=True)

        self.W_b = theano.shared(lasagne.init.Normal(0.1).sample(
            (self.dim, self.dim)),
                                 borrow=True)
        self.W_1 = theano.shared(lasagne.init.Normal(0.1).sample(
            (self.dim, 10 * self.dim + 3)),
                                 borrow=True)
        self.W_2 = theano.shared(lasagne.init.Normal(0.1).sample(
            (1, self.dim)),
                                 borrow=True)
        self.b_1 = theano.shared(lasagne.init.Constant(0.0).sample(
            (self.dim, )),
                                 borrow=True)
        self.b_2 = theano.shared(lasagne.init.Constant(0.0).sample((1, )),
                                 borrow=True)

        print "==> building episodic memory module (fixed number of steps: %d)" % self.memory_hops
        memory = [self.q_q.copy()]  # (dim, 4)
        for iter in range(1, self.memory_hops + 1):
            current_episode = self.new_episode(memory[iter - 1])
            memory.append(
                self.GRU_update_batch(memory[iter - 1], current_episode,
                                      self.W_mem_res_in, self.W_mem_res_hid,
                                      self.b_mem_res, self.W_mem_upd_in,
                                      self.W_mem_upd_hid, self.b_mem_upd,
                                      self.W_mem_hid_in, self.W_mem_hid_hid,
                                      self.b_mem_hid))

        last_mem = memory[-1].flatten()

        print "==> building answer module"
        self.W_a = theano.shared(lasagne.init.Normal(0.1).sample(
            (self.vocab_size, 4 * self.dim)),
                                 borrow=True)
        self.prediction = nn_utils.softmax(T.dot(self.W_a, last_mem))

        print "==> collecting all parameters"
        self.params = [
            self.W_inp_res_in, self.W_inp_res_hid, self.b_inp_res,
            self.W_inp_upd_in, self.W_inp_upd_hid, self.b_inp_upd,
            self.W_inp_hid_in, self.W_inp_hid_hid, self.b_inp_hid,
            self.W_mem_res_in, self.W_mem_res_hid, self.b_mem_res,
            self.W_mem_upd_in, self.W_mem_upd_hid, self.b_mem_upd,
            self.W_mem_hid_in, self.W_mem_hid_hid, self.b_mem_hid, self.W_b,
            self.W_1, self.W_2, self.b_1, self.b_2, self.W_a
        ]

        print "==> building loss layer and computing updates"
        self.loss_ce = T.nnet.categorical_crossentropy(
            self.prediction.dimshuffle('x', 0), T.stack([self.ans_var]))[0]
        if self.l2 > 0:
            self.loss_l2 = self.l2 * nn_utils.l2_reg(self.params)
        else:
            self.loss_l2 = 0

        self.loss = self.loss_ce + self.loss_l2

        updates = lasagne.updates.adadelta(self.loss, self.params)

        if self.mode == 'train':
            print "==> compiling train_fn"
            self.train_fn = theano.function(
                inputs=[
                    self.inp_var, self.q_var, self.ans_var, self.ca_var,
                    self.cb_var, self.cc_var, self.cd_var, self.input_mask_var
                ],
                outputs=[self.prediction, self.loss],
                updates=updates)

        print "==> compiling test_fn"
        self.test_fn = theano.function(inputs=[
            self.inp_var, self.q_var, self.ans_var, self.ca_var, self.cb_var,
            self.cc_var, self.cd_var, self.input_mask_var
        ],
                                       outputs=[
                                           self.prediction, self.loss,
                                           self.inp_c, self.q_q, last_mem
                                       ])

        if self.mode == 'train':
            print "==> computing gradients (for debugging)"
            gradient = T.grad(self.loss, self.params)
            self.get_gradient_fn = theano.function(inputs=[
                self.inp_var, self.q_var, self.ans_var, self.ca_var,
                self.cb_var, self.cc_var, self.cd_var, self.input_mask_var
            ],
                                                   outputs=gradient)