def __init__(self, data_dir, word2vec, word_vector_size, truncate_gradient, learning_rate, dim, cnn_dim, cnn_dim_fc, story_len, patches, mode, answer_module, memory_hops, batch_size, l2, normalize_attention, batch_norm, dropout, **kwargs): print "==> not used params in DMN class:", kwargs.keys() self.data_dir = data_dir self.learning_rate = learning_rate self.truncate_gradient = truncate_gradient self.word2vec = word2vec self.word_vector_size = word_vector_size self.dim = dim self.cnn_dim = cnn_dim self.cnn_dim_fc = cnn_dim_fc self.story_len = story_len self.mode = mode self.patches = patches self.answer_module = answer_module self.memory_hops = memory_hops self.batch_size = batch_size self.l2 = l2 self.normalize_attention = normalize_attention self.batch_norm = batch_norm self.dropout = dropout #self.vocab, self.ivocab = self._load_vocab(self.data_dir) self.vocab, self.ivocab = self._ext_vocab_from_word2vec() self.train_story = None self.test_story = None self.train_dict_story, self.train_lmdb_env_fc, self.train_lmdb_env_conv = self._process_input_sind( self.data_dir, 'train') self.test_dict_story, self.test_lmdb_env_fc, self.test_lmdb_env_conv = self._process_input_sind( self.data_dir, 'val') self.train_story = self.train_dict_story.keys() self.test_story = self.test_dict_story.keys() self.vocab_size = len(self.vocab) # Since this is pretty expensive, we will pass a story each time. # We assume that the input has been processed such that the sequences of patches # are snake like path. self.input_var = T.tensor4( 'input_var') # (batch_size, seq_len, patches, cnn_dim) self.q_var = T.matrix('q_var') # Now, it's a batch * image_sieze. self.answer_var = T.imatrix( 'answer_var') # answer of example in minibatch self.answer_mask = T.matrix('answer_mask') self.answer_inp_var = T.tensor3( 'answer_inp_var') # answer of example in minibatch print "==> building input module" self.W_inp_emb_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.cnn_dim)) #self.b_inp_emb_in = nn_utils.constant_param(value=0.0, shape=(self.dim,)) # First, we embed the visual features before sending it to the bi-GRUs. inp_rhp = T.reshape( self.input_var, (self.batch_size * self.story_len * self.patches, self.cnn_dim)) inp_rhp_dimshuffled = inp_rhp.dimshuffle(1, 0) inp_rhp_emb = T.dot(self.W_inp_emb_in, inp_rhp_dimshuffled) inp_rhp_emb_dimshuffled = inp_rhp_emb.dimshuffle(1, 0) inp_emb_raw = T.reshape( inp_rhp_emb_dimshuffled, (self.batch_size, self.story_len, self.patches, self.cnn_dim)) inp_emb = T.tanh( inp_emb_raw ) # Just follow the paper DMN for visual and textual QA. # Now, we use a bi-directional GRU to produce the input. # Forward GRU. self.inp_dim = self.dim / 2 # since we have forward and backward self.W_inpf_res_in = nn_utils.normal_param(std=0.1, shape=(self.inp_dim, self.cnn_dim)) self.W_inpf_res_hid = nn_utils.normal_param(std=0.1, shape=(self.inp_dim, self.inp_dim)) self.b_inpf_res = nn_utils.constant_param(value=0.0, shape=(self.inp_dim, )) self.W_inpf_upd_in = nn_utils.normal_param(std=0.1, shape=(self.inp_dim, self.cnn_dim)) self.W_inpf_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.inp_dim, self.inp_dim)) self.b_inpf_upd = nn_utils.constant_param(value=0.0, shape=(self.inp_dim, )) self.W_inpf_hid_in = nn_utils.normal_param(std=0.1, shape=(self.inp_dim, self.cnn_dim)) self.W_inpf_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.inp_dim, self.inp_dim)) self.b_inpf_hid = nn_utils.constant_param(value=0.0, shape=(self.inp_dim, )) # Backward GRU. self.W_inpb_res_in = nn_utils.normal_param(std=0.1, shape=(self.inp_dim, self.cnn_dim)) self.W_inpb_res_hid = nn_utils.normal_param(std=0.1, shape=(self.inp_dim, self.inp_dim)) self.b_inpb_res = nn_utils.constant_param(value=0.0, shape=(self.inp_dim, )) self.W_inpb_upd_in = nn_utils.normal_param(std=0.1, shape=(self.inp_dim, self.cnn_dim)) self.W_inpb_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.inp_dim, self.inp_dim)) self.b_inpb_upd = nn_utils.constant_param(value=0.0, shape=(self.inp_dim, )) self.W_inpb_hid_in = nn_utils.normal_param(std=0.1, shape=(self.inp_dim, self.cnn_dim)) self.W_inpb_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.inp_dim, self.inp_dim)) self.b_inpb_hid = nn_utils.constant_param(value=0.0, shape=(self.inp_dim, )) # Now, we use the GRU to build the inputs. # Two-level of nested scan is unnecessary. It will become too complicated. Just use this one. inp_dummy = theano.shared( np.zeros((self.inp_dim, self.story_len), dtype=floatX)) for i in range(self.batch_size): if i == 0: inp_1st_f, _ = theano.scan( fn=self.input_gru_step_forward, sequences=inp_emb[i, :].dimshuffle(1, 2, 0), outputs_info=T.zeros_like(inp_dummy), truncate_gradient=self.truncate_gradient) inp_1st_b, _ = theano.scan( fn=self.input_gru_step_backward, sequences=inp_emb[i, :, ::-1, :].dimshuffle(1, 2, 0), outputs_info=T.zeros_like(inp_dummy), truncate_gradient=self.truncate_gradient) # Now, combine them. inp_1st = T.concatenate([ inp_1st_f.dimshuffle(2, 0, 1), inp_1st_b.dimshuffle(2, 0, 1) ], axis=-1) self.inp_c = inp_1st.dimshuffle('x', 0, 1, 2) else: inp_f, _ = theano.scan( fn=self.input_gru_step_forward, sequences=inp_emb[i, :].dimshuffle(1, 2, 0), outputs_info=T.zeros_like(inp_dummy), truncate_gradient=self.truncate_gradient) inp_b, _ = theano.scan( fn=self.input_gru_step_backward, sequences=inp_emb[i, :, ::-1, :].dimshuffle(1, 2, 0), outputs_info=T.zeros_like(inp_dummy), truncate_gradient=self.truncate_gradient) # Now, combine them. inp_fb = T.concatenate( [inp_f.dimshuffle(2, 0, 1), inp_b.dimshuffle(2, 0, 1)], axis=-1) self.inp_c = T.concatenate( [self.inp_c, inp_fb.dimshuffle('x', 0, 1, 2)], axis=0) # Done, now self.inp_c should be batch_size x story_len x patches x cnn_dim # Eventually, we can flattern them. # Now, the input dimension is 1024 because we have forward and backward. inp_c_t = T.reshape( self.inp_c, (self.batch_size, self.story_len * self.patches, self.dim)) inp_c_t_dimshuffled = inp_c_t.dimshuffle(0, 'x', 1, 2) inp_batch = T.repeat(inp_c_t_dimshuffled, self.story_len, axis=1) # Now, its ready for all the 5 images in the same story. # 50 * 980 * 512 self.inp_batch = T.reshape(inp_batch, (inp_batch.shape[0] * inp_batch.shape[1], inp_batch.shape[2], inp_batch.shape[3])) self.inp_batch_dimshuffled = self.inp_batch.dimshuffle( 1, 2, 0) # 980 x 512 x 50 # It's very simple now, the input module just need to map from cnn_dim to dim. logging.info('self.cnn_dim = %d', self.cnn_dim) print "==> building question module" # Now, share the parameter with the input module. self.W_inp_emb_q = nn_utils.normal_param(std=0.1, shape=(self.dim, self.cnn_dim_fc)) self.b_inp_emb_q = nn_utils.normal_param(std=0.1, shape=(self.dim, )) q_var_shuffled = self.q_var.dimshuffle(1, 0) inp_q = T.dot( self.W_inp_emb_q, q_var_shuffled) + self.b_inp_emb_q.dimshuffle( 0, 'x') # 512 x 50 self.q_q = T.tanh( inp_q ) # Since this is used to initialize the memory, we need to make it tanh. print "==> creating parameters for memory module" self.W_mem_res_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_res = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_mem_upd_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_mem_hid_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, )) #self.W_b = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_1 = nn_utils.normal_param(std=0.1, shape=(self.dim, 7 * self.dim + 0)) self.W_2 = nn_utils.normal_param(std=0.1, shape=(1, self.dim)) self.b_1 = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.b_2 = nn_utils.constant_param(value=0.0, shape=(1, )) print "==> building episodic memory module (fixed number of steps: %d)" % self.memory_hops memory = [self.q_q.copy()] for iter in range(1, self.memory_hops + 1): #m = printing.Print('mem')(memory[iter-1]) current_episode = self.new_episode(memory[iter - 1]) #current_episode = self.new_episode(m) #current_episode = printing.Print('current_episode')(current_episode) memory.append( self.GRU_update(memory[iter - 1], current_episode, self.W_mem_res_in, self.W_mem_res_hid, self.b_mem_res, self.W_mem_upd_in, self.W_mem_upd_hid, self.b_mem_upd, self.W_mem_hid_in, self.W_mem_hid_hid, self.b_mem_hid)) last_mem_raw = memory[-1].dimshuffle((1, 0)) net = layers.InputLayer(shape=(self.batch_size * self.story_len, self.dim), input_var=last_mem_raw) if self.batch_norm: net = layers.BatchNormLayer(incoming=net) if self.dropout > 0 and self.mode == 'train': net = layers.DropoutLayer(net, p=self.dropout) last_mem = layers.get_output(net).dimshuffle((1, 0)) logging.info('last_mem size') print last_mem.shape.eval({ self.input_var: np.random.rand(10, 5, 196, 512).astype('float32'), self.q_var: np.random.rand(50, 4096).astype('float32') }) print "==> building answer module" answer_inp_var_shuffled = self.answer_inp_var.dimshuffle(1, 2, 0) # Sounds good. Now, we need to map last_mem to a new space. self.W_mem_emb = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim * 2)) self.W_inp_emb = nn_utils.normal_param(std=0.1, shape=(self.dim, self.word_vector_size)) def _dot2(x, W): return T.dot(W, x) answer_inp_var_shuffled_emb, _ = theano.scan( fn=_dot2, sequences=answer_inp_var_shuffled, non_sequences=self.W_inp_emb) # seq x dim x batch # Now, we also need to embed the image and use it to do the memory. #q_q_shuffled = self.q_q.dimshuffle(1,0) # dim * batch. init_ans = T.concatenate([self.q_q, last_mem], axis=0) mem_ans = T.dot(self.W_mem_emb, init_ans) # dim x batchsize. mem_ans_dim = mem_ans.dimshuffle('x', 0, 1) answer_inp = T.concatenate([mem_ans_dim, answer_inp_var_shuffled_emb], axis=0) # Now, we have both embedding. We can let them go to the rnn. # We also need to map the input layer as well. dummy = theano.shared( np.zeros((self.dim, self.batch_size * self.story_len), dtype=floatX)) self.W_a = nn_utils.normal_param(std=0.1, shape=(self.vocab_size + 1, self.dim)) self.W_ans_res_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_ans_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_res = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_ans_upd_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_ans_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_ans_hid_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_ans_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, )) logging.info('answer_inp size') #print answer_inp.shape.eval({self.input_var: np.random.rand(10,4,4096).astype('float32'), # self.answer_inp_var: np.random.rand(10, 18, 8001).astype('float32'), # self.q_var: np.random.rand(10, 4096).astype('float32')}) #last_mem = printing.Print('prob_sm')(last_mem) results, _ = theano.scan(fn=self.answer_gru_step, sequences=answer_inp, outputs_info=[dummy]) # Assume there is a start token #print results.shape.eval({self.input_var: np.random.rand(10,4,4096).astype('float32'), # self.q_var: np.random.rand(10, 4096).astype('float32'), # self.answer_inp_var: np.random.rand(10, 18, 8001).astype('float32')}, on_unused_input='ignore') results = results[ 1: -1, :, :] # get rid of the last token as well as the first one (image) #print results.shape.eval({self.input_var: np.random.rand(10,4,4096).astype('float32'), # self.q_var: np.random.rand(10, 4096).astype('float32'), # self.answer_inp_var: np.random.rand(10, 18, 8001).astype('float32')}, on_unused_input='ignore') # Now, we need to transform it to the probabilities. prob, _ = theano.scan(fn=lambda x, w: T.dot(w, x), sequences=results, non_sequences=self.W_a) prob_shuffled = prob.dimshuffle(2, 0, 1) # b * len * vocab logging.info("prob shape.") #print prob.shape.eval({self.input_var: np.random.rand(10,4,4096).astype('float32'), # self.q_var: np.random.rand(10, 4096).astype('float32'), # self.answer_inp_var: np.random.rand(10, 18, 8001).astype('float32')}) n = prob_shuffled.shape[0] * prob_shuffled.shape[1] prob_rhp = T.reshape(prob_shuffled, (n, prob_shuffled.shape[2])) prob_sm = nn_utils.softmax_(prob_rhp) self.prediction = prob_sm mask = T.reshape(self.answer_mask, (n, )) lbl = T.reshape(self.answer_var, (n, )) self.params = [ self.W_inp_emb_in, #self.b_inp_emb_in, self.W_inpf_res_in, self.W_inpf_res_hid, self.b_inpf_res, self.W_inpf_upd_in, self.W_inpf_upd_hid, self.b_inpf_upd, self.W_inpf_hid_in, self.W_inpf_hid_hid, self.b_inpf_hid, self.W_inpb_res_in, self.W_inpb_res_hid, self.b_inpb_res, self.W_inpb_upd_in, self.W_inpb_upd_hid, self.b_inpb_upd, self.W_inpb_hid_in, self.W_inpb_hid_hid, self.b_inpb_hid, self.W_inp_emb_q, self.b_inp_emb_q, self.W_mem_res_in, self.W_mem_res_hid, self.b_mem_res, self.W_mem_upd_in, self.W_mem_upd_hid, self.b_mem_upd, self.W_mem_hid_in, self.W_mem_hid_hid, self.b_mem_hid, #self.W_b self.W_1, self.W_2, self.b_1, self.b_2, self.W_a, self.W_mem_emb, self.W_inp_emb, self.W_ans_res_in, self.W_ans_res_hid, self.b_ans_res, self.W_ans_upd_in, self.W_ans_upd_hid, self.b_ans_upd, self.W_ans_hid_in, self.W_ans_hid_hid, self.b_ans_hid, ] print "==> building loss layer and computing updates" loss_vec = T.nnet.categorical_crossentropy(prob_sm, lbl) self.loss_ce = (mask * loss_vec).sum() / mask.sum() #self.loss_ce = T.nnet.categorical_crossentropy(results_rhp, lbl) if self.l2 > 0: self.loss_l2 = self.l2 * nn_utils.l2_reg(self.params) else: self.loss_l2 = 0 self.loss = self.loss_ce + self.loss_l2 #updates = lasagne.updates.adadelta(self.loss, self.params) #updates = lasagne.updates.adam(self.loss, self.params, learning_rate = self.learning_rate) updates = lasagne.updates.rmsprop(self.loss, self.params, learning_rate=self.learning_rate) #updates = lasagne.updates.momentum(self.loss, self.params, learning_rate=0.001) if self.mode == 'train': print "==> compiling train_fn" self.train_fn = theano.function( inputs=[ self.input_var, self.q_var, self.answer_var, self.answer_mask, self.answer_inp_var ], outputs=[self.prediction, self.loss], updates=updates) #profile = True) print "==> compiling test_fn" self.test_fn = theano.function(inputs=[ self.input_var, self.q_var, self.answer_var, self.answer_mask, self.answer_inp_var ], outputs=[self.prediction, self.loss])
def __init__(self, babi_train_raw, babi_test_raw, word2vec, word_vector_size, dim, mode, answer_module, input_mask_mode, memory_hops, batch_size, l2, normalize_attention, **kwargs): print "==> not used params in DMN class:", kwargs.keys() self.vocab = {} self.ivocab = {} self.word2vec = word2vec self.word_vector_size = word_vector_size self.dim = dim self.mode = mode self.answer_module = answer_module self.input_mask_mode = input_mask_mode self.memory_hops = memory_hops self.batch_size = batch_size self.l2 = l2 self.normalize_attention = normalize_attention self.max_fact_count = 0 self.train_input, self.train_q, self.train_answer, self.train_fact_count, self.train_input_mask = self._process_input( babi_train_raw) self.test_input, self.test_q, self.test_answer, self.test_fact_count, self.test_input_mask = self._process_input( babi_test_raw) self.vocab_size = len(self.vocab) self.input_var = T.tensor3( 'input_var') # (batch_size, seq_len, glove_dim) self.q_var = T.tensor3('question_var') # as self.input_var self.answer_var = T.ivector( 'answer_var') # answer of example in minibatch self.fact_count_var = T.ivector( 'fact_count_var') # number of facts in the example of minibatch self.input_mask_var = T.imatrix( 'input_mask_var') # (batch_size, indices) print "==> building input module" self.W_inp_res_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.word_vector_size)) self.W_inp_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_res = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_inp_upd_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.word_vector_size)) self.W_inp_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_inp_hid_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.word_vector_size)) self.W_inp_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, )) input_var_shuffled = self.input_var.dimshuffle(1, 2, 0) inp_dummy = theano.shared( np.zeros((self.dim, self.batch_size), dtype=floatX)) inp_c_history, _ = theano.scan(fn=self.input_gru_step, sequences=input_var_shuffled, outputs_info=T.zeros_like(inp_dummy)) inp_c_history_shuffled = inp_c_history.dimshuffle(2, 0, 1) inp_c_list = [] inp_c_mask_list = [] for batch_index in range(self.batch_size): taken = inp_c_history_shuffled[batch_index].take( self.input_mask_var[ batch_index, :self.fact_count_var[batch_index]], axis=0) inp_c_list.append( T.concatenate([ taken, T.zeros((self.max_fact_count - taken.shape[0], self.dim), floatX) ])) inp_c_mask_list.append( T.concatenate([ T.ones((taken.shape[0], ), np.int32), T.zeros((self.max_fact_count - taken.shape[0], ), np.int32) ])) self.inp_c = T.stack(inp_c_list).dimshuffle(1, 2, 0) inp_c_mask = T.stack(inp_c_mask_list).dimshuffle(1, 0) q_var_shuffled = self.q_var.dimshuffle(1, 2, 0) q_dummy = theano.shared( np.zeros((self.dim, self.batch_size), dtype=floatX)) q_q_history, _ = theano.scan(fn=self.input_gru_step, sequences=q_var_shuffled, outputs_info=T.zeros_like(q_dummy)) self.q_q = q_q_history[-1] print "==> creating parameters for memory module" self.W_mem_res_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_res = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_mem_upd_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_mem_hid_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_b = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_1 = nn_utils.normal_param(std=0.1, shape=(self.dim, 7 * self.dim + 0)) self.W_2 = nn_utils.normal_param(std=0.1, shape=(1, self.dim)) self.b_1 = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.b_2 = nn_utils.constant_param(value=0.0, shape=(1, )) print "==> building episodic memory module (fixed number of steps: %d)" % self.memory_hops memory = [self.q_q.copy()] for iter in range(1, self.memory_hops + 1): current_episode = self.new_episode(memory[iter - 1]) memory.append( self.GRU_update(memory[iter - 1], current_episode, self.W_mem_res_in, self.W_mem_res_hid, self.b_mem_res, self.W_mem_upd_in, self.W_mem_upd_hid, self.b_mem_upd, self.W_mem_hid_in, self.W_mem_hid_hid, self.b_mem_hid)) last_mem = memory[-1] print "==> building answer module" self.W_a = nn_utils.normal_param(std=0.1, shape=(self.vocab_size, self.dim)) if self.answer_module == 'feedforward': self.prediction = nn_utils.softmax(T.dot(self.W_a, last_mem)) elif self.answer_module == 'recurrent': self.W_ans_res_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.dim + self.vocab_size)) self.W_ans_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_res = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_ans_upd_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.dim + self.vocab_size)) self.W_ans_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_ans_hid_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.dim + self.vocab_size)) self.W_ans_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, )) def answer_step(prev_a, prev_y): a = self.GRU_update(prev_a, T.concatenate([prev_y, self.q_q]), self.W_ans_res_in, self.W_ans_res_hid, self.b_ans_res, self.W_ans_upd_in, self.W_ans_upd_hid, self.b_ans_upd, self.W_ans_hid_in, self.W_ans_hid_hid, self.b_ans_hid) y = nn_utils.softmax(T.dot(self.W_a, a)) return [a, y] # TODO: add conditional ending dummy = theano.shared( np.zeros((self.vocab_size, self.batch_size), dtype=floatX)) results, updates = theano.scan( fn=self.answer_step, outputs_info=[last_mem, T.zeros_like(dummy)], #(last_mem, y) n_steps=1) self.prediction = results[1][-1] else: raise Exception("invalid answer_module") self.prediction = self.prediction.dimshuffle(1, 0) self.params = [ self.W_inp_res_in, self.W_inp_res_hid, self.b_inp_res, self.W_inp_upd_in, self.W_inp_upd_hid, self.b_inp_upd, self.W_inp_hid_in, self.W_inp_hid_hid, self.b_inp_hid, self.W_mem_res_in, self.W_mem_res_hid, self.b_mem_res, self.W_mem_upd_in, self.W_mem_upd_hid, self.b_mem_upd, self.W_mem_hid_in, self.W_mem_hid_hid, self.b_mem_hid, #self.W_b self.W_1, self.W_2, self.b_1, self.b_2, self.W_a ] if self.answer_module == 'recurrent': self.params = self.params + [ self.W_ans_res_in, self.W_ans_res_hid, self.b_ans_res, self.W_ans_upd_in, self.W_ans_upd_hid, self.b_ans_upd, self.W_ans_hid_in, self.W_ans_hid_hid, self.b_ans_hid ] print "==> building loss layer and computing updates" self.loss_ce = T.nnet.categorical_crossentropy(self.prediction, self.answer_var).mean() if self.l2 > 0: self.loss_l2 = self.l2 * nn_utils.l2_reg(self.params) else: self.loss_l2 = 0 self.loss = self.loss_ce + self.loss_l2 updates = lasagne.updates.adadelta(self.loss, self.params) if self.mode == 'train': print "==> compiling train_fn" self.train_fn = theano.function( inputs=[ self.input_var, self.q_var, self.answer_var, self.fact_count_var, self.input_mask_var ], outputs=[self.prediction, self.loss], updates=updates) print "==> compiling test_fn" self.test_fn = theano.function(inputs=[ self.input_var, self.q_var, self.answer_var, self.fact_count_var, self.input_mask_var ], outputs=[self.prediction, self.loss])
def __init__(self, data_dir, word2vec, truncate_gradient, learning_rate, dim, cnn_dim, story_len, mode, answer_module, batch_size, l2, dropout, **kwargs): print "==> not used params in DMN class:", kwargs.keys() self.data_dir = data_dir self.learning_rate = learning_rate self.word_vector_size = 300 self.truncate_gradient = truncate_gradient self.word2vec = word2vec self.dim = dim self.cnn_dim = cnn_dim self.story_len = story_len self.mode = mode self.answer_module = answer_module self.batch_size = batch_size self.l2 = l2 self.dropout = dropout self.vocab, self.ivocab = self._load_vocab(self.data_dir) self.train_story = None self.test_story = None self.train_dict_story, self.train_lmdb_env_fc = self._process_input_sind( self.data_dir, 'train') self.test_dict_story, self.test_lmdb_env_fc = self._process_input_sind( self.data_dir, 'val') self.train_story = self.train_dict_story.keys() self.test_story = self.test_dict_story.keys() self.vocab_size = len(self.vocab) self.q_var = T.tensor3( 'q_var') # Now, it's a batch * story_len * image_sieze. self.answer_var = T.imatrix( 'answer_var') # answer of example in minibatch self.answer_mask = T.matrix('answer_mask') self.answer_inp_var = T.tensor3( 'answer_inp_var') # answer of example in minibatch q_shuffled = self.q_var.dimshuffle( 1, 2, 0) # now: story_len * image_size * batch_size print "==> building input module" # Now, we use a GRU to produce the input. # Forward GRU. self.W_inpf_res_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.cnn_dim)) self.W_inpf_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inpf_res = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_inpf_upd_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.cnn_dim)) self.W_inpf_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inpf_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_inpf_hid_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.cnn_dim)) self.W_inpf_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inpf_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, )) # Now, we use the GRU to build the inputs. # Two-level of nested scan is unnecessary. It will become too complicated. Just use this one. inp_dummy = theano.shared( np.zeros((self.dim, self.batch_size), dtype=floatX)) q_inp, _ = theano.scan(fn=self.input_gru_step_forward, sequences=q_shuffled, outputs_info=[T.zeros_like(inp_dummy)]) q_inp_shuffled = q_inp.dimshuffle(2, 0, 1) q_inp_last = q_inp_shuffled[:, -1, :].dimshuffle(1, 0) #dim * batch_size # Now, share the parameter with the input module. self.W_inp_emb_q = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_emb_q = nn_utils.normal_param(std=0.1, shape=(self.dim, )) inp_q = T.dot( self.W_inp_emb_q, q_inp_last) + self.b_inp_emb_q.dimshuffle( 0, 'x') # 512 x 50 self.q_q = T.tanh( inp_q ) # Since this is used to initialize the memory, we need to make it tanh. print "==> building answer module" answer_inp_var_shuffled = self.answer_inp_var.dimshuffle(1, 2, 0) # Sounds good. Now, we need to map last_mem to a new space. self.W_inp_emb = nn_utils.normal_param(std=0.1, shape=(self.dim, self.vocab_size + 1)) def _dot2(x, W): return T.dot(W, x) answer_inp_var_shuffled_emb, _ = theano.scan( fn=_dot2, sequences=answer_inp_var_shuffled, non_sequences=self.W_inp_emb) # seq x dim x batch self.W_a = nn_utils.normal_param(std=0.1, shape=(self.vocab_size + 1, self.dim)) self.W_ans_res_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_ans_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_res = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_ans_upd_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_ans_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_ans_hid_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_ans_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, )) results, _ = theano.scan(fn=self.answer_gru_step, sequences=answer_inp_var_shuffled_emb, outputs_info=[self.q_q]) # Assume there is a start token #print results.shape.eval({self.input_var: np.random.rand(10,4,4096).astype('float32'), # self.q_var: np.random.rand(10, 4096).astype('float32'), # self.answer_inp_var: np.random.rand(10, 18, 8001).astype('float32')}, on_unused_input='ignore') #results = results[:-1,:,:] # get rid of the last token as well as the first one (image) #print results.shape.eval({self.input_var: np.random.rand(10,4,4096).astype('float32'), # self.q_var: np.random.rand(10, 4096).astype('float32'), # self.answer_inp_var: np.random.rand(10, 18, 8001).astype('float32')}, on_unused_input='ignore') # Now, we need to transform it to the probabilities. prob_f, _ = theano.scan(fn=lambda x, w: T.dot(w, x), sequences=results, non_sequences=self.W_a) prob = prob_f[:-1, :, :] prob_shuffled = prob.dimshuffle(2, 0, 1) # b * len * vocab prob_f_shuffled = prob_f.dimshuffle(2, 0, 1) logging.info("prob shape.") #print prob.shape.eval({self.input_var: np.random.rand(10,4,4096).astype('float32'), # self.q_var: np.random.rand(10, 4096).astype('float32'), # self.answer_inp_var: np.random.rand(10, 18, 8001).astype('float32')}) n = prob_shuffled.shape[0] * prob_shuffled.shape[1] prob_rhp = T.reshape(prob_shuffled, (n, prob_shuffled.shape[2])) prob_sm = nn_utils.softmax_(prob_rhp) self.pred = prob_sm n_f = prob_f_shuffled.shape[0] * prob_f_shuffled.shape[1] prob_f_rhp = T.reshape(prob_f_shuffled, (n_f, prob_f_shuffled.shape[2])) prob_f_sm = nn_utils.softmax_(prob_f_rhp) self.prediction = T.reshape( prob_f_sm, (prob_f_shuffled.shape[0], prob_f_shuffled.shape[1], prob_f_shuffled.shape[2])) mask = T.reshape(self.answer_mask, (n, )) lbl = T.reshape(self.answer_var, (n, )) self.params = [ self.W_inpf_res_in, self.W_inpf_res_hid, self.b_inpf_res, self.W_inpf_upd_in, self.W_inpf_upd_hid, self.b_inpf_upd, self.W_inpf_hid_in, self.W_inpf_hid_hid, self.b_inpf_hid, self.W_inp_emb_q, self.b_inp_emb_q, self.W_a, self.W_inp_emb, self.W_ans_res_in, self.W_ans_res_hid, self.b_ans_res, self.W_ans_upd_in, self.W_ans_upd_hid, self.b_ans_upd, self.W_ans_hid_in, self.W_ans_hid_hid, self.b_ans_hid, ] print "==> building loss layer and computing updates" loss_vec = T.nnet.categorical_crossentropy(prob_sm, lbl) self.loss_ce = (mask * loss_vec).sum() / mask.sum() #self.loss_ce = T.nnet.categorical_crossentropy(results_rhp, lbl) if self.l2 > 0: self.loss_l2 = self.l2 * nn_utils.l2_reg(self.params) else: self.loss_l2 = 0 self.loss = self.loss_ce + self.loss_l2 updates = lasagne.updates.adadelta(self.loss, self.params, learning_rate=self.learning_rate) #updates = lasagne.updates.momentum(self.loss, self.params, learning_rate=0.001) if self.mode == 'train': print "==> compiling train_fn" self.train_fn = theano.function(inputs=[ self.q_var, self.answer_var, self.answer_mask, self.answer_inp_var ], outputs=[self.pred, self.loss], updates=updates) print "==> compiling test_fn" self.test_fn = theano.function(inputs=[ self.q_var, self.answer_var, self.answer_mask, self.answer_inp_var ], outputs=[self.pred, self.loss]) print "==> compiling pred_fn" self.pred_fn = theano.function( inputs=[self.q_var, self.answer_inp_var], outputs=[self.prediction])
def __init__(self, babi_train_raw, babi_test_raw, word2vec, word_vector_size, sent_vector_size, dim, mode, answer_module, input_mask_mode, memory_hops, l2, normalize_attention, batch_norm, dropout, dropout_in, **kwargs): print "==> not used params in DMN class:", kwargs.keys() self.vocab = {None: 0} self.ivocab = {0: None} self.word2vec = word2vec self.word_vector_size = word_vector_size self.sent_vector_size = sent_vector_size self.dim = dim self.mode = mode self.answer_module = answer_module self.input_mask_mode = input_mask_mode self.memory_hops = memory_hops self.l2 = l2 self.normalize_attention = normalize_attention self.batch_norm = batch_norm self.dropout = dropout self.dropout_in = dropout_in self.max_inp_sent_len = 0 self.max_q_len = 0 """ #To Use All Vocab self.vocab = {None: 0, 'jason': 134.0, 'office': 14.0, 'yellow': 78.0, 'bedroom': 24.0, 'go': 108.0, 'yes': 15.0, 'antoine': 138.0, 'milk': 139.0, 'before': 46.0, 'grabbed': 128.0, 'fit': 100.0, 'how': 105.0, 'swan': 73.0, 'than': 96.0, 'to': 13.0, 'does': 99.0, 's,e': 110.0, 'east': 102.0, 'rectangle': 82.0, 'gave': 149.0, 'then': 39.0, 'evening': 48.0, 'triangle': 79.0, 'garden': 37.0, 'get': 131.0, 'football,apple,milk': 179.0, 'they': 41.0, 'not': 178.0, 'bigger': 95.0, 'gray': 77.0, 'school': 6.0, 'apple': 142.0, 'did': 127.0, 'morning': 44.0, 'discarded': 146.0, 'julius': 72.0, 'she': 29.0, 'went': 11.0, 'where': 30.0, 'jeff': 152.0, 'square': 84.0, 'who': 153.0, 'tired': 124.0, 'there': 130.0, 'back': 12.0, 'lion': 70.0, 'are': 50.0, 'picked': 143.0, 'e,e': 119.0, 'pajamas': 129.0, 'Mary': 157.0, 'blue': 83.0, 'what': 63.0, 'container': 98.0, 'rhino': 76.0, 'daniel': 31.0, 'bernhard': 67.0, 'milk,football': 172.0, 'above': 80.0, 'got': 136.0, 'emily': 60.0, 'red': 88.0, 'either': 3.0, 'sheep': 58.0, 'football': 137.0, 'jessica': 61.0, 'do': 106.0, 'Bill': 155.0, 'football,apple': 168.0, 'fred': 1.0, 'winona': 59.0, 'objects': 161.0, 'put': 147.0, 'kitchen': 17.0, 'box': 90.0, 'received': 154.0, 'journeyed': 25.0, 'of': 52.0, 'wolf': 62.0, 'afternoon': 47.0, 'or': 7.0, 'south': 112.0, 's,w': 114.0, 'afterwards': 32.0, 'sumit': 123.0, 'color': 75.0, 'julie': 23.0, 'one': 163.0, 'down': 148.0, 'nothing': 167.0, 'n,n': 113.0, 'right': 86.0, 's,s': 116.0, 'gertrude': 54.0, 'bathroom': 26.0, 'from': 109.0, 'west': 104.0, 'chocolates': 91.0, 'two': 165.0, 'frog': 66.0, '.': 9.0, 'cats': 57.0, 'apple,milk,football': 175.0, 'passed': 158.0, 'apple,football,milk': 176.0, 'white': 71.0, 'john': 35.0, 'was': 45.0, 'mary': 10.0, 'apple,football': 170.0, 'north': 103.0, 'n,w': 111.0, 'that': 28.0, 'park': 8.0, 'took': 141.0, 'chocolate': 101.0, 'carrying': 162.0, 'n,e': 120.0, 'mice': 49.0, 'travelled': 22.0, 'he': 33.0, 'none': 164.0, 'bored': 133.0, 'e,n': 117.0, None: 0, 'Jeff': 159.0, 'this': 43.0, 'inside': 93.0, 'bill': 16.0, 'up': 144.0, 'cat': 64.0, 'will': 125.0, 'below': 87.0, 'greg': 74.0, 'three': 166.0, 'suitcase': 97.0, 'following': 36.0, 'e,s': 115.0, 'and': 40.0, 'thirsty': 135.0, 'cinema': 19.0, 'is': 2.0, 'moved': 18.0, 'yann': 132.0, 'sphere': 89.0, 'dropped': 145.0, 'in': 4.0, 'mouse': 56.0, 'football,milk': 171.0, 'pink': 81.0, 'afraid': 51.0, 'no': 20.0, 'Fred': 156.0, 'w,s': 121.0, 'handed': 151.0, 'w,w': 118.0, 'brian': 69.0, 'chest': 94.0, 'w,n': 122.0, 'you': 107.0, 'many': 160.0, 'lily': 65.0, 'hallway': 34.0, 'why': 126.0, 'after': 27.0, 'yesterday': 42.0, 'sandra': 38.0, 'fits': 92.0, 'milk,football,apple': 173.0, 'the': 5.0, 'milk,apple': 169.0, 'a': 55.0, 'give': 150.0, 'longer': 177.0, 'maybe': 21.0, 'hungry': 140.0, 'apple,milk': 174.0, 'green': 68.0, 'wolves': 53.0, 'left': 85.0} self.ivocab = {0: None, 1: 'fred', 2: 'is', 3: 'either', 4: 'in', 5: 'the', 6: 'school', 7: 'or', 8: 'park', 9: '.', 10: 'mary', 11: 'went', 12: 'back', 13: 'to', 14: 'office', 15: 'yes', 16: 'bill', 17: 'kitchen', 18: 'moved', 19: 'cinema', 20: 'no', 21: 'maybe', 22: 'travelled', 23: 'julie', 24: 'bedroom', 25: 'journeyed', 26: 'bathroom', 27: 'after', 28: 'that', 29: 'she', 30: 'where', 31: 'daniel', 32: 'afterwards', 33: 'he', 34: 'hallway', 35: 'john', 36: 'following', 37: 'garden', 38: 'sandra', 39: 'then', 40: 'and', 41: 'they', 42: 'yesterday', 43: 'this', 44: 'morning', 45: 'was', 46: 'before', 47: 'afternoon', 48: 'evening', 49: 'mice', 50: 'are', 51: 'afraid', 52: 'of', 53: 'wolves', 54: 'gertrude', 55: 'a', 56: 'mouse', 57: 'cats', 58: 'sheep', 59: 'winona', 60: 'emily', 61: 'jessica', 62: 'wolf', 63: 'what', 64: 'cat', 65: 'lily', 66: 'frog', 67: 'bernhard', 68: 'green', 69: 'brian', 70: 'lion', 71: 'white', 72: 'julius', 73: 'swan', 74: 'greg', 75: 'color', 76: 'rhino', 77: 'gray', 78: 'yellow', 79: 'triangle', 80: 'above', 81: 'pink', 82: 'rectangle', 83: 'blue', 84: 'square', 85: 'left', 86: 'right', 87: 'below', 88: 'red', 89: 'sphere', 90: 'box', 91: 'chocolates', 92: 'fits', 93: 'inside', 94: 'chest', 95: 'bigger', 96: 'than', 97: 'suitcase', 98: 'container', 99: 'does', 100: 'fit', 101: 'chocolate', 102: 'east', 103: 'north', 104: 'west', 105: 'how', 106: 'do', 107: 'you', 108: 'go', 109: 'from', 110: 's,e', 111: 'n,w', 112: 'south', 113: 'n,n', 114: 's,w', 115: 'e,s', 116: 's,s', 117: 'e,n', 118: 'w,w', 119: 'e,e', 120: 'n,e', 121: 'w,s', 122: 'w,n', 123: 'sumit', 124: 'tired', 125: 'will', 126: 'why', 127: 'did', 128: 'grabbed', 129: 'pajamas', 130: 'there', 131: 'get', 132: 'yann', 133: 'bored', 134: 'jason', 135: 'thirsty', 136: 'got', 137: 'football', 138: 'antoine', 139: 'milk', 140: 'hungry', 141: 'took', 142: 'apple', 143: 'picked', 144: 'up', 145: 'dropped', 146: 'discarded', 147: 'put', 148: 'down', 149: 'gave', 150: 'give', 151: 'handed', 152: 'jeff', 153: 'who', 154: 'received', 155: 'Bill', 156: 'Fred', 157: 'Mary', 158: 'passed', 159: 'Jeff', 160: 'many', 161: 'objects', 162: 'carrying', 163: 'one', 164: 'none', 165: 'two', 166: 'three', 167: 'nothing', 168: 'football,apple', 169: 'milk,apple', 170: 'apple,football', 171: 'football,milk', 172: 'milk,football', 173: 'milk,football,apple', 174: 'apple,milk', 175: 'apple,milk,football', 176: 'apple,football,milk', 177: 'longer', 178: 'not', 179: 'football,apple,milk'} #self.vocab = {'jason': 134.0, 'office': 14.0, 'yellow': 78.0, 'bedroom': 24.0, 'go': 108.0, 'yes': 15.0, 'antoine': 138.0, 'milk': 139.0, 'before': 46.0, 'grabbed': 128.0, 'fit': 100.0, 'how': 105.0, 'swan': 73.0, 'than': 96.0, 'to': 13.0, 'does': 99.0, 's,e': 110.0, 'east': 102.0, 'rectangle': 82.0, 'gave': 149.0, 'then': 39.0, 'evening': 48.0, 'triangle': 79.0, 'garden': 37.0, 'get': 131.0, 'football,apple,milk': 179.0, 'they': 41.0, 'not': 178.0, 'bigger': 95.0, 'gray': 77.0, 'school': 6.0, 'apple': 142.0, 'did': 127.0, 'morning': 44.0, 'discarded': 146.0, 'julius': 72.0, 'she': 29.0, 'went': 11.0, 'where': 30.0, 'jeff': 152.0, 'square': 84.0, 'who': 153.0, 'tired': 124.0, 'there': 130.0, 'back': 12.0, 'lion': 70.0, 'are': 50.0, 'picked': 143.0, 'e,e': 119.0, 'pajamas': 129.0, 'Mary': 157.0, 'blue': 83.0, 'what': 63.0, 'container': 98.0, 'rhino': 76.0, 'daniel': 31.0, 'bernhard': 67.0, 'milk,football': 172.0, 'above': 80.0, 'got': 136.0, 'emily': 60.0, 'red': 88.0, 'either': 3.0, 'sheep': 58.0, 'football': 137.0, 'jessica': 61.0, 'do': 106.0, 'Bill': 155.0, 'football,apple': 168.0, 'fred': 1.0, 'winona': 59.0, 'objects': 161.0, 'put': 147.0, 'kitchen': 17.0, 'box': 90.0, 'received': 154.0, 'journeyed': 25.0, 'of': 52.0, 'wolf': 62.0, 'afternoon': 47.0, 'or': 7.0, 'south': 112.0, 's,w': 114.0, 'afterwards': 32.0, 'sumit': 123.0, 'color': 75.0, 'julie': 23.0, 'one': 163.0, 'down': 148.0, 'nothing': 167.0, 'n,n': 113.0, 'right': 86.0, 's,s': 116.0, 'gertrude': 54.0, 'bathroom': 26.0, 'from': 109.0, 'west': 104.0, 'chocolates': 91.0, 'two': 165.0, 'frog': 66.0, '.': 9.0, 'cats': 57.0, 'apple,milk,football': 175.0, 'passed': 158.0, 'apple,football,milk': 176.0, 'white': 71.0, 'john': 35.0, 'was': 45.0, 'mary': 10.0, 'apple,football': 170.0, 'north': 103.0, 'n,w': 111.0, 'that': 28.0, 'park': 8.0, 'took': 141.0, 'chocolate': 101.0, 'carrying': 162.0, 'n,e': 120.0, 'mice': 49.0, 'travelled': 22.0, 'he': 33.0, 'none': 164.0, 'bored': 133.0, 'e,n': 117.0, None: 0, 'Jeff': 159.0, 'this': 43.0, 'inside': 93.0, 'bill': 16.0, 'up': 144.0, 'cat': 64.0, 'will': 125.0, 'below': 87.0, 'greg': 74.0, 'three': 166.0, 'suitcase': 97.0, 'following': 36.0, 'e,s': 115.0, 'and': 40.0, 'thirsty': 135.0, 'cinema': 19.0, 'is': 2.0, 'moved': 18.0, 'yann': 132.0, 'sphere': 89.0, 'dropped': 145.0, 'in': 4.0, 'mouse': 56.0, 'football,milk': 171.0, 'pink': 81.0, 'afraid': 51.0, 'no': 20.0, 'Fred': 156.0, 'w,s': 121.0, 'handed': 151.0, 'w,w': 118.0, 'brian': 69.0, 'chest': 94.0, 'w,n': 122.0, 'you': 107.0, 'many': 160.0, 'lily': 65.0, 'hallway': 34.0, 'why': 126.0, 'after': 27.0, 'yesterday': 42.0, 'sandra': 38.0, 'fits': 92.0, 'milk,football,apple': 173.0, 'the': 5.0, 'milk,apple': 169.0, 'a': 55.0, 'give': 150.0, 'longer': 177.0, 'maybe': 21.0, 'hungry': 140.0, 'apple,milk': 174.0, 'green': 68.0, 'wolves': 53.0, 'left': 85.0} #self.ivocab = {1: 'fred', 2: 'is', 3: 'either', 4: 'in', 5: 'the', 6: 'school', 7: 'or', 8: 'park', 9: '.', 10: 'mary', 11: 'went', 12: 'back', 13: 'to', 14: 'office', 15: 'yes', 16: 'bill', 17: 'kitchen', 18: 'moved', 19: 'cinema', 20: 'no', 21: 'maybe', 22: 'travelled', 23: 'julie', 24: 'bedroom', 25: 'journeyed', 26: 'bathroom', 27: 'after', 28: 'that', 29: 'she', 30: 'where', 31: 'daniel', 32: 'afterwards', 33: 'he', 34: 'hallway', 35: 'john', 36: 'following', 37: 'garden', 38: 'sandra', 39: 'then', 40: 'and', 41: 'they', 42: 'yesterday', 43: 'this', 44: 'morning', 45: 'was', 46: 'before', 47: 'afternoon', 48: 'evening', 49: 'mice', 50: 'are', 51: 'afraid', 52: 'of', 53: 'wolves', 54: 'gertrude', 55: 'a', 56: 'mouse', 57: 'cats', 58: 'sheep', 59: 'winona', 60: 'emily', 61: 'jessica', 62: 'wolf', 63: 'what', 64: 'cat', 65: 'lily', 66: 'frog', 67: 'bernhard', 68: 'green', 69: 'brian', 70: 'lion', 71: 'white', 72: 'julius', 73: 'swan', 74: 'greg', 75: 'color', 76: 'rhino', 77: 'gray', 78: 'yellow', 79: 'triangle', 80: 'above', 81: 'pink', 82: 'rectangle', 83: 'blue', 84: 'square', 85: 'left', 86: 'right', 87: 'below', 88: 'red', 89: 'sphere', 90: 'box', 91: 'chocolates', 92: 'fits', 93: 'inside', 94: 'chest', 95: 'bigger', 96: 'than', 97: 'suitcase', 98: 'container', 99: 'does', 100: 'fit', 101: 'chocolate', 102: 'east', 103: 'north', 104: 'west', 105: 'how', 106: 'do', 107: 'you', 108: 'go', 109: 'from', 110: 's,e', 111: 'n,w', 112: 'south', 113: 'n,n', 114: 's,w', 115: 'e,s', 116: 's,s', 117: 'e,n', 118: 'w,w', 119: 'e,e', 120: 'n,e', 121: 'w,s', 122: 'w,n', 123: 'sumit', 124: 'tired', 125: 'will', 126: 'why', 127: 'did', 128: 'grabbed', 129: 'pajamas', 130: 'there', 131: 'get', 132: 'yann', 133: 'bored', 134: 'jason', 135: 'thirsty', 136: 'got', 137: 'football', 138: 'antoine', 139: 'milk', 140: 'hungry', 141: 'took', 142: 'apple', 143: 'picked', 144: 'up', 145: 'dropped', 146: 'discarded', 147: 'put', 148: 'down', 149: 'gave', 150: 'give', 151: 'handed', 152: 'jeff', 153: 'who', 154: 'received', 155: 'Bill', 156: 'Fred', 157: 'Mary', 158: 'passed', 159: 'Jeff', 160: 'many', 161: 'objects', 162: 'carrying', 163: 'one', 164: 'none', 165: 'two', 166: 'three', 167: 'nothing', 168: 'football,apple', 169: 'milk,apple', 170: 'apple,football', 171: 'football,milk', 172: 'milk,football', 173: 'milk,football,apple', 174: 'apple,milk', 175: 'apple,milk,football', 176: 'apple,football,milk', 177: 'longer', 178: 'not', 179: 'football,apple,milk'} #""" self.train_input, self.train_q, self.train_answer, self.train_input_mask = self._process_input(babi_train_raw) self.test_input, self.test_q, self.test_answer, self.test_input_mask = self._process_input(babi_test_raw) self.vocab_size = len(self.vocab) self.input_var = T.imatrix('input_var') self.q_var = T.ivector('question_var') self.answer_var = T.iscalar('answer_var') self.input_mask_var = T.ivector('input_mask_var') self.attentions = [] self.pe_matrix_in = self.pe_matrix(self.max_inp_sent_len) self.pe_matrix_q = self.pe_matrix(self.max_q_len) print "==> building input module" #positional encoder weights self.W_pe = nn_utils.normal_param(std=0.1, shape=(self.vocab_size, self.dim)) #biGRU input fusion weights self.W_inp_res_in_fwd = nn_utils.normal_param(std=0.1, shape=(self.dim, self.sent_vector_size)) self.W_inp_res_hid_fwd = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_res_fwd = nn_utils.constant_param(value=0.0, shape=(self.dim,)) self.W_inp_upd_in_fwd = nn_utils.normal_param(std=0.1, shape=(self.dim, self.sent_vector_size)) self.W_inp_upd_hid_fwd = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_upd_fwd = nn_utils.constant_param(value=0.0, shape=(self.dim,)) self.W_inp_hid_in_fwd = nn_utils.normal_param(std=0.1, shape=(self.dim, self.sent_vector_size)) self.W_inp_hid_hid_fwd = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_hid_fwd = nn_utils.constant_param(value=0.0, shape=(self.dim,)) self.W_inp_res_in_bwd = nn_utils.normal_param(std=0.1, shape=(self.dim, self.sent_vector_size)) self.W_inp_res_hid_bwd = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_res_bwd = nn_utils.constant_param(value=0.0, shape=(self.dim,)) self.W_inp_upd_in_bwd = nn_utils.normal_param(std=0.1, shape=(self.dim, self.sent_vector_size)) self.W_inp_upd_hid_bwd = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_upd_bwd = nn_utils.constant_param(value=0.0, shape=(self.dim,)) self.W_inp_hid_in_bwd = nn_utils.normal_param(std=0.1, shape=(self.dim, self.sent_vector_size)) self.W_inp_hid_hid_bwd = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_hid_bwd = nn_utils.constant_param(value=0.0, shape=(self.dim,)) #self.V_f = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) #self.V_b = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.inp_sent_reps, _ = theano.scan( fn=self.sum_pos_encodings_in, sequences=self.input_var) self.inp_sent_reps_stacked = T.stacklists(self.inp_sent_reps) #self.inp_c = self.input_module_full(self.inp_sent_reps_stacked) self.inp_c = self.input_module_full(self.inp_sent_reps) self.q_q = self.sum_pos_encodings_q(self.q_var) print "==> creating parameters for memory module" self.W_mem_res_in = nn_utils.normal_param(std=0.1, shape=(self.memory_hops, self.dim, self.dim)) self.W_mem_res_hid = nn_utils.normal_param(std=0.1, shape=(self.memory_hops, self.dim, self.dim)) self.b_mem_res = nn_utils.constant_param(value=0.0, shape=(self.memory_hops, self.dim,)) self.W_mem_upd_in = nn_utils.normal_param(std=0.1, shape=(self.memory_hops, self.dim, self.dim)) self.W_mem_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.memory_hops, self.dim, self.dim)) self.b_mem_upd = nn_utils.constant_param(value=0.0, shape=(self.memory_hops, self.dim,)) self.W_mem_hid_in = nn_utils.normal_param(std=0.1, shape=(self.memory_hops, self.dim, self.dim)) self.W_mem_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.memory_hops, self.dim, self.dim)) self.b_mem_hid = nn_utils.constant_param(value=0.0, shape=(self.memory_hops, self.dim,)) #self.W_b = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) #self.W_1 = nn_utils.normal_param(std=0.1, shape=(self.dim, 7 * self.dim + 0)) self.W_1 = nn_utils.normal_param(std=0.1, shape=(self.memory_hops, self.dim, 4 * self.dim + 0)) self.W_2 = nn_utils.normal_param(std=0.1, shape=(self.memory_hops, 1, self.dim)) self.b_1 = nn_utils.constant_param(value=0.0, shape=(self.memory_hops, self.dim,)) self.b_2 = nn_utils.constant_param(value=0.0, shape=(self.memory_hops, 1,)) print "==> building episodic memory module (fixed number of steps: %d)" % self.memory_hops memory = [self.q_q.copy()] for iter in range(1, self.memory_hops + 1): self.mem_weight_num = int(iter - 1) current_episode = self.new_episode(memory[iter - 1]) memory.append(self.GRU_update(memory[iter - 1], current_episode, self.W_mem_res_in[self.mem_weight_num], self.W_mem_res_hid[self.mem_weight_num], self.b_mem_res[self.mem_weight_num], self.W_mem_upd_in[self.mem_weight_num], self.W_mem_upd_hid[self.mem_weight_num], self.b_mem_upd[self.mem_weight_num], self.W_mem_hid_in[self.mem_weight_num], self.W_mem_hid_hid[self.mem_weight_num], self.b_mem_hid[self.mem_weight_num])) last_mem_raw = memory[-1].dimshuffle(('x', 0)) net = layers.InputLayer(shape=(1, self.dim), input_var=last_mem_raw) if self.dropout > 0 and self.mode == 'train': net = layers.DropoutLayer(net, p=self.dropout) last_mem = layers.get_output(net)[0] print "==> building answer module" self.W_a = nn_utils.normal_param(std=0.1, shape=(self.vocab_size, self.dim)) if self.answer_module == 'feedforward': self.prediction = nn_utils.softmax(T.dot(self.W_a, last_mem)) elif self.answer_module == 'recurrent': self.W_ans_res_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim + self.vocab_size)) self.W_ans_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_res = nn_utils.constant_param(value=0.0, shape=(self.dim,)) self.W_ans_upd_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim + self.vocab_size)) self.W_ans_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_upd = nn_utils.constant_param(value=0.0, shape=(self.dim,)) self.W_ans_hid_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim + self.vocab_size)) self.W_ans_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_hid = nn_utils.constant_param(value=0.0, shape=(self.dim,)) def answer_step(prev_a, prev_y): a = self.GRU_update(prev_a, T.concatenate([prev_y, self.q_q]), self.W_ans_res_in, self.W_ans_res_hid, self.b_ans_res, self.W_ans_upd_in, self.W_ans_upd_hid, self.b_ans_upd, self.W_ans_hid_in, self.W_ans_hid_hid, self.b_ans_hid) y = nn_utils.softmax(T.dot(self.W_a, a)) return [a, y] # add conditional ending? dummy = theano.shared(np.zeros((self.vocab_size, ), dtype=floatX)) results, updates = theano.scan(fn=answer_step, outputs_info=[last_mem, T.zeros_like(dummy)], n_steps=1) self.prediction = results[1][-1] else: raise Exception("invalid answer_module") print "==> collecting all parameters" self.params = [self.W_pe, self.W_inp_res_in_fwd, self.W_inp_res_hid_fwd, self.b_inp_res_fwd, self.W_inp_upd_in_fwd, self.W_inp_upd_hid_fwd, self.b_inp_upd_fwd, self.W_inp_hid_in_fwd, self.W_inp_hid_hid_fwd, self.b_inp_hid_fwd, self.W_inp_res_in_bwd, self.W_inp_res_hid_bwd, self.b_inp_res_bwd, self.W_inp_upd_in_bwd, self.W_inp_upd_hid_bwd, self.b_inp_upd_bwd, self.W_inp_hid_in_bwd, self.W_inp_hid_hid_bwd, self.b_inp_hid_bwd, self.W_mem_res_in, self.W_mem_res_hid, self.b_mem_res, self.W_mem_upd_in, self.W_mem_upd_hid, self.b_mem_upd, self.W_mem_hid_in, self.W_mem_hid_hid, self.b_mem_hid, #self.W_b self.W_1, self.W_2, self.b_1, self.b_2, self.W_a] if self.answer_module == 'recurrent': self.params = self.params + [self.W_ans_res_in, self.W_ans_res_hid, self.b_ans_res, self.W_ans_upd_in, self.W_ans_upd_hid, self.b_ans_upd, self.W_ans_hid_in, self.W_ans_hid_hid, self.b_ans_hid] print "==> building loss layer and computing updates" self.loss_ce = T.nnet.categorical_crossentropy(self.prediction.dimshuffle('x', 0), T.stack([self.answer_var]))[0] if self.l2 > 0: self.loss_l2 = self.l2 * nn_utils.l2_reg(self.params) else: self.loss_l2 = 0 self.loss = self.loss_ce + self.loss_l2 #updates = lasagne.updates.adadelta(self.loss, self.params) updates = lasagne.updates.adam(self.loss, self.params) updates = lasagne.updates.adam(self.loss, self.params, learning_rate=0.0001, beta1=0.5) #from DCGAN paper #updates = lasagne.updates.adadelta(self.loss, self.params, learning_rate=0.0005) #updates = lasagne.updates.momentum(self.loss, self.params, learning_rate=0.0003) self.attentions = T.stack(self.attentions) if self.mode == 'train': print "==> compiling train_fn" self.train_fn = theano.function(inputs=[self.input_var, self.q_var, self.answer_var, self.input_mask_var], outputs=[self.prediction, self.loss, self.attentions], updates=updates, on_unused_input='warn', allow_input_downcast=True) print "==> compiling test_fn" self.test_fn = theano.function(inputs=[self.input_var, self.q_var, self.answer_var, self.input_mask_var], outputs=[self.prediction, self.loss, self.attentions], on_unused_input='warn', allow_input_downcast=True)
def __init__(self, stories, QAs, batch_size, story_v, learning_rate, word_vector_size, sent_vector_size, dim, mode, answer_module, input_mask_mode, memory_hops, l2, story_source, normalize_attention, batch_norm, dropout, dropout_in, **kwargs): #print "==> not used params in DMN class:", kwargs.keys() self.learning_rate = learning_rate self.rng = np.random self.rng.seed(1234) mqa = MovieQA.DataLoader() ### Load Word2Vec model w2v_model = w2v.load(w2v_mqa_model_filename, kind='bin') self.w2v = w2v_model self.d_w2v = len(w2v_model.get_vector(w2v_model.vocab[1])) self.word_thresh = 1 print "Loaded word2vec model: dim = %d | vocab-size = %d" % ( self.d_w2v, len(w2v_model.vocab)) ### Create vocabulary-to-index and index-to-vocabulary v2i = {'': 0, 'UNK': 1} # vocabulary to index QA_words, v2i = self.create_vocabulary( QAs, stories, v2i, w2v_vocab=w2v_model.vocab.tolist(), word_thresh=self.word_thresh) i2v = {v: k for k, v in v2i.iteritems()} self.vocab = v2i self.ivocab = i2v self.story_v = story_v self.word2vec = w2v_model self.word_vector_size = word_vector_size self.sent_vector_size = sent_vector_size self.dim = dim self.batch_size = batch_size self.mode = mode self.answer_module = answer_module self.input_mask_mode = input_mask_mode self.memory_hops = memory_hops self.l2 = l2 self.normalize_attention = normalize_attention self.batch_norm = batch_norm self.dropout = dropout self.dropout_in = dropout_in #self.max_inp_sent_len = 0 #self.max_q_len = 0 ### Convert QAs and stories into numpy matrices (like in the bAbI data set) # storyM - Dictionary - indexed by imdb_key. Values are [num-sentence X max-num-words] # questionM - NP array - [num-question X max-num-words] # answerM - NP array - [num-question X num-answer-options X max-num-words] storyM, questionM, answerM = self.data_in_matrix_form( stories, QA_words, v2i) qinfo = self.associate_additional_QA_info(QAs) ### Split everything into train, val, and test data #train_storyM = {k:v for k, v in storyM.iteritems() if k in mqa.data_split['train']} #val_storyM = {k:v for k, v in storyM.iteritems() if k in mqa.data_split['val']} #test_storyM = {k:v for k, v in storyM.iteritems() if k in mqa.data_split['test']} def split_train_test(long_list, QAs, trnkey='train', tstkey='val'): # Create train/val/test splits based on key train_split = [ item for k, item in enumerate(long_list) if QAs[k].qid.startswith('train') ] val_split = [ item for k, item in enumerate(long_list) if QAs[k].qid.startswith('val') ] test_split = [ item for k, item in enumerate(long_list) if QAs[k].qid.startswith('test') ] if type(long_list) == np.ndarray: return np.array(train_split), np.array(val_split), np.array( test_split) else: return train_split, val_split, test_split train_questionM, val_questionM, test_questionM = split_train_test( questionM, QAs) train_answerM, val_answerM, test_answerM, = split_train_test( answerM, QAs) train_qinfo, val_qinfo, test_qinfo = split_train_test(qinfo, QAs) QA_train = [qa for qa in QAs if qa.qid.startswith('train:')] QA_val = [qa for qa in QAs if qa.qid.startswith('val:')] QA_test = [qa for qa in QAs if qa.qid.startswith('test:')] #train_data = {'s':train_storyM, 'q':train_questionM, 'a':train_answerM, 'qinfo':train_qinfo} #val_data = {'s':val_storyM, 'q':val_questionM, 'a':val_answerM, 'qinfo':val_qinfo} #test_data = {'s':test_storyM, 'q':test_questionM, 'a':test_answerM, 'qinfo':test_qinfo} with open('train_split.json') as fid: trdev = json.load(fid) s_key = self.story_v.keys() self.train_range = [ k for k, qi in enumerate(qinfo) if (qi['movie'] in trdev['train'] and qi['qid'] in s_key) ] self.train_val_range = [ k for k, qi in enumerate(qinfo) if (qi['movie'] in trdev['dev'] and qi['qid'] in s_key) ] self.val_range = [ k for k, qi in enumerate(val_qinfo) if qi['qid'] in s_key ] self.max_sent_len = max( [sty.shape[0] for sty in self.story_v.values()]) self.train_input = self.story_v self.train_val_input = self.story_v self.test_input = self.story_v self.train_q = train_questionM self.train_answer = train_answerM self.train_qinfo = train_qinfo self.train_val_q = train_questionM self.train_val_answer = train_answerM self.train_val_qinfo = train_qinfo self.test_q = val_questionM self.test_answer = val_answerM self.test_qinfo = val_qinfo """Setup some configuration parts of the model. """ self.v2i = v2i self.vs = len(v2i) self.d_lproj = 300 # define Look-Up-Table mask np_mask = np.vstack( (np.zeros(self.d_w2v), np.ones((self.vs - 1, self.d_w2v)))) T_mask = theano.shared(np_mask.astype(theano.config.floatX), name='LUT_mask') # setup Look-Up-Table to be Word2Vec self.pca_mat = None print "Initialize LUTs as word2vec and use linear projection layer" self.LUT = np.zeros((self.vs, self.d_w2v), dtype='float32') found_words = 0 for w, v in self.v2i.iteritems(): if w in self.w2v.vocab: # all valid words are already in vocab or 'UNK' self.LUT[v] = self.w2v.get_vector(w) found_words += 1 else: # LUT[v] = np.zeros((self.d_w2v)) self.LUT[v] = self.rng.randn(self.d_w2v) self.LUT[v] = self.LUT[v] / (np.linalg.norm(self.LUT[v]) + 1e-6) print "Found %d / %d words" % (found_words, len(self.v2i)) # word 0 is blanked out, word 1 is 'UNK' self.LUT[0] = np.zeros((self.d_w2v)) # if linear projection layer is not the same shape as LUT, then initialize with PCA if self.d_lproj != self.LUT.shape[1]: pca = PCA(n_components=self.d_lproj, whiten=True) self.pca_mat = pca.fit_transform(self.LUT.T) # 300 x 100? # setup LUT! self.T_w2v = theano.shared(self.LUT.astype(theano.config.floatX)) self.train_input_mask = np_mask self.test_input_mask = np_mask #self.train_input, self.train_q, self.train_answer, self.train_input_mask = self._process_input(babi_train_raw) #self.test_input, self.test_q, self.test_answer, self.test_input_mask = self._process_input(babi_test_raw) self.vocab_size = len(self.vocab) self.input_var = T.tensor3( 'input_var') # batch-size X sentences X 4096 self.q_var = T.matrix('question_var') # batch-size X 300 self.answer_var = T.tensor3( 'answer_var') # batch-size X multiple options X 300 self.input_mask_var = T.imatrix('input_mask_var') self.target = T.ivector( 'target' ) # batch-size ('single': word index, 'multi_choice': correct option) self.attentions = [] #self.pe_matrix_in = self.pe_matrix(self.max_inp_sent_len) #self.pe_matrix_q = self.pe_matrix(self.max_q_len) print "==> building input module" #positional encoder weights self.W_pe = nn_utils.normal_param(std=0.1, shape=(self.vocab_size, self.dim)) #biGRU input fusion weights self.W_inp_res_in_fwd = nn_utils.normal_param( std=0.1, shape=(self.dim, self.sent_vector_size)) self.W_inp_res_hid_fwd = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_res_fwd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_inp_upd_in_fwd = nn_utils.normal_param( std=0.1, shape=(self.dim, self.sent_vector_size)) self.W_inp_upd_hid_fwd = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_upd_fwd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_inp_hid_in_fwd = nn_utils.normal_param( std=0.1, shape=(self.dim, self.sent_vector_size)) self.W_inp_hid_hid_fwd = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_hid_fwd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_inp_res_in_bwd = nn_utils.normal_param( std=0.1, shape=(self.dim, self.sent_vector_size)) self.W_inp_res_hid_bwd = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_res_bwd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_inp_upd_in_bwd = nn_utils.normal_param( std=0.1, shape=(self.dim, self.sent_vector_size)) self.W_inp_upd_hid_bwd = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_upd_bwd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_inp_hid_in_bwd = nn_utils.normal_param( std=0.1, shape=(self.dim, self.sent_vector_size)) self.W_inp_hid_hid_bwd = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_hid_bwd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) #self.V_f = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) #self.V_b = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.inp_sent_reps = self.input_var self.ans_reps = self.answer_var self.inp_c = self.input_module_full(self.inp_sent_reps) self.q_q = self.q_var print "==> creating parameters for memory module" self.W_mem_res_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_res = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_mem_upd_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_mem_hid_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, )) #self.W_b = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) #self.W_1 = nn_utils.normal_param(std=0.1, shape=(self.dim, 7 * self.dim + 0)) self.W_1 = nn_utils.normal_param(std=0.1, shape=(self.dim, 4 * self.dim + 0)) self.W_2 = nn_utils.normal_param(std=0.1, shape=(1, self.dim)) self.b_1 = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.b_2 = nn_utils.constant_param(value=0.0, shape=(1, )) print "==> building episodic memory module (fixed number of steps: %d)" % self.memory_hops memory = [self.q_q.copy()] for iter in range(1, self.memory_hops + 1): current_episode = self.new_episode(memory[iter - 1]) memory.append( self.GRU_update(memory[iter - 1], current_episode, self.W_mem_res_in, self.W_mem_res_hid, self.b_mem_res, self.W_mem_upd_in, self.W_mem_upd_hid, self.b_mem_upd, self.W_mem_hid_in, self.W_mem_hid_hid, self.b_mem_hid)) #last_mem_raw = memory[-1].dimshuffle(('x', 0)) last_mem_raw = memory[-1] net = layers.InputLayer(shape=(self.batch_size, self.dim), input_var=last_mem_raw) if self.dropout > 0 and self.mode == 'train': net = layers.DropoutLayer(net, p=self.dropout) last_mem = layers.get_output(net)[0] print "==> building answer module" self.W_a = nn_utils.normal_param(std=0.1, shape=(300, self.dim)) if self.answer_module == 'feedforward': self.temp = T.dot(self.ans_reps, self.W_a) self.prediction = nn_utils.softmax(T.dot(self.temp, last_mem)) elif self.answer_module == 'recurrent': self.W_ans_res_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.dim + self.vocab_size)) self.W_ans_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_res = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_ans_upd_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.dim + self.vocab_size)) self.W_ans_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_ans_hid_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.dim + self.vocab_size)) self.W_ans_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, )) def answer_step(prev_a, prev_y): a = self.GRU_update(prev_a, T.concatenate([prev_y, self.q_q]), self.W_ans_res_in, self.W_ans_res_hid, self.b_ans_res, self.W_ans_upd_in, self.W_ans_upd_hid, self.b_ans_upd, self.W_ans_hid_in, self.W_ans_hid_hid, self.b_ans_hid) y = nn_utils.softmax(T.dot(self.W_a, a)) return [a, y] # add conditional ending? dummy = theano.shared(np.zeros((self.vocab_size, ), dtype=floatX)) results, updates = theano.scan( fn=answer_step, outputs_info=[last_mem, T.zeros_like(dummy)], n_steps=1) self.prediction = results[1][-1] else: raise Exception("invalid answer_module") print "==> collecting all parameters" self.params = [ self.W_pe, self.W_inp_res_in_fwd, self.W_inp_res_hid_fwd, self.b_inp_res_fwd, self.W_inp_upd_in_fwd, self.W_inp_upd_hid_fwd, self.b_inp_upd_fwd, self.W_inp_hid_in_fwd, self.W_inp_hid_hid_fwd, self.b_inp_hid_fwd, self.W_inp_res_in_bwd, self.W_inp_res_hid_bwd, self.b_inp_res_bwd, self.W_inp_upd_in_bwd, self.W_inp_upd_hid_bwd, self.b_inp_upd_bwd, self.W_inp_hid_in_bwd, self.W_inp_hid_hid_bwd, self.b_inp_hid_bwd, self.W_mem_res_in, self.W_mem_res_hid, self.b_mem_res, self.W_mem_upd_in, self.W_mem_upd_hid, self.b_mem_upd, self.W_mem_hid_in, self.W_mem_hid_hid, self.b_mem_hid, #self.W_b self.W_1, self.W_2, self.b_1, self.b_2, self.W_a ] if self.answer_module == 'recurrent': self.params = self.params + [ self.W_ans_res_in, self.W_ans_res_hid, self.b_ans_res, self.W_ans_upd_in, self.W_ans_upd_hid, self.b_ans_upd, self.W_ans_hid_in, self.W_ans_hid_hid, self.b_ans_hid ] print "==> building loss layer and computing updates" #tmp= self.prediction.dimshuffle(2,0,1) #res, _ =theano.scan(fn = lambda inp: inp, sequences=tmp) #self.prediction = res[-1] self.loss_ce = T.nnet.categorical_crossentropy(self.prediction, self.target) if self.l2 > 0: self.loss_l2 = self.l2 * nn_utils.l2_reg(self.params) else: self.loss_l2 = 0 self.loss = T.mean(self.loss_ce) + self.loss_l2 #updates = lasagne.updates.adadelta(self.loss, self.params) #updates = lasagne.updates.adam(self.loss, self.params) updates = lasagne.updates.adam(self.loss, self.params, learning_rate=self.learning_rate, beta1=0.5) #from DCGAN paper #updates = lasagne.updates.adadelta(self.loss, self.params, learning_rate=0.0005) #updates = lasagne.updates.momentum(self.loss, self.params, learning_rate=0.0003) self.attentions = T.stack(self.attentions) if self.mode == 'train': print "==> compiling train_fn" self.train_fn = theano.function( inputs=[ self.input_var, self.q_var, self.answer_var, self.target ], outputs=[self.prediction, self.loss, self.attentions], updates=updates, on_unused_input='warn', allow_input_downcast=True) print "==> compiling test_fn" self.test_fn = theano.function( inputs=[self.input_var, self.q_var, self.answer_var, self.target], outputs=[self.prediction, self.loss, self.attentions], on_unused_input='warn', allow_input_downcast=True)
def __init__(self,word2vec, word_vector_size, dim, mode, answer_module, input_mask_mode, memory_hops, batch_size, l2, normalize_attention, batch_norm, dropout,h5file,json_dict_file ,num_answers,img_vector_size,img_seq_len, img_h5file_train,img_h5file_test,**kwargs): print "==> not used params in DMN class:", kwargs.keys() self.vocab = {} self.ivocab = {} self.lr=0.001 self.word2vec = word2vec self.word_vector_size = word_vector_size self.dim = dim self.mode = mode self.answer_module = answer_module self.input_mask_mode = input_mask_mode self.memory_hops = memory_hops self.batch_size = batch_size self.l2 = l2 self.normalize_attention = normalize_attention self.batch_norm = batch_norm self.dropout = dropout self.h5file=h5py.File(h5file,"r") self.img_h5file_train=h5py.File(img_h5file_train,"r") self.img_h5file_test=h5py.File(img_h5file_test,"r") self.img_seq_len=img_seq_len self.img_vector_size=img_vector_size with open (json_dict_file) as f: self.json_dict=json.load(f) #self.train_input, self.train_q, self.train_answer, self.train_fact_count, self.train_input_mask = self._process_input(babi_train_raw) #self.test_input, self.test_q, self.test_answer, self.test_fact_count, self.test_input_mask = self._process_input(babi_test_raw) #self.vocab_size = len(self.vocab) self.vocab_size=num_answers self.input_var = T.tensor3('input_var') # (batch_size, seq_len, glove_dim) self.img_input_var=T.tensor3('img_input_var') # (batch_size * img_seq_len , img_vector_size) self.q_var = T.tensor3('question_var') # as self.input_var self.answer_var = T.ivector('answer_var') # answer of example in minibatch self.fact_count_var = T.ivector('fact_count_var') # number of facts in the example of minibatch self.input_mask_var = T.imatrix('input_mask_var') # (batch_size, indices) print "==> building input module" self.W_inp_res_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.word_vector_size)) self.W_inp_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_res = nn_utils.constant_param(value=0.0, shape=(self.dim,)) self.W_inp_upd_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.word_vector_size)) self.W_inp_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_upd = nn_utils.constant_param(value=0.0, shape=(self.dim,)) self.W_inp_hid_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.word_vector_size)) self.W_inp_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_hid = nn_utils.constant_param(value=0.0, shape=(self.dim,)) input_var_shuffled = self.input_var.dimshuffle(1, 2, 0) inp_dummy = theano.shared(np.zeros((self.dim, self.batch_size), dtype=floatX)) inp_c_history, _ = theano.scan(fn=self.input_gru_step, sequences=input_var_shuffled, outputs_info=T.zeros_like(inp_dummy)) inp_c_history_shuffled = inp_c_history.dimshuffle(2, 0, 1) inp_c_list = [] inp_c_mask_list = [] for batch_index in range(self.batch_size): taken = inp_c_history_shuffled[batch_index].take(self.input_mask_var[batch_index, :self.fact_count_var[batch_index]], axis=0) inp_c_list.append(T.concatenate([taken, T.zeros((self.input_mask_var.shape[1] - taken.shape[0], self.dim), floatX)])) inp_c_mask_list.append(T.concatenate([T.ones((taken.shape[0],), np.int32), T.zeros((self.input_mask_var.shape[1] - taken.shape[0],), np.int32)])) self.inp_c = T.stack(inp_c_list).dimshuffle(1, 2, 0) inp_c_mask = T.stack(inp_c_mask_list).dimshuffle(1, 0) ###################### Adding the Image Input Module print "==> building image img_input module" ### Don't Really Need the GRU to reduce the sentences into vectors ### self.img_input_var=T.reshape(self.img_input_var , ( self.batch_size * self.img_seq_len , self.img_vector_size )) img_input_layer=layers.InputLayer( shape=(self.batch_size*self.img_seq_len, self.img_vector_size), input_var=self.img_input_var) ## Convert the img_vector_size to self.dim using a MLP ## img_input_layer=layers.DenseLayer( img_input_layer , num_units=self.dim ) img_input_var_dim=layers.get_output(img_input_layer) img_input_var_dim=T.reshape(img_input_var_dim ,(self.batch_size , self.img_seq_len , self.dim ) ) #self.img_inp_c = T.stack(img_input_var_dim).dimshuffle(1, 2, 0) self.img_inp_c = img_input_var_dim.dimshuffle(1,2,0) ################################################### q_var_shuffled = self.q_var.dimshuffle(1, 2, 0) q_dummy = theano.shared(np.zeros((self.dim, self.batch_size), dtype=floatX)) q_q_history, _ = theano.scan(fn=self.input_gru_step, sequences=q_var_shuffled, outputs_info=T.zeros_like(q_dummy)) self.q_q = q_q_history[-1] print "==> creating parameters for memory module" self.W_mem_res_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_res = nn_utils.constant_param(value=0.0, shape=(self.dim,)) self.W_mem_upd_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_upd = nn_utils.constant_param(value=0.0, shape=(self.dim,)) self.W_mem_hid_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_hid = nn_utils.constant_param(value=0.0, shape=(self.dim,)) self.W_b = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_1 = nn_utils.normal_param(std=0.1, shape=(self.dim, 7 * self.dim + 0)) self.W_2 = nn_utils.normal_param(std=0.1, shape=(1, self.dim)) self.b_1 = nn_utils.constant_param(value=0.0, shape=(self.dim,)) self.b_2 = nn_utils.constant_param(value=0.0, shape=(1,)) print "==> building episodic memory module (fixed number of steps: %d)" % self.memory_hops memory = [self.q_q.copy()] for iter in range(1, self.memory_hops + 1): current_episode = self.new_episode(memory[iter - 1]) memory.append(self.GRU_update(memory[iter - 1], current_episode, self.W_mem_res_in, self.W_mem_res_hid, self.b_mem_res, self.W_mem_upd_in, self.W_mem_upd_hid, self.b_mem_upd, self.W_mem_hid_in, self.W_mem_hid_hid, self.b_mem_hid)) last_mem_raw = memory[-1].dimshuffle((1, 0)) ################################# Episodic Memory Module for Image print "==> creating parameters for image memory module" self.W_img_mem_res_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_img_mem_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_img_mem_res = nn_utils.constant_param(value=0.0, shape=(self.dim,)) self.W_img_mem_upd_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_img_mem_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_img_mem_upd = nn_utils.constant_param(value=0.0, shape=(self.dim,)) self.W_img_mem_hid_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_img_mem_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_img_mem_hid = nn_utils.constant_param(value=0.0, shape=(self.dim,)) self.W_img_b = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_img_1 = nn_utils.normal_param(std=0.1, shape=(self.dim, 7 * self.dim + 0)) self.W_img_2 = nn_utils.normal_param(std=0.1, shape=(1, self.dim)) self.b_img_1 = nn_utils.constant_param(value=0.0, shape=(self.dim,)) self.b_img_2 = nn_utils.constant_param(value=0.0, shape=(1,)) print "==> building episodic img_memory module (fixed number of steps: %d)" % self.memory_hops img_memory = [self.q_q.copy()] for iter in range(1, self.memory_hops + 1): current_episode = self.new_img_episode(img_memory[iter - 1]) img_memory.append(self.GRU_update(img_memory[iter - 1], current_episode, self.W_img_mem_res_in, self.W_img_mem_res_hid, self.b_img_mem_res, self.W_img_mem_upd_in, self.W_img_mem_upd_hid, self.b_img_mem_upd, self.W_img_mem_hid_in, self.W_img_mem_hid_hid, self.b_img_mem_hid)) last_img_mem_raw = img_memory[-1].dimshuffle((1, 0)) ####################################################################### ### Concatenating The 2 Memory Modules Representations Assuming the representation as self.batch_size x self.dim ### combined_mem_raw=T.concatenate([last_mem_raw,last_img_mem_raw],axis=1) #net = layers.InputLayer(shape=(self.batch_size, self.dim), input_var=last_mem_raw) net = layers.InputLayer(shape=(self.batch_size, self.dim+self.dim), input_var=combined_mem_raw) if self.batch_norm: net = layers.BatchNormLayer(incoming=net) if self.dropout > 0 and self.mode == 'train': net = layers.DropoutLayer(net, p=self.dropout) last_mem = layers.get_output(net).dimshuffle((1, 0)) print "==> building answer module" #self.W_a = nn_utils.normal_param(std=0.1, shape=(self.vocab_size, self.dim)) self.W_a = nn_utils.normal_param(std=0.1, shape=(self.vocab_size, self.dim+self.dim)) if self.answer_module == 'feedforward': self.prediction = nn_utils.softmax(T.dot(self.W_a, last_mem)) elif self.answer_module == 'recurrent': self.W_ans_res_in = nn_utils.normal_param(std=0.1, shape=(2*self.dim, self.dim + self.vocab_size)) self.W_ans_res_hid = nn_utils.normal_param(std=0.1, shape=(2*self.dim, 2*self.dim)) self.b_ans_res = nn_utils.constant_param(value=0.0, shape=(2*self.dim,)) self.W_ans_upd_in = nn_utils.normal_param(std=0.1, shape=(2*self.dim, self.dim + self.vocab_size)) self.W_ans_upd_hid = nn_utils.normal_param(std=0.1, shape=(2*self.dim,2*self.dim)) self.b_ans_upd = nn_utils.constant_param(value=0.0, shape=(2*self.dim,)) self.W_ans_hid_in = nn_utils.normal_param(std=0.1, shape=(2*self.dim, self.dim + self.vocab_size)) self.W_ans_hid_hid = nn_utils.normal_param(std=0.1, shape=(2*self.dim, 2*self.dim)) self.b_ans_hid = nn_utils.constant_param(value=0.0, shape=(2*self.dim,)) def answer_step(prev_a, prev_y): a = self.GRU_update(prev_a, T.concatenate([prev_y, self.q_q]), self.W_ans_res_in, self.W_ans_res_hid, self.b_ans_res, self.W_ans_upd_in, self.W_ans_upd_hid, self.b_ans_upd, self.W_ans_hid_in, self.W_ans_hid_hid, self.b_ans_hid) y = nn_utils.softmax(T.dot(self.W_a, a)) return [a, y] # TODO: add conditional ending dummy = theano.shared(np.zeros((self.vocab_size, self.batch_size), dtype=floatX)) results, updates = theano.scan(fn=answer_step, outputs_info=[last_mem, T.zeros_like(dummy)], #(last_mem, y) n_steps=1) self.prediction = results[1][-1] else: raise Exception("invalid answer_module") self.prediction = self.prediction.dimshuffle(1, 0) self.params = [self.W_inp_res_in, self.W_inp_res_hid, self.b_inp_res, self.W_inp_upd_in, self.W_inp_upd_hid, self.b_inp_upd, self.W_inp_hid_in, self.W_inp_hid_hid, self.b_inp_hid, self.W_mem_res_in, self.W_mem_res_hid, self.b_mem_res, self.W_mem_upd_in, self.W_mem_upd_hid, self.b_mem_upd, self.W_mem_hid_in, self.W_mem_hid_hid, self.b_mem_hid, #self.W_b self.W_1, self.W_2, self.b_1, self.b_2, self.W_a, ## Add the parameters of the Image Input Module self.W_img_mem_res_in, self.W_img_mem_res_hid, self.b_img_mem_res, self.W_img_mem_upd_in, self.W_img_mem_upd_hid, self.b_img_mem_upd, self.W_img_mem_hid_in, self.W_img_mem_hid_hid, self.b_img_mem_hid, #self.W_img_b_img self.W_img_1, self.W_img_2, self.b_img_1, self.b_img_2] ## Add the parameters of the Image Input Module dim_transform_mlp_params=layers.get_all_params(img_input_layer ) self.params=self.params+ dim_transform_mlp_params if self.answer_module == 'recurrent': self.params = self.params + [self.W_ans_res_in, self.W_ans_res_hid, self.b_ans_res, self.W_ans_upd_in, self.W_ans_upd_hid, self.b_ans_upd, self.W_ans_hid_in, self.W_ans_hid_hid, self.b_ans_hid] print "==> building loss layer and computing updates" self.loss_ce = T.nnet.categorical_crossentropy(self.prediction, self.answer_var).mean() if self.l2 > 0: self.loss_l2 = self.l2 * nn_utils.l2_reg(self.params) else: self.loss_l2 = 0 self.loss = self.loss_ce + self.loss_l2 self.learning_rate=T.scalar(name="learning_rate") updates=lasagne.updates.adam(self.loss,self.params,learning_rate=self.learning_rate) #updates = lasagne.updates.adadelta(self.loss, self.params) #updates = lasagne.updates.momentum(self.loss, self.params, learning_rate=0.001) if self.mode == 'train': print "==> compiling train_fn" self.train_fn = theano.function(inputs=[self.input_var, self.q_var, self.answer_var, self.fact_count_var, self.input_mask_var,self.img_input_var,self.learning_rate], outputs=[self.prediction, self.loss], updates=updates) print "==> compiling test_fn" self.test_fn = theano.function(inputs=[self.input_var, self.q_var, self.answer_var, self.fact_count_var, self.input_mask_var,self.img_input_var,self.learning_rate],on_unused_input='ignore', outputs=[self.prediction, self.loss])
def __init__(self, babi_train_raw, babi_test_raw, word2vec, word_vector_size, memory_hops, dim, mode, input_mask_mode, l2, batch_norm, dropout, **kwargs): print "==> not used params in DMN class:", kwargs.keys() self.vocab = {} self.ivocab = {} self.word2vec = word2vec self.word_vector_size = word_vector_size self.dim = dim self.mode = mode self.input_mask_mode = input_mask_mode self.l2 = l2 self.batch_norm = batch_norm self.dropout = dropout self.memory_hops = memory_hops self.train_input, self.train_q, self.train_answer, self.train_input_mask, self.train_gates = self._process_input( babi_train_raw) self.test_input, self.test_q, self.test_answer, self.test_input_mask, self.test_gates = self._process_input( babi_test_raw) self.vocab_size = len(self.vocab) print "Train size: ", len(self.train_input) print "Test size: ", len(self.test_input) print "Vocab size: ", self.vocab_size self.input_var = T.matrix('input_var') self.q_var = T.matrix('question_var') self.answer_var = T.iscalar('answer_var') self.input_mask_var = T.ivector('input_mask_var') self.gates_var = T.ivector( 'gates_var') # attention gate (including end_reading) self.attentions = [] print "==> building input module" self.W_inp_res_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.word_vector_size)) self.W_inp_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_res = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_inp_upd_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.word_vector_size)) self.W_inp_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_inp_hid_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.word_vector_size)) self.W_inp_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, )) inp_c_history, _ = theano.scan(fn=self.input_gru_step, sequences=self.input_var, outputs_info=T.zeros_like( self.b_inp_hid)) self.end_reading = nn_utils.constant_param(value=0.0, shape=(1, self.dim)) inp_c_tag = T.concatenate([inp_c_history, self.end_reading], axis=0) self.inp_c = inp_c_tag.take(self.input_mask_var, axis=0) #(facts_len,dim) self.q_q, _ = theano.scan(fn=self.input_gru_step, sequences=self.q_var, outputs_info=T.zeros_like(self.b_inp_hid)) self.q_q = self.q_q[-1] #(1,dim) print "==> creating parameters for memory module" self.W_mem_res_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_res = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_mem_upd_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_mem_hid_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_b = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_1 = nn_utils.normal_param(std=0.1, shape=(self.dim, 7 * self.dim + 2)) self.W_2 = nn_utils.normal_param(std=0.1, shape=(1, self.dim)) self.b_1 = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.b_2 = nn_utils.constant_param(value=0.0, shape=(1, )) print "==> building episodic memory module (fixed number of steps: %d)" % self.memory_hops memory = [self.q_q.copy()] for iter in range(0, self.memory_hops): current_episode, g = self.new_episode(memory[iter]) self.attentions.append(g) memory.append( self.GRU_update(memory[iter], current_episode, self.W_mem_res_in, self.W_mem_res_hid, self.b_mem_res, self.W_mem_upd_in, self.W_mem_upd_hid, self.b_mem_upd, self.W_mem_hid_in, self.W_mem_hid_hid, self.b_mem_hid)) last_mem_raw = memory[-1].dimshuffle(('x', 0)) net = layers.InputLayer(shape=(1, self.dim), input_var=last_mem_raw) if self.dropout > 0 and self.mode == 'train': net = layers.DropoutLayer(net, p=self.dropout) last_mem = layers.get_output(net)[0] self.attentions = T.stack(self.attentions) #(memory_hops, fact_cnt) print "==> building answer module" self.W_a = nn_utils.normal_param(std=0.1, shape=(self.vocab_size, self.dim)) self.prediction = nn_utils.softmax(T.dot(self.W_a, last_mem)) print "==> collecting all parameters" self.params = [ self.W_inp_res_in, self.W_inp_res_hid, self.b_inp_res, self.W_inp_upd_in, self.W_inp_upd_hid, self.b_inp_upd, self.W_inp_hid_in, self.W_inp_hid_hid, self.b_inp_hid, self.W_mem_res_in, self.W_mem_res_hid, self.b_mem_res, self.W_mem_upd_in, self.W_mem_upd_hid, self.b_mem_upd, self.W_mem_hid_in, self.W_mem_hid_hid, self.b_mem_hid, self.W_b, self.W_1, self.W_2, self.b_1, self.b_2, self.W_a ] print "==> building loss layer and computing updates" self.loss_ce = T.nnet.categorical_crossentropy( self.prediction.dimshuffle('x', 0), T.stack([self.answer_var]))[0] if self.l2 > 0: self.loss_l2 = self.l2 * nn_utils.l2_reg(self.params) else: self.loss_l2 = 0 self.loss_gate = T.nnet.categorical_crossentropy( self.attentions, self.gates_var).mean() self.loss = self.loss_ce + self.loss_l2 + self.loss_gate updates = lasagne.updates.adam(self.loss, self.params) #updates = lasagne.updates.momentum(self.loss, self.params, learning_rate=0.0003) if self.mode == 'train': print "==> compiling train_fn" self.train_fn = theano.function( inputs=[ self.input_var, self.q_var, self.answer_var, self.input_mask_var, self.gates_var ], allow_input_downcast=True, outputs=[self.prediction, self.loss, self.attentions], updates=updates) print "==> compiling test_fn" self.test_fn = theano.function( inputs=[ self.input_var, self.q_var, self.answer_var, self.input_mask_var, self.gates_var ], allow_input_downcast=True, outputs=[self.prediction, self.loss, self.attentions]) if self.mode == 'train': print "==> computing gradients (for debugging)" gradient = T.grad(self.loss, self.params) self.get_gradient_fn = theano.function(inputs=[ self.input_var, self.q_var, self.answer_var, self.input_mask_var, self.gates_var ], allow_input_downcast=True, outputs=gradient)
def __init__(self, train_raw, dev_raw, test_raw, word2vec, word_vector_size, dim, mode, input_mask_mode, memory_hops, l2, normalize_attention, dropout, **kwargs): print "generate one-word answer for mctest" print "==> not used params in DMN class:", kwargs.keys() self.word2vec = word2vec self.word_vector_size = word_vector_size self.vocab_size = len(word2vec) self.dim = dim # hidden state size self.mode = mode self.input_mask_mode = input_mask_mode self.memory_hops = memory_hops self.l2 = l2 self.normalize_attention = normalize_attention self.dropout = dropout self.train_input, self.train_q, self.train_answer, self.train_input_mask = self._process_input( train_raw) self.dev_input, self.dev_q, self.dev_answer, self.dev_input_mask = self._process_input( dev_raw) self.test_input, self.test_q, self.test_answer, self.test_input_mask = self._process_input( test_raw) self.input_var = T.matrix('input_var') self.q_var = T.matrix('question_var') self.answer_var = T.iscalar('answer_var') self.input_mask_var = T.ivector('input_mask_var') self.attentions = [] print "==> building input module" self.W_inp_res_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.word_vector_size)) self.W_inp_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_res = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_inp_upd_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.word_vector_size)) self.W_inp_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_inp_hid_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.word_vector_size)) self.W_inp_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, )) inp_c_history, _ = theano.scan(fn=self.input_gru_step, sequences=self.input_var, outputs_info=T.zeros_like( self.b_inp_hid)) self.inp_c = inp_c_history.take(self.input_mask_var, axis=0) self.q_q, _ = theano.scan(fn=self.input_gru_step, sequences=self.q_var, outputs_info=T.zeros_like(self.b_inp_hid)) self.q_q = self.q_q[-1] print "==> creating parameters for memory module" self.W_mem_res_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_res = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_mem_upd_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_mem_hid_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_b = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_1 = nn_utils.normal_param(std=0.1, shape=(self.dim, 7 * self.dim + 2)) self.W_2 = nn_utils.normal_param(std=0.1, shape=(1, self.dim)) self.b_1 = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.b_2 = nn_utils.constant_param(value=0.0, shape=(1, )) print "==> building episodic memory module (fixed number of steps: %d)" % self.memory_hops memory = [self.q_q.copy()] for iter in range(1, self.memory_hops + 1): current_episode = self.new_episode(memory[iter - 1]) memory.append( self.GRU_update(memory[iter - 1], current_episode, self.W_mem_res_in, self.W_mem_res_hid, self.b_mem_res, self.W_mem_upd_in, self.W_mem_upd_hid, self.b_mem_upd, self.W_mem_hid_in, self.W_mem_hid_hid, self.b_mem_hid)) last_mem_raw = memory[-1].dimshuffle(('x', 0)) net = layers.InputLayer(shape=(1, self.dim), input_var=last_mem_raw) if self.dropout > 0 and self.mode == 'train': net = layers.DropoutLayer(net, p=self.dropout) last_mem = layers.get_output(net)[0] self.attentions = T.stack(self.attentions) print "==> building answer module" self.W_a = nn_utils.normal_param(std=0.1, shape=(self.vocab_size, self.dim)) self.prediction = nn_utils.softmax(T.dot(self.W_a, last_mem)) print "==> collecting all parameters" self.params = [ self.W_inp_res_in, self.W_inp_res_hid, self.b_inp_res, self.W_inp_upd_in, self.W_inp_upd_hid, self.b_inp_upd, self.W_inp_hid_in, self.W_inp_hid_hid, self.b_inp_hid, self.W_mem_res_in, self.W_mem_res_hid, self.b_mem_res, self.W_mem_upd_in, self.W_mem_upd_hid, self.b_mem_upd, self.W_mem_hid_in, self.W_mem_hid_hid, self.b_mem_hid, self.W_b, self.W_1, self.W_2, self.b_1, self.b_2, self.W_a ] print "==> building loss layer and computing updates" self.loss_ce = T.nnet.categorical_crossentropy( self.prediction.dimshuffle('x', 0), T.stack([self.answer_var]))[0] if self.l2 > 0: self.loss_l2 = self.l2 * nn_utils.l2_reg(self.params) else: self.loss_l2 = 0 self.loss = self.loss_ce + self.loss_l2 updates = lasagne.updates.adam(self.loss, self.params) #updates = lasagne.updates.momentum(self.loss, self.params, learning_rate=0.0003) if self.mode == 'train': print "==> compiling train_fn" self.train_fn = theano.function( inputs=[ self.input_var, self.q_var, self.answer_var, self.input_mask_var ], allow_input_downcast=True, outputs=[self.prediction, self.loss], updates=updates) print "==> compiling test_fn" self.test_fn = theano.function( inputs=[ self.input_var, self.q_var, self.answer_var, self.input_mask_var ], allow_input_downcast=True, outputs=[self.prediction, self.loss, self.attentions])
def __init__(self, babi_train_raw, babi_test_raw, word2vec, word_vector_size, dim, mode, answer_module, input_mask_mode, memory_hops, batch_size, l2, normalize_attention, batch_norm, dropout, **kwargs): print "==> not used params in DMN class:", kwargs.keys() self.vocab = {} self.ivocab = {} self.word2vec = word2vec self.word_vector_size = word_vector_size self.dim = dim self.mode = mode self.answer_module = answer_module self.input_mask_mode = input_mask_mode self.memory_hops = memory_hops self.batch_size = batch_size self.l2 = l2 self.normalize_attention = normalize_attention self.batch_norm = batch_norm self.dropout = dropout self.train_input, self.train_q, self.train_answer, self.train_fact_count, self.train_input_mask = self._process_input(babi_train_raw) self.test_input, self.test_q, self.test_answer, self.test_fact_count, self.test_input_mask = self._process_input(babi_test_raw) self.vocab_size = len(self.vocab) self.input_var = T.tensor3('input_var') # (batch_size, seq_len, glove_dim) self.q_var = T.tensor3('question_var') # as self.input_var self.answer_var = T.ivector('answer_var') # answer of example in minibatch self.fact_count_var = T.ivector('fact_count_var') # number of facts in the example of minibatch self.input_mask_var = T.imatrix('input_mask_var') # (batch_size, indices) print "==> building input module" self.W_inp_res_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.word_vector_size)) self.W_inp_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_res = nn_utils.constant_param(value=0.0, shape=(self.dim,)) self.W_inp_upd_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.word_vector_size)) self.W_inp_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_upd = nn_utils.constant_param(value=0.0, shape=(self.dim,)) self.W_inp_hid_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.word_vector_size)) self.W_inp_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_hid = nn_utils.constant_param(value=0.0, shape=(self.dim,)) input_var_shuffled = self.input_var.dimshuffle(1, 2, 0) inp_dummy = theano.shared(np.zeros((self.dim, self.batch_size), dtype=floatX)) inp_c_history, _ = theano.scan(fn=self.input_gru_step, sequences=input_var_shuffled, outputs_info=T.zeros_like(inp_dummy)) inp_c_history_shuffled = inp_c_history.dimshuffle(2, 0, 1) inp_c_list = [] inp_c_mask_list = [] for batch_index in range(self.batch_size): taken = inp_c_history_shuffled[batch_index].take(self.input_mask_var[batch_index, :self.fact_count_var[batch_index]], axis=0) inp_c_list.append(T.concatenate([taken, T.zeros((self.input_mask_var.shape[1] - taken.shape[0], self.dim), floatX)])) inp_c_mask_list.append(T.concatenate([T.ones((taken.shape[0],), np.int32), T.zeros((self.input_mask_var.shape[1] - taken.shape[0],), np.int32)])) self.inp_c = T.stack(inp_c_list).dimshuffle(1, 2, 0) inp_c_mask = T.stack(inp_c_mask_list).dimshuffle(1, 0) q_var_shuffled = self.q_var.dimshuffle(1, 2, 0) q_dummy = theano.shared(np.zeros((self.dim, self.batch_size), dtype=floatX)) q_q_history, _ = theano.scan(fn=self.input_gru_step, sequences=q_var_shuffled, outputs_info=T.zeros_like(q_dummy)) self.q_q = q_q_history[-1] print "==> creating parameters for memory module" self.W_mem_res_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_res = nn_utils.constant_param(value=0.0, shape=(self.dim,)) self.W_mem_upd_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_upd = nn_utils.constant_param(value=0.0, shape=(self.dim,)) self.W_mem_hid_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_hid = nn_utils.constant_param(value=0.0, shape=(self.dim,)) self.W_b = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_1 = nn_utils.normal_param(std=0.1, shape=(self.dim, 7 * self.dim + 0)) self.W_2 = nn_utils.normal_param(std=0.1, shape=(1, self.dim)) self.b_1 = nn_utils.constant_param(value=0.0, shape=(self.dim,)) self.b_2 = nn_utils.constant_param(value=0.0, shape=(1,)) print "==> building episodic memory module (fixed number of steps: %d)" % self.memory_hops memory = [self.q_q.copy()] for iter in range(1, self.memory_hops + 1): current_episode = self.new_episode(memory[iter - 1]) memory.append(self.GRU_update(memory[iter - 1], current_episode, self.W_mem_res_in, self.W_mem_res_hid, self.b_mem_res, self.W_mem_upd_in, self.W_mem_upd_hid, self.b_mem_upd, self.W_mem_hid_in, self.W_mem_hid_hid, self.b_mem_hid)) last_mem_raw = memory[-1].dimshuffle((1, 0)) net = layers.InputLayer(shape=(self.batch_size, self.dim), input_var=last_mem_raw) if self.batch_norm: net = layers.BatchNormLayer(incoming=net) if self.dropout > 0 and self.mode == 'train': net = layers.DropoutLayer(net, p=self.dropout) last_mem = layers.get_output(net).dimshuffle((1, 0)) print "==> building answer module" self.W_a = nn_utils.normal_param(std=0.1, shape=(self.vocab_size, self.dim)) if self.answer_module == 'feedforward': self.prediction = nn_utils.softmax(T.dot(self.W_a, last_mem)) elif self.answer_module == 'recurrent': self.W_ans_res_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim + self.vocab_size)) self.W_ans_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_res = nn_utils.constant_param(value=0.0, shape=(self.dim,)) self.W_ans_upd_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim + self.vocab_size)) self.W_ans_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_upd = nn_utils.constant_param(value=0.0, shape=(self.dim,)) self.W_ans_hid_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim + self.vocab_size)) self.W_ans_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_hid = nn_utils.constant_param(value=0.0, shape=(self.dim,)) def answer_step(prev_a, prev_y): a = self.GRU_update(prev_a, T.concatenate([prev_y, self.q_q]), self.W_ans_res_in, self.W_ans_res_hid, self.b_ans_res, self.W_ans_upd_in, self.W_ans_upd_hid, self.b_ans_upd, self.W_ans_hid_in, self.W_ans_hid_hid, self.b_ans_hid) y = nn_utils.softmax(T.dot(self.W_a, a)) return [a, y] # TODO: add conditional ending dummy = theano.shared(np.zeros((self.vocab_size, self.batch_size), dtype=floatX)) results, updates = theano.scan(fn=self.answer_step, outputs_info=[last_mem, T.zeros_like(dummy)], #(last_mem, y) n_steps=1) self.prediction = results[1][-1] else: raise Exception("invalid answer_module") self.prediction = self.prediction.dimshuffle(1, 0) self.params = [self.W_inp_res_in, self.W_inp_res_hid, self.b_inp_res, self.W_inp_upd_in, self.W_inp_upd_hid, self.b_inp_upd, self.W_inp_hid_in, self.W_inp_hid_hid, self.b_inp_hid, self.W_mem_res_in, self.W_mem_res_hid, self.b_mem_res, self.W_mem_upd_in, self.W_mem_upd_hid, self.b_mem_upd, self.W_mem_hid_in, self.W_mem_hid_hid, self.b_mem_hid, #self.W_b self.W_1, self.W_2, self.b_1, self.b_2, self.W_a] if self.answer_module == 'recurrent': self.params = self.params + [self.W_ans_res_in, self.W_ans_res_hid, self.b_ans_res, self.W_ans_upd_in, self.W_ans_upd_hid, self.b_ans_upd, self.W_ans_hid_in, self.W_ans_hid_hid, self.b_ans_hid] print "==> building loss layer and computing updates" self.loss_ce = T.nnet.categorical_crossentropy(self.prediction, self.answer_var).mean() if self.l2 > 0: self.loss_l2 = self.l2 * nn_utils.l2_reg(self.params) else: self.loss_l2 = 0 self.loss = self.loss_ce + self.loss_l2 updates = lasagne.updates.adadelta(self.loss, self.params) #updates = lasagne.updates.momentum(self.loss, self.params, learning_rate=0.001) if self.mode == 'train': print "==> compiling train_fn" self.train_fn = theano.function(inputs=[self.input_var, self.q_var, self.answer_var, self.fact_count_var, self.input_mask_var], outputs=[self.prediction, self.loss], updates=updates) print "==> compiling test_fn" self.test_fn = theano.function(inputs=[self.input_var, self.q_var, self.answer_var, self.fact_count_var, self.input_mask_var], outputs=[self.prediction, self.loss])
def __init__(self, babi_train_raw, babi_test_raw, word2vec, word_vector_size, sent_vector_size, dim, mode, answer_module, input_mask_mode, memory_hops, l2, normalize_attention, batch_norm, dropout, dropout_in, **kwargs): print "==> not used params in DMN class:", kwargs.keys() self.vocab = {None: 0} self.ivocab = {0: None} self.word2vec = word2vec self.word_vector_size = word_vector_size self.sent_vector_size = sent_vector_size self.dim = dim self.mode = mode self.answer_module = answer_module self.input_mask_mode = input_mask_mode self.memory_hops = memory_hops self.l2 = l2 self.normalize_attention = normalize_attention self.batch_norm = batch_norm self.dropout = dropout self.dropout_in = dropout_in self.max_inp_sent_len = 0 self.max_q_len = 0 """ #To Use All Vocab self.vocab = {None: 0, 'jason': 134.0, 'office': 14.0, 'yellow': 78.0, 'bedroom': 24.0, 'go': 108.0, 'yes': 15.0, 'antoine': 138.0, 'milk': 139.0, 'before': 46.0, 'grabbed': 128.0, 'fit': 100.0, 'how': 105.0, 'swan': 73.0, 'than': 96.0, 'to': 13.0, 'does': 99.0, 's,e': 110.0, 'east': 102.0, 'rectangle': 82.0, 'gave': 149.0, 'then': 39.0, 'evening': 48.0, 'triangle': 79.0, 'garden': 37.0, 'get': 131.0, 'football,apple,milk': 179.0, 'they': 41.0, 'not': 178.0, 'bigger': 95.0, 'gray': 77.0, 'school': 6.0, 'apple': 142.0, 'did': 127.0, 'morning': 44.0, 'discarded': 146.0, 'julius': 72.0, 'she': 29.0, 'went': 11.0, 'where': 30.0, 'jeff': 152.0, 'square': 84.0, 'who': 153.0, 'tired': 124.0, 'there': 130.0, 'back': 12.0, 'lion': 70.0, 'are': 50.0, 'picked': 143.0, 'e,e': 119.0, 'pajamas': 129.0, 'Mary': 157.0, 'blue': 83.0, 'what': 63.0, 'container': 98.0, 'rhino': 76.0, 'daniel': 31.0, 'bernhard': 67.0, 'milk,football': 172.0, 'above': 80.0, 'got': 136.0, 'emily': 60.0, 'red': 88.0, 'either': 3.0, 'sheep': 58.0, 'football': 137.0, 'jessica': 61.0, 'do': 106.0, 'Bill': 155.0, 'football,apple': 168.0, 'fred': 1.0, 'winona': 59.0, 'objects': 161.0, 'put': 147.0, 'kitchen': 17.0, 'box': 90.0, 'received': 154.0, 'journeyed': 25.0, 'of': 52.0, 'wolf': 62.0, 'afternoon': 47.0, 'or': 7.0, 'south': 112.0, 's,w': 114.0, 'afterwards': 32.0, 'sumit': 123.0, 'color': 75.0, 'julie': 23.0, 'one': 163.0, 'down': 148.0, 'nothing': 167.0, 'n,n': 113.0, 'right': 86.0, 's,s': 116.0, 'gertrude': 54.0, 'bathroom': 26.0, 'from': 109.0, 'west': 104.0, 'chocolates': 91.0, 'two': 165.0, 'frog': 66.0, '.': 9.0, 'cats': 57.0, 'apple,milk,football': 175.0, 'passed': 158.0, 'apple,football,milk': 176.0, 'white': 71.0, 'john': 35.0, 'was': 45.0, 'mary': 10.0, 'apple,football': 170.0, 'north': 103.0, 'n,w': 111.0, 'that': 28.0, 'park': 8.0, 'took': 141.0, 'chocolate': 101.0, 'carrying': 162.0, 'n,e': 120.0, 'mice': 49.0, 'travelled': 22.0, 'he': 33.0, 'none': 164.0, 'bored': 133.0, 'e,n': 117.0, None: 0, 'Jeff': 159.0, 'this': 43.0, 'inside': 93.0, 'bill': 16.0, 'up': 144.0, 'cat': 64.0, 'will': 125.0, 'below': 87.0, 'greg': 74.0, 'three': 166.0, 'suitcase': 97.0, 'following': 36.0, 'e,s': 115.0, 'and': 40.0, 'thirsty': 135.0, 'cinema': 19.0, 'is': 2.0, 'moved': 18.0, 'yann': 132.0, 'sphere': 89.0, 'dropped': 145.0, 'in': 4.0, 'mouse': 56.0, 'football,milk': 171.0, 'pink': 81.0, 'afraid': 51.0, 'no': 20.0, 'Fred': 156.0, 'w,s': 121.0, 'handed': 151.0, 'w,w': 118.0, 'brian': 69.0, 'chest': 94.0, 'w,n': 122.0, 'you': 107.0, 'many': 160.0, 'lily': 65.0, 'hallway': 34.0, 'why': 126.0, 'after': 27.0, 'yesterday': 42.0, 'sandra': 38.0, 'fits': 92.0, 'milk,football,apple': 173.0, 'the': 5.0, 'milk,apple': 169.0, 'a': 55.0, 'give': 150.0, 'longer': 177.0, 'maybe': 21.0, 'hungry': 140.0, 'apple,milk': 174.0, 'green': 68.0, 'wolves': 53.0, 'left': 85.0} self.ivocab = {0: None, 1: 'fred', 2: 'is', 3: 'either', 4: 'in', 5: 'the', 6: 'school', 7: 'or', 8: 'park', 9: '.', 10: 'mary', 11: 'went', 12: 'back', 13: 'to', 14: 'office', 15: 'yes', 16: 'bill', 17: 'kitchen', 18: 'moved', 19: 'cinema', 20: 'no', 21: 'maybe', 22: 'travelled', 23: 'julie', 24: 'bedroom', 25: 'journeyed', 26: 'bathroom', 27: 'after', 28: 'that', 29: 'she', 30: 'where', 31: 'daniel', 32: 'afterwards', 33: 'he', 34: 'hallway', 35: 'john', 36: 'following', 37: 'garden', 38: 'sandra', 39: 'then', 40: 'and', 41: 'they', 42: 'yesterday', 43: 'this', 44: 'morning', 45: 'was', 46: 'before', 47: 'afternoon', 48: 'evening', 49: 'mice', 50: 'are', 51: 'afraid', 52: 'of', 53: 'wolves', 54: 'gertrude', 55: 'a', 56: 'mouse', 57: 'cats', 58: 'sheep', 59: 'winona', 60: 'emily', 61: 'jessica', 62: 'wolf', 63: 'what', 64: 'cat', 65: 'lily', 66: 'frog', 67: 'bernhard', 68: 'green', 69: 'brian', 70: 'lion', 71: 'white', 72: 'julius', 73: 'swan', 74: 'greg', 75: 'color', 76: 'rhino', 77: 'gray', 78: 'yellow', 79: 'triangle', 80: 'above', 81: 'pink', 82: 'rectangle', 83: 'blue', 84: 'square', 85: 'left', 86: 'right', 87: 'below', 88: 'red', 89: 'sphere', 90: 'box', 91: 'chocolates', 92: 'fits', 93: 'inside', 94: 'chest', 95: 'bigger', 96: 'than', 97: 'suitcase', 98: 'container', 99: 'does', 100: 'fit', 101: 'chocolate', 102: 'east', 103: 'north', 104: 'west', 105: 'how', 106: 'do', 107: 'you', 108: 'go', 109: 'from', 110: 's,e', 111: 'n,w', 112: 'south', 113: 'n,n', 114: 's,w', 115: 'e,s', 116: 's,s', 117: 'e,n', 118: 'w,w', 119: 'e,e', 120: 'n,e', 121: 'w,s', 122: 'w,n', 123: 'sumit', 124: 'tired', 125: 'will', 126: 'why', 127: 'did', 128: 'grabbed', 129: 'pajamas', 130: 'there', 131: 'get', 132: 'yann', 133: 'bored', 134: 'jason', 135: 'thirsty', 136: 'got', 137: 'football', 138: 'antoine', 139: 'milk', 140: 'hungry', 141: 'took', 142: 'apple', 143: 'picked', 144: 'up', 145: 'dropped', 146: 'discarded', 147: 'put', 148: 'down', 149: 'gave', 150: 'give', 151: 'handed', 152: 'jeff', 153: 'who', 154: 'received', 155: 'Bill', 156: 'Fred', 157: 'Mary', 158: 'passed', 159: 'Jeff', 160: 'many', 161: 'objects', 162: 'carrying', 163: 'one', 164: 'none', 165: 'two', 166: 'three', 167: 'nothing', 168: 'football,apple', 169: 'milk,apple', 170: 'apple,football', 171: 'football,milk', 172: 'milk,football', 173: 'milk,football,apple', 174: 'apple,milk', 175: 'apple,milk,football', 176: 'apple,football,milk', 177: 'longer', 178: 'not', 179: 'football,apple,milk'} #self.vocab = {'jason': 134.0, 'office': 14.0, 'yellow': 78.0, 'bedroom': 24.0, 'go': 108.0, 'yes': 15.0, 'antoine': 138.0, 'milk': 139.0, 'before': 46.0, 'grabbed': 128.0, 'fit': 100.0, 'how': 105.0, 'swan': 73.0, 'than': 96.0, 'to': 13.0, 'does': 99.0, 's,e': 110.0, 'east': 102.0, 'rectangle': 82.0, 'gave': 149.0, 'then': 39.0, 'evening': 48.0, 'triangle': 79.0, 'garden': 37.0, 'get': 131.0, 'football,apple,milk': 179.0, 'they': 41.0, 'not': 178.0, 'bigger': 95.0, 'gray': 77.0, 'school': 6.0, 'apple': 142.0, 'did': 127.0, 'morning': 44.0, 'discarded': 146.0, 'julius': 72.0, 'she': 29.0, 'went': 11.0, 'where': 30.0, 'jeff': 152.0, 'square': 84.0, 'who': 153.0, 'tired': 124.0, 'there': 130.0, 'back': 12.0, 'lion': 70.0, 'are': 50.0, 'picked': 143.0, 'e,e': 119.0, 'pajamas': 129.0, 'Mary': 157.0, 'blue': 83.0, 'what': 63.0, 'container': 98.0, 'rhino': 76.0, 'daniel': 31.0, 'bernhard': 67.0, 'milk,football': 172.0, 'above': 80.0, 'got': 136.0, 'emily': 60.0, 'red': 88.0, 'either': 3.0, 'sheep': 58.0, 'football': 137.0, 'jessica': 61.0, 'do': 106.0, 'Bill': 155.0, 'football,apple': 168.0, 'fred': 1.0, 'winona': 59.0, 'objects': 161.0, 'put': 147.0, 'kitchen': 17.0, 'box': 90.0, 'received': 154.0, 'journeyed': 25.0, 'of': 52.0, 'wolf': 62.0, 'afternoon': 47.0, 'or': 7.0, 'south': 112.0, 's,w': 114.0, 'afterwards': 32.0, 'sumit': 123.0, 'color': 75.0, 'julie': 23.0, 'one': 163.0, 'down': 148.0, 'nothing': 167.0, 'n,n': 113.0, 'right': 86.0, 's,s': 116.0, 'gertrude': 54.0, 'bathroom': 26.0, 'from': 109.0, 'west': 104.0, 'chocolates': 91.0, 'two': 165.0, 'frog': 66.0, '.': 9.0, 'cats': 57.0, 'apple,milk,football': 175.0, 'passed': 158.0, 'apple,football,milk': 176.0, 'white': 71.0, 'john': 35.0, 'was': 45.0, 'mary': 10.0, 'apple,football': 170.0, 'north': 103.0, 'n,w': 111.0, 'that': 28.0, 'park': 8.0, 'took': 141.0, 'chocolate': 101.0, 'carrying': 162.0, 'n,e': 120.0, 'mice': 49.0, 'travelled': 22.0, 'he': 33.0, 'none': 164.0, 'bored': 133.0, 'e,n': 117.0, None: 0, 'Jeff': 159.0, 'this': 43.0, 'inside': 93.0, 'bill': 16.0, 'up': 144.0, 'cat': 64.0, 'will': 125.0, 'below': 87.0, 'greg': 74.0, 'three': 166.0, 'suitcase': 97.0, 'following': 36.0, 'e,s': 115.0, 'and': 40.0, 'thirsty': 135.0, 'cinema': 19.0, 'is': 2.0, 'moved': 18.0, 'yann': 132.0, 'sphere': 89.0, 'dropped': 145.0, 'in': 4.0, 'mouse': 56.0, 'football,milk': 171.0, 'pink': 81.0, 'afraid': 51.0, 'no': 20.0, 'Fred': 156.0, 'w,s': 121.0, 'handed': 151.0, 'w,w': 118.0, 'brian': 69.0, 'chest': 94.0, 'w,n': 122.0, 'you': 107.0, 'many': 160.0, 'lily': 65.0, 'hallway': 34.0, 'why': 126.0, 'after': 27.0, 'yesterday': 42.0, 'sandra': 38.0, 'fits': 92.0, 'milk,football,apple': 173.0, 'the': 5.0, 'milk,apple': 169.0, 'a': 55.0, 'give': 150.0, 'longer': 177.0, 'maybe': 21.0, 'hungry': 140.0, 'apple,milk': 174.0, 'green': 68.0, 'wolves': 53.0, 'left': 85.0} #self.ivocab = {1: 'fred', 2: 'is', 3: 'either', 4: 'in', 5: 'the', 6: 'school', 7: 'or', 8: 'park', 9: '.', 10: 'mary', 11: 'went', 12: 'back', 13: 'to', 14: 'office', 15: 'yes', 16: 'bill', 17: 'kitchen', 18: 'moved', 19: 'cinema', 20: 'no', 21: 'maybe', 22: 'travelled', 23: 'julie', 24: 'bedroom', 25: 'journeyed', 26: 'bathroom', 27: 'after', 28: 'that', 29: 'she', 30: 'where', 31: 'daniel', 32: 'afterwards', 33: 'he', 34: 'hallway', 35: 'john', 36: 'following', 37: 'garden', 38: 'sandra', 39: 'then', 40: 'and', 41: 'they', 42: 'yesterday', 43: 'this', 44: 'morning', 45: 'was', 46: 'before', 47: 'afternoon', 48: 'evening', 49: 'mice', 50: 'are', 51: 'afraid', 52: 'of', 53: 'wolves', 54: 'gertrude', 55: 'a', 56: 'mouse', 57: 'cats', 58: 'sheep', 59: 'winona', 60: 'emily', 61: 'jessica', 62: 'wolf', 63: 'what', 64: 'cat', 65: 'lily', 66: 'frog', 67: 'bernhard', 68: 'green', 69: 'brian', 70: 'lion', 71: 'white', 72: 'julius', 73: 'swan', 74: 'greg', 75: 'color', 76: 'rhino', 77: 'gray', 78: 'yellow', 79: 'triangle', 80: 'above', 81: 'pink', 82: 'rectangle', 83: 'blue', 84: 'square', 85: 'left', 86: 'right', 87: 'below', 88: 'red', 89: 'sphere', 90: 'box', 91: 'chocolates', 92: 'fits', 93: 'inside', 94: 'chest', 95: 'bigger', 96: 'than', 97: 'suitcase', 98: 'container', 99: 'does', 100: 'fit', 101: 'chocolate', 102: 'east', 103: 'north', 104: 'west', 105: 'how', 106: 'do', 107: 'you', 108: 'go', 109: 'from', 110: 's,e', 111: 'n,w', 112: 'south', 113: 'n,n', 114: 's,w', 115: 'e,s', 116: 's,s', 117: 'e,n', 118: 'w,w', 119: 'e,e', 120: 'n,e', 121: 'w,s', 122: 'w,n', 123: 'sumit', 124: 'tired', 125: 'will', 126: 'why', 127: 'did', 128: 'grabbed', 129: 'pajamas', 130: 'there', 131: 'get', 132: 'yann', 133: 'bored', 134: 'jason', 135: 'thirsty', 136: 'got', 137: 'football', 138: 'antoine', 139: 'milk', 140: 'hungry', 141: 'took', 142: 'apple', 143: 'picked', 144: 'up', 145: 'dropped', 146: 'discarded', 147: 'put', 148: 'down', 149: 'gave', 150: 'give', 151: 'handed', 152: 'jeff', 153: 'who', 154: 'received', 155: 'Bill', 156: 'Fred', 157: 'Mary', 158: 'passed', 159: 'Jeff', 160: 'many', 161: 'objects', 162: 'carrying', 163: 'one', 164: 'none', 165: 'two', 166: 'three', 167: 'nothing', 168: 'football,apple', 169: 'milk,apple', 170: 'apple,football', 171: 'football,milk', 172: 'milk,football', 173: 'milk,football,apple', 174: 'apple,milk', 175: 'apple,milk,football', 176: 'apple,football,milk', 177: 'longer', 178: 'not', 179: 'football,apple,milk'} #""" self.train_input, self.train_q, self.train_answer, self.train_input_mask = self._process_input( babi_train_raw) self.test_input, self.test_q, self.test_answer, self.test_input_mask = self._process_input( babi_test_raw) self.vocab_size = len(self.vocab) self.input_var = T.imatrix('input_var') self.q_var = T.ivector('question_var') self.answer_var = T.iscalar('answer_var') self.input_mask_var = T.ivector('input_mask_var') self.attentions = [] self.pe_matrix_in = self.pe_matrix(self.max_inp_sent_len) self.pe_matrix_q = self.pe_matrix(self.max_q_len) print "==> building input module" #positional encoder weights self.W_pe = nn_utils.normal_param(std=0.1, shape=(self.vocab_size, self.dim)) #biGRU input fusion weights self.W_inp_res_in_fwd = nn_utils.normal_param( std=0.1, shape=(self.dim, self.sent_vector_size)) self.W_inp_res_hid_fwd = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_res_fwd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_inp_upd_in_fwd = nn_utils.normal_param( std=0.1, shape=(self.dim, self.sent_vector_size)) self.W_inp_upd_hid_fwd = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_upd_fwd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_inp_hid_in_fwd = nn_utils.normal_param( std=0.1, shape=(self.dim, self.sent_vector_size)) self.W_inp_hid_hid_fwd = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_hid_fwd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_inp_res_in_bwd = nn_utils.normal_param( std=0.1, shape=(self.dim, self.sent_vector_size)) self.W_inp_res_hid_bwd = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_res_bwd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_inp_upd_in_bwd = nn_utils.normal_param( std=0.1, shape=(self.dim, self.sent_vector_size)) self.W_inp_upd_hid_bwd = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_upd_bwd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_inp_hid_in_bwd = nn_utils.normal_param( std=0.1, shape=(self.dim, self.sent_vector_size)) self.W_inp_hid_hid_bwd = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_hid_bwd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) #self.V_f = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) #self.V_b = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.inp_sent_reps, _ = theano.scan(fn=self.sum_pos_encodings_in, sequences=self.input_var) self.inp_sent_reps_stacked = T.stacklists(self.inp_sent_reps) #self.inp_c = self.input_module_full(self.inp_sent_reps_stacked) self.inp_c = self.input_module_full(self.inp_sent_reps) self.q_q = self.sum_pos_encodings_q(self.q_var) print "==> creating parameters for memory module" self.W_mem_res_in = nn_utils.normal_param(std=0.1, shape=(self.memory_hops, self.dim, self.dim)) self.W_mem_res_hid = nn_utils.normal_param(std=0.1, shape=(self.memory_hops, self.dim, self.dim)) self.b_mem_res = nn_utils.constant_param(value=0.0, shape=( self.memory_hops, self.dim, )) self.W_mem_upd_in = nn_utils.normal_param(std=0.1, shape=(self.memory_hops, self.dim, self.dim)) self.W_mem_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.memory_hops, self.dim, self.dim)) self.b_mem_upd = nn_utils.constant_param(value=0.0, shape=( self.memory_hops, self.dim, )) self.W_mem_hid_in = nn_utils.normal_param(std=0.1, shape=(self.memory_hops, self.dim, self.dim)) self.W_mem_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.memory_hops, self.dim, self.dim)) self.b_mem_hid = nn_utils.constant_param(value=0.0, shape=( self.memory_hops, self.dim, )) #self.W_b = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) #self.W_1 = nn_utils.normal_param(std=0.1, shape=(self.dim, 7 * self.dim + 0)) self.W_1 = nn_utils.normal_param(std=0.1, shape=(self.memory_hops, self.dim, 4 * self.dim + 0)) self.W_2 = nn_utils.normal_param(std=0.1, shape=(self.memory_hops, 1, self.dim)) self.b_1 = nn_utils.constant_param(value=0.0, shape=( self.memory_hops, self.dim, )) self.b_2 = nn_utils.constant_param(value=0.0, shape=( self.memory_hops, 1, )) print "==> building episodic memory module (fixed number of steps: %d)" % self.memory_hops memory = [self.q_q.copy()] for iter in range(1, self.memory_hops + 1): self.mem_weight_num = int(iter - 1) current_episode = self.new_episode(memory[iter - 1]) memory.append( self.GRU_update(memory[iter - 1], current_episode, self.W_mem_res_in[self.mem_weight_num], self.W_mem_res_hid[self.mem_weight_num], self.b_mem_res[self.mem_weight_num], self.W_mem_upd_in[self.mem_weight_num], self.W_mem_upd_hid[self.mem_weight_num], self.b_mem_upd[self.mem_weight_num], self.W_mem_hid_in[self.mem_weight_num], self.W_mem_hid_hid[self.mem_weight_num], self.b_mem_hid[self.mem_weight_num])) last_mem_raw = memory[-1].dimshuffle(('x', 0)) net = layers.InputLayer(shape=(1, self.dim), input_var=last_mem_raw) if self.dropout > 0 and self.mode == 'train': net = layers.DropoutLayer(net, p=self.dropout) last_mem = layers.get_output(net)[0] print "==> building answer module" self.W_a = nn_utils.normal_param(std=0.1, shape=(self.vocab_size, self.dim)) if self.answer_module == 'feedforward': self.prediction = nn_utils.softmax(T.dot(self.W_a, last_mem)) elif self.answer_module == 'recurrent': self.W_ans_res_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.dim + self.vocab_size)) self.W_ans_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_res = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_ans_upd_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.dim + self.vocab_size)) self.W_ans_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_ans_hid_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.dim + self.vocab_size)) self.W_ans_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, )) def answer_step(prev_a, prev_y): a = self.GRU_update(prev_a, T.concatenate([prev_y, self.q_q]), self.W_ans_res_in, self.W_ans_res_hid, self.b_ans_res, self.W_ans_upd_in, self.W_ans_upd_hid, self.b_ans_upd, self.W_ans_hid_in, self.W_ans_hid_hid, self.b_ans_hid) y = nn_utils.softmax(T.dot(self.W_a, a)) return [a, y] # add conditional ending? dummy = theano.shared(np.zeros((self.vocab_size, ), dtype=floatX)) results, updates = theano.scan( fn=answer_step, outputs_info=[last_mem, T.zeros_like(dummy)], n_steps=1) self.prediction = results[1][-1] else: raise Exception("invalid answer_module") print "==> collecting all parameters" self.params = [ self.W_pe, self.W_inp_res_in_fwd, self.W_inp_res_hid_fwd, self.b_inp_res_fwd, self.W_inp_upd_in_fwd, self.W_inp_upd_hid_fwd, self.b_inp_upd_fwd, self.W_inp_hid_in_fwd, self.W_inp_hid_hid_fwd, self.b_inp_hid_fwd, self.W_inp_res_in_bwd, self.W_inp_res_hid_bwd, self.b_inp_res_bwd, self.W_inp_upd_in_bwd, self.W_inp_upd_hid_bwd, self.b_inp_upd_bwd, self.W_inp_hid_in_bwd, self.W_inp_hid_hid_bwd, self.b_inp_hid_bwd, self.W_mem_res_in, self.W_mem_res_hid, self.b_mem_res, self.W_mem_upd_in, self.W_mem_upd_hid, self.b_mem_upd, self.W_mem_hid_in, self.W_mem_hid_hid, self.b_mem_hid, #self.W_b self.W_1, self.W_2, self.b_1, self.b_2, self.W_a ] if self.answer_module == 'recurrent': self.params = self.params + [ self.W_ans_res_in, self.W_ans_res_hid, self.b_ans_res, self.W_ans_upd_in, self.W_ans_upd_hid, self.b_ans_upd, self.W_ans_hid_in, self.W_ans_hid_hid, self.b_ans_hid ] print "==> building loss layer and computing updates" self.loss_ce = T.nnet.categorical_crossentropy( self.prediction.dimshuffle('x', 0), T.stack([self.answer_var]))[0] if self.l2 > 0: self.loss_l2 = self.l2 * nn_utils.l2_reg(self.params) else: self.loss_l2 = 0 self.loss = self.loss_ce + self.loss_l2 #updates = lasagne.updates.adadelta(self.loss, self.params) updates = lasagne.updates.adam(self.loss, self.params) updates = lasagne.updates.adam(self.loss, self.params, learning_rate=0.0001, beta1=0.5) #from DCGAN paper #updates = lasagne.updates.adadelta(self.loss, self.params, learning_rate=0.0005) #updates = lasagne.updates.momentum(self.loss, self.params, learning_rate=0.0003) self.attentions = T.stack(self.attentions) if self.mode == 'train': print "==> compiling train_fn" self.train_fn = theano.function( inputs=[ self.input_var, self.q_var, self.answer_var, self.input_mask_var ], outputs=[self.prediction, self.loss, self.attentions], updates=updates, on_unused_input='warn', allow_input_downcast=True) print "==> compiling test_fn" self.test_fn = theano.function( inputs=[ self.input_var, self.q_var, self.answer_var, self.input_mask_var ], outputs=[self.prediction, self.loss, self.attentions], on_unused_input='warn', allow_input_downcast=True)
def __init__(self, data_dir, word2vec, word_vector_size, dim, mode, answer_module, memory_hops, batch_size, l2, normalize_attention, batch_norm, dropout, **kwargs): print "==> not used params in DMN class:", kwargs.keys() self.data_dir = data_dir self.word2vec = word2vec self.word_vector_size = word_vector_size self.dim = dim self.mode = mode self.answer_module = answer_module self.memory_hops = memory_hops self.batch_size = batch_size self.l2 = l2 self.normalize_attention = normalize_attention self.batch_norm = batch_norm self.dropout = dropout self.vocab, self.ivocab = self._load_vocab(self.data_dir) self.train_story = None self.test_story = None self.train_dict_story, self.train_features, self.train_fns_dict, self.train_num_imgs = self._process_input_sind( self.data_dir, 'train') self.test_dict_story, self.test_features, self.test_fns_dict, self.test_num_imgs = self._process_input_sind( self.data_dir, 'val') self.train_story = self.train_dict_story.keys() self.test_story = self.test_dict_story.keys() self.vocab_size = len(self.vocab) self.input_var = T.tensor3( 'input_var') # (batch_size, seq_len, cnn_dim) self.q_var = T.matrix('q_var') # Now, it's a batch * image_sieze. self.answer_var = T.imatrix( 'answer_var') # answer of example in minibatch self.answer_mask = T.matrix('answer_mask') self.answer_inp_var = T.tensor3( 'answer_inp_var') # answer of example in minibatch print "==> building input module" # It's very simple now, the input module just need to map from cnn_dim to dim. logging.info('self.cnn_dim = %d', self.cnn_dim) self.W_inp_emb_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.cnn_dim)) self.b_inp_emb_in = nn_utils.constant_param(value=0.0, shape=(self.dim, )) #inp_c_hist = T.dot(self.W_inp_emb_in, self.input_var) + self.b_inp_emb_in inp_var_shuffled = self.input_var.dimshuffle(1, 2, 0) print inp_var_shuffled.shape.eval( {self.input_var: np.random.rand(10, 4, 4096).astype('float32')}) def _dot(x, W, b): return T.dot(W, x) + b.dimshuffle(0, 'x') inp_c_hist, _ = theano.scan( fn=_dot, sequences=inp_var_shuffled, non_sequences=[self.W_inp_emb_in, self.b_inp_emb_in]) #inp_c_hist,_ = theano.scan(fn = _dot, sequences=self.input_var, non_sequences = [self.W_inp_emb_in, self.b_inp_emb_in]) #self.inp_c = inp_c_hist.dimshuffle(2,0,1) # b x len x fea self.inp_c = inp_c_hist print "==> building question module" # Now, share the parameter with the input module. q_var_shuffled = self.q_var.dimshuffle(1, 0) q_hist = T.dot(self.W_inp_emb_in, q_var_shuffled) + self.b_inp_emb_in.dimshuffle(0, 'x') self.q_q = q_hist.dimshuffle(0, 1) # batch x dim print "==> creating parameters for memory module" self.W_mem_res_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_res = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_mem_upd_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_mem_hid_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_b = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_1 = nn_utils.normal_param(std=0.1, shape=(self.dim, 7 * self.dim + 0)) self.W_2 = nn_utils.normal_param(std=0.1, shape=(1, self.dim)) self.b_1 = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.b_2 = nn_utils.constant_param(value=0.0, shape=(1, )) print "==> building episodic memory module (fixed number of steps: %d)" % self.memory_hops memory = [self.q_q.copy()] for iter in range(1, self.memory_hops + 1): #m = printing.Print('mem')(memory[iter-1]) current_episode = self.new_episode(memory[iter - 1]) #current_episode = self.new_episode(m) #current_episode = printing.Print('current_episode')(current_episode) memory.append( self.GRU_update(memory[iter - 1], current_episode, self.W_mem_res_in, self.W_mem_res_hid, self.b_mem_res, self.W_mem_upd_in, self.W_mem_upd_hid, self.b_mem_upd, self.W_mem_hid_in, self.W_mem_hid_hid, self.b_mem_hid)) last_mem_raw = memory[-1].dimshuffle((1, 0)) net = layers.InputLayer(shape=(self.batch_size, self.dim), input_var=last_mem_raw) if self.batch_norm: net = layers.BatchNormLayer(incoming=net) if self.dropout > 0 and self.mode == 'train': net = layers.DropoutLayer(net, p=self.dropout) last_mem = layers.get_output(net).dimshuffle((1, 0)) logging.info('last_mem size') print last_mem.shape.eval({ self.input_var: np.random.rand(10, 4, 4096).astype('float32'), self.q_var: np.random.rand(10, 4096).astype('float32') }) print "==> building answer module" answer_inp_var_shuffled = self.answer_inp_var.dimshuffle(1, 2, 0) # because we have the additional #start token. Thus, we need to add this +1 for all the parameters as well. dummy = theano.shared( np.zeros((self.vocab_size + 1, self.batch_size), dtype=floatX)) self.W_a = nn_utils.normal_param(std=0.1, shape=(self.vocab_size + 1, self.dim)) self.W_ans_res_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim + self.vocab_size + 1)) self.W_ans_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_res = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_ans_upd_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim + self.vocab_size + 1)) self.W_ans_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_ans_hid_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim + self.vocab_size + 1)) self.W_ans_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, )) logging.info('answer_inp_var_shuffled size') print answer_inp_var_shuffled.shape.eval({ self.answer_inp_var: np.random.rand(10, 18, 8001).astype('float32') }) #last_mem = printing.Print('prob_sm')(last_mem) results, _ = theano.scan(fn=self.answer_gru_step, sequences=answer_inp_var_shuffled, outputs_info=[last_mem]) # Assume there is a start token print results.shape.eval({ self.input_var: np.random.rand(10, 4, 4096).astype('float32'), self.q_var: np.random.rand(10, 4096).astype('float32'), self.answer_inp_var: np.random.rand(10, 18, 8001).astype('float32') }) results = results[0:-1, :, :] # get rid of the last token. print results.shape.eval({ self.input_var: np.random.rand(10, 4, 4096).astype('float32'), self.q_var: np.random.rand(10, 4096).astype('float32'), self.answer_inp_var: np.random.rand(10, 18, 8001).astype('float32') }) # Now, we need to transform it to the probabilities. prob, _ = theano.scan(fn=lambda x, w: T.dot(w, x), sequences=results, non_sequences=self.W_a) prob_shuffled = prob.dimshuffle(2, 0, 1) # b * len * vocab logging.info("prob shape.") print prob.shape.eval({ self.input_var: np.random.rand(10, 4, 4096).astype('float32'), self.q_var: np.random.rand(10, 4096).astype('float32'), self.answer_inp_var: np.random.rand(10, 18, 8001).astype('float32') }) n = prob_shuffled.shape[0] * prob_shuffled.shape[1] prob_rhp = T.reshape(prob_shuffled, (n, prob_shuffled.shape[2])) prob_sm = nn_utils.softmax(prob_rhp) self.prediction = prob_sm mask = T.reshape(self.answer_mask, (n, )) lbl = T.reshape(self.answer_var, (n, )) self.params = [ self.W_inp_emb_in, self.b_inp_emb_in, self.W_mem_res_in, self.W_mem_res_hid, self.b_mem_res, self.W_mem_upd_in, self.W_mem_upd_hid, self.b_mem_upd, self.W_mem_hid_in, self.W_mem_hid_hid, self.b_mem_hid, #self.W_b self.W_1, self.W_2, self.b_1, self.b_2, self.W_a ] self.params = self.params + [ self.W_ans_res_in, self.W_ans_res_hid, self.b_ans_res, self.W_ans_upd_in, self.W_ans_upd_hid, self.b_ans_upd, self.W_ans_hid_in, self.W_ans_hid_hid, self.b_ans_hid ] print "==> building loss layer and computing updates" loss_vec = T.nnet.categorical_crossentropy(prob_sm, lbl) self.loss_ce = (mask * loss_vec).sum() / mask.sum() #self.loss_ce = T.nnet.categorical_crossentropy(results_rhp, lbl) if self.l2 > 0: self.loss_l2 = self.l2 * nn_utils.l2_reg(self.params) else: self.loss_l2 = 0 self.loss = self.loss_ce + self.loss_l2 updates = lasagne.updates.adadelta(self.loss, self.params) #updates = lasagne.updates.momentum(self.loss, self.params, learning_rate=0.001) if self.mode == 'train': print "==> compiling train_fn" self.train_fn = theano.function( inputs=[ self.input_var, self.q_var, self.answer_var, self.answer_mask, self.answer_inp_var ], outputs=[self.prediction, self.loss], updates=updates) print "==> compiling test_fn" self.test_fn = theano.function(inputs=[ self.input_var, self.q_var, self.answer_var, self.answer_mask, self.answer_inp_var ], outputs=[self.prediction, self.loss])
def __init__(self, train_raw, dev_raw, test_raw, word2vec, word_vector_size, answer_module, dim, mode, input_mask_mode, memory_hops, l2, normalize_attention, dropout, **kwargs): print "generate sentence answer for mctest" print "==> not used params in DMN class:", kwargs.keys() self.word2vec = word2vec self.word_vector_size = word_vector_size # add eng_of_sentence tag for answer generation #self.end_tag = len(word2vec) #self.vocab_size = self.end_tag+1 self.vocab_size = len(word2vec) self.dim = dim # hidden state size self.mode = mode self.input_mask_mode = input_mask_mode self.memory_hops = memory_hops self.answer_module = answer_module self.l2 = l2 self.normalize_attention = normalize_attention self.dropout = dropout self.train_input, self.train_q, self.train_answer, self.train_input_mask, self.train_max_n = self._process_input( train_raw) self.dev_input, self.dev_q, self.dev_answer, self.dev_input_mask, self.dev_max_n = self._process_input( dev_raw) self.test_input, self.test_q, self.test_answer, self.test_input_mask, self.test_max_n = self._process_input( test_raw) self.input_var = T.matrix('input_var') self.q_var = T.matrix('question_var') self.answer_var = T.ivector('answer_var') self.input_mask_var = T.ivector('input_mask_var') self.max_n = T.iscalar('max_n') self.attentions = [] print "==> building input module" self.W_inp_res_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.word_vector_size)) self.W_inp_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_res = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_inp_upd_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.word_vector_size)) self.W_inp_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_inp_hid_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.word_vector_size)) self.W_inp_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, )) inp_c_history, _ = theano.scan(fn=self.input_gru_step, sequences=self.input_var, outputs_info=T.zeros_like( self.b_inp_hid)) self.inp_c = inp_c_history.take(self.input_mask_var, axis=0) self.q_q, _ = theano.scan(fn=self.input_gru_step, sequences=self.q_var, outputs_info=T.zeros_like(self.b_inp_hid)) self.q_q = self.q_q[-1] print "==> creating parameters for memory module" self.W_mem_res_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_res = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_mem_upd_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_mem_hid_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_b = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_1 = nn_utils.normal_param(std=0.1, shape=(self.dim, 7 * self.dim + 2)) self.W_2 = nn_utils.normal_param(std=0.1, shape=(1, self.dim)) self.b_1 = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.b_2 = nn_utils.constant_param(value=0.0, shape=(1, )) print "==> building episodic memory module (fixed number of steps: %d)" % self.memory_hops memory = [self.q_q.copy()] for iter in range(1, self.memory_hops + 1): current_episode = self.new_episode(memory[iter - 1]) memory.append( self.GRU_update(memory[iter - 1], current_episode, self.W_mem_res_in, self.W_mem_res_hid, self.b_mem_res, self.W_mem_upd_in, self.W_mem_upd_hid, self.b_mem_upd, self.W_mem_hid_in, self.W_mem_hid_hid, self.b_mem_hid)) last_mem_raw = memory[-1].dimshuffle(('x', 0)) net = layers.InputLayer(shape=(1, self.dim), input_var=last_mem_raw) if self.dropout > 0 and self.mode == 'train': net = layers.DropoutLayer(net, p=self.dropout) last_mem = layers.get_output(net)[0] self.attentions = T.stack(self.attentions) print "==> building answer module" self.W_a = nn_utils.normal_param(std=0.1, shape=(self.vocab_size, self.dim)) if self.answer_module == 'feedforward': self.prediction = nn_utils.softmax(T.dot(self.W_a, last_mem)) self.prediction = self.prediction.dimshuffle('x', 0) elif self.answer_module == 'recurrent': self.W_ans_res_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.dim + self.vocab_size)) self.W_ans_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_res = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_ans_upd_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.dim + self.vocab_size)) self.W_ans_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_ans_hid_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.dim + self.vocab_size)) self.W_ans_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, )) def answer_step(prev_a, prev_y): a = self.GRU_update(prev_a, T.concatenate([prev_y, self.q_q]), self.W_ans_res_in, self.W_ans_res_hid, self.b_ans_res, self.W_ans_upd_in, self.W_ans_upd_hid, self.b_ans_upd, self.W_ans_hid_in, self.W_ans_hid_hid, self.b_ans_hid) y = nn_utils.softmax(T.dot(self.W_a, a)) return [ a, y ] #, theano.scan_module.until(n>=max_n)) # or argmax==self.end_tag) dummy = theano.shared(np.zeros((self.vocab_size, ), dtype=floatX)) results, updates = theano.scan( fn=answer_step, outputs_info=[last_mem, T.zeros_like(dummy)], n_steps=self.max_n) self.prediction = results[1] else: raise Exception("invalid answer_module") print "==> collecting all parameters" self.params = [ self.W_inp_res_in, self.W_inp_res_hid, self.b_inp_res, self.W_inp_upd_in, self.W_inp_upd_hid, self.b_inp_upd, self.W_inp_hid_in, self.W_inp_hid_hid, self.b_inp_hid, self.W_mem_res_in, self.W_mem_res_hid, self.b_mem_res, self.W_mem_upd_in, self.W_mem_upd_hid, self.b_mem_upd, self.W_mem_hid_in, self.W_mem_hid_hid, self.b_mem_hid, self.W_b, self.W_1, self.W_2, self.b_1, self.b_2, self.W_a ] if self.answer_module == 'recurrent': #feedforward': self.params = self.params + [ self.W_ans_res_in, self.W_ans_res_hid, self.b_ans_res, self.W_ans_upd_in, self.W_ans_upd_hid, self.b_ans_upd, self.W_ans_hid_in, self.W_ans_hid_hid, self.b_ans_hid ] print "==> building loss layer and computing updates" self.loss_ce = T.nnet.categorical_crossentropy(self.prediction, self.answer_var).sum() if self.l2 > 0: self.loss_l2 = self.l2 * nn_utils.l2_reg(self.params) else: self.loss_l2 = 0 self.loss = self.loss_ce + self.loss_l2 updates = lasagne.updates.adam(self.loss, self.params) #updates = lasagne.updates.momentum(self.loss, self.params, learning_rate=0.0003) if self.mode == 'train': print "==> compiling train_fn" self.train_fn = theano.function( inputs=[ self.input_var, self.q_var, self.answer_var, self.input_mask_var, self.max_n ], allow_input_downcast=True, outputs=[self.prediction, self.loss], updates=updates) print "==> compiling test_fn" self.test_fn = theano.function( inputs=[ self.input_var, self.q_var, self.answer_var, self.input_mask_var, self.max_n ], allow_input_downcast=True, outputs=[self.prediction, self.loss, self.attentions])
def __init__(self, babi_train_raw, babi_test_raw, word2vec, word_vector_size, memory_hops, dim, mode, input_mask_mode, l2, batch_norm, dropout, **kwargs): print "==> not used params in DMN class:", kwargs.keys() self.vocab = {} self.ivocab = {} self.word2vec = word2vec self.word_vector_size = word_vector_size self.dim = dim self.mode = mode self.input_mask_mode = input_mask_mode self.l2 = l2 self.batch_norm = batch_norm self.dropout = dropout self.memory_hops = memory_hops self.train_input, self.train_q, self.train_answer, self.train_input_mask, self.train_gates = self._process_input(babi_train_raw) self.test_input, self.test_q, self.test_answer, self.test_input_mask, self.test_gates = self._process_input(babi_test_raw) self.vocab_size = len(self.vocab) print "Train size: ", len(self.train_input) print "Test size: ", len(self.test_input) print "Vocab size: ", self.vocab_size self.input_var = T.matrix('input_var') self.q_var = T.matrix('question_var') self.answer_var = T.iscalar('answer_var') self.input_mask_var = T.ivector('input_mask_var') self.gates_var = T.ivector('gates_var') # attention gate (including end_reading) self.attentions = [] print "==> building input module" self.W_inp_res_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.word_vector_size)) self.W_inp_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_res = nn_utils.constant_param(value=0.0, shape=(self.dim,)) self.W_inp_upd_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.word_vector_size)) self.W_inp_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_upd = nn_utils.constant_param(value=0.0, shape=(self.dim,)) self.W_inp_hid_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.word_vector_size)) self.W_inp_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_hid = nn_utils.constant_param(value=0.0, shape=(self.dim,)) inp_c_history, _ = theano.scan(fn=self.input_gru_step, sequences=self.input_var, outputs_info=T.zeros_like(self.b_inp_hid)) self.end_reading = nn_utils.constant_param(value=0.0,shape=(1,self.dim)) inp_c_tag = T.concatenate([inp_c_history,self.end_reading],axis=0) self.inp_c = inp_c_tag.take(self.input_mask_var, axis=0) #(facts_len,dim) self.q_q, _ = theano.scan(fn=self.input_gru_step, sequences=self.q_var, outputs_info=T.zeros_like(self.b_inp_hid)) self.q_q = self.q_q[-1] #(1,dim) print "==> creating parameters for memory module" self.W_mem_res_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_res = nn_utils.constant_param(value=0.0, shape=(self.dim,)) self.W_mem_upd_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_upd = nn_utils.constant_param(value=0.0, shape=(self.dim,)) self.W_mem_hid_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_hid = nn_utils.constant_param(value=0.0, shape=(self.dim,)) self.W_b = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_1 = nn_utils.normal_param(std=0.1, shape=(self.dim, 7 * self.dim + 2)) self.W_2 = nn_utils.normal_param(std=0.1, shape=(1, self.dim)) self.b_1 = nn_utils.constant_param(value=0.0, shape=(self.dim,)) self.b_2 = nn_utils.constant_param(value=0.0, shape=(1,)) print "==> building episodic memory module (fixed number of steps: %d)" % self.memory_hops memory = [self.q_q.copy()] for iter in range(0, self.memory_hops): current_episode, g = self.new_episode(memory[iter]) self.attentions.append(g) memory.append(self.GRU_update(memory[iter], current_episode, self.W_mem_res_in, self.W_mem_res_hid, self.b_mem_res, self.W_mem_upd_in, self.W_mem_upd_hid, self.b_mem_upd, self.W_mem_hid_in, self.W_mem_hid_hid, self.b_mem_hid)) last_mem_raw = memory[-1].dimshuffle(('x', 0)) net = layers.InputLayer(shape=(1, self.dim), input_var=last_mem_raw) if self.dropout > 0 and self.mode == 'train': net = layers.DropoutLayer(net, p=self.dropout) last_mem = layers.get_output(net)[0] self.attentions = T.stack(self.attentions) #(memory_hops, fact_cnt) print "==> building answer module" self.W_a = nn_utils.normal_param(std=0.1, shape=(self.vocab_size, self.dim)) self.prediction = nn_utils.softmax(T.dot(self.W_a, last_mem)) print "==> collecting all parameters" self.params = [self.W_inp_res_in, self.W_inp_res_hid, self.b_inp_res, self.W_inp_upd_in, self.W_inp_upd_hid, self.b_inp_upd, self.W_inp_hid_in, self.W_inp_hid_hid, self.b_inp_hid, self.W_mem_res_in, self.W_mem_res_hid, self.b_mem_res, self.W_mem_upd_in, self.W_mem_upd_hid, self.b_mem_upd, self.W_mem_hid_in, self.W_mem_hid_hid, self.b_mem_hid, self.W_b, self.W_1, self.W_2, self.b_1, self.b_2, self.W_a] print "==> building loss layer and computing updates" self.loss_ce = T.nnet.categorical_crossentropy(self.prediction.dimshuffle('x', 0), T.stack([self.answer_var]))[0] if self.l2 > 0: self.loss_l2 = self.l2 * nn_utils.l2_reg(self.params) else: self.loss_l2 = 0 self.loss_gate = T.nnet.categorical_crossentropy(self.attentions, self.gates_var).mean() self.loss = self.loss_ce + self.loss_l2 + self.loss_gate updates = lasagne.updates.adam(self.loss, self.params) #updates = lasagne.updates.momentum(self.loss, self.params, learning_rate=0.0003) if self.mode == 'train': print "==> compiling train_fn" self.train_fn = theano.function(inputs=[self.input_var, self.q_var, self.answer_var, self.input_mask_var, self.gates_var], allow_input_downcast = True, outputs=[self.prediction, self.loss, self.attentions], updates=updates) print "==> compiling test_fn" self.test_fn = theano.function(inputs=[self.input_var, self.q_var, self.answer_var, self.input_mask_var, self.gates_var], allow_input_downcast = True, outputs=[self.prediction, self.loss, self.attentions]) if self.mode == 'train': print "==> computing gradients (for debugging)" gradient = T.grad(self.loss, self.params) self.get_gradient_fn = theano.function(inputs=[self.input_var, self.q_var, self.answer_var, self.input_mask_var, self.gates_var], allow_input_downcast = True, outputs=gradient)
def __init__(self, data_dir, word2vec, word_vector_size, dim, cnn_dim, story_len, patches,cnn_dim_fc,truncate_gradient, learning_rate, mode, answer_module, memory_hops, batch_size, l2, normalize_attention, batch_norm, dropout, **kwargs): print "==> not used params in DMN class:", kwargs.keys() self.data_dir = data_dir self.truncate_gradient = truncate_gradient self.learning_rate = learning_rate self.trng = RandomStreams(1234) self.word2vec = word2vec self.word_vector_size = word_vector_size self.dim = dim self.cnn_dim = cnn_dim self.cnn_dim_fc = cnn_dim_fc self.story_len = story_len self.patches = patches self.mode = mode self.answer_module = answer_module self.memory_hops = memory_hops self.batch_size = batch_size self.l2 = l2 self.normalize_attention = normalize_attention self.batch_norm = batch_norm self.dropout = dropout self.vocab, self.ivocab = self._load_vocab(self.data_dir) self.train_story = None self.test_story = None self.train_dict_story, self.train_lmdb_env_fc, self.train_lmdb_env_conv = self._process_input_sind_lmdb(self.data_dir, 'train') self.test_dict_story, self.test_lmdb_env_fc, self.test_lmdb_env_conv = self._process_input_sind_lmdb(self.data_dir, 'val') self.train_story = self.train_dict_story.keys() self.test_story = self.test_dict_story.keys() self.vocab_size = len(self.vocab) self.alpha_entropy_c = 0.02 # for hard attention. # This is the local patch of each image. self.input_var = T.tensor4('input_var') # (batch_size, seq_len, patches, cnn_dim) self.q_var = T.tensor3('q_var') # Now, it's a batch * story_len * image_sieze. self.answer_var = T.ivector('answer_var') # answer of example in minibatch self.answer_mask = T.matrix('answer_mask') self.answer_idx = T.imatrix('answer_idx') # batch x seq self.answer_inp_var = T.tensor3('answer_inp_var') # answer of example in minibatch print "==> building input module" # It's very simple now, the input module just need to map from cnn_dim to dim. logging.info('self.cnn_dim = %d', self.cnn_dim) logging.info('self.cnn_dim_fc = %d', self.cnn_dim_fc) logging.info('self.dim = %d', self.dim) self.W_q_emb_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.cnn_dim_fc)) self.b_q_emb_in = nn_utils.constant_param(value=0.0, shape=(self.dim,)) logging.info('Building the glob attention model') self.W_glb_att_1 = nn_utils.normal_param(std = 0.1, shape = (self.dim, 2 * self.dim)) self.W_glb_att_2 = nn_utils.normal_param(std = 0.1, shape = (1, self.dim)) self.b_glb_att_1 = nn_utils.constant_param(value = 0.0, shape = (self.dim,)) self.b_glb_att_2 = nn_utils.constant_param(value = 0.0, shape = (1,)) q_var_shuffled = self.q_var.dimshuffle(1,2,0) # seq x cnn x batch. def _dot(x, W, b): return T.tanh( T.dot(W, x) + b.dimshuffle(0, 'x')) q_var_shuffled_emb,_ = theano.scan(fn = _dot, sequences= q_var_shuffled, non_sequences = [self.W_q_emb_in, self.b_q_emb_in]) print 'q_var_shuffled_emb', q_var_shuffled_emb.shape.eval({self.q_var:np.random.rand(2,5,4096).astype('float32')}) q_var_emb = q_var_shuffled_emb.dimshuffle(2,0,1) # batch x seq x emb_size q_var_emb_ext = q_var_emb.dimshuffle(0,'x',1,2) q_var_emb_ext = T.repeat(q_var_emb_ext, q_var_emb.shape[1],1) # batch x seq x seq x emb_size q_var_emb_rhp = T.reshape( q_var_emb, (q_var_emb.shape[0] * q_var_emb.shape[1], q_var_emb.shape[2])) q_var_emb_ext_rhp = T.reshape(q_var_emb_ext, (q_var_emb_ext.shape[0] * q_var_emb_ext.shape[1],q_var_emb_ext.shape[2], q_var_emb_ext.shape[3])) q_var_emb_ext_rhp = q_var_emb_ext_rhp.dimshuffle(0,2,1) q_idx = T.arange(self.story_len).dimshuffle('x',0) q_idx = T.repeat(q_idx,self.batch_size, axis = 0) q_idx = T.reshape(q_idx, (q_idx.shape[0]* q_idx.shape[1],)) print q_idx.eval() print 'q_var_emb_rhp.shape', q_var_emb_rhp.shape.eval({self.q_var:np.random.rand(3,5,4096).astype('float32')}) print 'q_var_emb_ext_rhp.shape', q_var_emb_ext_rhp.shape.eval({self.q_var:np.random.rand(3,5,4096).astype('float32')}) #att_alpha,_ = theano.scan(fn = self.new_attention_step_glob, sequences = [q_var_emb_rhp, q_var_emb_ext_rhp, q_idx] ) alpha,_ = theano.scan(fn = self.new_attention_step_glob, sequences = [q_var_emb_rhp, q_var_emb_ext_rhp, q_idx] ) att_alpha = alpha[1] att_alpha_a = alpha[0] #print results.shape.eval({self.input_var: np.random.rand(3,4,4096).astype('float32'), print att_alpha.shape.eval({self.q_var:np.random.rand(3,5,4096).astype('float32')}) # att_alpha: (batch x seq) x seq) self.W_inp_emb_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.cnn_dim)) self.b_inp_emb_in = nn_utils.constant_param(value=0.0, shape=(self.dim,)) inp_rhp = T.reshape(self.input_var, (self.batch_size* self.story_len* self.patches, self.cnn_dim)) inp_rhp_dimshuffled = inp_rhp.dimshuffle(1,0) inp_rhp_emb = T.dot(self.W_inp_emb_in, inp_rhp_dimshuffled) + self.b_inp_emb_in.dimshuffle(0,'x') inp_rhp_emb_dimshuffled = inp_rhp_emb.dimshuffle(1,0) inp_emb_raw = T.reshape(inp_rhp_emb_dimshuffled, (self.batch_size, self.story_len, self.patches, self.cnn_dim)) inp_emb = T.tanh(inp_emb_raw) # Just follow the paper DMN for visual and textual QA. #inp_emb = inp_emb.dimshuffle(0,'x', 1,2) #inp_emb = T.repeat(inp_emb, self.story_len, 1) #print inp_emb.shape.eval({self.input_var:np.random.rand(3,5,196, 4096).astype('float32')}) att_alpha_sample = self.trng.multinomial(pvals = att_alpha, dtype=theano.config.floatX) att_mask = att_alpha_sample.argmax(1) print 'att_mask.shape', att_mask.shape.eval({self.q_var:np.random.rand(2,5,4096).astype('float32')}) print 'att_mask', att_mask.eval({self.q_var:np.random.rand(2,5,4096).astype('float32')}) # No time to fix the hard attention, now we use the soft attention. idx_t = T.repeat(T.arange(self.input_var.shape[0]), self.input_var.shape[1]) print 'idx_t', idx_t.eval({self.input_var:np.random.rand(2,5,196,512).astype('float32')}) att_input = inp_emb[idx_t, att_mask,:,:] # (batch x seq) x batches x emb_size att_input = T.reshape(att_input, (self.batch_size, self.story_len, self.patches, self.dim)) print 'att_input', att_input.shape.eval({self.input_var:np.random.rand(2,5,196,512).astype('float32'),self.q_var:np.random.rand(2,5,4096).astype('float32')}) # Now, it's the same size with the input_var, but we have only one image for each one of input. # Now, we can use the rnn on these local imgs to learn the # Now, we use a bi-directional GRU to produce the input. # Forward GRU. self.inp_c = T.reshape(att_input, (att_input.shape[0] * att_input.shape[1], att_input.shape[2], att_input.shape[3])) self.inp_c = self.inp_c.dimshuffle(1,2,0) #print 'inp_c', self.inp_c.shape.eval({att_input:np.random.rand(2,5,196,512).astype('float32')}) print "==> building question module" self.W_qf_res_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_qf_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_qf_res = nn_utils.constant_param(value=0.0, shape=(self.dim,)) self.W_qf_upd_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_qf_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_qf_upd = nn_utils.constant_param(value=0.0, shape=(self.dim,)) self.W_qf_hid_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_qf_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_qf_hid = nn_utils.constant_param(value=0.0, shape=(self.dim,)) inp_dummy = theano.shared(np.zeros((self.dim, self.batch_size), dtype = floatX)) #print 'q_var_shuffled_emb', q_var_shuffled_emb.shape.eval({self.q_var:np.random.rand(2,5,4096).astype('float32')}) q_glb,_ = theano.scan(fn = self.q_gru_step_forward, sequences = q_var_shuffled_emb, outputs_info = [T.zeros_like(inp_dummy)]) q_glb_shuffled = q_glb.dimshuffle(2,0,1) # batch_size * seq_len * dim #print 'q_glb_shuffled', q_glb_shuffled.shape.eval({self.q_var:np.random.rand(2,5,4096).astype('float32')}) q_glb_last = q_glb_shuffled[:,-1,:] # batch_size * dim #print 'q_glb_last', q_glb_last.shape.eval({self.q_var:np.random.rand(2,5,4096).astype('float32')}) q_net = layers.InputLayer(shape=(self.batch_size*self.story_len, self.dim), input_var=q_var_emb_rhp) if self.batch_norm: q_net = layers.BatchNormLayer(incoming=q_net) if self.dropout > 0 and self.mode == 'train': q_net = layers.DropoutLayer(q_net, p=self.dropout) self.q_q = layers.get_output(q_net).dimshuffle(1,0) print "==> creating parameters for memory module" self.W_mem_res_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_res = nn_utils.constant_param(value=0.0, shape=(self.dim,)) self.W_mem_upd_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_upd = nn_utils.constant_param(value=0.0, shape=(self.dim,)) self.W_mem_hid_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_hid = nn_utils.constant_param(value=0.0, shape=(self.dim,)) self.W_mem_update1 = nn_utils.normal_param(std=0.1, shape=(self.dim , self.dim* 2)) self.b_mem_upd1 = nn_utils.constant_param(value=0.0, shape=(self.dim,)) self.W_mem_update2 = nn_utils.normal_param(std=0.1, shape=(self.dim,self.dim*2)) self.b_mem_upd2 = nn_utils.constant_param(value=0.0, shape=(self.dim,)) self.W_mem_update3 = nn_utils.normal_param(std=0.1, shape=(self.dim , self.dim*2)) self.b_mem_upd3 = nn_utils.constant_param(value=0.0, shape=(self.dim,)) self.W_mem_update = [self.W_mem_update1,self.W_mem_update2,self.W_mem_update3] self.b_mem_update = [self.b_mem_upd1,self.b_mem_upd2, self.b_mem_upd3] self.W_1 = nn_utils.normal_param(std=0.1, shape=(self.dim, 7 * self.dim + 0)) self.W_2 = nn_utils.normal_param(std=0.1, shape=(1, self.dim)) self.b_1 = nn_utils.constant_param(value=0.0, shape=(self.dim,)) self.b_2 = nn_utils.constant_param(value=0.0, shape=(1,)) print "==> building episodic memory module (fixed number of steps: %d)" % self.memory_hops memory = [self.q_q.copy()] for iter in range(1, self.memory_hops + 1): #m = printing.Print('mem')(memory[iter-1]) current_episode = self.new_episode(memory[iter - 1]) # Replace GRU with ReLU activation + MLP. c = T.concatenate([memory[iter - 1], current_episode], axis = 0) cur_mem = T.dot(self.W_mem_update[iter-1], c) + self.b_mem_update[iter-1].dimshuffle(0,'x') memory.append(T.nnet.relu(cur_mem)) last_mem_raw = memory[-1].dimshuffle((1, 0)) net = layers.InputLayer(shape=(self.batch_size * self.story_len, self.dim), input_var=last_mem_raw) if self.batch_norm: net = layers.BatchNormLayer(incoming=net) if self.dropout > 0 and self.mode == 'train': net = layers.DropoutLayer(net, p=self.dropout) last_mem = layers.get_output(net).dimshuffle((1, 0)) print "==> building answer module" answer_inp_var_shuffled = self.answer_inp_var.dimshuffle(1,2,0) # Sounds good. Now, we need to map last_mem to a new space. self.W_mem_emb = nn_utils.normal_param(std = 0.1, shape = (self.dim, self.dim * 2)) self.b_mem_emb = nn_utils.constant_param(value=0.0, shape=(self.dim,)) self.W_inp_emb = nn_utils.normal_param(std = 0.1, shape = (self.dim, self.vocab_size + 1)) self.b_inp_emb = nn_utils.constant_param(value=0.0, shape=(self.dim,)) def _dot2(x, W, b): #return T.tanh(T.dot(W, x) + b.dimshuffle(0,'x')) return T.dot(W, x) + b.dimshuffle(0,'x') answer_inp_var_shuffled_emb,_ = theano.scan(fn = _dot2, sequences = answer_inp_var_shuffled, non_sequences = [self.W_inp_emb,self.b_inp_emb] ) # seq x dim x batch #print 'answer_inp_var_shuffled_emb', answer_inp_var_shuffled_emb.shape.eval({self.answer_inp_var:np.random.rand(2,5,8900).astype('float32')}) # dim x batch_size * 5 q_glb_dim = q_glb_last.dimshuffle(0,'x', 1) # batch_size * 1 * dim #print 'q_glb_dim', q_glb_dim.shape.eval({self.q_var:np.random.rand(2,5,4096).astype('float32')}) q_glb_repmat = T.repeat(q_glb_dim, self.story_len, 1) # batch_size * len * dim #print 'q_glb_repmat', q_glb_repmat.shape.eval({self.q_var:np.random.rand(2,5,4096).astype('float32')}) q_glb_rhp = T.reshape(q_glb_repmat, (q_glb_repmat.shape[0] * q_glb_repmat.shape[1], q_glb_repmat.shape[2])) #print 'q_glb_rhp', q_glb_rhp.shape.eval({q_glb_last:np.random.rand(2,512).astype('float32')}) init_ans = T.concatenate([self.q_q, last_mem], axis = 0) #print 'init_ans', init_ans.shape.eval({self.q_var:np.random.rand(2,5,4096).astype('float32'), self.input_var:np.random.rand(2,5,196, 512).astype('float32')}) mem_ans = T.dot(self.W_mem_emb, init_ans) + self.b_mem_emb.dimshuffle(0,'x') # dim x batchsize mem_ans_dim = mem_ans.dimshuffle('x',0,1) answer_inp = T.concatenate([mem_ans_dim, answer_inp_var_shuffled_emb], axis = 0) q_glb_rhp = q_glb_rhp.dimshuffle(1,0) q_glb_rhp = q_glb_rhp.dimshuffle('x', 0, 1) q_glb_step = T.repeat(q_glb_rhp, answer_inp.shape[0], 0) #mem_ans = T.tanh(T.dot(self.W_mem_emb, init_ans) + self.b_mem_emb.dimshuffle(0,'x')) # dim x batchsize. # seq + 1 x dim x batch answer_inp = T.concatenate([answer_inp, q_glb_step], axis = 1) dummy = theano.shared(np.zeros((self.dim, self.batch_size * self.story_len), dtype=floatX)) self.W_a = nn_utils.normal_param(std=0.1, shape=(self.vocab_size + 1, self.dim)) self.W_ans_res_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim * 2)) self.W_ans_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_res = nn_utils.constant_param(value=0.0, shape=(self.dim,)) self.W_ans_upd_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim * 2)) self.W_ans_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_upd = nn_utils.constant_param(value=0.0, shape=(self.dim,)) self.W_ans_hid_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim * 2)) self.W_ans_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_hid = nn_utils.constant_param(value=0.0, shape=(self.dim,)) results, _ = theano.scan(fn = self.answer_gru_step, sequences = answer_inp, outputs_info = [ dummy ]) #results = None #r = None #for i in range(self.story_len): # answer_inp_i = answer_inp[i,:] # # if i == 0: # # results: seq + 1 x dim x batch_size # r, _ = theano.scan(fn = self.answer_gru_step, # sequences = answer_inp_i, # truncate_gradient = self.truncate_gradient, # outputs_info = [ dummy ]) # #print 'r', r.shape.eval({answer_inp_i:np.random.rand(23,512,2).astype('float32')}) # results = r.dimshuffle('x', 0, 1,2) # else: # prev_h = r[self.answer_idx[:,i],:,T.arange(self.batch_size)] # #print 'prev_h', prev_h.shape.eval({answer_inp_i:np.random.rand(23,512,2).astype('float32'), self.answer_idx: np.asarray([[1,1,1,1,1],[2,2,2,2,2]]).astype('int32')},on_unused_input='warn' ) # #print 'prev_h', prev_h.shape.eval({r:np.random.rand(23,512,2).astype('float32'), self.answer_idx: np.asarray([[1,1,1,1,1],[2,2,2,2,2]]).astype('int32')}) # r,_ = theano.scan(fn = self.answer_gru_step, # sequences = answer_inp_i, # truncate_gradient = self.truncate_gradient, # outputs_info = [ prev_h.dimshuffle(1,0) ]) # results = T.concatenate([results, r.dimshuffle('x', 0, 1, 2)]) ## results: story_len x seq+1 x dim x batch_size #results = results.dimshuffle(3,0,1,2) #results = T.reshape(results, (self.batch_size * self.story_len, results.shape[2], results.shape[3])) #results = results.dimshuffle(1,2,0) # seq_len x dim x (batch x seq) # Assume there is a start token #print 'results', results.shape.eval({self.input_var: np.random.rand(2,5,196,512).astype('float32'), # self.q_var: np.random.rand(2,5, 4096).astype('float32'), # self.answer_idx: np.asarray([[1,1,1,1,1],[2,2,2,2,2]]).astype('int32'), # self.answer_inp_var: np.random.rand(5, 18, 8001).astype('float32')}) #results = results[1:-1,:,:] # get rid of the last token as well as the first one (image) #print results.shape.eval({self.input_var: np.random.rand(3,4,4096).astype('float32'), # self.q_var: np.random.rand(3, 4096).astype('float32'), # self.answer_inp_var: np.random.rand(3, 18, 8001).astype('float32')}, on_unused_input='ignore') # Now, we need to transform it to the probabilities. prob,_ = theano.scan(fn = lambda x, w: T.dot(w, x), sequences = results, non_sequences = self.W_a ) #print 'prob', prob.shape.eval({self.input_var: np.random.rand(2,5,196,512).astype('float32'), # self.q_var: np.random.rand(2,5, 4096).astype('float32'), # self.answer_idx: np.asarray([[1,1,1,1,1],[2,2,2,2,2]]).astype('int32'), # self.answer_inp_var: np.random.rand(5, 18, 8001).astype('float32')}) preds = prob[1:,:,:] prob = prob[1:-1,:,:] prob_shuffled = prob.dimshuffle(2,0,1) # b * len * vocab preds_shuffled = preds.dimshuffle(2,0,1) logging.info("prob shape.") #print prob.shape.eval({self.input_var: np.random.rand(3,4,4096).astype('float32'), # self.q_var: np.random.rand(3, 4096).astype('float32'), # self.answer_inp_var: np.random.rand(3, 18, 8001).astype('float32')}) n = prob_shuffled.shape[0] * prob_shuffled.shape[1] n_preds = preds_shuffled.shape[0] * preds_shuffled.shape[1] prob_rhp = T.reshape(prob_shuffled, (n, prob_shuffled.shape[2])) preds_rhp = T.reshape(preds_shuffled, (n_preds, preds_shuffled.shape[2])) prob_sm = nn_utils.softmax_(prob_rhp) preds_sm = nn_utils.softmax_(preds_rhp) self.prediction = prob_sm # this one is for the training. #print 'prob_sm', prob_sm.shape.eval({prob: np.random.rand(19,8897,3).astype('float32')}) #print 'lbl', loss_vec.shape.eval({prob: np.random.rand(19,8897,3).astype('float32')}) # This one is for the beamsearch. self.pred = T.reshape(preds_sm, (preds_shuffled.shape[0], preds_shuffled.shape[1], preds_shuffled.shape[2])) mask = T.reshape(self.answer_mask, (n,)) lbl = T.reshape(self.answer_var, (n,)) self.params = [self.W_inp_emb_in, self.b_inp_emb_in, self.W_q_emb_in, self.b_q_emb_in, self.W_glb_att_1, self.W_glb_att_2, self.b_glb_att_1, self.b_glb_att_2, self.W_qf_res_in, self.W_qf_res_hid, self.b_qf_res, self.W_qf_upd_in, self.W_qf_upd_hid, self.b_qf_upd, self.W_qf_hid_in, self.W_qf_hid_hid, self.b_qf_hid, self.W_mem_emb, self.W_inp_emb,self.b_mem_emb, self.b_inp_emb, self.W_mem_res_in, self.W_mem_res_hid, self.b_mem_res, self.W_mem_upd_in, self.W_mem_upd_hid, self.b_mem_upd, self.W_mem_hid_in, self.W_mem_hid_hid, self.b_mem_hid, #self.W_b #self.W_mem_emb, self.W_inp_emb,self.b_mem_emb, self.b_inp_emb, self.W_1, self.W_2, self.b_1, self.b_2, self.W_a, self.W_ans_res_in, self.W_ans_res_hid, self.b_ans_res, self.W_ans_upd_in, self.W_ans_upd_hid, self.b_ans_upd, self.W_ans_hid_in, self.W_ans_hid_hid, self.b_ans_hid, ] self.params += self.W_mem_update self.params += self.b_mem_update print "==> building loss layer and computing updates" reward_prob = prob_sm[T.arange(n), lbl] reward_prob = T.reshape(reward_prob, (prob_shuffled.shape[0], prob_shuffled.shape[1])) #reward_prob = printing.Print('mean_r')(reward_prob) loss_vec = T.nnet.categorical_crossentropy(prob_sm, lbl) #loss_vec = T.nnet.categorical_crossentropy(prob_sm, T.flatten(self.answer_var)) #print 'loss_vec', loss_vec.shape.eval({prob_sm: np.random.rand(39,8900).astype('float32'), # lbl: np.random.rand(39,).astype('int32')}) self.loss_ce = (mask * loss_vec ).sum() / mask.sum() print 'loss_ce', self.loss_ce.eval({prob_sm: np.random.rand(39,8900).astype('float32'), lbl: np.random.rand(39,).astype('int32'), mask: np.random.rand(39,).astype('float32')}) if self.l2 > 0: self.loss_l2 = self.l2 * nn_utils.l2_reg(self.params) else: self.loss_l2 = 0 self.loss = self.loss_ce + self.loss_l2 self.baseline_time = theano.shared(np.float32(0.), name='baseline_time') alpha_entropy_c = theano.shared(np.float32(self.alpha_entropy_c), name='alpha_entropy_c') #mean_r = ( mask * reward_prob).sum() / mask.sum() # or just fixed it as 1. #mean_r = 1 mean_r = (self.answer_mask * reward_prob).sum(1) / self.answer_mask.sum(1) # or just fixed it as 1. mean_r = mean_r[0,None] grads = T.grad(self.loss, wrt=self.params, disconnected_inputs='raise', known_grads={att_alpha_a:(mean_r - self.baseline_time)* (att_alpha_sample/(att_alpha_a + 1e-10)) + alpha_entropy_c*(T.log(att_alpha_a + 1e-10) + 1)}) updates = lasagne.updates.adadelta(grads, self.params, learning_rate = self.learning_rate) updates[self.baseline_time] = self.baseline_time * 0.9 + 0.1 * mean_r.mean() #updates = lasagne.updates.momentum(self.loss, self.params, learning_rate=0.001) if self.mode == 'train': logging.info("compiling train_fn") self.train_fn = theano.function(inputs=[self.input_var, self.q_var, self.answer_var, self.answer_mask, self.answer_inp_var], outputs=[self.prediction, self.loss], updates=updates) logging.info("compiling test_fn") self.test_fn = theano.function(inputs=[self.input_var, self.q_var, self.answer_var, self.answer_mask, self.answer_inp_var], outputs=[self.prediction, self.loss]) logging.info("compiling pred_fn") self.pred_fn= theano.function(inputs=[self.input_var, self.q_var, self.answer_inp_var], outputs=[self.pred])
def __init__(self, data_dir, word2vec, word_vector_size, dim, cnn_dim, story_len, mode, answer_module, memory_hops, batch_size, l2, normalize_attention, batch_norm, dropout, **kwargs): print "==> not used params in DMN class:", kwargs.keys() self.data_dir = data_dir self.word2vec = word2vec self.word_vector_size = word_vector_size self.dim = dim self.cnn_dim = cnn_dim self.story_len = story_len self.mode = mode self.answer_module = answer_module self.memory_hops = memory_hops self.batch_size = batch_size self.l2 = l2 self.normalize_attention = normalize_attention self.batch_norm = batch_norm self.dropout = dropout self.vocab, self.ivocab = self._load_vocab(self.data_dir) self.train_story = None self.test_story = None self.train_dict_story, self.train_lmdb_env_fc = self._process_input_sind_lmdb( self.data_dir, 'train') self.val_dict_story, self.val_lmdb_env_fc = self._process_input_sind_lmdb( self.data_dir, 'val') self.test_dict_story, self.test_lmdb_env_fc = self._process_input_sind_lmdb( self.data_dir, 'test') self.train_story = self.train_dict_story.keys() self.val_story = self.val_dict_story.keys() self.test_story = self.test_dict_story.keys() self.vocab_size = len(self.vocab) self.q_var = T.tensor3( 'q_var') # Now, it's a batch * story_len * image_sieze. self.answer_var = T.imatrix( 'answer_var') # answer of example in minibatch self.answer_mask = T.matrix('answer_mask') self.answer_inp_var = T.tensor3( 'answer_inp_var') # answer of example in minibatch print "==> building input module" # It's very simple now, the input module just need to map from cnn_dim to dim. logging.info('self.cnn_dim = %d', self.cnn_dim) self.W_inp_emb_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.cnn_dim)) self.b_inp_emb_in = nn_utils.constant_param(value=0.0, shape=(self.dim, )) q_seq = self.q_var.dimshuffle(0, 'x', 1, 2) q_seq_rpt = T.repeat(q_seq, self.story_len, 1) q_seq_rhp = T.reshape(q_seq_rpt, (q_seq_rpt.shape[0] * q_seq_rpt.shape[1], q_seq_rpt.shape[2], q_seq_rpt.shape[3])) inp_var_shuffled = q_seq_rhp.dimshuffle(1, 2, 0) #seq x cnn x batch def _dot(x, W, b): return T.dot(W, x) + b.dimshuffle(0, 'x') inp_c_hist, _ = theano.scan( fn=_dot, sequences=inp_var_shuffled, non_sequences=[self.W_inp_emb_in, self.b_inp_emb_in]) #inp_c_hist,_ = theano.scan(fn = _dot, sequences=self.input_var, non_sequences = [self.W_inp_emb_in, self.b_inp_emb_in]) self.inp_c = inp_c_hist # seq x emb x batch print "==> building question module" # Now, share the parameter with the input module. q_var_shuffled = self.q_var.dimshuffle( 1, 2, 0) # now: story_len * image_size * batch_size # This is the RNN used to produce the Global Glimpse self.W_inpf_res_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.cnn_dim)) self.W_inpf_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inpf_res = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_inpf_upd_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.cnn_dim)) self.W_inpf_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inpf_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_inpf_hid_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.cnn_dim)) self.W_inpf_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inpf_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, )) inp_dummy = theano.shared( np.zeros((self.dim, self.batch_size), dtype=floatX)) q_glb, _ = theano.scan(fn=self.input_gru_step_forward, sequences=q_var_shuffled, outputs_info=[T.zeros_like(inp_dummy)]) q_glb_shuffled = q_glb.dimshuffle(2, 0, 1) # batch_size * seq_len * dim q_glb_last = q_glb_shuffled[:, -1, :] # batch_size * dim # Now, we also need to build the individual model. #q_var_shuffled = self.q_var.dimshuffle(1,0) q_single = T.reshape( self.q_var, (self.q_var.shape[0] * self.q_var.shape[1], self.q_var.shape[2])) q_single_shuffled = q_single.dimshuffle(1, 0) #cnn_dim x batch_size * 5 # batch_size * 5 x dim q_hist = T.dot(self.W_inp_emb_in, q_single_shuffled) + self.b_inp_emb_in.dimshuffle( 0, 'x') q_hist_shuffled = q_hist.dimshuffle(1, 0) # batch_size * 5 x dim if self.batch_norm: logging.info("Using batch normalization.") q_net = layers.InputLayer(shape=(self.batch_size * self.story_len, self.dim), input_var=q_hist_shuffled) if self.batch_norm: q_net = layers.BatchNormLayer(incoming=q_net) if self.dropout > 0 and self.mode == 'train': q_net = layers.DropoutLayer(q_net, p=self.dropout) #last_mem = layers.get_output(q_net).dimshuffle((1, 0)) self.q_q = layers.get_output(q_net).dimshuffle(1, 0) print "==> creating parameters for memory module" self.W_mem_res_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_res = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_mem_upd_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_mem_hid_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_b = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_1 = nn_utils.normal_param(std=0.1, shape=(self.dim, 7 * self.dim + 0)) self.W_2 = nn_utils.normal_param(std=0.1, shape=(1, self.dim)) self.b_1 = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.b_2 = nn_utils.constant_param(value=0.0, shape=(1, )) print "==> building episodic memory module (fixed number of steps: %d)" % self.memory_hops memory = [self.q_q.copy()] for iter in range(1, self.memory_hops + 1): #m = printing.Print('mem')(memory[iter-1]) current_episode = self.new_episode(memory[iter - 1]) #current_episode = self.new_episode(m) #current_episode = printing.Print('current_episode')(current_episode) memory.append( self.GRU_update(memory[iter - 1], current_episode, self.W_mem_res_in, self.W_mem_res_hid, self.b_mem_res, self.W_mem_upd_in, self.W_mem_upd_hid, self.b_mem_upd, self.W_mem_hid_in, self.W_mem_hid_hid, self.b_mem_hid)) last_mem_raw = memory[-1].dimshuffle((1, 0)) net = layers.InputLayer(shape=(self.batch_size * self.story_len, self.dim), input_var=last_mem_raw) if self.batch_norm: net = layers.BatchNormLayer(incoming=net) if self.dropout > 0 and self.mode == 'train': net = layers.DropoutLayer(net, p=self.dropout) last_mem = layers.get_output(net).dimshuffle((1, 0)) print "==> building answer module" answer_inp_var_shuffled = self.answer_inp_var.dimshuffle(1, 2, 0) # Sounds good. Now, we need to map last_mem to a new space. self.W_mem_emb = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim * 3)) self.W_inp_emb = nn_utils.normal_param(std=0.1, shape=(self.dim, self.vocab_size + 1)) def _dot2(x, W): return T.dot(W, x) answer_inp_var_shuffled_emb, _ = theano.scan( fn=_dot2, sequences=answer_inp_var_shuffled, non_sequences=self.W_inp_emb) # seq x dim x batch # dim x batch_size * 5 q_glb_dim = q_glb_last.dimshuffle(0, 'x', 1) # batch_size * 1 * dim q_glb_repmat = T.repeat(q_glb_dim, self.story_len, 1) # batch_size * len * dim q_glb_rhp = T.reshape(q_glb_repmat, (q_glb_repmat.shape[0] * q_glb_repmat.shape[1], q_glb_repmat.shape[2])) init_ans = T.concatenate( [self.q_q, last_mem, q_glb_rhp.dimshuffle(1, 0)], axis=0) mem_ans = T.dot(self.W_mem_emb, init_ans) # dim x batchsize. mem_ans_dim = mem_ans.dimshuffle('x', 0, 1) answer_inp = T.concatenate([mem_ans_dim, answer_inp_var_shuffled_emb], axis=0) dummy = theano.shared( np.zeros((self.dim, self.batch_size * self.story_len), dtype=floatX)) self.W_a = nn_utils.normal_param(std=0.1, shape=(self.vocab_size + 1, self.dim)) self.W_ans_res_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_ans_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_res = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_ans_upd_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_ans_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_ans_hid_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_ans_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, )) results, _ = theano.scan(fn=self.answer_gru_step, sequences=answer_inp, outputs_info=[dummy]) # Assume there is a start token #print results.shape.eval({self.input_var: np.random.rand(10,4,4096).astype('float32'), # self.q_var: np.random.rand(10, 4096).astype('float32'), # self.answer_inp_var: np.random.rand(10, 18, 8001).astype('float32')}, on_unused_input='ignore') #results = results[1:-1,:,:] # get rid of the last token as well as the first one (image) #print results.shape.eval({self.input_var: np.random.rand(10,4,4096).astype('float32'), # self.q_var: np.random.rand(10, 4096).astype('float32'), # self.answer_inp_var: np.random.rand(10, 18, 8001).astype('float32')}, on_unused_input='ignore') # Now, we need to transform it to the probabilities. prob, _ = theano.scan(fn=lambda x, w: T.dot(w, x), sequences=results, non_sequences=self.W_a) preds = prob[1:, :, :] prob = prob[1:-1, :, :] prob_shuffled = prob.dimshuffle(2, 0, 1) # b * len * vocab preds_shuffled = preds.dimshuffle(2, 0, 1) logging.info("prob shape.") #print prob.shape.eval({self.input_var: np.random.rand(10,4,4096).astype('float32'), # self.q_var: np.random.rand(10, 4096).astype('float32'), # self.answer_inp_var: np.random.rand(10, 18, 8001).astype('float32')}) n = prob_shuffled.shape[0] * prob_shuffled.shape[1] n_preds = preds_shuffled.shape[0] * preds_shuffled.shape[1] prob_rhp = T.reshape(prob_shuffled, (n, prob_shuffled.shape[2])) preds_rhp = T.reshape(preds_shuffled, (n_preds, preds_shuffled.shape[2])) prob_sm = nn_utils.softmax_(prob_rhp) preds_sm = nn_utils.softmax_(preds_rhp) self.prediction = prob_sm # this one is for the training. # This one is for the beamsearch. self.pred = T.reshape( preds_sm, (preds_shuffled.shape[0], preds_shuffled.shape[1], preds_shuffled.shape[2])) mask = T.reshape(self.answer_mask, (n, )) lbl = T.reshape(self.answer_var, (n, )) self.params = [ self.W_inp_emb_in, self.b_inp_emb_in, self.W_mem_res_in, self.W_mem_res_hid, self.b_mem_res, self.W_mem_upd_in, self.W_mem_upd_hid, self.b_mem_upd, self.W_mem_hid_in, self.W_mem_hid_hid, self.b_mem_hid, #self.W_b self.W_1, self.W_2, self.b_1, self.b_2, self.W_a ] self.params = self.params + [ self.W_ans_res_in, self.W_ans_res_hid, self.b_ans_res, self.W_ans_upd_in, self.W_ans_upd_hid, self.b_ans_upd, self.W_ans_hid_in, self.W_ans_hid_hid, self.b_ans_hid, self.W_mem_emb, self.W_inp_emb ] print "==> building loss layer and computing updates" loss_vec = T.nnet.categorical_crossentropy(prob_sm, lbl) self.loss_ce = (mask * loss_vec).sum() / mask.sum() #self.loss_ce = T.nnet.categorical_crossentropy(results_rhp, lbl) if self.l2 > 0: self.loss_l2 = self.l2 * nn_utils.l2_reg(self.params) else: self.loss_l2 = 0 self.loss = self.loss_ce + self.loss_l2 updates = lasagne.updates.adadelta(self.loss, self.params) #updates = lasagne.updates.momentum(self.loss, self.params, learning_rate=0.001) if self.mode == 'train': print "==> compiling train_fn" self.train_fn = theano.function( inputs=[ self.q_var, self.answer_var, self.answer_mask, self.answer_inp_var ], outputs=[self.prediction, self.loss], updates=updates) print "==> compiling test_fn" self.test_fn = theano.function(inputs=[ self.q_var, self.answer_var, self.answer_mask, self.answer_inp_var ], outputs=[self.prediction, self.loss]) print "==> compiling pred_fn" self.pred_fn = theano.function( inputs=[self.q_var, self.answer_inp_var], outputs=[self.pred])
def __init__(self, data_dir, word2vec, word_vector_size, dim, cnn_dim, story_len, patches, cnn_dim_fc, truncate_gradient, learning_rate, mode, answer_module, memory_hops, batch_size, l2, normalize_attention, batch_norm, dropout, **kwargs): print "==> not used params in DMN class:", kwargs.keys() self.data_dir = data_dir self.truncate_gradient = truncate_gradient self.learning_rate = learning_rate self.trng = RandomStreams(1234) self.word2vec = word2vec self.word_vector_size = word_vector_size self.dim = dim self.cnn_dim = cnn_dim self.cnn_dim_fc = cnn_dim_fc self.story_len = story_len self.patches = patches self.mode = mode self.answer_module = answer_module self.memory_hops = memory_hops self.batch_size = batch_size self.l2 = l2 self.normalize_attention = normalize_attention self.batch_norm = batch_norm self.dropout = dropout self.vocab, self.ivocab = self._load_vocab(self.data_dir) self.train_story = None self.test_story = None self.train_dict_story, self.train_lmdb_env_fc, self.train_lmdb_env_conv = self._process_input_sind_lmdb( self.data_dir, 'train') self.test_dict_story, self.test_lmdb_env_fc, self.test_lmdb_env_conv = self._process_input_sind_lmdb( self.data_dir, 'val') self.train_story = self.train_dict_story.keys() self.test_story = self.test_dict_story.keys() self.vocab_size = len(self.vocab) # This is the local patch of each image. self.input_var = T.tensor4( 'input_var') # (batch_size, seq_len, patches, cnn_dim) self.q_var = T.tensor3( 'q_var') # Now, it's a batch * story_len * image_sieze. self.answer_var = T.ivector( 'answer_var') # answer of example in minibatch self.answer_mask = T.matrix('answer_mask') self.answer_idx = T.imatrix('answer_idx') # batch x seq self.answer_inp_var = T.tensor3( 'answer_inp_var') # answer of example in minibatch print "==> building input module" # It's very simple now, the input module just need to map from cnn_dim to dim. logging.info('self.cnn_dim = %d', self.cnn_dim) logging.info('self.cnn_dim_fc = %d', self.cnn_dim_fc) logging.info('self.dim = %d', self.dim) self.W_q_emb_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.cnn_dim_fc)) self.b_q_emb_in = nn_utils.constant_param(value=0.0, shape=(self.dim, )) q_var_shuffled = self.q_var.dimshuffle(1, 2, 0) # seq x cnn x batch. def _dot(x, W, b): return T.tanh(T.dot(W, x) + b.dimshuffle(0, 'x')) q_var_shuffled_emb, _ = theano.scan( fn=_dot, sequences=q_var_shuffled, non_sequences=[self.W_q_emb_in, self.b_q_emb_in]) #print 'q_var_shuffled_emb', q_var_shuffled_emb.shape.eval({self.q_var:np.random.rand(2,5,4096).astype('float32')}) q_var_emb = q_var_shuffled_emb.dimshuffle(2, 0, 1) # batch x seq x emb_size q_var_emb_ext = q_var_emb.dimshuffle(0, 'x', 1, 2) q_var_emb_ext = T.repeat(q_var_emb_ext, q_var_emb.shape[1], 1) # batch x seq x seq x emb_size q_var_emb_rhp = T.reshape( q_var_emb, (q_var_emb.shape[0] * q_var_emb.shape[1], q_var_emb.shape[2])) q_var_emb_ext_rhp = T.reshape( q_var_emb_ext, (q_var_emb_ext.shape[0] * q_var_emb_ext.shape[1], q_var_emb_ext.shape[2], q_var_emb_ext.shape[3])) q_var_emb_ext_rhp = q_var_emb_ext_rhp.dimshuffle(0, 2, 1) q_idx = T.arange(self.story_len).dimshuffle('x', 0) q_idx = T.repeat(q_idx, self.batch_size, axis=0) q_idx = T.reshape(q_idx, (q_idx.shape[0] * q_idx.shape[1], )) self.W_inp_emb_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.cnn_dim)) self.b_inp_emb_in = nn_utils.constant_param(value=0.0, shape=(self.dim, )) inp_rhp = T.reshape( self.input_var, (self.batch_size * self.story_len * self.patches, self.cnn_dim)) inp_rhp_dimshuffled = inp_rhp.dimshuffle(1, 0) inp_rhp_emb = T.dot( self.W_inp_emb_in, inp_rhp_dimshuffled) + self.b_inp_emb_in.dimshuffle(0, 'x') inp_rhp_emb_dimshuffled = inp_rhp_emb.dimshuffle(1, 0) inp_emb_raw = T.reshape( inp_rhp_emb_dimshuffled, (self.batch_size * self.story_len, self.patches, self.cnn_dim)) inp_emb = T.tanh( inp_emb_raw ) # Just follow the paper DMN for visual and textual QA. self.inp_c = inp_emb.dimshuffle(1, 2, 0) logging.info('building question module') self.W_qf_res_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_qf_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_qf_res = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_qf_upd_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_qf_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_qf_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_qf_hid_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_qf_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_qf_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, )) inp_dummy = theano.shared( np.zeros((self.dim, self.batch_size), dtype=floatX)) q_var_shuffled_emb_reversed = q_var_shuffled_emb[:: -1, :, :] # seq x emb_size x batch q_glb, _ = theano.scan(fn=self.q_gru_step_forward, sequences=q_var_shuffled_emb_reversed, outputs_info=[T.zeros_like(inp_dummy)]) q_glb_shuffled = q_glb.dimshuffle(2, 0, 1) # batch_size * seq_len * dim q_glb_last = q_glb_shuffled[:, -1, :] # batch_size * dim q_net = layers.InputLayer(shape=(self.batch_size * self.story_len, self.dim), input_var=q_var_emb_rhp) if self.batch_norm: q_net = layers.BatchNormLayer(incoming=q_net) if self.dropout > 0 and self.mode == 'train': q_net = layers.DropoutLayer(q_net, p=self.dropout) self.q_q = layers.get_output(q_net).dimshuffle(1, 0) #print "==> creating parameters for memory module" logging.info('creating parameters for memory module') self.W_mem_res_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_res = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_mem_upd_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_mem_hid_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_mem_update1 = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim * 2)) self.b_mem_upd1 = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_mem_update2 = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim * 2)) self.b_mem_upd2 = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_mem_update3 = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim * 2)) self.b_mem_upd3 = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_mem_update = [ self.W_mem_update1, self.W_mem_update2, self.W_mem_update3 ] self.b_mem_update = [self.b_mem_upd1, self.b_mem_upd2, self.b_mem_upd3] self.W_1 = nn_utils.normal_param(std=0.1, shape=(self.dim, 7 * self.dim + 0)) self.W_2 = nn_utils.normal_param(std=0.1, shape=(1, self.dim)) self.b_1 = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.b_2 = nn_utils.constant_param(value=0.0, shape=(1, )) logging.info( '==> building episodic memory module (fixed number of steps: %d)', self.memory_hops) memory = [self.q_q.copy()] for iter in range(1, self.memory_hops + 1): #m = printing.Print('mem')(memory[iter-1]) current_episode = self.new_episode(memory[iter - 1]) # Replace GRU with ReLU activation + MLP. c = T.concatenate([memory[iter - 1], current_episode], axis=0) cur_mem = T.dot(self.W_mem_update[iter - 1], c) + self.b_mem_update[iter - 1].dimshuffle( 0, 'x') memory.append(T.nnet.relu(cur_mem)) last_mem_raw = memory[-1].dimshuffle((1, 0)) net = layers.InputLayer(shape=(self.batch_size * self.story_len, self.dim), input_var=last_mem_raw) if self.batch_norm: net = layers.BatchNormLayer(incoming=net) if self.dropout > 0 and self.mode == 'train': net = layers.DropoutLayer(net, p=self.dropout) last_mem = layers.get_output(net).dimshuffle((1, 0)) print "==> building answer module" answer_inp_var_shuffled = self.answer_inp_var.dimshuffle(1, 2, 0) # Sounds good. Now, we need to map last_mem to a new space. self.W_mem_emb = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim * 2)) self.b_mem_emb = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_inp_emb = nn_utils.normal_param(std=0.1, shape=(self.dim, self.vocab_size + 1)) self.b_inp_emb = nn_utils.constant_param(value=0.0, shape=(self.dim, )) def _dot2(x, W, b): #return T.tanh(T.dot(W, x) + b.dimshuffle(0,'x')) return T.dot(W, x) + b.dimshuffle(0, 'x') answer_inp_var_shuffled_emb, _ = theano.scan( fn=_dot2, sequences=answer_inp_var_shuffled, non_sequences=[self.W_inp_emb, self.b_inp_emb]) # seq x dim x batch init_ans = T.concatenate([self.q_q, last_mem], axis=0) # dim x (batch x self.story_len) mem_ans = T.dot(self.W_mem_emb, init_ans) + self.b_mem_emb.dimshuffle( 0, 'x') # dim x (batchsize x self.story_len) #mem_ans_dim = mem_ans.dimshuffle('x',0,1) mem_ans_rhp = T.reshape(mem_ans.dimshuffle( 1, 0), (self.batch_size, self.story_len, mem_ans.shape[0])) mem_ans_dim = mem_ans_rhp.dimshuffle(1, 2, 0) answer_inp = answer_inp_var_shuffled_emb #answer_inp = T.concatenate([mem_ans_dim, answer_inp_var_shuffled_emb], axis = 0) #seq + 1 x dim x (batch-size x self.story+len) # Now, each answer got its input, our next step is to obtain the sequences. answer_inp_shu = answer_inp.dimshuffle(2, 0, 1) answer_inp_shu_rhp = T.reshape(answer_inp_shu, (self.batch_size, self.story_len, answer_inp_shu.shape[1],\ answer_inp_shu.shape[2])) answer_inp = answer_inp_shu_rhp.dimshuffle( 1, 2, 3, 0) # story_len x seq + 1 x dim x batch_size self.W_a = nn_utils.normal_param(std=0.1, shape=(self.vocab_size + 1, self.dim)) self.W_ans_res_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_ans_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_res = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_ans_upd_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_ans_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_ans_hid_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_ans_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_ans_map = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim * 2)) self.b_ans_map = nn_utils.constant_param(value=0.0, shape=(self.dim, )) results = None r = None dummy = theano.shared( np.zeros((self.dim, self.batch_size), dtype=floatX)) for i in range(self.story_len): answer_inp_i = answer_inp[i, :] # seq + 1 x dim x batch_size mem_ans_dim_i = mem_ans_dim[i, :] # dim x batch_size if i == 0: q_glb_inp = q_glb_last.dimshuffle('x', 1, 0) #1 x dim x batch_size answer_inp_i = T.concatenate([q_glb_inp, answer_inp_i], axis=0) init_h = T.concatenate([dummy, mem_ans_dim_i], axis=0) init_h = T.dot(self.W_ans_map, init_h) + self.b_ans_map.dimshuffle(0, 'x') init_h = T.tanh(init_h) r, _ = theano.scan(fn=self.answer_gru_step, sequences=answer_inp_i, truncate_gradient=self.truncate_gradient, outputs_info=[init_h]) r = r[1:, :] # get rid of the first glob one. results = r.dimshuffle('x', 0, 1, 2) else: prev_h = r[self.answer_idx[:, i], :, T.arange(self.batch_size)] h_ = T.concatenate([prev_h.dimshuffle(1, 0), mem_ans_dim_i], axis=0) h_ = T.dot(self.W_ans_map, h_) + self.b_ans_map.dimshuffle( 0, 'x') h_ = T.tanh(h_) r, _ = theano.scan(fn=self.answer_gru_step, sequences=answer_inp_i, truncate_gradient=self.truncate_gradient, outputs_info=[h_]) results = T.concatenate([results, r.dimshuffle('x', 0, 1, 2)]) ## results: story_len x seq+1 x dim x batch_size results = results.dimshuffle(3, 0, 1, 2) results = T.reshape(results, (self.batch_size * self.story_len, results.shape[2], results.shape[3])) results = results.dimshuffle(1, 2, 0) # seq_len x dim x (batch x seq) # Assume there is a start token #print 'results', results.shape.eval({self.input_var: np.random.rand(2,5,196,512).astype('float32'), # self.q_var: np.random.rand(2,5, 4096).astype('float32'), # self.answer_idx: np.asarray([[1,1,1,1,1],[2,2,2,2,2]]).astype('int32'), # self.answer_inp_var: np.random.rand(5, 18, 8001).astype('float32')}) #results = results[1:-1,:,:] # get rid of the last token as well as the first one (image) #print results.shape.eval({self.input_var: np.random.rand(3,4,4096).astype('float32'), # self.q_var: np.random.rand(3, 4096).astype('float32'), # self.answer_inp_var: np.random.rand(3, 18, 8001).astype('float32')}, on_unused_input='ignore') # Now, we need to transform it to the probabilities. prob, _ = theano.scan(fn=lambda x, w: T.dot(w, x), sequences=results, non_sequences=self.W_a) #print 'prob', prob.shape.eval({self.input_var: np.random.rand(2,5,196,512).astype('float32'), # self.q_var: np.random.rand(2,5, 4096).astype('float32'), # self.answer_idx: np.asarray([[1,1,1,1,1],[2,2,2,2,2]]).astype('int32'), # self.answer_inp_var: np.random.rand(5, 18, 8001).astype('float32')}) #preds = prob[1:,:,:] #prob = prob[1:-1,:,:] preds = prob prob = prob[:-1, :, :] prob_shuffled = prob.dimshuffle(2, 0, 1) # b * len * vocab preds_shuffled = preds.dimshuffle(2, 0, 1) logging.info("prob shape.") #print prob.shape.eval({self.input_var: np.random.rand(3,4,4096).astype('float32'), # self.q_var: np.random.rand(3, 4096).astype('float32'), # self.answer_inp_var: np.random.rand(3, 18, 8001).astype('float32')}) n = prob_shuffled.shape[0] * prob_shuffled.shape[1] n_preds = preds_shuffled.shape[0] * preds_shuffled.shape[1] prob_rhp = T.reshape(prob_shuffled, (n, prob_shuffled.shape[2])) preds_rhp = T.reshape(preds_shuffled, (n_preds, preds_shuffled.shape[2])) prob_sm = nn_utils.softmax_(prob_rhp) preds_sm = nn_utils.softmax_(preds_rhp) self.prediction = prob_sm # this one is for the training. #print 'prob_sm', prob_sm.shape.eval({prob: np.random.rand(19,8897,3).astype('float32')}) #print 'lbl', loss_vec.shape.eval({prob: np.random.rand(19,8897,3).astype('float32')}) # This one is for the beamsearch. self.pred = T.reshape( preds_sm, (preds_shuffled.shape[0], preds_shuffled.shape[1], preds_shuffled.shape[2])) mask = T.reshape(self.answer_mask, (n, )) lbl = T.reshape(self.answer_var, (n, )) self.params = [ self.W_inp_emb_in, self.b_inp_emb_in, self.W_q_emb_in, self.b_q_emb_in, #self.W_glb_att_1, self.W_glb_att_2, self.b_glb_att_1, self.b_glb_att_2, self.W_qf_res_in, self.W_qf_res_hid, self.b_qf_res, self.W_qf_upd_in, self.W_qf_upd_hid, self.b_qf_upd, self.W_qf_hid_in, self.W_qf_hid_hid, self.b_qf_hid, self.W_mem_emb, self.W_inp_emb, self.b_mem_emb, self.b_inp_emb, self.W_mem_res_in, self.W_mem_res_hid, self.b_mem_res, self.W_mem_upd_in, self.W_mem_upd_hid, self.b_mem_upd, self.W_mem_hid_in, self.W_mem_hid_hid, self.b_mem_hid, #self.W_b #self.W_mem_emb, self.W_inp_emb,self.b_mem_emb, self.b_inp_emb, self.W_1, self.W_2, self.b_1, self.b_2, self.W_a, self.W_ans_res_in, self.W_ans_res_hid, self.b_ans_res, self.W_ans_upd_in, self.W_ans_upd_hid, self.b_ans_upd, self.W_ans_hid_in, self.W_ans_hid_hid, self.b_ans_hid, self.W_ans_map, self.b_ans_map, ] self.params += self.W_mem_update self.params += self.b_mem_update print "==> building loss layer and computing updates" reward_prob = prob_sm[T.arange(n), lbl] reward_prob = T.reshape( reward_prob, (prob_shuffled.shape[0], prob_shuffled.shape[1])) #reward_prob = printing.Print('mean_r')(reward_prob) loss_vec = T.nnet.categorical_crossentropy(prob_sm, lbl) #loss_vec = T.nnet.categorical_crossentropy(prob_sm, T.flatten(self.answer_var)) #print 'loss_vec', loss_vec.shape.eval({prob_sm: np.random.rand(39,8900).astype('float32'), # lbl: np.random.rand(39,).astype('int32')}) self.loss_ce = (mask * loss_vec).sum() / mask.sum() print 'loss_ce', self.loss_ce.eval({ prob_sm: np.random.rand(39, 8900).astype('float32'), lbl: np.random.rand(39, ).astype('int32'), mask: np.random.rand(39, ).astype('float32') }) if self.l2 > 0: self.loss_l2 = self.l2 * nn_utils.l2_reg(self.params) else: self.loss_l2 = 0 self.loss = self.loss_ce + self.loss_l2 grads = T.grad(self.loss, wrt=self.params, disconnected_inputs='raise') updates = lasagne.updates.adadelta(grads, self.params, learning_rate=self.learning_rate) if self.mode == 'train': logging.info("compiling train_fn") self.train_fn = theano.function( inputs=[ self.input_var, self.q_var, self.answer_var, self.answer_mask, self.answer_inp_var, self.answer_idx ], outputs=[self.prediction, self.loss], updates=updates) logging.info("compiling test_fn") self.test_fn = theano.function(inputs=[ self.input_var, self.q_var, self.answer_var, self.answer_mask, self.answer_inp_var, self.answer_idx ], outputs=[self.prediction, self.loss]) logging.info("compiling pred_fn") self.pred_fn = theano.function(inputs=[ self.input_var, self.q_var, self.answer_inp_var, self.answer_idx ], outputs=[self.pred])
def __init__(self, babi_train_raw, babi_test_raw, word2vec, word_vector_size, dim, mode, answer_module, input_mask_mode, memory_hops, l2, normalize_attention, **kwargs): print "==> not used params in DMN class:", kwargs.keys() self.vocab = {} self.ivocab = {} self.word2vec = word2vec self.word_vector_size = word_vector_size self.dim = dim self.mode = mode self.answer_module = answer_module self.input_mask_mode = input_mask_mode self.memory_hops = memory_hops self.l2 = l2 self.normalize_attention = normalize_attention self.train_input, self.train_q, self.train_answer, self.train_input_mask = self._process_input( babi_train_raw) self.test_input, self.test_q, self.test_answer, self.test_input_mask = self._process_input( babi_test_raw) self.vocab_size = len(self.vocab) self.input_var = T.matrix('input_var') self.q_var = T.matrix('question_var') self.answer_var = T.iscalar('answer_var') self.input_mask_var = T.ivector('input_mask_var') print "==> building input module" self.W_inp_res_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.word_vector_size)) self.W_inp_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_res = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_inp_upd_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.word_vector_size)) self.W_inp_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_inp_hid_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.word_vector_size)) self.W_inp_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, )) inp_c_history, _ = theano.scan(fn=self.input_gru_step, sequences=self.input_var, outputs_info=T.zeros_like( self.b_inp_hid)) self.inp_c = inp_c_history.take(self.input_mask_var, axis=0) self.q_q, _ = theano.scan(fn=self.input_gru_step, sequences=self.q_var, outputs_info=T.zeros_like(self.b_inp_hid)) self.q_q = self.q_q[-1] print "==> creating parameters for memory module" self.W_mem_res_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_res = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_mem_upd_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_mem_hid_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_b = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_1 = nn_utils.normal_param(std=0.1, shape=(self.dim, 7 * self.dim + 2)) self.W_2 = nn_utils.normal_param(std=0.1, shape=(1, self.dim)) self.b_1 = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.b_2 = nn_utils.constant_param(value=0.0, shape=(1, )) print "==> building episodic memory module (fixed number of steps: %d)" % self.memory_hops memory = [self.q_q.copy()] for iter in range(1, self.memory_hops + 1): current_episode = self.new_episode(memory[iter - 1]) memory.append( self.GRU_update(memory[iter - 1], current_episode, self.W_mem_res_in, self.W_mem_res_hid, self.b_mem_res, self.W_mem_upd_in, self.W_mem_upd_hid, self.b_mem_upd, self.W_mem_hid_in, self.W_mem_hid_hid, self.b_mem_hid)) last_mem = memory[-1] print "==> building answer module" self.W_a = nn_utils.normal_param(std=0.1, shape=(self.vocab_size, self.dim)) if self.answer_module == 'feedforward': self.prediction = nn_utils.softmax(T.dot(self.W_a, last_mem)) elif self.answer_module == 'recurrent': self.W_ans_res_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.dim + self.vocab_size)) self.W_ans_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_res = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_ans_upd_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.dim + self.vocab_size)) self.W_ans_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, )) self.W_ans_hid_in = nn_utils.normal_param( std=0.1, shape=(self.dim, self.dim + self.vocab_size)) self.W_ans_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, )) def answer_step(prev_a, prev_y): a = self.GRU_update(prev_a, T.concatenate([prev_y, self.q_q]), self.W_ans_res_in, self.W_ans_res_hid, self.b_ans_res, self.W_ans_upd_in, self.W_ans_upd_hid, self.b_ans_upd, self.W_ans_hid_in, self.W_ans_hid_hid, self.b_ans_hid) y = nn_utils.softmax(T.dot(self.W_a, a)) return [a, y] # TODO: add conditional ending dummy = theano.shared(np.zeros((self.vocab_size, ), dtype=floatX)) results, updates = theano.scan( fn=answer_step, outputs_info=[last_mem, T.zeros_like(dummy)], n_steps=1) self.prediction = results[1][-1] else: raise Exception("invalid answer_module") print "==> collecting all parameters" self.params = [ self.W_inp_res_in, self.W_inp_res_hid, self.b_inp_res, self.W_inp_upd_in, self.W_inp_upd_hid, self.b_inp_upd, self.W_inp_hid_in, self.W_inp_hid_hid, self.b_inp_hid, self.W_mem_res_in, self.W_mem_res_hid, self.b_mem_res, self.W_mem_upd_in, self.W_mem_upd_hid, self.b_mem_upd, self.W_mem_hid_in, self.W_mem_hid_hid, self.b_mem_hid, self.W_b, self.W_1, self.W_2, self.b_1, self.b_2, self.W_a ] if self.answer_module == 'recurrent': self.params = self.params + [ self.W_ans_res_in, self.W_ans_res_hid, self.b_ans_res, self.W_ans_upd_in, self.W_ans_upd_hid, self.b_ans_upd, self.W_ans_hid_in, self.W_ans_hid_hid, self.b_ans_hid ] print "==> building loss layer and computing updates" self.loss_ce = T.nnet.categorical_crossentropy( self.prediction.dimshuffle('x', 0), T.stack([self.answer_var]))[0] if self.l2 > 0: self.loss_l2 = self.l2 * nn_utils.l2_reg(self.params) else: self.loss_l2 = 0 self.loss = self.loss_ce + self.loss_l2 updates = lasagne.updates.adadelta(self.loss, self.params) if self.mode == 'train': print "==> compiling train_fn" self.train_fn = theano.function( inputs=[ self.input_var, self.q_var, self.answer_var, self.input_mask_var ], outputs=[self.prediction, self.loss], updates=updates) print "==> compiling test_fn" self.test_fn = theano.function(inputs=[ self.input_var, self.q_var, self.answer_var, self.input_mask_var ], outputs=[ self.prediction, self.loss, self.inp_c, self.q_q, last_mem ]) if self.mode == 'train': print "==> computing gradients (for debugging)" gradient = T.grad(self.loss, self.params) self.get_gradient_fn = theano.function(inputs=[ self.input_var, self.q_var, self.answer_var, self.input_mask_var ], outputs=gradient)
def __init__(self, train_raw, dev_raw, test_raw, word2vec, word_vector_size, dim, mode, input_mask_mode, memory_hops, l2, normalize_attention, dropout, **kwargs): print "generate one-word answer for mctest" print "==> not used params in DMN class:", kwargs.keys() self.word2vec = word2vec self.word_vector_size = word_vector_size self.vocab_size = len(word2vec) self.dim = dim # hidden state size self.mode = mode self.input_mask_mode = input_mask_mode self.memory_hops = memory_hops self.l2 = l2 self.normalize_attention = normalize_attention self.dropout = dropout self.train_input, self.train_q, self.train_answer, self.train_input_mask = self._process_input(train_raw) self.dev_input, self.dev_q, self.dev_answer, self.dev_input_mask = self._process_input(dev_raw) self.test_input, self.test_q, self.test_answer, self.test_input_mask = self._process_input(test_raw) self.input_var = T.matrix('input_var') self.q_var = T.matrix('question_var') self.answer_var = T.iscalar('answer_var') self.input_mask_var = T.ivector('input_mask_var') self.attentions = [] print "==> building input module" self.W_inp_res_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.word_vector_size)) self.W_inp_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_res = nn_utils.constant_param(value=0.0, shape=(self.dim,)) self.W_inp_upd_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.word_vector_size)) self.W_inp_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_upd = nn_utils.constant_param(value=0.0, shape=(self.dim,)) self.W_inp_hid_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.word_vector_size)) self.W_inp_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_hid = nn_utils.constant_param(value=0.0, shape=(self.dim,)) inp_c_history, _ = theano.scan(fn=self.input_gru_step, sequences=self.input_var, outputs_info=T.zeros_like(self.b_inp_hid)) self.inp_c = inp_c_history.take(self.input_mask_var, axis=0) self.q_q, _ = theano.scan(fn=self.input_gru_step, sequences=self.q_var, outputs_info=T.zeros_like(self.b_inp_hid)) self.q_q = self.q_q[-1] print "==> creating parameters for memory module" self.W_mem_res_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_res = nn_utils.constant_param(value=0.0, shape=(self.dim,)) self.W_mem_upd_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_upd = nn_utils.constant_param(value=0.0, shape=(self.dim,)) self.W_mem_hid_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_hid = nn_utils.constant_param(value=0.0, shape=(self.dim,)) self.W_b = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_1 = nn_utils.normal_param(std=0.1, shape=(self.dim, 7 * self.dim + 2)) self.W_2 = nn_utils.normal_param(std=0.1, shape=(1, self.dim)) self.b_1 = nn_utils.constant_param(value=0.0, shape=(self.dim,)) self.b_2 = nn_utils.constant_param(value=0.0, shape=(1,)) print "==> building episodic memory module (fixed number of steps: %d)" % self.memory_hops memory = [self.q_q.copy()] for iter in range(1, self.memory_hops + 1): current_episode = self.new_episode(memory[iter - 1]) memory.append(self.GRU_update(memory[iter - 1], current_episode, self.W_mem_res_in, self.W_mem_res_hid, self.b_mem_res, self.W_mem_upd_in, self.W_mem_upd_hid, self.b_mem_upd, self.W_mem_hid_in, self.W_mem_hid_hid, self.b_mem_hid)) last_mem_raw = memory[-1].dimshuffle(('x', 0)) net = layers.InputLayer(shape=(1, self.dim), input_var=last_mem_raw) if self.dropout > 0 and self.mode == 'train': net = layers.DropoutLayer(net, p=self.dropout) last_mem = layers.get_output(net)[0] self.attentions = T.stack(self.attentions) print "==> building answer module" self.W_a = nn_utils.normal_param(std=0.1, shape=(self.vocab_size, self.dim)) self.prediction = nn_utils.softmax(T.dot(self.W_a, last_mem)) print "==> collecting all parameters" self.params = [self.W_inp_res_in, self.W_inp_res_hid, self.b_inp_res, self.W_inp_upd_in, self.W_inp_upd_hid, self.b_inp_upd, self.W_inp_hid_in, self.W_inp_hid_hid, self.b_inp_hid, self.W_mem_res_in, self.W_mem_res_hid, self.b_mem_res, self.W_mem_upd_in, self.W_mem_upd_hid, self.b_mem_upd, self.W_mem_hid_in, self.W_mem_hid_hid, self.b_mem_hid, self.W_b, self.W_1, self.W_2, self.b_1, self.b_2, self.W_a] print "==> building loss layer and computing updates" self.loss_ce = T.nnet.categorical_crossentropy(self.prediction.dimshuffle('x',0),T.stack([self.answer_var]))[0] if self.l2 > 0: self.loss_l2 = self.l2 * nn_utils.l2_reg(self.params) else: self.loss_l2 = 0 self.loss = self.loss_ce + self.loss_l2 updates = lasagne.updates.adam(self.loss, self.params) #updates = lasagne.updates.momentum(self.loss, self.params, learning_rate=0.0003) if self.mode == 'train': print "==> compiling train_fn" self.train_fn = theano.function(inputs=[self.input_var, self.q_var, self.answer_var, self.input_mask_var], allow_input_downcast = True, outputs=[self.prediction, self.loss], updates=updates) print "==> compiling test_fn" self.test_fn = theano.function(inputs=[self.input_var, self.q_var, self.answer_var, self.input_mask_var], allow_input_downcast = True, outputs=[self.prediction, self.loss, self.attentions])
def __init__(self, babi_train_raw, babi_test_raw, word2vec, word_vector_size, dim, mode, answer_module, input_mask_mode, memory_hops, l2, normalize_attention, **kwargs): print "==> not used params in DMN class:", kwargs.keys() self.vocab = {} self.ivocab = {} self.word2vec = word2vec self.word_vector_size = word_vector_size self.dim = dim self.mode = mode self.answer_module = answer_module self.input_mask_mode = input_mask_mode self.memory_hops = memory_hops self.l2 = l2 self.normalize_attention = normalize_attention self.train_input, self.train_q, self.train_answer, self.train_input_mask = self._process_input(babi_train_raw) self.test_input, self.test_q, self.test_answer, self.test_input_mask = self._process_input(babi_test_raw) self.vocab_size = len(self.vocab) self.input_var = T.matrix('input_var') self.q_var = T.matrix('question_var') self.answer_var = T.iscalar('answer_var') self.input_mask_var = T.ivector('input_mask_var') print "==> building input module" self.W_inp_res_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.word_vector_size)) self.W_inp_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_res = nn_utils.constant_param(value=0.0, shape=(self.dim,)) self.W_inp_upd_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.word_vector_size)) self.W_inp_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_upd = nn_utils.constant_param(value=0.0, shape=(self.dim,)) self.W_inp_hid_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.word_vector_size)) self.W_inp_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_inp_hid = nn_utils.constant_param(value=0.0, shape=(self.dim,)) inp_c_history, _ = theano.scan(fn=self.input_gru_step, sequences=self.input_var, outputs_info=T.zeros_like(self.b_inp_hid)) self.inp_c = inp_c_history.take(self.input_mask_var, axis=0) self.q_q, _ = theano.scan(fn=self.input_gru_step, sequences=self.q_var, outputs_info=T.zeros_like(self.b_inp_hid)) self.q_q = self.q_q[-1] print "==> creating parameters for memory module" self.W_mem_res_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_res = nn_utils.constant_param(value=0.0, shape=(self.dim,)) self.W_mem_upd_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_upd = nn_utils.constant_param(value=0.0, shape=(self.dim,)) self.W_mem_hid_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_mem_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_mem_hid = nn_utils.constant_param(value=0.0, shape=(self.dim,)) self.W_b = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_1 = nn_utils.normal_param(std=0.1, shape=(self.dim, 7 * self.dim + 2)) self.W_2 = nn_utils.normal_param(std=0.1, shape=(1, self.dim)) self.b_1 = nn_utils.constant_param(value=0.0, shape=(self.dim,)) self.b_2 = nn_utils.constant_param(value=0.0, shape=(1,)) print "==> building episodic memory module (fixed number of steps: %d)" % self.memory_hops memory = [self.q_q.copy()] for iter in range(1, self.memory_hops + 1): current_episode = self.new_episode(memory[iter - 1]) memory.append(self.GRU_update(memory[iter - 1], current_episode, self.W_mem_res_in, self.W_mem_res_hid, self.b_mem_res, self.W_mem_upd_in, self.W_mem_upd_hid, self.b_mem_upd, self.W_mem_hid_in, self.W_mem_hid_hid, self.b_mem_hid)) last_mem = memory[-1] print "==> building answer module" self.W_a = nn_utils.normal_param(std=0.1, shape=(self.vocab_size, self.dim)) if self.answer_module == 'feedforward': self.prediction = nn_utils.softmax(T.dot(self.W_a, last_mem)) elif self.answer_module == 'recurrent': self.W_ans_res_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim + self.vocab_size)) self.W_ans_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_res = nn_utils.constant_param(value=0.0, shape=(self.dim,)) self.W_ans_upd_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim + self.vocab_size)) self.W_ans_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_upd = nn_utils.constant_param(value=0.0, shape=(self.dim,)) self.W_ans_hid_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim + self.vocab_size)) self.W_ans_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_hid = nn_utils.constant_param(value=0.0, shape=(self.dim,)) def answer_step(prev_a, prev_y): a = self.GRU_update(prev_a, T.concatenate([prev_y, self.q_q]), self.W_ans_res_in, self.W_ans_res_hid, self.b_ans_res, self.W_ans_upd_in, self.W_ans_upd_hid, self.b_ans_upd, self.W_ans_hid_in, self.W_ans_hid_hid, self.b_ans_hid) y = nn_utils.softmax(T.dot(self.W_a, a)) return [a, y] # TODO: add conditional ending dummy = theano.shared(np.zeros((self.vocab_size, ), dtype=floatX)) results, updates = theano.scan(fn=answer_step, outputs_info=[last_mem, T.zeros_like(dummy)], n_steps=1) self.prediction = results[1][-1] else: raise Exception("invalid answer_module") print "==> collecting all parameters" self.params = [self.W_inp_res_in, self.W_inp_res_hid, self.b_inp_res, self.W_inp_upd_in, self.W_inp_upd_hid, self.b_inp_upd, self.W_inp_hid_in, self.W_inp_hid_hid, self.b_inp_hid, self.W_mem_res_in, self.W_mem_res_hid, self.b_mem_res, self.W_mem_upd_in, self.W_mem_upd_hid, self.b_mem_upd, self.W_mem_hid_in, self.W_mem_hid_hid, self.b_mem_hid, self.W_b, self.W_1, self.W_2, self.b_1, self.b_2, self.W_a] if self.answer_module == 'recurrent': self.params = self.params + [self.W_ans_res_in, self.W_ans_res_hid, self.b_ans_res, self.W_ans_upd_in, self.W_ans_upd_hid, self.b_ans_upd, self.W_ans_hid_in, self.W_ans_hid_hid, self.b_ans_hid] print "==> building loss layer and computing updates" self.loss_ce = T.nnet.categorical_crossentropy(self.prediction.dimshuffle('x', 0), T.stack([self.answer_var]))[0] if self.l2 > 0: self.loss_l2 = self.l2 * nn_utils.l2_reg(self.params) else: self.loss_l2 = 0 self.loss = self.loss_ce + self.loss_l2 updates = lasagne.updates.adadelta(self.loss, self.params) if self.mode == 'train': print "==> compiling train_fn" self.train_fn = theano.function(inputs=[self.input_var, self.q_var, self.answer_var, self.input_mask_var], outputs=[self.prediction, self.loss], updates=updates) print "==> compiling test_fn" self.test_fn = theano.function(inputs=[self.input_var, self.q_var, self.answer_var, self.input_mask_var], outputs=[self.prediction, self.loss, self.inp_c, self.q_q, last_mem]) if self.mode == 'train': print "==> computing gradients (for debugging)" gradient = T.grad(self.loss, self.params) self.get_gradient_fn = theano.function(inputs=[self.input_var, self.q_var, self.answer_var, self.input_mask_var], outputs=gradient)
def __init__(self, data_dir, word2vec, word_vector_size, dim, mode, answer_module, memory_hops, batch_size, l2, normalize_attention, batch_norm, dropout, **kwargs): print "==> not used params in DMN class:", kwargs.keys() self.data_dir = data_dir self.word2vec = word2vec self.word_vector_size = word_vector_size self.dim = dim self.mode = mode self.answer_module = answer_module self.memory_hops = memory_hops self.batch_size = batch_size self.l2 = l2 self.normalize_attention = normalize_attention self.batch_norm = batch_norm self.dropout = dropout self.vocab, self.ivocab = self._load_vocab(self.data_dir) self.train_story = None self.test_story = None self.train_dict_story, self.train_features, self.train_fns_dict, self.train_num_imgs = self._process_input_sind(self.data_dir, 'train') self.test_dict_story, self.test_features, self.test_fns_dict, self.test_num_imgs = self._process_input_sind(self.data_dir, 'val') self.train_story = self.train_dict_story.keys() self.test_story = self.test_dict_story.keys() self.vocab_size = len(self.vocab) self.q_var = T.matrix('q_var') # Now, it's a batch * image_sieze. self.answer_var = T.imatrix('answer_var') # answer of example in minibatch self.answer_mask = T.matrix('answer_mask') self.answer_inp_var = T.tensor3('answer_inp_var') # answer of example in minibatch print "==> building question module" # Now, share the parameter with the input module. q_var_shuffled = self.q_var.dimshuffle(1,0) self.W_inp_emb_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.cnn_dim)) self.b_inp_emb_in = nn_utils.constant_param(value=0.0, shape=(self.dim,)) q_hist = T.dot(self.W_inp_emb_in, q_var_shuffled) + self.b_inp_emb_in.dimshuffle(0,'x') q_hist_shuffled = q_hist.dimshuffle(1,0) if self.batch_norm: logging.info("Using batch normalization.") q_net = layers.InputLayer(shape=(self.batch_size, self.dim), input_var=q_hist_shuffled) if self.batch_norm: q_net = layers.BatchNormLayer(incoming=q_net) if self.dropout > 0 and self.mode == 'train': q_net = layers.DropoutLayer(q_net, p=self.dropout) #last_mem = layers.get_output(q_net).dimshuffle((1, 0)) self.q_q = layers.get_output(q_net).dimshuffle(1,0) print "==> building answer module" answer_inp_var_shuffled = self.answer_inp_var.dimshuffle(1,2,0) #self.W_mem_emb = nn_utils.normal_param(std = 0.1, shape = (self.dim, self.dim)) self.W_inp_emb = nn_utils.normal_param(std = 0.1, shape = (self.dim, self.vocab_size + 1)) def _dot2(x, W): return T.dot(W, x) answer_inp_var_shuffled_emb,_ = theano.scan(fn = _dot2, sequences = answer_inp_var_shuffled, non_sequences = self.W_inp_emb ) # seq x dim x batch mem_ans = self.q_q mem_ans_dim = mem_ans.dimshuffle('x',0,1) answer_inp = T.concatenate([mem_ans_dim, answer_inp_var_shuffled_emb], axis = 0) dummy = theano.shared(np.zeros((self.dim, self.batch_size), dtype=floatX)) self.W_a = nn_utils.normal_param(std=0.1, shape=(self.vocab_size + 1, self.dim)) self.W_ans_res_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_ans_res_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_res = nn_utils.constant_param(value=0.0, shape=(self.dim,)) self.W_ans_upd_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_ans_upd_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_upd = nn_utils.constant_param(value=0.0, shape=(self.dim,)) self.W_ans_hid_in = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.W_ans_hid_hid = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim)) self.b_ans_hid = nn_utils.constant_param(value=0.0, shape=(self.dim,)) logging.info('answer_inp size') #last_mem = printing.Print('prob_sm')(last_mem) results, _ = theano.scan(fn = self.answer_gru_step, sequences = answer_inp, outputs_info = [ dummy ]) prob,_ = theano.scan(fn = lambda x, w: T.dot(w, x), sequences = results, non_sequences = self.W_a ) preds = prob[1:,:,:] prob = prob[1:-1,:,:] prob_shuffled = prob.dimshuffle(2,0,1) # b * len * vocab preds_shuffled = preds.dimshuffle(2,0,1) logging.info("prob shape.") #print prob.shape.eval({self.input_var: np.random.rand(10,4,4096).astype('float32'), # self.q_var: np.random.rand(10, 4096).astype('float32'), # self.answer_inp_var: np.random.rand(10, 18, 8001).astype('float32')}) n = prob_shuffled.shape[0] * prob_shuffled.shape[1] n_preds = preds_shuffled.shape[0] * preds_shuffled.shape[1] prob_rhp = T.reshape(prob_shuffled, (n, prob_shuffled.shape[2])) preds_rhp = T.reshape(preds_shuffled, (n_preds, preds_shuffled.shape[2])) prob_sm = nn_utils.softmax_(prob_rhp) preds_sm = nn_utils.softmax_(preds_rhp) self.prediction = prob_sm # this one is for the training. # This one is for the beamsearch. self.pred = T.reshape(preds_sm, (preds_shuffled.shape[0], preds_shuffled.shape[1], preds_shuffled.shape[2])) mask = T.reshape(self.answer_mask, (n,)) lbl = T.reshape(self.answer_var, (n,)) self.params = [self.W_a,self.W_ans_res_in, self.W_ans_res_hid, self.b_ans_res, self.W_ans_upd_in, self.W_ans_upd_hid, self.b_ans_upd, self.W_ans_hid_in, self.W_ans_hid_hid, self.b_ans_hid, self.W_inp_emb_in, self.b_inp_emb_in, self.W_inp_emb] print "==> building loss layer and computing updates" loss_vec = T.nnet.categorical_crossentropy(prob_sm, lbl) self.loss_ce = (mask * loss_vec ).sum() / mask.sum() #self.loss_ce = T.nnet.categorical_crossentropy(results_rhp, lbl) if self.l2 > 0: self.loss_l2 = self.l2 * nn_utils.l2_reg(self.params) else: self.loss_l2 = 0 self.loss = self.loss_ce + self.loss_l2 grad = T.grad(self.loss, self.params) #scaled_grad = lasagne.updates.norm_constraint(grad, max_norm = 1e4) updates = lasagne.updates.adadelta(self.loss, self.params, learning_rate = 0.01) #updates = lasagne.updates.momentum(self.loss, self.params, learning_rate=0.001) if self.mode == 'train': print "==> compiling train_fn" self.train_fn = theano.function(inputs=[self.q_var, self.answer_var, self.answer_mask, self.answer_inp_var], outputs=[self.prediction, self.loss], updates=updates) print "==> compiling test_fn" self.test_fn = theano.function(inputs=[self.q_var, self.answer_var, self.answer_mask, self.answer_inp_var], outputs=[self.prediction, self.loss]) print "==> compiling pred_fn" self.pred_fn= theano.function(inputs=[self.q_var, self.answer_inp_var], outputs=[self.pred])