def attend(self, contexts, output): alpha_list = [] contexts = nn.dropout(contexts, self.dropout_rate, self.is_training, name='drop_c') output = nn.dropout(output, self.dropout_rate, self.is_training, name='drop_o') for i in range(self.batch_size): context = contexts[i] # shape = [196, 512] logits_context = nn.dense(context, units=196, activation=None, use_bias=False, name='fc_lc') # shape = [196, 196] output_i = tf.reshape(output[i], shape=[512, 1]) ones = tf.ones([1, 196], tf.float32) logits_temp = tf.matmul(output_i, ones) # shape = [512, 196] logits_temp = tf.transpose(logits_temp) # shape = [196, 512] logits_output = nn.dense(logits_temp, units=196, activation=None, use_bias=False, name='fc_lo') # shape = [196, 196] logit_tanh = tf.tanh(logits_context + logits_output) alpha = nn.dense(logit_tanh, units=1, activation=None, use_bias=False, name='fc_alpha') # shape = [196, 1] alpha = tf.reshape(alpha, shape=[196]) alpha_list.append(alpha) alpha_batch = tf.stack(alpha_list, axis=0) # shape = [batch_size, 196] alpha = tf.nn.softmax(alpha_batch) # shape = [batch_size, 196] return alpha
def build(self): params = self.params N, L, Q, F = params.batch_size, params.max_sent_size, params.max_ques_size, params.max_fact_count V, d, A = params.glove_size, params.hidden_size, self.words.vocab_size # initialize self # placeholders input = tf.placeholder(tf.float32, shape=[N, L, V], name='x') # [num_batch, sentence_len, glove_dim] question = tf.placeholder(tf.float32, shape=[N, Q, V], name='q') # [num_batch, sentence_len, glove_dim] answer = tf.placeholder(tf.int64, shape=[N], name='y') # [num_batch] - one word answer input_mask = tf.placeholder(tf.bool, shape=[N, L], name='x_mask') # [num_batch, sentence_len] is_training = tf.placeholder(tf.bool) # Prepare parameters gru = rnn_cell.GRUCell(d) # Input module with tf.variable_scope('input') as scope: input_list = self.make_decoder_batch_input(input) input_states, _ = seq2seq.rnn_decoder(input_list, gru.zero_state(N, tf.float32), gru) # Question module scope.reuse_variables() ques_list = self.make_decoder_batch_input(question) questions, _ = seq2seq.rnn_decoder(ques_list, gru.zero_state(N, tf.float32), gru) question_vec = questions[-1] # use final state # Masking: to extract fact vectors at end of sentence. (details in paper) input_states = tf.transpose(tf.pack(input_states), [1, 0, 2]) # [N, L, D] facts = [] for n in range(N): filtered = tf.boolean_mask(input_states[n, :, :], input_mask[n, :]) # [?, D] padding = tf.zeros(tf.pack([F - tf.shape(filtered)[0], d])) facts.append(tf.concat(0, [filtered, padding])) # [F, D] facked = tf.pack(facts) # packing for transpose... I hate TF so much facts = tf.unpack(tf.transpose(facked, [1, 0, 2]), num=F) # F x [N, D] # Episodic Memory with tf.variable_scope('episodic') as scope: episode = EpisodeModule(d, question_vec, facts) memory = tf.identity(question_vec) for t in range(params.memory_step): memory = gru(episode.new(memory), memory)[0] scope.reuse_variables() # Regularizations if params.batch_norm: memory = batch_norm(memory, is_training=is_training) memory = dropout(memory, params.keep_prob, is_training) with tf.name_scope('Answer'): # Answer module : feed-forward version (for it is one word answer) w_a = weight('w_a', [d, A]) logits = tf.matmul(memory, w_a) # [N, A] with tf.name_scope('Loss'): # Cross-Entropy loss cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, answer) loss = tf.reduce_mean(cross_entropy) total_loss = loss + params.weight_decay * tf.add_n(tf.get_collection('l2')) with tf.variable_scope('Accuracy'): # Accuracy predicts = tf.cast(tf.argmax(logits, 1), 'int32') corrects = tf.equal(predicts, answer) num_corrects = tf.reduce_sum(tf.cast(corrects, tf.float32)) accuracy = tf.reduce_mean(tf.cast(corrects, tf.float32)) # Training optimizer = tf.train.AdadeltaOptimizer(params.learning_rate) opt_op = optimizer.minimize(total_loss, global_step=self.global_step) # placeholders self.x = input self.q = question self.y = answer self.mask = input_mask self.is_training = is_training # tensors self.total_loss = total_loss self.num_corrects = num_corrects self.accuracy = accuracy self.opt_op = opt_op
def build(self): params = self.params N, L, Q, F = params.batch_size, params.max_sent_size, params.max_ques_size, params.max_fact_count V, d, A = params.embed_size, params.hidden_size, self.words.vocab_size # initialize self # placeholders input = tf.placeholder( 'int32', shape=[N, F, L], name='x') # [num_batch, fact_count, sentence_len] question = tf.placeholder('int32', shape=[N, Q], name='q') # [num_batch, question_len] answer = tf.placeholder('int32', shape=[N], name='y') # [num_batch] - one word answer fact_counts = tf.placeholder('int64', shape=[N], name='fc') input_mask = tf.placeholder('float32', shape=[N, F, L, V], name='xm') is_training = tf.placeholder(tf.bool) self.att = tf.constant(0.) # Prepare parameters gru = rnn_cell.GRUCell(d) l = self.positional_encoding() embedding = weight('embedding', [A, V], init='uniform', range=3**(1 / 2)) with tf.name_scope('SentenceReader'): input_list = tf.unpack(tf.transpose(input)) # L x [F, N] input_embed = [] for facts in input_list: facts = tf.unpack(facts) embed = tf.pack([ tf.nn.embedding_lookup(embedding, w) for w in facts ]) # [F, N, V] input_embed.append(embed) # apply positional encoding input_embed = tf.transpose(tf.pack(input_embed), [2, 1, 0, 3]) # [N, F, L, V] encoded = l * input_embed * input_mask facts = tf.reduce_sum(encoded, 2) # [N, F, V] # dropout time facts = dropout(facts, params.keep_prob, is_training) with tf.name_scope('InputFusion'): # Bidirectional RNN with tf.variable_scope('Forward'): forward_states, _ = tf.nn.dynamic_rnn(gru, facts, fact_counts, dtype=tf.float32) with tf.variable_scope('Backward'): facts_reverse = tf.reverse_sequence(facts, fact_counts, 1) backward_states, _ = tf.nn.dynamic_rnn(gru, facts_reverse, fact_counts, dtype=tf.float32) # Use forward and backward states both facts = forward_states + backward_states # [N, F, d] with tf.variable_scope('Question'): ques_list = tf.unpack(tf.transpose(question)) ques_embed = [ tf.nn.embedding_lookup(embedding, w) for w in ques_list ] _, question_vec = tf.nn.rnn(gru, ques_embed, dtype=tf.float32) # Episodic Memory with tf.variable_scope('Episodic'): episode = EpisodeModule(d, question_vec, facts, is_training, params.batch_norm) memory = tf.identity(question_vec) for t in range(params.memory_step): with tf.variable_scope('Layer%d' % t) as scope: if params.memory_update == 'gru': memory = gru(episode.new(memory), memory)[0] else: # ReLU update c = episode.new(memory) concated = tf.concat(1, [memory, c, question_vec]) w_t = weight('w_t', [3 * d, d]) z = tf.matmul(concated, w_t) if params.batch_norm: z = batch_norm(z, is_training) else: b_t = bias('b_t', d) z = z + b_t memory = tf.nn.relu(z) # [N, d] scope.reuse_variables() # Regularizations if params.batch_norm: memory = batch_norm(memory, is_training=is_training) memory = dropout(memory, params.keep_prob, is_training) with tf.name_scope('Answer'): # Answer module : feed-forward version (for it is one word answer) w_a = weight('w_a', [d, A], init='xavier') logits = tf.matmul(memory, w_a) # [N, A] with tf.name_scope('Loss'): # Cross-Entropy loss cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( logits, answer) loss = tf.reduce_mean(cross_entropy) total_loss = loss + params.weight_decay * tf.add_n( tf.get_collection('l2')) with tf.variable_scope('Accuracy'): # Accuracy predicts = tf.cast(tf.argmax(logits, 1), 'int32') corrects = tf.equal(predicts, answer) num_corrects = tf.reduce_sum(tf.cast(corrects, tf.float32)) accuracy = tf.reduce_mean(tf.cast(corrects, tf.float32)) # Training optimizer = tf.train.AdamOptimizer(params.learning_rate) opt_op = optimizer.minimize(total_loss, global_step=self.global_step) # placeholders self.x = input self.xm = input_mask self.q = question self.y = answer self.fc = fact_counts self.is_training = is_training # tensors self.total_loss = total_loss self.num_corrects = num_corrects self.accuracy = accuracy self.opt_op = opt_op
def build_rnn(self): with tf.variable_scope("word_embedding"): word_embedding_matrix = tf.get_variable( name='weights', shape=[self.vocabulary_size, self.embedding_size], initializer=nn.kernel_initializer(), regularizer=nn.kernel_regularizer(), trainable=True) # 1. build Word LSTM WordLSTM = tf.nn.rnn_cell.LSTMCell(self.lstm_units, initializer=nn.kernel_initializer()) if self.is_training: WordLSTM = tf.nn.rnn_cell.DropoutWrapper( WordLSTM, input_keep_prob=1.0 - self.lstm_drop_rate, output_keep_prob=1.0 - self.lstm_drop_rate, state_keep_prob=1.0 - self.lstm_drop_rate) # 2. initialize word lstm with tf.variable_scope("word_lstm_initialize"): context = tf.reduce_mean(self.visual_feats, axis=1) context_dropout = nn.dropout(context, self.dropout_rate, self.is_training, name='drop_c') initial_memory = nn.dense(context_dropout, units=self.lstm_units, activation=None, name='fc_m') initial_output = nn.dense(context_dropout, self.lstm_units, activation=None, name='fc_o') WordLSTM_last_state = initial_memory, initial_output WordLSTM_last_output = initial_output WordLSTM_last_word = tf.zeros( [self.batch_size], tf.int32) # tf.zeros() means the '<S>' token predictions = [] # store predict word prediction_corrects = [] # store correct predict to compute accuracy cross_entropies = [] # store cross entropy loss alphas = [] # 3. generate word step by step for id in range(self.max_caption_length): with tf.variable_scope("word_embedding"): word_embedding = tf.nn.embedding_lookup( word_embedding_matrix, WordLSTM_last_word) with tf.variable_scope("attend", reuse=tf.AUTO_REUSE): alpha = self.attend(self.visual_feats, WordLSTM_last_output) context = tf.reduce_sum(self.visual_feats * tf.expand_dims(alpha, axis=2), axis=1) if self.is_training: titled_masks = tf.tile( tf.expand_dims(self.masks[:, id], axis=1), [1, 196]) masked_alpha = alpha * titled_masks alphas.append(tf.reshape(masked_alpha, [-1])) with tf.variable_scope('WordLSTM'): inputs = tf.concat([context, word_embedding], axis=1) WordLSTM_current_output, WordLSTM_current_state = WordLSTM( inputs, WordLSTM_last_state) with tf.variable_scope('decode'): expanded_output = tf.concat( [context, word_embedding, WordLSTM_current_output], axis=1) expanded_output_dropout = nn.dropout(expanded_output, self.dropout_rate, self.is_training, name='drop') logits = nn.dense(expanded_output_dropout, units=self.vocabulary_size, activation=None, name='fc') prediction = tf.argmax(logits, 1) predictions.append(prediction) tf.get_variable_scope().reuse_variables() WordLSTM_last_state = WordLSTM_current_state # use teacher policy if self.is_training: WordLSTM_last_word = self.sentences[:, id] else: WordLSTM_last_word = prediction # compute loss cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=self.sentences[:, id], logits=logits) masked_cross_entropy = cross_entropy * self.masks[:, id] cross_entropies.append(masked_cross_entropy) # compute accuracy ground_truth = tf.cast(self.sentences[:, id], tf.int64) prediction_correct = tf.where( tf.equal(prediction, ground_truth), tf.cast(self.masks[:, id], tf.float32), tf.cast(tf.zeros_like(prediction), tf.float32)) prediction_corrects.append(prediction_correct) # 4. compute accuracy prediction_corrects = tf.stack(prediction_corrects, axis=1) accuracy = tf.reduce_sum(prediction_corrects) / tf.reduce_sum( self.masks) self.predictions = predictions self.cross_entropies = cross_entropies self.alphas = alphas self.accuracy = accuracy print('rnn built.')
def build(self): params = self.params N, L, Q, F = params.batch_size, params.max_sent_size, params.max_ques_size, params.max_fact_count V, d, A = params.embed_size, params.hidden_size, self.words.vocab_size # initialize self # placeholders input = tf.placeholder('int32', shape=[N, L], name='x') # [num_batch, sentence_len] question = tf.placeholder('int32', shape=[N, Q], name='q') # [num_batch, sentence_len] answer = tf.placeholder('int32', shape=[N], name='y') # [num_batch] - one word answer input_mask = tf.placeholder(tf.bool, shape=[N, L], name='x_mask') # [num_batch, sentence_len] is_training = tf.placeholder(tf.bool) # Prepare parameters gru = rnn_cell.GRUCell(d) # Input module with tf.variable_scope('input') as scope: input_list = tf.unpack(tf.transpose(input)) input_states, _ = seq2seq.embedding_rnn_decoder(input_list, gru.zero_state(N, tf.float32), gru, A, V) # Question module scope.reuse_variables() ques_list = tf.unpack(tf.transpose(question)) questions, _ = seq2seq.embedding_rnn_decoder(ques_list, gru.zero_state(N, tf.float32), gru, A, V) question_vec = questions[-1] # use final state # Masking: to extract fact vectors at end of sentence. (details in paper) input_states = tf.transpose(tf.pack(input_states), [1, 0, 2]) # [N, L, D] facts = [] for n in range(N): filtered = tf.boolean_mask(input_states[n, :, :], input_mask[n, :]) # [?, D] padding = tf.zeros(tf.pack([F - tf.shape(filtered)[0], d])) facts.append(tf.concat(0, [filtered, padding])) # [F, D] facked = tf.pack(facts) # packing for transpose... I hate TF so much facts = tf.unpack(tf.transpose(facked, [1, 0, 2]), num=F) # F x [N, D] # Episodic Memory with tf.variable_scope('episodic') as scope: episode = EpisodeModule(d, question_vec, facts) memory = tf.identity(question_vec) for t in range(params.memory_step): memory = gru(episode.new(memory), memory)[0] scope.reuse_variables() # Regularizations if params.batch_norm: memory = batch_norm(memory, is_training=is_training) memory = dropout(memory, params.keep_prob, is_training) with tf.name_scope('Answer'): # Answer module : feed-forward version (for it is one word answer) w_a = weight('w_a', [d, A]) logits = tf.matmul(memory, w_a) # [N, A] with tf.name_scope('Loss'): # Cross-Entropy loss cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, answer) loss = tf.reduce_mean(cross_entropy) total_loss = loss + params.weight_decay * tf.add_n(tf.get_collection('l2')) with tf.variable_scope('Accuracy'): # Accuracy predicts = tf.cast(tf.argmax(logits, 1), 'int32') corrects = tf.equal(predicts, answer) num_corrects = tf.reduce_sum(tf.cast(corrects, tf.float32)) accuracy = tf.reduce_mean(tf.cast(corrects, tf.float32)) # Training optimizer = tf.train.AdadeltaOptimizer(params.learning_rate) opt_op = optimizer.minimize(total_loss, global_step=self.global_step) # placeholders self.x = input self.q = question self.y = answer self.mask = input_mask self.is_training = is_training # tensors self.total_loss = total_loss self.num_corrects = num_corrects self.accuracy = accuracy self.opt_op = opt_op
def build(self): params = self.params N, L, Q, F = params.batch_size, params.max_sent_size, params.max_ques_size, params.max_fact_count V, d, A = params.embed_size, params.hidden_size, self.words.vocab_size # initialize self # placeholders input = tf.placeholder('int32', shape=[N, F, L], name='x') # [num_batch, fact_count, sentence_len] question = tf.placeholder('int32', shape=[N, Q], name='q') # [num_batch, question_len] answer = tf.placeholder('int32', shape=[N], name='y') # [num_batch] - one word answer fact_counts = tf.placeholder('int64', shape=[N], name='fc') input_mask = tf.placeholder('float32', shape=[N, F, L, V], name='xm') is_training = tf.placeholder(tf.bool) self.att = tf.constant(0.) # Prepare parameters gru = rnn_cell.GRUCell(d) l = self.positional_encoding() embedding = weight('embedding', [A, V], init='uniform', range=3**(1/2)) with tf.name_scope('SentenceReader'): input_list = tf.unpack(tf.transpose(input)) # L x [F, N] input_embed = [] for facts in input_list: facts = tf.unpack(facts) embed = tf.pack([tf.nn.embedding_lookup(embedding, w) for w in facts]) # [F, N, V] input_embed.append(embed) # apply positional encoding input_embed = tf.transpose(tf.pack(input_embed), [2, 1, 0, 3]) # [N, F, L, V] encoded = l * input_embed * input_mask facts = tf.reduce_sum(encoded, 2) # [N, F, V] # dropout time facts = dropout(facts, params.keep_prob, is_training) with tf.name_scope('InputFusion'): # Bidirectional RNN with tf.variable_scope('Forward'): forward_states, _ = tf.nn.dynamic_rnn(gru, facts, fact_counts, dtype=tf.float32) with tf.variable_scope('Backward'): facts_reverse = tf.reverse_sequence(facts, fact_counts, 1) backward_states, _ = tf.nn.dynamic_rnn(gru, facts_reverse, fact_counts, dtype=tf.float32) # Use forward and backward states both facts = forward_states + backward_states # [N, F, d] with tf.variable_scope('Question'): ques_list = tf.unpack(tf.transpose(question)) ques_embed = [tf.nn.embedding_lookup(embedding, w) for w in ques_list] _, question_vec = tf.nn.rnn(gru, ques_embed, dtype=tf.float32) # Episodic Memory with tf.variable_scope('Episodic'): episode = EpisodeModule(d, question_vec, facts, is_training, params.batch_norm) memory = tf.identity(question_vec) for t in range(params.memory_step): with tf.variable_scope('Layer%d' % t) as scope: if params.memory_update == 'gru': memory = gru(episode.new(memory), memory)[0] else: # ReLU update c = episode.new(memory) concated = tf.concat(1, [memory, c, question_vec]) w_t = weight('w_t', [3 * d, d]) z = tf.matmul(concated, w_t) if params.batch_norm: z = batch_norm(z, is_training) else: b_t = bias('b_t', d) z = z + b_t memory = tf.nn.relu(z) # [N, d] scope.reuse_variables() # Regularizations if params.batch_norm: memory = batch_norm(memory, is_training=is_training) memory = dropout(memory, params.keep_prob, is_training) with tf.name_scope('Answer'): # Answer module : feed-forward version (for it is one word answer) w_a = weight('w_a', [d, A], init='xavier') logits = tf.matmul(memory, w_a) # [N, A] with tf.name_scope('Loss'): # Cross-Entropy loss cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, answer) loss = tf.reduce_mean(cross_entropy) total_loss = loss + params.weight_decay * tf.add_n(tf.get_collection('l2')) with tf.variable_scope('Accuracy'): # Accuracy predicts = tf.cast(tf.argmax(logits, 1), 'int32') corrects = tf.equal(predicts, answer) num_corrects = tf.reduce_sum(tf.cast(corrects, tf.float32)) accuracy = tf.reduce_mean(tf.cast(corrects, tf.float32)) # Training optimizer = tf.train.AdamOptimizer(params.learning_rate) opt_op = optimizer.minimize(total_loss, global_step=self.global_step) # placeholders self.x = input self.xm = input_mask self.q = question self.y = answer self.fc = fact_counts self.is_training = is_training # tensors self.total_loss = total_loss self.num_corrects = num_corrects self.accuracy = accuracy self.opt_op = opt_op
def build(self): params = self.params N, L, Q, F = params.batch_size, params.max_sent_size, params.max_ques_size, params.max_fact_count V, d, A = params.embed_size, params.hidden_size, self.words.vocab_size # initialize self # placeholders input = tf.placeholder( 'int32', shape=[N, F, L], name='x') # [num_batch, fact_count, sentence_len] question = tf.placeholder('int32', shape=[N, Q], name='q') # [num_batch, question_len] answer = tf.placeholder('int32', shape=[N], name='y') # [num_batch] - one word answer fact_counts = tf.placeholder( 'int64', shape=[N], name='fc') #how many facts for each question input_mask = tf.placeholder( 'float32', shape=[N, F, L, V], name='xm') #[num_batch, fact_count, sentence_len,embed_size] is_training = tf.placeholder(tf.bool) self.att = tf.constant(0.) # Prepare parameters gru = rnn_cell.GRUCell( d) #building a GRU cell with d hidden dimentions l = self.positional_encoding( ) #This is a positional encoding matrix for each sentance. We can embed input words in each sentance by this embedding = weight( 'embedding', [A, V], init='uniform', range=3**(1 / 2)) #embedding metric [vocanb size , embed size] with tf.name_scope('SentenceReader'): input_list = tf.unpack( tf.transpose(input) ) # L x [F, N] #input it gives how many sentaces ,batch size and length of a sentence input_embed = [] for facts in input_list: #this will iterate till maximum sentance length facts = tf.unpack( facts ) #in each sentance postion there can be 10 *128 words embed = tf.pack([ tf.nn.embedding_lookup(embedding, w) for w in facts ]) # [F, N, V] #put them insid the ebedding metric input_embed.append( embed) #add the embeddings for each senetence length # apply positional encoding input_embed = tf.transpose( tf.pack(input_embed), [2, 1, 0, 3]) # [N, F, L, V] #embeddings for all words encoded = l * input_embed * input_mask #again initialize them facts = tf.reduce_sum( encoded, 2 ) # [N, F, V] #this is like simming all the vectors in one sentance (total sentances) #####################################Up to here all embedding has done############################################################## # dropout time facts = dropout(facts, params.keep_prob, is_training) #impleent the dropout with tf.name_scope('InputFusion'): #Bidirectional RNN with tf.variable_scope('Forward'): forward_states, _ = tf.nn.dynamic_rnn( gru, facts, fact_counts, dtype=tf.float32) #this creates a dynamic RNN #This can be replaced with a biderectional dynamic RNN it's easy ########### with tf.variable_scope('Backward'): facts_reverse = tf.reverse_sequence(facts, fact_counts, 1) #reversing the facts backward_states, _ = tf.nn.dynamic_rnn( gru, facts_reverse, fact_counts, dtype=tf.float32) #fact counts for where to stop # Use forward and backward states both facts = forward_states + backward_states # [N, F, d] #sum them up with tf.variable_scope('Question'): ques_list = tf.unpack(tf.transpose( question)) #unpacking the a place holder to a list ques_embed = [ tf.nn.embedding_lookup(embedding, w) for w in ques_list ] #assign embeddings using embedding lookup _, question_vec = tf.nn.rnn( gru, ques_embed, dtype=tf.float32) #send the over a RNN then take the