def build_simple_ic_model(sentence_input, img_features_input, dropout_input, num_tokens, num_labels, embeddings, embeddings_size, train_embeddings, rnn_hidden_size, multimodal_fusion_hidden_size, classification_hidden_size): sentence_length = tf.cast( tf.reduce_sum( tf.cast( tf.not_equal(sentence_input, tf.zeros_like(sentence_input, dtype=tf.int32)), tf.int64), 1), tf.int32) if embeddings is not None: embedding_matrix = tf.get_variable( "embedding_matrix", shape=(num_tokens, embeddings_size), initializer=glove_embeddings_initializer(embeddings), trainable=train_embeddings) print("Loaded GloVe embeddings!") else: embedding_matrix = tf.get_variable( "embedding_matrix", shape=(num_tokens, embeddings_size), initializer=tf.random_normal_initializer(stddev=0.05), trainable=train_embeddings) sentence_embeddings = tf.nn.embedding_lookup(embedding_matrix, sentence_input) lstm_cell = DropoutWrapper(tf.nn.rnn_cell.LSTMCell(rnn_hidden_size), input_keep_prob=dropout_input, output_keep_prob=dropout_input) sentence_outputs, sentence_final_states = tf.nn.dynamic_rnn( cell=lstm_cell, inputs=sentence_embeddings, sequence_length=sentence_length, dtype=tf.float32) normalized_img_features = tf.nn.l2_normalize(img_features_input, dim=1) gated_sentence_hidden_layer = tf.nn.dropout(gated_tanh( sentence_final_states.h, multimodal_fusion_hidden_size), keep_prob=dropout_input) gated_img_hidden_layer = tf.nn.dropout(gated_tanh( normalized_img_features, multimodal_fusion_hidden_size), keep_prob=dropout_input) sentence_img_multimodal_fusion = tf.multiply(gated_sentence_hidden_layer, gated_img_hidden_layer) gated_first_layer = tf.nn.dropout(gated_tanh( sentence_img_multimodal_fusion, classification_hidden_size), keep_prob=dropout_input) gated_second_layer = tf.nn.dropout(gated_tanh(gated_first_layer, classification_hidden_size), keep_prob=dropout_input) gated_third_layer = tf.nn.dropout(gated_tanh(gated_second_layer, classification_hidden_size), keep_prob=dropout_input) return tf.contrib.layers.fully_connected(gated_third_layer, num_labels, activation_fn=None)
def build_simple_te_model_h(premise_input, hypothesis_input, dropout_input, num_tokens, num_labels, embeddings, embeddings_size, train_embeddings, rnn_hidden_size, classification_hidden_size): hypothesis_length = tf.cast( tf.reduce_sum( tf.cast( tf.not_equal(hypothesis_input, tf.zeros_like(hypothesis_input, dtype=tf.int32)), tf.int64), 1), tf.int32) if embeddings is not None: embedding_matrix = tf.get_variable( "embedding_matrix", shape=(num_tokens, embeddings_size), initializer=glove_embeddings_initializer(embeddings), trainable=train_embeddings) print("Loaded GloVe embeddings!") else: embedding_matrix = tf.get_variable( "embedding_matrix", shape=(num_tokens, embeddings_size), initializer=tf.random_normal_initializer(stddev=0.05), trainable=train_embeddings) hypothesis_embeddings = tf.nn.embedding_lookup(embedding_matrix, hypothesis_input) lstm_cell = DropoutWrapper(tf.nn.rnn_cell.LSTMCell(rnn_hidden_size), input_keep_prob=dropout_input, output_keep_prob=dropout_input) hypothesis_outputs, hypothesis_final_states = tf.nn.dynamic_rnn( cell=lstm_cell, inputs=hypothesis_embeddings, sequence_length=hypothesis_length, dtype=tf.float32) gated_first_layer = tf.nn.dropout(gated_tanh(hypothesis_final_states.h, classification_hidden_size), keep_prob=dropout_input) gated_second_layer = tf.nn.dropout(gated_tanh(gated_first_layer, classification_hidden_size), keep_prob=dropout_input) gated_third_layer = tf.nn.dropout(gated_tanh(gated_second_layer, classification_hidden_size), keep_prob=dropout_input) return tf.contrib.layers.fully_connected(gated_third_layer, num_labels, activation_fn=None)
def build_tl_mt_model(sentence_input, premise_input, hypothesis_input, img_features_input, dropout_input, num_tokens, num_ic_labels, num_vte_labels, embeddings, embeddings_size, num_img_features, img_features_size, train_embeddings, rnn_hidden_size, multimodal_fusion_hidden_size, classification_hidden_size): sentence_length = tf.cast( tf.reduce_sum( tf.cast( tf.not_equal(sentence_input, tf.zeros_like(sentence_input, dtype=tf.int32)), tf.int64), 1), tf.int32) premise_length = tf.cast( tf.reduce_sum( tf.cast( tf.not_equal(premise_input, tf.zeros_like(premise_input, dtype=tf.int32)), tf.int64), 1), tf.int32) hypothesis_length = tf.cast( tf.reduce_sum( tf.cast( tf.not_equal(hypothesis_input, tf.zeros_like(hypothesis_input, dtype=tf.int32)), tf.int64), 1), tf.int32) if embeddings is not None: embedding_matrix = tf.get_variable( "embedding_matrix", shape=(num_tokens, embeddings_size), initializer=glove_embeddings_initializer(embeddings), trainable=train_embeddings) print("Loaded GloVe embeddings!") else: embedding_matrix = tf.get_variable( "embedding_matrix", shape=(num_tokens, embeddings_size), initializer=tf.random_normal_initializer(stddev=0.05), trainable=train_embeddings) sentence_embeddings = tf.nn.embedding_lookup(embedding_matrix, sentence_input) premise_embeddings = tf.nn.embedding_lookup(embedding_matrix, premise_input) hypothesis_embeddings = tf.nn.embedding_lookup(embedding_matrix, hypothesis_input) lstm_cell = DropoutWrapper(tf.nn.rnn_cell.LSTMCell(rnn_hidden_size), input_keep_prob=dropout_input, output_keep_prob=dropout_input) sentence_outputs, sentence_final_states = tf.nn.dynamic_rnn( cell=lstm_cell, inputs=sentence_embeddings, sequence_length=sentence_length, dtype=tf.float32) premise_outputs, premise_final_states = tf.nn.dynamic_rnn( cell=lstm_cell, inputs=premise_embeddings, sequence_length=premise_length, dtype=tf.float32) hypothesis_outputs, hypothesis_final_states = tf.nn.dynamic_rnn( cell=lstm_cell, inputs=hypothesis_embeddings, sequence_length=hypothesis_length, dtype=tf.float32) normalized_img_features = tf.nn.l2_normalize(img_features_input, dim=2) reshaped_sentence = tf.reshape( tf.tile(sentence_final_states.h, [1, num_img_features]), [-1, num_img_features, rnn_hidden_size]) img_sentence_concatenation = tf.concat( [normalized_img_features, reshaped_sentence], -1) gated_img_sentence_concatenation = tf.nn.dropout(gated_tanh( img_sentence_concatenation, rnn_hidden_size), keep_prob=dropout_input) att_wa_sentence = lambda x: tf.nn.dropout( tf.contrib.layers.fully_connected( x, 1, activation_fn=None, biases_initializer=None), keep_prob=dropout_input) a_sentence = att_wa_sentence(gated_img_sentence_concatenation) a_sentence = tf.nn.softmax(tf.squeeze(a_sentence)) v_head_sentence = tf.squeeze( tf.matmul(tf.expand_dims(a_sentence, 1), normalized_img_features)) with tf.variable_scope( "gated_sentence_scope_W_plus_b") as gated_sentence_scope_W_plus_b: gated_sentence_W_plus_b = lambda x: tf.contrib.layers.fully_connected( x, multimodal_fusion_hidden_size, activation_fn=None, scope=gated_sentence_scope_W_plus_b) with tf.variable_scope("gated_sentence_scope_W_plus_b_prime" ) as gated_sentence_scope_W_plus_b_prime: gated_sentence_W_plus_b_prime = lambda x: tf.contrib.layers.fully_connected( x, multimodal_fusion_hidden_size, activation_fn=None, scope=gated_sentence_scope_W_plus_b_prime) gated_sentence = tf.nn.dropout( gated_tanh(sentence_final_states.h, multimodal_fusion_hidden_size, W_plus_b=gated_sentence_W_plus_b, W_plus_b_prime=gated_sentence_W_plus_b_prime), keep_prob=dropout_input, ) v_head_sentence.set_shape( (sentence_embeddings.get_shape()[0], img_features_size)) with tf.variable_scope("gated_img_features_sentence_scope_W_plus_b" ) as gated_img_features_sentence_scope_W_plus_b: gated_img_features_sentence_W_plus_b = lambda x: tf.contrib.layers.fully_connected( x, multimodal_fusion_hidden_size, activation_fn=None, scope=gated_img_features_sentence_scope_W_plus_b) with tf.variable_scope( "gated_img_features_sentence_scope_W_plus_b_prime" ) as gated_img_features_sentence_scope_W_plus_b_prime: gated_img_features_sentence_W_plus_b_prime = lambda x: tf.contrib.layers.fully_connected( x, multimodal_fusion_hidden_size, activation_fn=None, scope=gated_img_features_sentence_scope_W_plus_b_prime) gated_img_features_sentence = tf.nn.dropout(gated_tanh( v_head_sentence, multimodal_fusion_hidden_size, W_plus_b=gated_img_features_sentence_W_plus_b, W_plus_b_prime=gated_img_features_sentence_W_plus_b_prime), keep_prob=dropout_input) h_premise_img = tf.multiply(gated_sentence, gated_img_features_sentence) with tf.variable_scope("gated_first_layer_scope_W_plus_b" ) as gated_first_layer_scope_W_plus_b: gated_first_layer_W_plus_b = lambda x: tf.contrib.layers.fully_connected( x, classification_hidden_size, activation_fn=None, scope=gated_first_layer_scope_W_plus_b) with tf.variable_scope("gated_first_layer_scope_W_plus_b_prime" ) as gated_first_layer_scope_W_plus_b_prime: gated_first_layer_W_plus_b_prime = lambda x: tf.contrib.layers.fully_connected( x, classification_hidden_size, activation_fn=None, scope=gated_first_layer_scope_W_plus_b_prime) gated_first_layer = tf.nn.dropout(gated_tanh( h_premise_img, W_plus_b=gated_first_layer_W_plus_b, W_plus_b_prime=gated_first_layer_W_plus_b_prime), keep_prob=dropout_input) gated_second_layer = tf.nn.dropout(gated_tanh(gated_first_layer, classification_hidden_size), keep_prob=dropout_input) gated_third_layer = tf.nn.dropout(gated_tanh(gated_second_layer, classification_hidden_size), keep_prob=dropout_input) ic_classification = tf.nn.dropout(tf.contrib.layers.fully_connected( gated_third_layer, num_ic_labels, activation_fn=None), keep_prob=dropout_input) reshaped_premise = tf.reshape( tf.tile(premise_final_states.h, [1, num_img_features]), [-1, num_img_features, rnn_hidden_size]) img_premise_concatenation = tf.concat( [normalized_img_features, reshaped_premise], -1) gated_img_premise_concatenation = tf.nn.dropout(gated_tanh( img_premise_concatenation, rnn_hidden_size), keep_prob=dropout_input) att_wa_premise = lambda x: tf.nn.dropout(tf.contrib.layers.fully_connected( x, 1, activation_fn=None, biases_initializer=None), keep_prob=dropout_input) a_premise = att_wa_premise(gated_img_premise_concatenation) a_premise = tf.nn.softmax(tf.squeeze(a_premise)) v_head_premise = tf.squeeze( tf.matmul(tf.expand_dims(a_premise, 1), normalized_img_features)) reshaped_hypothesis = tf.reshape( tf.tile(hypothesis_final_states.h, [1, num_img_features]), [-1, num_img_features, rnn_hidden_size]) img_hypothesis_concatenation = tf.concat( [normalized_img_features, reshaped_hypothesis], -1) gated_img_hypothesis_concatenation = tf.nn.dropout(gated_tanh( img_hypothesis_concatenation, rnn_hidden_size), keep_prob=dropout_input) att_wa_hypothesis = lambda x: tf.nn.dropout( tf.contrib.layers.fully_connected( x, 1, activation_fn=None, biases_initializer=None), keep_prob=dropout_input) a_hypothesis = att_wa_hypothesis(gated_img_hypothesis_concatenation) a_hypothesis = tf.nn.softmax(tf.squeeze(a_hypothesis)) v_head_hypothesis = tf.squeeze( tf.matmul(tf.expand_dims(a_hypothesis, 1), normalized_img_features)) with tf.variable_scope( "gated_sentence_scope_W_plus_b") as gated_sentence_scope_W_plus_b: gated_premise_W_plus_b = lambda x: tf.contrib.layers.fully_connected( x, multimodal_fusion_hidden_size, activation_fn=None, scope=gated_sentence_scope_W_plus_b, reuse=True) with tf.variable_scope("gated_sentence_scope_W_plus_b_prime" ) as gated_sentence_scope_W_plus_b_prime: gated_premise_W_plus_b_prime = lambda x: tf.contrib.layers.fully_connected( x, multimodal_fusion_hidden_size, activation_fn=None, scope=gated_sentence_scope_W_plus_b_prime, reuse=True) gated_premise = tf.nn.dropout( gated_tanh(premise_final_states.h, multimodal_fusion_hidden_size, W_plus_b=gated_premise_W_plus_b, W_plus_b_prime=gated_premise_W_plus_b_prime), keep_prob=dropout_input, ) with tf.variable_scope( "gated_sentence_scope_W_plus_b") as gated_sentence_scope_W_plus_b: gated_hypothesis_W_plus_b = lambda x: tf.contrib.layers.fully_connected( x, multimodal_fusion_hidden_size, activation_fn=None, scope=gated_sentence_scope_W_plus_b, reuse=True) with tf.variable_scope("gated_sentence_scope_W_plus_b_prime" ) as gated_sentence_scope_W_plus_b_prime: gated_hypothesis_W_plus_b_prime = lambda x: tf.contrib.layers.fully_connected( x, multimodal_fusion_hidden_size, activation_fn=None, scope=gated_sentence_scope_W_plus_b_prime, reuse=True) gated_hypothesis = tf.nn.dropout( gated_tanh(hypothesis_final_states.h, multimodal_fusion_hidden_size, W_plus_b=gated_hypothesis_W_plus_b, W_plus_b_prime=gated_hypothesis_W_plus_b_prime), keep_prob=dropout_input, ) v_head_premise.set_shape( (premise_embeddings.get_shape()[0], img_features_size)) with tf.variable_scope("gated_img_features_sentence_scope_W_plus_b" ) as gated_img_features_sentence_scope_W_plus_b: gated_img_features_premise_W_plus_b = lambda x: tf.contrib.layers.fully_connected( x, multimodal_fusion_hidden_size, activation_fn=None, scope=gated_img_features_sentence_scope_W_plus_b, reuse=True) with tf.variable_scope( "gated_img_features_sentence_scope_W_plus_b_prime" ) as gated_img_features_sentence_scope_W_plus_b_prime: gated_img_features_premise_W_plus_b_prime = lambda x: tf.contrib.layers.fully_connected( x, multimodal_fusion_hidden_size, activation_fn=None, scope=gated_img_features_sentence_scope_W_plus_b_prime, reuse=True) gated_img_features_premise = tf.nn.dropout(gated_tanh( v_head_premise, multimodal_fusion_hidden_size, W_plus_b=gated_img_features_premise_W_plus_b, W_plus_b_prime=gated_img_features_premise_W_plus_b_prime), keep_prob=dropout_input) v_head_hypothesis.set_shape( (hypothesis_embeddings.get_shape()[0], img_features_size)) with tf.variable_scope("gated_img_features_sentence_scope_W_plus_b" ) as gated_img_features_sentence_scope_W_plus_b: gated_img_features_hypothesis_W_plus_b = lambda x: tf.contrib.layers.fully_connected( x, multimodal_fusion_hidden_size, activation_fn=None, scope=gated_img_features_sentence_scope_W_plus_b, reuse=True) with tf.variable_scope( "gated_img_features_sentence_scope_W_plus_b_prime" ) as gated_img_features_sentence_scope_W_plus_b_prime: gated_img_features_hypothesis_W_plus_b_prime = lambda x: tf.contrib.layers.fully_connected( x, multimodal_fusion_hidden_size, activation_fn=None, scope=gated_img_features_sentence_scope_W_plus_b_prime, reuse=True) gated_img_features_hypothesis = tf.nn.dropout(gated_tanh( v_head_hypothesis, multimodal_fusion_hidden_size, W_plus_b=gated_img_features_hypothesis_W_plus_b, W_plus_b_prime=gated_img_features_hypothesis_W_plus_b_prime), keep_prob=dropout_input) h_premise_img = tf.multiply(gated_premise, gated_img_features_premise) h_hypothesis_img = tf.multiply(gated_hypothesis, gated_img_features_hypothesis) with tf.variable_scope("gated_first_layer_scope_W_plus_b" ) as gated_first_layer_scope_W_plus_b: gated_h_premise_img_hidden_layer_W_plus_b = lambda x: tf.contrib.layers.fully_connected( x, classification_hidden_size, activation_fn=None, scope=gated_first_layer_scope_W_plus_b, reuse=True) with tf.variable_scope("gated_first_layer_scope_W_plus_b_prime" ) as gated_first_layer_scope_W_plus_b_prime: gated_h_premise_hidden_layer_W_plus_b_prime = lambda x: tf.contrib.layers.fully_connected( x, classification_hidden_size, activation_fn=None, scope=gated_first_layer_scope_W_plus_b_prime, reuse=True) gated_h_premise_img_hidden_layer = tf.nn.dropout(gated_tanh( h_premise_img, W_plus_b=gated_h_premise_img_hidden_layer_W_plus_b, W_plus_b_prime=gated_h_premise_hidden_layer_W_plus_b_prime), keep_prob=dropout_input) with tf.variable_scope("gated_first_layer_scope_W_plus_b" ) as gated_first_layer_scope_W_plus_b: gated_h_hypothesis_img_hidden_layer_W_plus_b = lambda x: tf.contrib.layers.fully_connected( x, classification_hidden_size, activation_fn=None, scope=gated_first_layer_scope_W_plus_b, reuse=True) with tf.variable_scope("gated_first_layer_scope_W_plus_b_prime" ) as gated_first_layer_scope_W_plus_b_prime: gated_h_hypothesis_hidden_layer_W_plus_b_prime = lambda x: tf.contrib.layers.fully_connected( x, classification_hidden_size, activation_fn=None, scope=gated_first_layer_scope_W_plus_b_prime, reuse=True) gated_h_hypothesis_img_hidden_layer = tf.nn.dropout( gated_tanh( h_hypothesis_img, W_plus_b=gated_h_hypothesis_img_hidden_layer_W_plus_b, W_plus_b_prime=gated_h_hypothesis_hidden_layer_W_plus_b_prime), keep_prob=dropout_input) final_concatenation = tf.concat([ gated_h_premise_img_hidden_layer, gated_h_hypothesis_img_hidden_layer ], 1) gated_first_layer = tf.nn.dropout(gated_tanh(final_concatenation, classification_hidden_size), keep_prob=dropout_input) gated_second_layer = tf.nn.dropout(gated_tanh(gated_first_layer, classification_hidden_size), keep_prob=dropout_input) vte_classification = tf.nn.dropout(tf.contrib.layers.fully_connected( gated_second_layer, num_vte_labels, activation_fn=None), keep_prob=dropout_input) return ic_classification, vte_classification
def build_bottom_up_top_down_ic_model(sentence_input, img_features_input, dropout_input, num_tokens, num_labels, embeddings, embeddings_size, num_img_features, img_features_size, train_embeddings, rnn_hidden_size, multimodal_fusion_hidden_size, classification_hidden_size): sentence_length = tf.cast( tf.reduce_sum( tf.cast(tf.not_equal(sentence_input, tf.zeros_like(sentence_input, dtype=tf.int32)), tf.int64), 1 ), tf.int32 ) if embeddings is not None: embedding_matrix = tf.get_variable( "embedding_matrix", shape=(num_tokens, embeddings_size), initializer=glove_embeddings_initializer(embeddings), trainable=train_embeddings ) print("Loaded GloVe embeddings!") else: embedding_matrix = tf.get_variable( "embedding_matrix", shape=(num_tokens, embeddings_size), initializer=tf.random_normal_initializer(stddev=0.05), trainable=train_embeddings ) sentence_embeddings = tf.nn.embedding_lookup(embedding_matrix, sentence_input) lstm_cell = DropoutWrapper( tf.nn.rnn_cell.LSTMCell(rnn_hidden_size), input_keep_prob=dropout_input, output_keep_prob=dropout_input ) sentence_outputs, sentence_final_states = tf.nn.dynamic_rnn( cell=lstm_cell, inputs=sentence_embeddings, sequence_length=sentence_length, dtype=tf.float32 ) normalized_img_features = tf.nn.l2_normalize(img_features_input, dim=2) reshaped_sentence = tf.reshape(tf.tile(sentence_final_states.h, [1, num_img_features]), [-1, num_img_features, rnn_hidden_size]) img_sentence_concatenation = tf.concat([normalized_img_features, reshaped_sentence], -1) gated_img_sentence_concatenation = gated_tanh(img_sentence_concatenation, rnn_hidden_size) att_wa_sentence = lambda x: tf.contrib.layers.fully_connected(x, 1, activation_fn=None, biases_initializer=None) a_sentence = att_wa_sentence(gated_img_sentence_concatenation) a_sentence = tf.nn.softmax(tf.squeeze(a_sentence)) v_head_sentence = tf.squeeze(tf.matmul(tf.expand_dims(a_sentence, 1), normalized_img_features)) v_head_sentence.set_shape((sentence_embeddings.get_shape()[0], img_features_size)) gated_sentence = tf.nn.dropout( gated_tanh(sentence_final_states.h, multimodal_fusion_hidden_size), keep_prob=dropout_input ) gated_img_features_sentence = tf.nn.dropout( gated_tanh(v_head_sentence, multimodal_fusion_hidden_size), keep_prob=dropout_input ) h_sentence_img = tf.multiply(gated_sentence, gated_img_features_sentence) gated_first_layer = tf.nn.dropout( gated_tanh(h_sentence_img, classification_hidden_size), keep_prob=dropout_input ) gated_second_layer = tf.nn.dropout( gated_tanh(gated_first_layer, classification_hidden_size), keep_prob=dropout_input ) gated_third_layer = tf.nn.dropout( gated_tanh(gated_second_layer, classification_hidden_size), keep_prob=dropout_input ) return tf.contrib.layers.fully_connected( gated_third_layer, num_labels, activation_fn=None )
def build_model(config, embeddings, mode, ilabel2itoken=None, inference_batch=None): """Basic setup. Args: config: Object containing configuration parameters. mode: "train" or "inference". inference_batch: if mode is 'inference', we will need to provide the batch_size of input data. Otherwise, leave it as None. glove_vocab: if we need to use glove word2vec to initialize our vocab embeddings, we will provide with a matrix of [config.vocab_size, config.embedding_size]. If not, we leave it as None. """ assert mode in ["train", "inference"] if mode == 'inference' and inference_batch is None: raise ValueError( "When inference mode, inference_batch must be provided!") config = config # To match the "Show and Tell" paper we initialize all variables with a # random uniform initializer. initializer = tf.random_uniform_initializer( minval=-config.initializer_scale, maxval=config.initializer_scale) ### Inputs for VQA model ### hypothesis_input = tf.placeholder(tf.int32, (None, None), name="hypothesis_input") img_features_input = tf.placeholder( tf.float32, (None, config.num_img_features, config.img_features_size), name="img_features_input") label_input = tf.placeholder(tf.int32, (None, ), name="label_input") dropout_input = tf.placeholder(tf.float32, name="dropout_input") ### Inputs for explanation generation ### # An int32 Tensor with shape [batch_size, padded_length]. input_seqs = tf.placeholder(tf.int32, [None, None], name='input_seqs') # An int32 Tensor with shape [batch_size, padded_length]. target_seqs = tf.placeholder(tf.int32, [None, None], name='target_seqs') # A float32 Tensor with shape [1] keep_prob = tf.placeholder(tf.float32, name='keep_prob') # An int32 0/1 Tensor with shape [batch_size, padded_length]. input_mask = tf.placeholder(tf.int32, [None, None], name='input_mask') # A float32 Tensor with shape [batch_size, image_feature_size]. image_feature = tf.placeholder(tf.float32, [None, config.image_feature_size], name='image_feature') # A float32 Tensor with shape [batch_size, padded_length, embedding_size]. seq_embedding = None # A float32 scalar Tensor; the total loss for the trainer to optimize. total_loss = None # A float32 Tensor with shape [batch_size * padded_length]. target_cross_entropy_losses = None # A float32 Tensor with shape [batch_size * padded_length]. target_cross_entropy_loss_weights = None # Collection of variables from the inception submodel. inception_variables = [] # Global step Tensor. global_step = None """Sets up the global step Tensor.""" global_step = tf.Variable( initial_value=0, name="global_step", trainable=False, collections=[tf.GraphKeys.GLOBAL_STEP, tf.GraphKeys.GLOBAL_VARIABLES]) # Dynamic batch size batch_size = tf.shape(hypothesis_input)[0] # Table to map label_id to token_id if ilabel2itoken: keys = list(ilabel2itoken.keys()) values = [ilabel2itoken[k] for k in keys] ilabel2itoken_table = tf.contrib.lookup.HashTable( tf.contrib.lookup.KeyValueTensorInitializer(keys, values, key_dtype=tf.int32, value_dtype=tf.int32), -1) ### Builds the input sequence embeddings ### # Inputs: # self.input_seqs # Outputs: # self.seq_embeddings ############################################ # with tf.variable_scope("seq_embedding"), tf.device("/cpu:0"): # if glove_vocab is None: # embedding_map = tf.get_variable( # name="map", # shape=[config.vocab_size, config.embedding_size], # initializer=initializer) # else: # init = tf.constant(glove_vocab.astype('float32')) # embedding_map = tf.get_variable( # name="map", # initializer=init) # seq_embedding = tf.nn.embedding_lookup(embedding_map, input_seqs) with tf.variable_scope("hypothesis_embeddings"), tf.device("/cpu:0"): if embeddings is not None: embedding_map = tf.get_variable( "map", shape=[config.vocab_size, config.embedding_size], initializer=glove_embeddings_initializer(embeddings), trainable=config.train_embeddings) print("Loaded GloVe embeddings!") else: embedding_map = tf.get_variable( "map", shape=[config.vocab_size, config.embedding_size], initializer=tf.random_normal_initializer(stddev=0.05), trainable=config.train_embeddings #TODO ) hypothesis_embeddings = tf.nn.embedding_lookup(embedding_map, hypothesis_input) ############ Builds the model ############## # Inputs: # self.image_feature # self.seq_embeddings # self.target_seqs (training and eval only) # self.input_mask (training and eval only) # Outputs: # self.total_loss (training and eval only) # self.target_cross_entropy_losses (training and eval only) # self.target_cross_entropy_loss_weights (training and eval only) ############################################ ############ VQA part ###################### hypothesis_length = tf.cast( tf.reduce_sum( tf.cast( tf.not_equal(hypothesis_input, tf.zeros_like(hypothesis_input, dtype=tf.int32)), tf.int64), 1), tf.int32) lstm_cell = tf.nn.rnn_cell.DropoutWrapper(tf.nn.rnn_cell.LSTMCell( config.num_lstm_units), input_keep_prob=dropout_input, output_keep_prob=dropout_input) hypothesis_outputs, hypothesis_final_states = tf.nn.dynamic_rnn( cell=lstm_cell, inputs=hypothesis_embeddings, sequence_length=hypothesis_length, dtype=tf.float32) normalized_img_features = tf.nn.l2_normalize(img_features_input, dim=2) reshaped_hypothesis = tf.reshape( tf.tile(hypothesis_final_states.h, [1, config.num_img_features]), [-1, config.num_img_features, config.num_lstm_units]) img_hypothesis_concatenation = tf.concat( [normalized_img_features, reshaped_hypothesis], -1) gated_img_hypothesis_concatenation = tf.nn.dropout(gated_tanh( img_hypothesis_concatenation, config.num_lstm_units), keep_prob=dropout_input) att_wa_hypothesis = lambda x: tf.nn.dropout( tf.contrib.layers.fully_connected( x, 1, activation_fn=None, biases_initializer=None), keep_prob=dropout_input) a_hypothesis = att_wa_hypothesis(gated_img_hypothesis_concatenation) a_hypothesis = tf.nn.softmax(tf.squeeze(a_hypothesis, axis=-1)) v_head_hypothesis = tf.squeeze(tf.matmul(tf.expand_dims(a_hypothesis, 1), normalized_img_features), axis=1) gated_hypothesis = tf.nn.dropout(gated_tanh( hypothesis_final_states.h, config.multimodal_fusion_hidden_size), keep_prob=dropout_input) v_head_hypothesis.set_shape( (hypothesis_embeddings.get_shape()[0], config.img_features_size)) gated_img_features_hypothesis = tf.nn.dropout(gated_tanh( v_head_hypothesis, config.multimodal_fusion_hidden_size), keep_prob=dropout_input) h_hypothesis_img = tf.multiply(gated_hypothesis, gated_img_features_hypothesis) final_concatenation = tf.concat([h_hypothesis_img], 1) gated_first_layer = tf.nn.dropout(gated_tanh( final_concatenation, config.classification_hidden_size), keep_prob=dropout_input) gated_second_layer = tf.nn.dropout(gated_tanh( gated_first_layer, config.classification_hidden_size), keep_prob=dropout_input) gated_third_layer = tf.nn.dropout(gated_tanh( gated_second_layer, config.classification_hidden_size), keep_prob=dropout_input) label_logits = tf.contrib.layers.fully_connected(gated_third_layer, config.num_labels, activation_fn=None) ############## Explanation generation part ###################### multimodal_feature = final_concatenation if mode == 'train' and ilabel2itoken: # prepend gold label # done outside of the build function in inference mode pre_labels = ilabel2itoken_table.lookup(label_input) input_seqs = tf.concat([tf.expand_dims(pre_labels, 1), input_seqs], axis=1) with tf.variable_scope("seq_embedding"), tf.device("/cpu:0"): seq_embedding = tf.nn.embedding_lookup(embedding_map, input_seqs) lstm_cell_expl = tf.nn.rnn_cell.LSTMCell(num_units=config.num_lstm_units, state_is_tuple=True) lstm_cell_expl = tf.nn.rnn_cell.DropoutWrapper(lstm_cell_expl, input_keep_prob=keep_prob, output_keep_prob=keep_prob) # TODO: attention? #attn_meca = tf.contrib.seq2seq.BahdanauAttention(config.num_lstm_units, multimodal_feature) #attn_cell = tf.contrib.seq2seq.AttentionWrapper(lstm_cell_expl, attn_meca, output_attention=False) with tf.variable_scope("lstm", initializer=initializer) as lstm_scope: # Feed the image embeddings to set the initial LSTM state. if mode == 'train': zero_state = lstm_cell_expl.zero_state(batch_size=batch_size, dtype=tf.float32) #zero_state = attn_cell.zero_state(batch_size=batch_size, dtype=tf.float32) elif mode == 'inference': zero_state = lstm_cell_expl.zero_state(batch_size=inference_batch, dtype=tf.float32) #zero_state = attn_cell.zero_state(batch_size=inference_batch, dtype=tf.float32) with tf.variable_scope('multimodal_embeddings'): multimodal_embeddings = tf.contrib.layers.fully_connected( inputs=multimodal_feature, num_outputs=config.embedding_size, activation_fn=None, weights_initializer=initializer, biases_initializer=None) _, initial_state = lstm_cell_expl(multimodal_embeddings, zero_state) #_, initial_state = attn_cell(multimodal_embeddings, zero_state) # Allow the LSTM variables to be reused. lstm_scope.reuse_variables() # Run the batch of sequence embeddings through the LSTM. sequence_length = tf.reduce_sum(input_mask, 1) lstm_outputs, final_state = tf.nn.dynamic_rnn( cell=lstm_cell_expl, inputs=seq_embedding, sequence_length=sequence_length, initial_state=initial_state, dtype=tf.float32, scope=lstm_scope) # lstm_outputs, final_state = tf.nn.dynamic_rnn(cell=attn_cell, # inputs=seq_embedding, # sequence_length=sequence_length, # initial_state=initial_state, # dtype=tf.float32, # scope=lstm_scope) # Stack batches vertically. lstm_outputs = tf.reshape( lstm_outputs, [-1, lstm_cell_expl.output_size]) # output_size == 256 #lstm_outputs = tf.reshape(lstm_outputs, [-1, attn_cell.output_size]) # output_size == 256 with tf.variable_scope('logits'): W = tf.get_variable('W', [lstm_cell_expl.output_size, config.vocab_size], initializer=initializer) #W = tf.get_variable('W', [attn_cell.output_size, config.vocab_size], initializer=initializer) b = tf.get_variable('b', [config.vocab_size], initializer=tf.constant_initializer(0.0)) logits = tf.matmul( lstm_outputs, W) + b # logits: [batch_size * padded_length, config.vocab_size] ###### for inference & validation only ####### softmax = tf.nn.softmax(logits) preds = tf.argmax(softmax, 1) ############################################## # for training only below targets = tf.reshape(target_seqs, [-1]) weights = tf.to_float(tf.reshape(input_mask, [-1])) # Compute losses. label_loss = tf.losses.sparse_softmax_cross_entropy( label_input, label_logits) losses = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=targets, logits=logits) explanation_loss = tf.div(tf.reduce_sum(tf.multiply(losses, weights)), tf.reduce_sum(weights), name="explanation_loss") batch_loss = (1 - config.alpha) * explanation_loss + config.alpha * label_loss tf.contrib.losses.add_loss(batch_loss) total_loss = tf.contrib.losses.get_total_loss() # target_cross_entropy_losses = losses # Used in evaluation. # target_cross_entropy_loss_weights = weights # Used in evaluation. # TODO; what else should I return? return dict(total_loss=total_loss, global_step=global_step, image_feature=image_feature, input_mask=input_mask, target_seqs=target_seqs, input_seqs=input_seqs, final_state=final_state, initial_state=initial_state, softmax=softmax, preds=preds, keep_prob=keep_prob, saver=tf.train.Saver(), hypothesis_input=hypothesis_input, img_features_input=img_features_input, label_input=label_input, dropout_input=dropout_input, label_logits=label_logits, explanation_loss=explanation_loss, attention_output=a_hypothesis)
def call(self, premise_input, hypothesis_input, img_features_input, label_input, target_expl, target_length, dropout_input, num_labels, num_img_features, img_features_size, rnn_hidden_size, multimodal_fusion_hidden_size, classification_hidden_size, max_length): hypothesis_length = tf.cast( tf.reduce_sum( tf.cast( tf.not_equal( hypothesis_input, tf.zeros_like(hypothesis_input, dtype=tf.int32)), tf.int64), 1), tf.int32) hypothesis_embeddings = tf.nn.embedding_lookup(self.embedding_matrix, hypothesis_input) hypothesis_outputs, hypothesis_final_states = tf.nn.dynamic_rnn( cell=self.lstm_cell, inputs=hypothesis_embeddings, sequence_length=hypothesis_length, dtype=tf.float32) normalized_img_features = tf.nn.l2_normalize(img_features_input, dim=2) reshaped_hypothesis = tf.reshape( tf.tile(hypothesis_final_states.h, [1, num_img_features]), [-1, num_img_features, rnn_hidden_size]) img_hypothesis_concatenation = tf.concat( [normalized_img_features, reshaped_hypothesis], -1) gated_img_hypothesis_concatenation = tf.nn.dropout( gated_tanh(img_hypothesis_concatenation, rnn_hidden_size), keep_prob=dropout_input) att_wa_hypothesis = lambda x: tf.nn.dropout( tf.contrib.layers.fully_connected( x, 1, activation_fn=None, biases_initializer=None), keep_prob=dropout_input) a_hypothesis = att_wa_hypothesis(gated_img_hypothesis_concatenation) a_hypothesis = tf.nn.softmax(tf.squeeze(a_hypothesis)) v_head_hypothesis = tf.squeeze( tf.matmul(tf.expand_dims(a_hypothesis, 1), normalized_img_features)) gated_hypothesis = tf.nn.dropout(gated_tanh( hypothesis_final_states.h, multimodal_fusion_hidden_size), keep_prob=dropout_input) v_head_hypothesis.set_shape( (hypothesis_embeddings.get_shape()[0], img_features_size)) gated_img_features_hypothesis = tf.nn.dropout(gated_tanh( v_head_hypothesis, multimodal_fusion_hidden_size), keep_prob=dropout_input) h_hypothesis_img = tf.multiply(gated_hypothesis, gated_img_features_hypothesis) # Features used to classify label and generate explanation final_concatenation = tf.concat([h_hypothesis_img], 1) # Classifier gated_first_layer = tf.nn.dropout(gated_tanh( final_concatenation, classification_hidden_size), keep_prob=dropout_input) gated_second_layer = tf.nn.dropout(gated_tanh( gated_first_layer, classification_hidden_size), keep_prob=dropout_input) gated_third_layer = tf.nn.dropout(gated_tanh( gated_second_layer, classification_hidden_size), keep_prob=dropout_input) pred_label = tf.contrib.layers.fully_connected(gated_third_layer, num_labels, activation_fn=None) # insert GRU here to generate explanations # expl= (bs, T, 300) start_token = tf.constant('<start>', dtype=tf.string) end_token = tf.constant('<end>', dtype=tf.string) batch_size = tf.shape(hypothesis_input)[0] #if tf.reduce_all(tf.math.equal(mode, tf.constant('teacher', dtype=tf.string))): # teacher forcing if self.mode == 'teacher': print("teacher") hidden_t = self.decoder.reset_state(batch_size=batch_size) batch_start_token = tf.fill([batch_size], '<start>') batch_end_token = tf.fill([batch_size], '<end>') dec_input_t = tf.expand_dims( self.token2id_table.lookup(batch_start_token), 1) all_predictions = [] # TODO: why target_expl.shape[1] gives None? # replacing with max_length but bad #for t in range(1, tf.shape(target_expl)[1]) #for t in tf.range(self.explanation_length_input): for t in range(1, max_length + 1): # passing the features through the decoder predictions, hidden_t, attention_weights = self.decoder( dec_input_t, tf.expand_dims(final_concatenation, 1), hidden_t) #prepend label if t == 1 and label_input is not None: labels = self.id2label_table.lookup(label_input) #(bs,) dec_input_t = self.token2id_table.lookup(labels) #(bs,) dec_input_t = tf.expand_dims(dec_input_t, 1) #(bs,1) # using teacher forcing #if t < max_length: elif t < max_length: dec_input_t = tf.expand_dims(target_expl[:, t], 1) else: dec_input_t = tf.expand_dims( self.token2id_table.lookup(batch_end_token), 1) # predictions: (bs, 1, n_vocab) all_predictions.append(predictions) # all_predictions: (bs, T, n_vocab) all_predictions = tf.stack(all_predictions, axis=1) return pred_label, all_predictions else: print("forloop") #all_predictions = [] ##TODO: attention shape #attention_features_shape = 36 #all_attention_plots = [] # pred_expls is a list of strings of size batch_size #pred_expls = [""] * batch_size #finished = [False] * batch_size #pred_expls = tf.fill([batch_size], "") #finished = tf.fill([batch_size], False) #TODO pred_expls = [] pred_expls_words = [] #finished = tf.zeros((batch_size)) t = 0 hidden_t = self.decoder.reset_state(batch_size=batch_size) batch_start_token = tf.fill([batch_size], '<start>') dec_input_t = tf.expand_dims( self.token2id_table.lookup(batch_start_token), 1) #TODO: #while t < max_length and tf.reduce_sum(finished) != batch_size: while t < max_length: t += 1 #dec_output_t: (bs, max_vocab) #dec_output_t: (bs * max_length, max_vocab) dec_output_t, hidden_t, attention_weights = self.decoder( dec_input_t, tf.expand_dims(final_concatenation, 1), hidden_t) #predicted_id: (bs* max_length) or (bs*max_length, 1) predicted_id = tf.argmax(dec_output_t, axis=1) pred_expls.append(predicted_id) pred_expls_words.append( self.id2token_table.lookup(predicted_id)) # TODO #completed = tf.where(predicted_id == self.token2id_table.lookup(end_token)) #finished[completed] = 1 if t > 1: #if True: #dec_input_t = tf.expand_dims(predicted_id, 1) dec_input_t = tf.reshape(predicted_id, [batch_size, 1]) else: out_labels = tf.argmax(pred_label, axis=1) # pred_label IDs --> labels words --> embeddings labels = self.id2label_table.lookup(out_labels) #(bs,) dec_input_t = self.token2id_table.lookup(labels) #(bs,) dec_input_t = tf.expand_dims(dec_input_t, 1) #(bs,1) #all_predictions.append(dec_output_t) #all_predictions = tf.stack(all_predictions, axis=1) pred_expls = tf.stack(pred_expls, axis=1) pred_expls_words = tf.stack(pred_expls_words, axis=1) return pred_label, pred_expls_words