def step(prev_state_c, prev_state_h, tokens, seq_length, stop_indicator): input = tf.gather(tokens, tf.shape(tokens)[0] - 1) # Look for new finish dialogue is_stop_token = tf.equal(input, stop_token) is_stop_dialogue_token = tf.equal(input, stop_dialogue_token) is_stop = tf.logical_or(is_stop_token, is_stop_dialogue_token) stop_indicator = tf.logical_or( stop_indicator, is_stop) # flag to false new finished dialogue # increment seq_length when the dialogue is not over seq_length = tf.where(stop_indicator, seq_length, tf.add(seq_length, 1)) # compute the next words. TODO: factorize with qgen.. but how?! with tf.variable_scope(self.scope_name, reuse=True): word_emb = utils.get_embedding( input, n_words=tokenizer.no_words, n_dim=config['word_embedding_size'], scope="word_embedding", reuse=True) inp_emb = tf.concat([word_emb, self.image_emb], axis=1) with tf.variable_scope("word_decoder"): lstm_cell = tf.contrib.rnn.LayerNormBasicLSTMCell( config['num_lstm_units'], layer_norm=False, dropout_keep_prob=1.0, reuse=True) state = tf.contrib.rnn.LSTMStateTuple(c=prev_state_c, h=prev_state_h) out, state = lstm_cell(inp_emb, state) # store/update the state when the dialogue is not finished (after sampling the <?> token) cond = tf.greater_equal( seq_length, tf.subtract(tf.reduce_max(seq_length), 1)) state_c = tf.where(cond, state.c, prev_state_c) state_h = tf.where(cond, state.h, prev_state_h) with tf.variable_scope('decoder_output'): output = utils.fully_connected(state_h, tokenizer.no_words, reuse=True) sampled_tokens = tf.cond( self.greedy, lambda: tf.argmax(output, 1), lambda: tf.reshape(tf.multinomial(output, 1), [-1])) sampled_tokens = tf.cast(sampled_tokens, tf.int32) tokens = tf.concat( [tokens, tf.expand_dims(sampled_tokens, 0)], axis=0) # check axis! return state_c, state_h, tokens, seq_length, stop_indicator
def __init__(self, config, no_words, no_answers, reuse=False, device=''): ResnetModel.__init__(self, "vqa", device=device) with tf.variable_scope(self.scope_name, reuse=reuse) as scope: self.batch_size = None ##################### # QUESTION ##################### self._question = tf.placeholder(tf.int32, [self.batch_size, None], name='question') self._seq_length = tf.placeholder(tf.int32, [self.batch_size], name='seq_length') self._answer_count = tf.placeholder(tf.float32, [self.batch_size, no_answers], name='answer_count') self._is_training = tf.placeholder(tf.bool, name="is_training") dropout_keep = float(config.get("dropout_keep_prob", 1.0)) dropout_keep = tf.cond(self._is_training, lambda: tf.constant(dropout_keep), lambda: tf.constant(1.0)) word_emb = utils.get_embedding(self._question, n_words=no_words, n_dim=int( config["word_embedding_dim"]), scope="word_embedding", reuse=reuse) if config['glove']: self._glove = tf.placeholder(tf.float32, [None, None, 300], name="glove") word_emb = tf.concat([word_emb, self._glove], axis=2) self.question_lstm, self.all_lstm_states = rnn.variable_length_LSTM( word_emb, num_hidden=int(config["no_hidden_LSTM"]), dropout_keep_prob=dropout_keep, seq_length=self._seq_length, depth=int(config["no_LSTM_cell"]), scope="question_lstm", reuse=reuse) ##################### # IMAGES ##################### self._image = tf.placeholder(tf.float32, [self.batch_size] + config['image']["dim"], name='image') self.image_out = get_image_features(image=self._image, question=self.question_lstm, is_training=self._is_training, scope_name=scope.name, config=config['image'], dropout_keep=dropout_keep) ##################### # COMBINE ##################### activation_name = config["activation"] with tf.variable_scope('final_mlp'): self.question_embedding = utils.fully_connected( self.question_lstm, config["no_question_mlp"], activation=activation_name, scope='question_mlp') self.image_embedding = utils.fully_connected( self.image_out, config["no_image_mlp"], activation=activation_name, scope='image_mlp') full_embedding = self.image_embedding * self.question_embedding full_embedding = tf.nn.dropout(full_embedding, dropout_keep) out = utils.fully_connected(full_embedding, config["no_hidden_final_mlp"], scope='layer1', activation=activation_name) out = tf.nn.dropout(out, dropout_keep) out = utils.fully_connected(out, no_answers, activation='linear', scope='layer2') # improve soft loss answer_count = tf.minimum(self._answer_count, 3) normalizing_sum = tf.maximum( 1.0, tf.reduce_sum(answer_count, 1, keep_dims=True)) self.answer_prob = answer_count / normalizing_sum self.soft_cross_entropy = tf.nn.softmax_cross_entropy_with_logits( logits=out, labels=self.answer_prob, name='soft_cross_entropy') self.soft_loss = self.soft_cross_entropy self.target_answer = tf.argmax(self._answer_count, axis=1) # unmorm_log_prob = tf.log(self._answer_count) # self.target_answer = tf.multinomial(unmorm_log_prob, num_samples=1) # self.target_answer = tf.reshape(self.target_answer, shape=[-1]) self.hard_cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=out, labels=self.target_answer, name='hard_cross_entropy') self.hard_loss = self.hard_cross_entropy if config['loss'] == 'soft': self.loss = self.soft_loss else: self.loss = self.hard_loss self.loss = tf.reduce_mean(self.loss) self.softmax = tf.nn.softmax(out, name='answer_prob') self.prediction = tf.argmax( out, axis=1, name='predicted_answer') # no need to compute the softmax with tf.variable_scope('accuracy'): ind = tf.range(tf.shape( self.prediction)[0]) * no_answers + tf.cast( self.prediction, tf.int32) pred_count = tf.gather(tf.reshape(self._answer_count, [-1]), ind) self.extended_accuracy = tf.minimum(pred_count / 3.0, 1.0, name="extended_accuracy") self.accuracy = tf.reduce_mean(self.extended_accuracy) tf.summary.scalar('soft_loss', self.soft_loss) tf.summary.scalar('hard_loss', self.hard_loss) tf.summary.scalar('accuracy', self.accuracy) print('Model... build!')
def __init__(self, config, num_words_question ,num_words_description=None, device='', reuse=False): ResnetModel.__init__(self, "oracle", device=device) with open("data/dict_word_embedding_{}_{}.pickle".format("fasttext",config["model"]["question"]["embedding_type"]),"rb") as f: dict_all_embedding = pickle.load(f) with tf.variable_scope(self.scope_name, reuse=reuse) as scope: embeddings = [] co_attention = [None,None,None,None] self.batch_size = None max_seq_length = 12 # QUESTION if config['inputs']['question']: self._is_training = tf.placeholder(tf.bool, name="is_training") # self._question_word = tf.placeholder(tf.int32, [self.batch_size], name='question_word') # self._question = tf.placeholder(tf.int32, [self.batch_size, 14], name='question') self.seq_length_question = tf.placeholder(tf.int32, [self.batch_size], name='seq_length_question') if config["model"]["glove"] == True or config["model"]["fasttext"] == True: print("****** WITH EMBEDDING ******") word_emb = utils.get_embedding(self._question, n_words=num_words_question, n_dim=int(config["model"]["word_embedding_dim"]), scope="word_embedding", dict_all_embedding=dict_all_embedding) else: print("****** NOT EMBEDDING ******") word_emb = utils.get_embedding(self._question, n_words=num_words_question, n_dim=int(config["model"]["word_embedding_dim"]), scope="word_embedding", dict_all_embedding=[]) print(".... word_emb 1 = {} ".format(word_emb)) self.out_question = None if config['model']['question']['lstm']: self.lstm_states_question, self.lstm_all_state_ques = rnn.variable_length_LSTM(word_emb, num_hidden=int(config['model']['question']["no_LSTM_hiddens"]), seq_length=self.seq_length_question) self.out_question = self.lstm_all_state_ques # print("out_queston = {} ".format(self.lstm_states_question)) # exit() # self.out_question = tf.reshape(self.out_question,[-1, self.out_question.get_shape()[1] * self.out_question.get_shape()[2] ]) else: self.out_question = word_emb if config["model"]["attention"]["co-attention"]: co_attention[0] = self.out_question # Tensor("oracle/lstm/lstmcell0/concat:0", shape=(?, 14, 1024), dtype=float32) embeddings.append(self.lstm_states_question) # print("question_lstm = {} ".format(self.out_question )) # exit() else: embeddings.append(self.lstm_states_question) # QUESTION-Pos if config['model']['question'] ['pos']: print("----------------------------------------") print("**** Oracle_network | input = question-pos ") self._question_pos = tf.placeholder(tf.int32, [self.batch_size, None], name='question_pos') self.seq_length_pos = tf.placeholder(tf.int32, [self.batch_size], name='seq_length_ques_pos') word_emb = utils.get_embedding(self._question_pos, n_words=num_words_question, n_dim=100, scope="word_embedding_pos") if config["model"]["glove"] == True or config["model"]["fasttext"] == True: self._glove = tf.placeholder(tf.float32, [None, None,int(config["model"]["word_embedding_dim"])], name="embedding_vector_ques_pos") word_emb = tf.concat([word_emb, self._glove], axis=2) else: print("None ****************") lstm_states, _ = rnn.variable_length_LSTM(word_emb, num_hidden=int(config['model']['question']["no_LSTM_hiddens"]), seq_length=self.seq_length_pos,scope="lstm2") # embeddings.append(lstm_states) # DESCRIPTION if config['inputs']['description']: print("**** Oracle_network | input = Description ") self._description = tf.placeholder(tf.int32, [self.batch_size, None], name='description') self.seq_length_description = tf.placeholder(tf.int32, [self.batch_size], name='seq_length_description') word_emb = utils.get_embedding(self._description, n_words=num_words_question, n_dim=100, reuse=True, scope="word_embedding") # print("word_emb = {} ".format(word_emb)) if config['model']['question']['lstm']: self.lstm_states_des, self.lstm_all_state_des = rnn.variable_length_LSTM(word_emb, num_hidden=int(config['model']['question']["no_LSTM_hiddens"]), seq_length=self.seq_length_description,scope="lstm3") self.out_question = self.lstm_states_des # print("self.out_question_emb = {} ".format(self.out_question)) # self.out_question = tf.reshape(self.out_question,[-1, self.out_question.get_shape()[1] * self.out_question.get_shape()[2] ]) else: self.out_question = word_emb # print("self.out_question = {} ".format(self.out_question)) if config["model"]["attention"]["co-attention"]: # co_attention[1] = self.out_question # embeddings.append(self.lstm_all_state_ques) embeddings.append(self.lstm_states_des) else: embeddings.append(self.lstm_states_des) if config['inputs']['history_question']: placeholders_lstmQuestion = [] placeholders_lstmLength = [] for i in range(6): self._embWord = tf.placeholder(tf.int32, [self.batch_size, 14], name="ques_hist_H{}".format(i)) self.seq_length_question_history = tf.placeholder(tf.int32, [self.batch_size], name='seq_length_question_history_H{}'.format(i)) self.word_emb = utils.get_embedding(self._embWord, n_words=num_words_question, n_dim=100, reuse=True, scope="word_embedding") placeholders_lstmQuestion.append(self.word_emb) placeholders_lstmLength.append(self.seq_length_question_history) self.lstm_states, self.lstm_all_state_ques_hist = rnn.variable_length_LSTM(placeholders_lstmQuestion, num_hidden=int(config['model']['question']["no_LSTM_hiddens"]), seq_length=placeholders_lstmLength,scope="lstm4",dim_4=True) if config["model"]["attention"]["co-attention"]: co_attention[2] = self.lstm_states else: embeddings.append(self.lstm_states) # Description-Pos if config['model']['description'] ['pos']: print("----------------------------------------") print("**** Oracle_network | inpurt = question-pos ") self._question_pos = tf.placeholder(tf.int32, [self.batch_size, None], name='des_pos') self.seq_length_pos = tf.placeholder(tf.int32, [self.batch_size], name='seq_length_des_pos') word_emb = utils.get_embedding(self._question_pos, n_words=num_words_question, n_dim=300, scope="word_embedding_pos") if config["model"]["glove"] == True or config["model"]["fasttext"] == True: self._glove = tf.placeholder(tf.float32, [None, None, int(config["model"]["word_embedding_dim"])], name="embedding_vector_des_pos") word_emb = tf.concat([word_emb, self._glove], axis=2) else: print("None ****************") lstm_states, _ = rnn.variable_length_LSTM(word_emb, num_hidden=int(config['model']['question']["no_LSTM_hiddens"]), seq_length=self.seq_length_pos,scope="lstm5") # embeddings.append(lstm_states) # CATEGORY if config['inputs']['category']: print("**** Oracle_network | input = category ") if config["model"]["category"]["use_embedding"]: self._category = tf.placeholder(tf.float32, [self.batch_size,int(config["model"]["word_embedding_dim"])], name='category') cat_emb = self._category # cat_emb = utils.get_embedding(self._category, # int(config['model']['category']["n_categories"]) + 1, # n_dim=int(config["model"]["word_embedding_dim"]), # scope="cat_embedding", # dict_all_embedding=dict_all_embedding # ) else: self._category = tf.placeholder(tf.int32, [self.batch_size], name='category') cat_emb = utils.get_embedding(self._category, int(config['model']['category']["n_categories"]) + 1, # we add the unkwon category int(config["model"]["word_embedding_dim"]), scope="cat_embedding", ) # cat_emb = tf.expand_dims(cat_emb,1) embeddings.append(cat_emb) print("Input: Category") # ALLCATEGORY if config['inputs']['allcategory']: print("**** Oracle_network | input = allcategory ") self._allcategory = tf.placeholder(tf.float32, [self.batch_size,90], name='allcategory') # self.seq_length_allcategory = tf.placeholder(tf.int32, [self.batch_size], name='seq_length_allcategory') # word_emb = utils.get_embedding(self._allcategory, # n_words=int(config['model']['category']["n_categories"]) + 1, # n_dim=int(config['model']['description']["embedding_dim"]), # scope="word_embedding_allcategory") #print(" SeqDescription = ",self.seq_length_description) # lstm_states, _ = rnn.variable_length_LSTM(word_emb, # num_hidden=int(config['model']['question']["no_LSTM_hiddens"]), # seq_length=self.seq_length_allcategory,scope="lstm3") print(" Oracle_network | embdedding all_cate=",word_emb) # embeddings.append(self._allcategory) print("Input: allcategory") # SPATIAL if config['inputs']['spatial']: print("**** Oracle_network | input = spatial ") self._spatial = tf.placeholder(tf.float32, [self.batch_size, 8], name='spatial') embeddings.append(self._spatial) print("Input: Spatial") # IMAGE if config['inputs']['image']: print("**** Oracle_network | input = image ") self._image_id = tf.placeholder(tf.float32, [self.batch_size], name='image_id') self._image = tf.placeholder(tf.float32, [self.batch_size] + config['model']['image']["dim"], name='image') # self.image_out = tf.reshape(self._image,shpe=[224*224*3]) # print("question = {} ".format(self.lstm_states_question)) # exit() self.image_out = get_image_features( image=self._image, question=self.lstm_states_question, is_training=self._is_training, scope_name=scope.name, scope_feature="Image/", config=config['model']['image'] ) # embeddings.append(self.image_out) print("Input: Image") co_attention[3] = self.image_out print(" -- image_int ={}".format(self._image)) # exit() image_feature = tf.reshape(self.image_out, shape=[-1, (7 * 7) * 2048]) # input image_feature ?,7,7,2048 => ?,49,2048 embeddings.append(image_feature) # print("... Image Features = {}".format(self.image_out)) # CROP if config['inputs']['crop']: print("**** Oracle_network | input = crop ") self._image_id = tf.placeholder(tf.float32, [self.batch_size], name='image_id') # self._crop_id = tf.placeholder(tf.float32, [self.batch_size], name='crop_id') self._crop = tf.placeholder(tf.float32, [self.batch_size] + config['model']['crop']["dim"], name='crop') if config["model"]["attention"]["co-attention"]: self.crop_out = get_image_features( image=self._crop, question=self.lstm_states_question, is_training=self._is_training, scope_name=scope.name, scope_feature="Crop/", config=config["model"]['crop']) co_attention[3] = self.crop_out else: self.crop_out = get_image_features( image=self._crop, question=self.lstm_states_question, is_training=self._is_training, scope_name=scope.name, scope_feature="Crop/", co_attention=False, config=config["model"]['crop']) embeddings.append(self.crop_out) if config["model"]["crop"]["segment_crop"]["use"]: all_segment_crop = [] # for i in range(10): self._segment_crop = tf.placeholder(tf.float32, [self.batch_size] + config['model']['crop']["dim"], name='crop_segment'.format(0)) self.crop_out = get_image_features( image=self._segment_crop, question=self.lstm_states_question, is_training=self._is_training, scope_name="test", scope_feature="Segment/", config=config["model"]['crop']) print("self.crop_out = {} ".format(self.crop_out)) # all_segment_crop.add(self.crop_out) # print("-- crop = {},image_features = {} ".format(self.crop_out, image_feature)) # exit() if config["model"]["attention"]["co-attention"]: question_feature,history_feature , image_feature = compute_all_attention(question_states=co_attention[0], caption=co_attention[1], history_states=co_attention[2], image_feature=co_attention[3], no_mlp_units=config['model']['attention']['no_attention_mlp'], config = config ) embeddings.append(history_feature) embeddings.append(question_feature) embeddings.append(image_feature) # embeddings.append(question_feature) print("*** All Embedding = ",embeddings) self.emb = tf.concat(embeddings, axis=1) print("*** self.emb = ",self.emb) # Compute the final embedding # print("---------- Embeddings=",embeddings) # self.emb = tf.concat(embeddings, axis=1) # OUTPUT num_classes = 3 self._answer = tf.placeholder(tf.float32, [self.batch_size, num_classes], name='answer') with tf.variable_scope('mlp'): num_hiddens = config['model']['MLP']['num_hiddens'] # emb = tf.print(emb, [emb], "input: ") l1 = utils.fully_connected(self.emb, num_hiddens, activation='relu', scope='l1') self.pred = utils.fully_connected(l1, num_classes, activation='softmax', scope='softmax') self.best_pred = tf.argmax(self.pred, axis=1) # self.best_pred = tf.reduce_mean(self.best_pred) print("--- predict = {} ,answer = {} ".format(self.pred,self._answer)) # exit() # self.loss = None self.loss = tf.reduce_mean(utils.cross_entropy(self.pred, self._answer)) self.error = tf.reduce_mean(utils.error(self.pred, self._answer)) print("loss = {} ,error = {} ".format(self.loss,self.error)) print('Model... Oracle build!')
def __init__(self, config, num_words, device='', reuse=False): ResnetModel.__init__(self, "oracle", device=device) with tf.variable_scope(self.scope_name, reuse=reuse) as scope: embeddings = [] self.batch_size = None # QUESTION self._is_training = tf.placeholder(tf.bool, name="is_training") self._question = tf.placeholder(tf.int32, [self.batch_size, None], name='question') self._seq_length = tf.placeholder(tf.int32, [self.batch_size], name='seq_length') word_emb = utils.get_embedding( self._question, n_words=num_words, n_dim=int(config['model']['question']["embedding_dim"]), scope="word_embedding") lstm_states, _ = rnn.variable_length_LSTM( word_emb, num_hidden=int(config['model']['question']["no_LSTM_hiddens"]), seq_length=self._seq_length) embeddings.append(lstm_states) # CATEGORY if config['inputs']['category']: self._category = tf.placeholder(tf.int32, [self.batch_size], name='category') cat_emb = utils.get_embedding( self._category, int(config['model']['category']["n_categories"]) + 1, # we add the unkwon category int(config['model']['category']["embedding_dim"]), scope="cat_embedding") embeddings.append(cat_emb) print("Input: Category") # SPATIAL if config['inputs']['spatial']: self._spatial = tf.placeholder(tf.float32, [self.batch_size, 8], name='spatial') embeddings.append(self._spatial) print("Input: Spatial") # IMAGE if config['inputs']['image']: self._image = tf.placeholder(tf.float32, [self.batch_size] + config['model']['image']["dim"], name='image') self.image_out = get_image_features( image=self._image, question=lstm_states, is_training=self._is_training, scope_name=scope.name, config=config['model']['image']) embeddings.append(self.image_out) print("Input: Image") # CROP if config['inputs']['crop']: self._crop = tf.placeholder(tf.float32, [self.batch_size] + config['model']['crop']["dim"], name='crop') self.crop_out = get_image_features( image=self._crop, question=lstm_states, is_training=self._is_training, scope_name=scope.name, config=config["model"]['crop']) embeddings.append(self.crop_out) print("Input: Crop") # Compute the final embedding emb = tf.concat(embeddings, axis=1) # OUTPUT num_classes = 3 self._answer = tf.placeholder(tf.float32, [self.batch_size, num_classes], name='answer') with tf.variable_scope('mlp'): num_hiddens = config['model']['MLP']['num_hiddens'] l1 = utils.fully_connected(emb, num_hiddens, activation='relu', scope='l1') self.pred = utils.fully_connected(l1, num_classes, activation='softmax', scope='softmax') self.best_pred = tf.argmax(self.pred, axis=1) self.loss = tf.reduce_mean( utils.cross_entropy(self.pred, self._answer)) self.error = tf.reduce_mean(utils.error(self.pred, self._answer)) print('Model... Oracle build!')
def __init__(self, config, num_words, policy_gradient, device='', reuse=False): # AbstractNetwork.__init__(self, "qgen_guesser", device=device) ResnetModel.__init__(self, "qgen_guesser", device=device) # Create the scope for this graph with tf.variable_scope(self.scope_name, reuse=reuse): # We set batch size to be none as the batch size for the validation set and train set are different # mini_batch_size = None # mini_batch_size = config['batch_size'] mini_batch_size = None self.guesser_loss_weight = tf.constant(config["guesser_loss_weight"], dtype = tf.float32, name = "guesser_loss_weight") self.qgen_loss_weight = tf.constant(config["qgen_loss_weight"], dtype = tf.float32, name = "qgen_loss_weight") self.loss = 0 # ********************************************************* # Placeholders specific for guesser and its processing # ********************************************************* # Objects self.obj_mask = tf.placeholder(tf.float32, [mini_batch_size, None], name='obj_mask') self.obj_cats = tf.placeholder(tf.int32, [mini_batch_size, None], name='obj_cats') self.obj_spats = tf.placeholder(tf.float32, [mini_batch_size, None, config['spat_dim']], name='obj_spats') # Targets self.targets = tf.placeholder(tf.int32, [mini_batch_size], name="targets_index") self.object_cats_emb = utils.get_embedding( self.obj_cats, config['no_categories'] + 1, config['cat_emb_dim'], scope='cat_embedding') self.objects_input = tf.concat([self.object_cats_emb, self.obj_spats], axis=2) self.flat_objects_inp = tf.reshape(self.objects_input, [-1, config['cat_emb_dim'] + config['spat_dim']]) with tf.variable_scope('obj_mlp'): h1 = utils.fully_connected( self.flat_objects_inp, n_out=config['obj_mlp_units'], activation='relu', scope='l1') h2 = utils.fully_connected( h1, n_out=config['no_hidden_final_mlp'], activation='relu', scope='l2') # print # print # print h2 # TODO: Object Embeddings do not have image features right now obj_embs = tf.reshape(h2, [-1, tf.shape(self.obj_cats)[1], config['no_hidden_final_mlp']]) # ********************************************************* # Placeholders for Qgen and common placeholder for guesser and its processing # ********************************************************* # Image self.images = tf.placeholder(tf.float32, [mini_batch_size] + config['image']["dim"], name='images') # Question self.dialogues = tf.placeholder(tf.int32, [mini_batch_size, None], name='dialogues') self.answer_mask = tf.placeholder(tf.float32, [mini_batch_size, None], name='answer_mask') # 1 if keep and (1 q/a 1) for (START q/a STOP) self.padding_mask = tf.placeholder(tf.float32, [mini_batch_size, None], name='padding_mask') self.seq_length = tf.placeholder(tf.int32, [mini_batch_size], name='seq_length') # Rewards self.cum_rewards = tf.placeholder(tf.float32, shape=[mini_batch_size, None], name='cum_reward') # DECODER Hidden state (for beam search) zero_state = tf.zeros([1, config['num_lstm_units']]) # default LSTM state is a zero-vector zero_state = tf.tile(zero_state, [tf.shape(self.images)[0], 1]) # trick to do a dynamic size 0 tensors self.decoder_zero_state_c = tf.placeholder_with_default(zero_state, [mini_batch_size, config['num_lstm_units']], name="state_c") self.decoder_zero_state_h = tf.placeholder_with_default(zero_state, [mini_batch_size, config['num_lstm_units']], name="state_h") decoder_initial_state = tf.contrib.rnn.LSTMStateTuple(c=self.decoder_zero_state_c, h=self.decoder_zero_state_h) # ******* # Misc # ******* self.is_training = tf.placeholder(tf.bool, name='is_training') self.greedy = tf.placeholder_with_default(False, shape=(), name="greedy") # use for graph self.samples = None # For each length of the answer, we are finding the next token # remove last token input_dialogues = self.dialogues[:, :-1] input_seq_length = self.seq_length - 1 # remove first token(=start token) rewards = self.cum_rewards[:, 1:] target_words = self.dialogues[:, 1:] # to understand the padding: # input # <start> is it a blue <?> <yes> is it a car <?> <no> <stop_dialogue> # target # is it a blue <?> - is it a car <?> - <stop_dialogue> - # TODO: # 1. Include Film in guesser (Check if you are using film or cbn????) # Add finetune in the training (see the training and config file of clever) # Check the use of finetune (should we input pretrained model), normalize etc # See in the config file, if the attention has to be put inside image block # As of now, in the first part where we get image embedding, the we only flatten the image. Use RCNN or other method to get the image features. # Include attention on image given the dialog embedding in the guesser part # Include dropout om lstm (option for inside and outside) and image # Include attention on words given the image features # 2. Use tf.gather and use all the lstm states where there was yes or no (-) in the target and the stop dialog # 3. Make the code run # Check how does the is_training flag works # image processing with tf.variable_scope('image_feature') as img_scope: if len(config["image"]["dim"]) == 1: self.image_out = self.images else: # TODO: Create a different config for this attention # Putting images tf.summary.image("image", self.images) self.image_out = get_image_features( image=self.images, question = None, is_training=self.is_training, scope_name=img_scope.name, config=config['image'], att = False ) image_pooling_size = [int((self.image_out).get_shape()[1]), int((self.image_out).get_shape()[2])] image_feature_depth = int((self.image_out).get_shape()[3]) self.image_out = tf.layers.max_pooling2d(self.image_out, image_pooling_size, 1, padding='valid', data_format='channels_last', name='max_pooling_image_out') self.image_out = tf.reshape(self.image_out, [-1, image_feature_depth]) # self.filmed_picture_out = tf.layers.average_pooling2d( self.filmed_picture_out, # final_pooling_size, # 1, # padding='valid', # data_format='channels_last', # name='average_pooling_filmed_picture_out') # self.image_out = get_attention(self.images, None, config["image"]["attention"]) #TODO: improve by using the previous lstm state? # self.image_out = tf.contrib.layers.flatten(self.image_out) print self.image_out print print # Reduce the embedding size of the image with tf.variable_scope('image_embedding'): self.image_emb = utils.fully_connected(self.image_out, config['image_embedding_size']) image_emb = tf.expand_dims(self.image_emb, 1) image_emb = tf.tile(image_emb, [1, tf.shape(input_dialogues)[1], 1]) # Compute the question embedding input_words = utils.get_embedding( input_dialogues, n_words=num_words, n_dim=config['word_embedding_size'], scope="word_embedding") # concat word embedding and image embedding # TODO: Check the size (see if input_seq_length is increased or not) decoder_input = tf.concat([input_words, image_emb], axis=2, name="concat_full_embedding") # encode one word+image decoder_lstm_cell = tf.contrib.rnn.LayerNormBasicLSTMCell( config['num_lstm_units'], layer_norm=False, dropout_keep_prob=1.0, reuse=reuse) # TODO: Since we have concatinated image, check if the input_seq_length should be increased by one # Decoding the states to generate questions self.decoder_output, self.decoder_state = tf.nn.dynamic_rnn( cell=decoder_lstm_cell, inputs=decoder_input, dtype=tf.float32, initial_state=decoder_initial_state, sequence_length=input_seq_length, scope="word_decoder") # TODO: use multi-layer RNN max_sequence = tf.reduce_max(self.seq_length) # For the Guesser # Adding extra layers of LSTM # TODO: There are several default parameters in the fuction. Try using them # TODO: as of now, not using it. # TODO, as of now only using the hidden state, you may include the other state too last_states = self.decoder_state.h # last_states, _ = rnn.variable_length_LSTM_extension( # self.decoder_output, # self.decoder_state, # num_hidden = config['num_lstm_units'], # seq_length = input_seq_length # ) last_states = tf.reshape(last_states, [-1, config['num_lstm_units']]) # TODO: Can be moved to utils def masked_softmax(scores, mask): # subtract max for stability scores = scores - tf.tile(tf.reduce_max(scores, axis=(1,), keepdims=True), [1, tf.shape(scores)[1]]) # compute padded softmax exp_scores = tf.exp(scores) exp_scores *= mask exp_sum_scores = tf.reduce_sum(exp_scores, axis=1, keepdims=True) return exp_scores / tf.tile(exp_sum_scores, [1, tf.shape(exp_scores)[1]]) # compute the softmax for evaluation (on all the words on dialogue) with tf.variable_scope('decoder_output'): flat_decoder_output = tf.reshape(self.decoder_output, [-1, decoder_lstm_cell.output_size]) flat_mlp_output = utils.fully_connected(flat_decoder_output, num_words) # retrieve the batch/dialogue format mlp_output = tf.reshape(flat_mlp_output, [tf.shape(self.seq_length)[0], max_sequence - 1, num_words]) # Ignore th STOP token self.softmax_output = tf.nn.softmax(mlp_output, name="softmax") self.argmax_output = tf.argmax(mlp_output, axis=2) self.cross_entropy_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=mlp_output, labels=target_words) # compute the maximum likelihood loss for the dialogues (for valid words) with tf.variable_scope('ml_loss'): ml_loss = tf.identity(self.cross_entropy_loss) ml_loss *= self.answer_mask[:, 1:] # remove answers (ignore the <stop> token) ml_loss *= self.padding_mask[:, 1:] # remove padding (ignore the <start> token) # Count number of unmask elements count = tf.reduce_sum(self.padding_mask) - tf.reduce_sum(1 - self.answer_mask[:, :-1]) - 1 # no_unpad - no_qa - START token ml_loss = tf.reduce_sum(ml_loss, axis=1) # reduce over dialogue dimension ml_loss = tf.reduce_sum(ml_loss, axis=0) # reduce over minibatch dimension self.ml_loss = ml_loss / count # Normalize self.qgen_loss = self.ml_loss self.loss += self.qgen_loss_weight * self.qgen_loss tf.summary.scalar("qgen_loss", self.qgen_loss) # TODO NOTE: IMP in config file, under the image section, set cbn to be true with tf.variable_scope('guesser_input') as scope: # Getting the CBN image features self.CBN_picture_out = get_image_features( image=self.images, question = last_states, is_training=self.is_training, scope_name=scope.name, config=config['image'] ) # FILMING the Features # self.filmed_picture_out = film_layer(ft=self.CBN_picture_out, context=last_states) # TODO: Make n a hyperparameter and add it to network parameters self.filmed_picture_out = self.CBN_picture_out n = 1 for i in range(n): with tf.variable_scope('film_layer_' + str(i)): self.filmed_picture_out = FiLMResblock(features=self.filmed_picture_out, context=last_states, is_training=self.is_training).get() # self.filmed_picture_out_3 = FiLMResblock(features=self.filmed_picture_out_2, context=last_states, is_training=self.is_training).get() # self.filmed_picture_out = FiLMResblock(features=self.filmed_picture_out_3, context=last_states, is_training=self.is_training).get() # self.filmed_picture_out = FiLMResblock(features=self.filmed_picture_out, context=last_states, is_training=self.is_training).get() # self.filmed_picture_out = FiLMResblock(features=self.filmed_picture_out, context=last_states, is_training=self.is_training).get() # self.filmed_picture_out = FiLMResblock(features=self.CBN_picture_out, context=last_states, is_training=self.is_training).get() # self.filmed_picture_out = film_layer(features=self.CBN_picture_out, context=last_states) # TODO: Doing a convolution over the feature maps (before the classifier) # Do a max pooling over the feature maps final_pooling_size = [int((self.filmed_picture_out).get_shape()[1]), int((self.filmed_picture_out).get_shape()[2])] final_feature_depth = int((self.filmed_picture_out).get_shape()[3]) if str(config["pooling"]).lower() == 'max': self.filmed_picture_out = tf.layers.max_pooling2d( self.filmed_picture_out, final_pooling_size, 1, padding='valid', data_format='channels_last', name='max_pooling_filmed_picture_out') elif str(config["pooling"]).lower() == 'avg': self.filmed_picture_out = tf.layers.average_pooling2d( self.filmed_picture_out, final_pooling_size, 1, padding='valid', data_format='channels_last', name='average_pooling_filmed_picture_out') else: print "No Pooling defined" sys.exit() self.filmed_picture_out = tf.reshape(self.filmed_picture_out, [-1, final_feature_depth]) # Combining filmed image and dialog features into one ##################### activation_name = config["activation"] self.question_embedding = utils.fully_connected(last_states, config["no_question_mlp"], activation=activation_name, scope='question_mlp') self.picture_embedding = utils.fully_connected(self.filmed_picture_out, config["no_picture_mlp"], activation=activation_name, scope='picture_mlp') self.full_embedding = self.picture_embedding * self.question_embedding # self.full_embedding = tf.nn.dropout(full_embedding, dropout_keep) # self.guesser_out_0 = utils.fully_connected(self.full_embedding, config["no_hidden_prefinal_mlp"], scope='hidden_prefinal', activation=activation_name) self.guesser_out_0 = self.full_embedding # out = tf.nn.dropout(out, dropout_keep) # Since we are not having # out = utils.fully_connected(out, no_answers, activation='linear', scope='layer_softmax') self.guesser_out = utils.fully_connected(self.guesser_out_0, config["no_hidden_final_mlp"], scope='hidden_final', activation=activation_name) self.guesser_out = tf.reshape(self.guesser_out, [-1, config["no_hidden_final_mlp"], 1]) # TODO DONE: Add all these losses to tensorboard with tf.variable_scope('guesser_output'): # TODO: In paper they do dot product, but in code they do matmul !! scores = tf.matmul(obj_embs, self.guesser_out) scores = tf.reshape(scores, [-1, tf.shape(self.obj_cats)[1]]) self.softmax = masked_softmax(scores, self.obj_mask) self.selected_object = tf.argmax(self.softmax, axis=1) self.guesser_error = tf.reduce_mean(utils.error(self.softmax, self.targets)) self.guesser_loss = tf.reduce_mean(utils.cross_entropy(self.softmax, self.targets)) self.loss += self.guesser_loss_weight * self.guesser_loss tf.summary.scalar("guesser loss", self.guesser_loss) # Compute policy gradient if policy_gradient: with tf.variable_scope('rl_baseline'): decoder_out = tf.stop_gradient(self.decoder_output) # take the LSTM output (and stop the gradient!) flat_decoder_output = tf.reshape(decoder_out, [-1, decoder_lstm_cell.output_size]) # flat_h1 = utils.fully_connected(flat_decoder_output, n_out=100, activation='relu', scope='baseline_hidden') flat_baseline = utils.fully_connected(flat_h1, 1, activation='relu', scope='baseline_out') self.baseline = tf.reshape(flat_baseline, [tf.shape(self.seq_length)[0], max_sequence-1]) self.baseline *= self.answer_mask[:, 1:] self.baseline *= self.padding_mask[:, 1:] with tf.variable_scope('policy_gradient_loss'): # Compute log_prob self.log_of_policy = tf.identity(self.cross_entropy_loss) self.log_of_policy *= self.answer_mask[:, 1:] # remove answers (<=> predicted answer has maximum reward) (ignore the START token in the mask) # No need to use padding mask as the discounted_reward is already zero once the episode terminated # Policy gradient loss rewards *= self.answer_mask[:, 1:] self.score_function = tf.multiply(self.log_of_policy, rewards - self.baseline) # score function self.baseline_loss = tf.reduce_sum(tf.square(rewards - self.baseline)) self.policy_gradient_loss = tf.reduce_sum(self.score_function, axis=1) # sum over the dialogue trajectory self.policy_gradient_loss = tf.reduce_mean(self.policy_gradient_loss, axis=0) # reduce over minibatch dimension self.loss = self.policy_gradient_loss tf.summary.scalar("total network loss", self.loss) self.summary = tf.summary.merge_all() print('Model... build!')