Ejemplo n.º 1
0
        def step(prev_state_c, prev_state_h, tokens, seq_length,
                 stop_indicator):
            input = tf.gather(tokens, tf.shape(tokens)[0] - 1)

            # Look for new finish dialogue
            is_stop_token = tf.equal(input, stop_token)
            is_stop_dialogue_token = tf.equal(input, stop_dialogue_token)
            is_stop = tf.logical_or(is_stop_token, is_stop_dialogue_token)
            stop_indicator = tf.logical_or(
                stop_indicator, is_stop)  # flag to false new finished dialogue

            # increment seq_length when the dialogue is not over
            seq_length = tf.where(stop_indicator, seq_length,
                                  tf.add(seq_length, 1))

            # compute the next words. TODO: factorize with qgen.. but how?!
            with tf.variable_scope(self.scope_name, reuse=True):
                word_emb = utils.get_embedding(
                    input,
                    n_words=tokenizer.no_words,
                    n_dim=config['word_embedding_size'],
                    scope="word_embedding",
                    reuse=True)

                inp_emb = tf.concat([word_emb, self.image_emb], axis=1)
                with tf.variable_scope("word_decoder"):
                    lstm_cell = tf.contrib.rnn.LayerNormBasicLSTMCell(
                        config['num_lstm_units'],
                        layer_norm=False,
                        dropout_keep_prob=1.0,
                        reuse=True)

                    state = tf.contrib.rnn.LSTMStateTuple(c=prev_state_c,
                                                          h=prev_state_h)
                    out, state = lstm_cell(inp_emb, state)

                    # store/update the state when the dialogue is not finished (after sampling the <?> token)
                    cond = tf.greater_equal(
                        seq_length, tf.subtract(tf.reduce_max(seq_length), 1))
                    state_c = tf.where(cond, state.c, prev_state_c)
                    state_h = tf.where(cond, state.h, prev_state_h)

                with tf.variable_scope('decoder_output'):
                    output = utils.fully_connected(state_h,
                                                   tokenizer.no_words,
                                                   reuse=True)

                    sampled_tokens = tf.cond(
                        self.greedy, lambda: tf.argmax(output, 1),
                        lambda: tf.reshape(tf.multinomial(output, 1), [-1]))
                    sampled_tokens = tf.cast(sampled_tokens, tf.int32)

            tokens = tf.concat(
                [tokens, tf.expand_dims(sampled_tokens, 0)],
                axis=0)  # check axis!

            return state_c, state_h, tokens, seq_length, stop_indicator
Ejemplo n.º 2
0
    def __init__(self, config, no_words, no_answers, reuse=False, device=''):
        ResnetModel.__init__(self, "vqa", device=device)

        with tf.variable_scope(self.scope_name, reuse=reuse) as scope:

            self.batch_size = None

            #####################
            #   QUESTION
            #####################

            self._question = tf.placeholder(tf.int32, [self.batch_size, None],
                                            name='question')
            self._seq_length = tf.placeholder(tf.int32, [self.batch_size],
                                              name='seq_length')
            self._answer_count = tf.placeholder(tf.float32,
                                                [self.batch_size, no_answers],
                                                name='answer_count')

            self._is_training = tf.placeholder(tf.bool, name="is_training")

            dropout_keep = float(config.get("dropout_keep_prob", 1.0))
            dropout_keep = tf.cond(self._is_training,
                                   lambda: tf.constant(dropout_keep),
                                   lambda: tf.constant(1.0))

            word_emb = utils.get_embedding(self._question,
                                           n_words=no_words,
                                           n_dim=int(
                                               config["word_embedding_dim"]),
                                           scope="word_embedding",
                                           reuse=reuse)

            if config['glove']:
                self._glove = tf.placeholder(tf.float32, [None, None, 300],
                                             name="glove")
                word_emb = tf.concat([word_emb, self._glove], axis=2)

            self.question_lstm, self.all_lstm_states = rnn.variable_length_LSTM(
                word_emb,
                num_hidden=int(config["no_hidden_LSTM"]),
                dropout_keep_prob=dropout_keep,
                seq_length=self._seq_length,
                depth=int(config["no_LSTM_cell"]),
                scope="question_lstm",
                reuse=reuse)

            #####################
            #   IMAGES
            #####################

            self._image = tf.placeholder(tf.float32, [self.batch_size] +
                                         config['image']["dim"],
                                         name='image')
            self.image_out = get_image_features(image=self._image,
                                                question=self.question_lstm,
                                                is_training=self._is_training,
                                                scope_name=scope.name,
                                                config=config['image'],
                                                dropout_keep=dropout_keep)

            #####################
            #   COMBINE
            #####################
            activation_name = config["activation"]
            with tf.variable_scope('final_mlp'):

                self.question_embedding = utils.fully_connected(
                    self.question_lstm,
                    config["no_question_mlp"],
                    activation=activation_name,
                    scope='question_mlp')
                self.image_embedding = utils.fully_connected(
                    self.image_out,
                    config["no_image_mlp"],
                    activation=activation_name,
                    scope='image_mlp')

                full_embedding = self.image_embedding * self.question_embedding
                full_embedding = tf.nn.dropout(full_embedding, dropout_keep)

                out = utils.fully_connected(full_embedding,
                                            config["no_hidden_final_mlp"],
                                            scope='layer1',
                                            activation=activation_name)
                out = tf.nn.dropout(out, dropout_keep)
                out = utils.fully_connected(out,
                                            no_answers,
                                            activation='linear',
                                            scope='layer2')

            # improve soft loss
            answer_count = tf.minimum(self._answer_count, 3)

            normalizing_sum = tf.maximum(
                1.0, tf.reduce_sum(answer_count, 1, keep_dims=True))
            self.answer_prob = answer_count / normalizing_sum
            self.soft_cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
                logits=out, labels=self.answer_prob, name='soft_cross_entropy')
            self.soft_loss = self.soft_cross_entropy

            self.target_answer = tf.argmax(self._answer_count, axis=1)
            # unmorm_log_prob = tf.log(self._answer_count)
            # self.target_answer = tf.multinomial(unmorm_log_prob, num_samples=1)
            # self.target_answer = tf.reshape(self.target_answer, shape=[-1])

            self.hard_cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=out,
                labels=self.target_answer,
                name='hard_cross_entropy')
            self.hard_loss = self.hard_cross_entropy

            if config['loss'] == 'soft':
                self.loss = self.soft_loss
            else:
                self.loss = self.hard_loss

            self.loss = tf.reduce_mean(self.loss)
            self.softmax = tf.nn.softmax(out, name='answer_prob')
            self.prediction = tf.argmax(
                out, axis=1,
                name='predicted_answer')  # no need to compute the softmax

            with tf.variable_scope('accuracy'):
                ind = tf.range(tf.shape(
                    self.prediction)[0]) * no_answers + tf.cast(
                        self.prediction, tf.int32)
                pred_count = tf.gather(tf.reshape(self._answer_count, [-1]),
                                       ind)
                self.extended_accuracy = tf.minimum(pred_count / 3.0,
                                                    1.0,
                                                    name="extended_accuracy")
                self.accuracy = tf.reduce_mean(self.extended_accuracy)

            tf.summary.scalar('soft_loss', self.soft_loss)
            tf.summary.scalar('hard_loss', self.hard_loss)
            tf.summary.scalar('accuracy', self.accuracy)

            print('Model... build!')
Ejemplo n.º 3
0
    def __init__(self, config, num_words_question ,num_words_description=None,  device='', reuse=False):
        ResnetModel.__init__(self, "oracle", device=device)

        with open("data/dict_word_embedding_{}_{}.pickle".format("fasttext",config["model"]["question"]["embedding_type"]),"rb") as f:
            dict_all_embedding = pickle.load(f)


        with tf.variable_scope(self.scope_name, reuse=reuse) as scope:
            embeddings = []
            co_attention = [None,None,None,None]
            self.batch_size = None
            max_seq_length = 12
            

            # QUESTION
            if config['inputs']['question']:
                self._is_training = tf.placeholder(tf.bool, name="is_training")
                # self._question_word = tf.placeholder(tf.int32, [self.batch_size], name='question_word') # 
                self._question = tf.placeholder(tf.int32, [self.batch_size, 14], name='question')
                self.seq_length_question = tf.placeholder(tf.int32, [self.batch_size], name='seq_length_question')

                if config["model"]["glove"] == True or config["model"]["fasttext"] == True:
            
                    print("****** WITH EMBEDDING ******")
                    word_emb = utils.get_embedding(self._question,
                                                n_words=num_words_question,
                                                n_dim=int(config["model"]["word_embedding_dim"]),
                                                scope="word_embedding",
                                                dict_all_embedding=dict_all_embedding)
                else:
                    print("****** NOT EMBEDDING ******")


                    word_emb = utils.get_embedding(self._question,
                                                n_words=num_words_question,
                                                n_dim=int(config["model"]["word_embedding_dim"]),
                                                scope="word_embedding",
                                                dict_all_embedding=[])

                    print(".... word_emb 1 = {} ".format(word_emb))                            

                self.out_question = None

                if config['model']['question']['lstm']:
                    self.lstm_states_question, self.lstm_all_state_ques = rnn.variable_length_LSTM(word_emb,
                                                        num_hidden=int(config['model']['question']["no_LSTM_hiddens"]),
                                                        seq_length=self.seq_length_question)

                    self.out_question =  self.lstm_all_state_ques

                    # print("out_queston = {} ".format(self.lstm_states_question))
                    # exit()
                    # self.out_question = tf.reshape(self.out_question,[-1, self.out_question.get_shape()[1] * self.out_question.get_shape()[2] ])


                else:
                    self.out_question = word_emb

                if config["model"]["attention"]["co-attention"]:
                    co_attention[0] = self.out_question     # Tensor("oracle/lstm/lstmcell0/concat:0", shape=(?, 14, 1024), dtype=float32)
                    embeddings.append(self.lstm_states_question)
                    # print("question_lstm = {} ".format(self.out_question ))

                    # exit()
                else:
                    embeddings.append(self.lstm_states_question)

                # QUESTION-Pos
                if config['model']['question'] ['pos']:
                    print("----------------------------------------")
                    print("**** Oracle_network |  input = question-pos ")

                    self._question_pos = tf.placeholder(tf.int32, [self.batch_size, None], name='question_pos')
                    self.seq_length_pos = tf.placeholder(tf.int32, [self.batch_size], name='seq_length_ques_pos')
                    word_emb = utils.get_embedding(self._question_pos,
                                                n_words=num_words_question,
                                                n_dim=100,
                                                scope="word_embedding_pos")

                    if config["model"]["glove"] == True or config["model"]["fasttext"] == True:
                        self._glove = tf.placeholder(tf.float32, [None, None,int(config["model"]["word_embedding_dim"])], name="embedding_vector_ques_pos")
                        word_emb = tf.concat([word_emb, self._glove], axis=2)

                    else:
                        print("None ****************")
                    

                    lstm_states, _ = rnn.variable_length_LSTM(word_emb,
                                                        num_hidden=int(config['model']['question']["no_LSTM_hiddens"]),
                                                        seq_length=self.seq_length_pos,scope="lstm2")
                    


                    # embeddings.append(lstm_states)

            # DESCRIPTION
            if config['inputs']['description']:
                print("****  Oracle_network |  input = Description ")

                self._description = tf.placeholder(tf.int32, [self.batch_size, None], name='description')
                self.seq_length_description = tf.placeholder(tf.int32, [self.batch_size], name='seq_length_description')

                word_emb = utils.get_embedding(self._description,
                                            n_words=num_words_question,
                                            n_dim=100,
                                            reuse=True,
                                            scope="word_embedding")

                # print("word_emb = {} ".format(word_emb))

                if config['model']['question']['lstm']:
                    self.lstm_states_des, self.lstm_all_state_des = rnn.variable_length_LSTM(word_emb,
                                                        num_hidden=int(config['model']['question']["no_LSTM_hiddens"]),
                                                        seq_length=self.seq_length_description,scope="lstm3")




                    self.out_question =  self.lstm_states_des

                    # print("self.out_question_emb = {} ".format(self.out_question))  

                    # self.out_question = tf.reshape(self.out_question,[-1, self.out_question.get_shape()[1] * self.out_question.get_shape()[2] ])
                else:
                    self.out_question = word_emb
                    # print("self.out_question = {} ".format(self.out_question)) 


                if config["model"]["attention"]["co-attention"]:
                    # co_attention[1] = self.out_question     # embeddings.append(self.lstm_all_state_ques)
                    embeddings.append(self.lstm_states_des)

                else:
                    embeddings.append(self.lstm_states_des)
                
            if config['inputs']['history_question']:
                
                placeholders_lstmQuestion = []
                placeholders_lstmLength = []

               
                for i in range(6):
                    self._embWord = tf.placeholder(tf.int32, [self.batch_size, 14], name="ques_hist_H{}".format(i))

                    self.seq_length_question_history = tf.placeholder(tf.int32, [self.batch_size], name='seq_length_question_history_H{}'.format(i))

                    self.word_emb = utils.get_embedding(self._embWord,
                                        n_words=num_words_question,
                                        n_dim=100,
                                        reuse=True,
                                        scope="word_embedding")

                    placeholders_lstmQuestion.append(self.word_emb)
                    placeholders_lstmLength.append(self.seq_length_question_history)

            
                self.lstm_states, self.lstm_all_state_ques_hist = rnn.variable_length_LSTM(placeholders_lstmQuestion,
                                                num_hidden=int(config['model']['question']["no_LSTM_hiddens"]),
                                                seq_length=placeholders_lstmLength,scope="lstm4",dim_4=True)


                if config["model"]["attention"]["co-attention"]:
                    co_attention[2] = self.lstm_states
                else:
                    embeddings.append(self.lstm_states)

                


             # Description-Pos

                if config['model']['description'] ['pos']:
                    print("----------------------------------------")
                    print("**** Oracle_network |  inpurt = question-pos ")

                    self._question_pos = tf.placeholder(tf.int32, [self.batch_size, None], name='des_pos')
                    self.seq_length_pos = tf.placeholder(tf.int32, [self.batch_size], name='seq_length_des_pos')

                    word_emb = utils.get_embedding(self._question_pos,
                                                n_words=num_words_question,
                                                n_dim=300,
                                                scope="word_embedding_pos")

                    if config["model"]["glove"] == True or config["model"]["fasttext"] == True:
                        self._glove = tf.placeholder(tf.float32, [None, None, int(config["model"]["word_embedding_dim"])], name="embedding_vector_des_pos")
                        word_emb = tf.concat([word_emb, self._glove], axis=2)
                    else:
                        print("None ****************")
                    
                    lstm_states, _ = rnn.variable_length_LSTM(word_emb,
                                                        num_hidden=int(config['model']['question']["no_LSTM_hiddens"]),
                                                        seq_length=self.seq_length_pos,scope="lstm5")
                    

                    # embeddings.append(lstm_states)

            # CATEGORY
            if config['inputs']['category']:
                print("****  Oracle_network |  input = category ")
                

                if config["model"]["category"]["use_embedding"]:
                    self._category = tf.placeholder(tf.float32, [self.batch_size,int(config["model"]["word_embedding_dim"])], name='category')
                    cat_emb = self._category                    
                    # cat_emb = utils.get_embedding(self._category,
                    #                               int(config['model']['category']["n_categories"]) + 1,
                    #                               n_dim=int(config["model"]["word_embedding_dim"]),
                    #                               scope="cat_embedding",
                    #                               dict_all_embedding=dict_all_embedding
                    #                              )
                else:
                    self._category = tf.placeholder(tf.int32, [self.batch_size], name='category')
                    cat_emb = utils.get_embedding(self._category,
                                                int(config['model']['category']["n_categories"]) + 1,  # we add the unkwon category
                                                int(config["model"]["word_embedding_dim"]),
                                                scope="cat_embedding",
                                                 )



                # cat_emb = tf.expand_dims(cat_emb,1)
                embeddings.append(cat_emb)
                print("Input: Category")


            # ALLCATEGORY
            if config['inputs']['allcategory']:
                print("**** Oracle_network |  input = allcategory ")

        
                
                self._allcategory = tf.placeholder(tf.float32, [self.batch_size,90], name='allcategory')
                # self.seq_length_allcategory = tf.placeholder(tf.int32, [self.batch_size], name='seq_length_allcategory')

                # word_emb = utils.get_embedding(self._allcategory,
                #                             n_words=int(config['model']['category']["n_categories"]) + 1,
                #                             n_dim=int(config['model']['description']["embedding_dim"]),
                #                             scope="word_embedding_allcategory")

                
                #print(" SeqDescription = ",self.seq_length_description)
                # lstm_states, _ = rnn.variable_length_LSTM(word_emb,
                #                                     num_hidden=int(config['model']['question']["no_LSTM_hiddens"]),
                #                                     seq_length=self.seq_length_allcategory,scope="lstm3")
                
                print(" Oracle_network | embdedding all_cate=",word_emb)
                # embeddings.append(self._allcategory)
                print("Input: allcategory")

                
            # SPATIAL
            if config['inputs']['spatial']:
                print("****  Oracle_network |  input = spatial ")
                self._spatial = tf.placeholder(tf.float32, [self.batch_size, 8], name='spatial')
                embeddings.append(self._spatial)
                print("Input: Spatial")


            # IMAGE
            if config['inputs']['image']:
                print("****  Oracle_network |  input = image ")

                self._image_id = tf.placeholder(tf.float32, [self.batch_size], name='image_id')
                self._image = tf.placeholder(tf.float32, [self.batch_size] + config['model']['image']["dim"], name='image')
                # self.image_out = tf.reshape(self._image,shpe=[224*224*3])
                # print("question = {} ".format(self.lstm_states_question))
                # exit()

                self.image_out = get_image_features(
                    image=self._image, question=self.lstm_states_question,
                    is_training=self._is_training,
                    scope_name=scope.name,
                    scope_feature="Image/",
                    config=config['model']['image']
                )
                # embeddings.append(self.image_out)
                print("Input: Image")
                co_attention[3]  = self.image_out
                print(" -- image_int ={}".format(self._image))
                # exit()
                image_feature = tf.reshape(self.image_out, shape=[-1, (7 * 7) * 2048]) # input image_feature ?,7,7,2048 => ?,49,2048
                embeddings.append(image_feature)
                # print("... Image Features = {}".format(self.image_out))



            # CROP
            if config['inputs']['crop']:
                print("****  Oracle_network |  input = crop ")
                self._image_id = tf.placeholder(tf.float32, [self.batch_size], name='image_id')
                # self._crop_id = tf.placeholder(tf.float32, [self.batch_size], name='crop_id')

                self._crop = tf.placeholder(tf.float32, [self.batch_size] + config['model']['crop']["dim"], name='crop')
                
                
                
                if config["model"]["attention"]["co-attention"]:
                    self.crop_out = get_image_features(
                    image=self._crop, question=self.lstm_states_question,
                    is_training=self._is_training,
                    scope_name=scope.name,
                    scope_feature="Crop/",
                    config=config["model"]['crop'])
                    co_attention[3] = self.crop_out

                else:
                    self.crop_out = get_image_features(
                    image=self._crop, question=self.lstm_states_question,
                    is_training=self._is_training,
                    scope_name=scope.name,
                    scope_feature="Crop/",
                    co_attention=False,
                    config=config["model"]['crop'])

                    embeddings.append(self.crop_out)



            if config["model"]["crop"]["segment_crop"]["use"]:
                all_segment_crop = []
                # for i in range(10):
                self._segment_crop = tf.placeholder(tf.float32, [self.batch_size] + config['model']['crop']["dim"], name='crop_segment'.format(0))
                

                self.crop_out = get_image_features(
                                image=self._segment_crop, question=self.lstm_states_question,
                                is_training=self._is_training,
                                scope_name="test",
                                scope_feature="Segment/",
                                config=config["model"]['crop'])


                print("self.crop_out = {} ".format(self.crop_out))


                

                        # all_segment_crop.add(self.crop_out)

                
                # print("-- crop = {},image_features = {} ".format(self.crop_out, image_feature))
                # exit()

            if config["model"]["attention"]["co-attention"]:
                question_feature,history_feature , image_feature = compute_all_attention(question_states=co_attention[0],
                                                                                caption=co_attention[1],
                                                                                history_states=co_attention[2],
                                                                                image_feature=co_attention[3],
                                                                                no_mlp_units=config['model']['attention']['no_attention_mlp'],
                                                                                config = config
                                                                                )


                embeddings.append(history_feature)
                embeddings.append(question_feature)
                embeddings.append(image_feature)

            # embeddings.append(question_feature)
            print("*** All Embedding = ",embeddings)
            self.emb = tf.concat(embeddings, axis=1)
            
            print("*** self.emb = ",self.emb)
            


            # Compute the final embedding
            # print("---------- Embeddings=",embeddings)
            # self.emb = tf.concat(embeddings, axis=1)
            
        

            # OUTPUT
            num_classes = 3
            self._answer = tf.placeholder(tf.float32, [self.batch_size, num_classes], name='answer')



            with tf.variable_scope('mlp'):
                num_hiddens = config['model']['MLP']['num_hiddens']
                # emb = tf.print(emb, [emb], "input: ")
                l1 = utils.fully_connected(self.emb, num_hiddens, activation='relu', scope='l1')
                self.pred = utils.fully_connected(l1, num_classes, activation='softmax', scope='softmax')
                self.best_pred = tf.argmax(self.pred, axis=1)
            # self.best_pred = tf.reduce_mean(self.best_pred)

            print("--- predict = {} ,answer = {} ".format(self.pred,self._answer))
            # exit()
            # self.loss = None
        
            self.loss = tf.reduce_mean(utils.cross_entropy(self.pred, self._answer))
            self.error = tf.reduce_mean(utils.error(self.pred, self._answer))

            print("loss = {} ,error = {} ".format(self.loss,self.error))
            print('Model... Oracle build!')
Ejemplo n.º 4
0
    def __init__(self, config, num_words, device='', reuse=False):
        ResnetModel.__init__(self, "oracle", device=device)

        with tf.variable_scope(self.scope_name, reuse=reuse) as scope:
            embeddings = []
            self.batch_size = None

            # QUESTION
            self._is_training = tf.placeholder(tf.bool, name="is_training")
            self._question = tf.placeholder(tf.int32, [self.batch_size, None],
                                            name='question')
            self._seq_length = tf.placeholder(tf.int32, [self.batch_size],
                                              name='seq_length')

            word_emb = utils.get_embedding(
                self._question,
                n_words=num_words,
                n_dim=int(config['model']['question']["embedding_dim"]),
                scope="word_embedding")

            lstm_states, _ = rnn.variable_length_LSTM(
                word_emb,
                num_hidden=int(config['model']['question']["no_LSTM_hiddens"]),
                seq_length=self._seq_length)
            embeddings.append(lstm_states)

            # CATEGORY
            if config['inputs']['category']:
                self._category = tf.placeholder(tf.int32, [self.batch_size],
                                                name='category')

                cat_emb = utils.get_embedding(
                    self._category,
                    int(config['model']['category']["n_categories"]) +
                    1,  # we add the unkwon category
                    int(config['model']['category']["embedding_dim"]),
                    scope="cat_embedding")
                embeddings.append(cat_emb)
                print("Input: Category")

            # SPATIAL
            if config['inputs']['spatial']:
                self._spatial = tf.placeholder(tf.float32,
                                               [self.batch_size, 8],
                                               name='spatial')
                embeddings.append(self._spatial)
                print("Input: Spatial")

            # IMAGE
            if config['inputs']['image']:
                self._image = tf.placeholder(tf.float32, [self.batch_size] +
                                             config['model']['image']["dim"],
                                             name='image')
                self.image_out = get_image_features(
                    image=self._image,
                    question=lstm_states,
                    is_training=self._is_training,
                    scope_name=scope.name,
                    config=config['model']['image'])
                embeddings.append(self.image_out)
                print("Input: Image")

            # CROP
            if config['inputs']['crop']:
                self._crop = tf.placeholder(tf.float32, [self.batch_size] +
                                            config['model']['crop']["dim"],
                                            name='crop')
                self.crop_out = get_image_features(
                    image=self._crop,
                    question=lstm_states,
                    is_training=self._is_training,
                    scope_name=scope.name,
                    config=config["model"]['crop'])

                embeddings.append(self.crop_out)
                print("Input: Crop")

            # Compute the final embedding
            emb = tf.concat(embeddings, axis=1)

            # OUTPUT
            num_classes = 3
            self._answer = tf.placeholder(tf.float32,
                                          [self.batch_size, num_classes],
                                          name='answer')

            with tf.variable_scope('mlp'):
                num_hiddens = config['model']['MLP']['num_hiddens']
                l1 = utils.fully_connected(emb,
                                           num_hiddens,
                                           activation='relu',
                                           scope='l1')

                self.pred = utils.fully_connected(l1,
                                                  num_classes,
                                                  activation='softmax',
                                                  scope='softmax')
                self.best_pred = tf.argmax(self.pred, axis=1)

            self.loss = tf.reduce_mean(
                utils.cross_entropy(self.pred, self._answer))
            self.error = tf.reduce_mean(utils.error(self.pred, self._answer))

            print('Model... Oracle build!')
Ejemplo n.º 5
0
	def __init__(self, config, num_words, policy_gradient, device='', reuse=False):
		# AbstractNetwork.__init__(self, "qgen_guesser", device=device)
		ResnetModel.__init__(self, "qgen_guesser", device=device)

		# Create the scope for this graph
		with tf.variable_scope(self.scope_name, reuse=reuse):

			# We set batch size to be none as the batch size for the validation set and train set are different
			# mini_batch_size = None
			# mini_batch_size = config['batch_size']
			mini_batch_size = None
			self.guesser_loss_weight = tf.constant(config["guesser_loss_weight"], dtype = tf.float32, name = "guesser_loss_weight")
			self.qgen_loss_weight = tf.constant(config["qgen_loss_weight"], dtype = tf.float32, name = "qgen_loss_weight")
			self.loss = 0    
			# *********************************************************
			# Placeholders specific for guesser and its processing
			# *********************************************************
			
			# Objects
			self.obj_mask = tf.placeholder(tf.float32, [mini_batch_size, None], name='obj_mask')
			self.obj_cats = tf.placeholder(tf.int32, [mini_batch_size, None], name='obj_cats')
			self.obj_spats = tf.placeholder(tf.float32, [mini_batch_size, None, config['spat_dim']], name='obj_spats')

			# Targets
			self.targets = tf.placeholder(tf.int32, [mini_batch_size], name="targets_index")

			self.object_cats_emb = utils.get_embedding(
				self.obj_cats,
				config['no_categories'] + 1,
				config['cat_emb_dim'],
				scope='cat_embedding')

			self.objects_input = tf.concat([self.object_cats_emb, self.obj_spats], axis=2)
			self.flat_objects_inp = tf.reshape(self.objects_input, [-1, config['cat_emb_dim'] + config['spat_dim']])

			with tf.variable_scope('obj_mlp'):
				h1 = utils.fully_connected(
					self.flat_objects_inp,
					n_out=config['obj_mlp_units'],
					activation='relu',
					scope='l1')
				h2 = utils.fully_connected(
					h1,
					n_out=config['no_hidden_final_mlp'],
					activation='relu',
					scope='l2')
			# print 
			# print 
			# print h2
			# TODO: Object Embeddings do not have image features right now
			obj_embs = tf.reshape(h2, [-1, tf.shape(self.obj_cats)[1], config['no_hidden_final_mlp']])


			# *********************************************************
			# Placeholders for Qgen and common placeholder for guesser and its processing
			# *********************************************************

			# Image
			self.images = tf.placeholder(tf.float32, [mini_batch_size] + config['image']["dim"], name='images')

			# Question
			self.dialogues = tf.placeholder(tf.int32, [mini_batch_size, None], name='dialogues')
			self.answer_mask = tf.placeholder(tf.float32, [mini_batch_size, None], name='answer_mask')  # 1 if keep and (1 q/a 1) for (START q/a STOP)
			self.padding_mask = tf.placeholder(tf.float32, [mini_batch_size, None], name='padding_mask')
			self.seq_length = tf.placeholder(tf.int32, [mini_batch_size], name='seq_length')

			# Rewards
			self.cum_rewards = tf.placeholder(tf.float32, shape=[mini_batch_size, None], name='cum_reward')


			# DECODER Hidden state (for beam search)
			zero_state = tf.zeros([1, config['num_lstm_units']])  # default LSTM state is a zero-vector
			zero_state = tf.tile(zero_state, [tf.shape(self.images)[0], 1])  # trick to do a dynamic size 0 tensors

			self.decoder_zero_state_c = tf.placeholder_with_default(zero_state, [mini_batch_size, config['num_lstm_units']], name="state_c")
			self.decoder_zero_state_h = tf.placeholder_with_default(zero_state, [mini_batch_size, config['num_lstm_units']], name="state_h")
			decoder_initial_state = tf.contrib.rnn.LSTMStateTuple(c=self.decoder_zero_state_c, h=self.decoder_zero_state_h)

			# *******
			# Misc
			# *******
			
			self.is_training = tf.placeholder(tf.bool, name='is_training')
			self.greedy = tf.placeholder_with_default(False, shape=(), name="greedy") # use for graph
			self.samples = None

			# For each length of the answer, we are finding the next token

			# remove last token

			input_dialogues = self.dialogues[:, :-1]
			input_seq_length = self.seq_length - 1

			# remove first token(=start token)
			rewards = self.cum_rewards[:, 1:]
			target_words = self.dialogues[:, 1:]

			# to understand the padding:
			# input
			#   <start>  is   it   a    blue   <?>   <yes>   is   it  a    car  <?>   <no>   <stop_dialogue>
			# target
			#    is      it   a   blue   <?>    -      is    it   a   car  <?>   -   <stop_dialogue>  -

			# TODO:

			# 1. Include Film in guesser (Check if you are using film or cbn????)
			# Add finetune in the training (see the training and config file of clever)
				# Check the use of finetune (should we input pretrained model), normalize etc
				# See in the config file, if the attention has to be put inside image block
			# As of now, in the first part where we get image embedding, the we only flatten the image. Use RCNN or other method to get the image features.
			# Include attention on image given the dialog embedding in the guesser part
			# Include dropout om lstm (option for inside and outside) and image
			# Include attention on words given the image features
			# 2. Use tf.gather and use all the lstm states where there was yes or no (-) in the target and the stop dialog
			# 3. Make the code run
			# Check how does the is_training flag works
			
			# image processing
			with tf.variable_scope('image_feature') as img_scope:

				if len(config["image"]["dim"]) == 1:
					self.image_out = self.images
				else:
					# TODO: Create a different config for this attention
					# Putting images
					tf.summary.image("image", self.images)
					self.image_out = get_image_features(
						image=self.images, question = None,
						is_training=self.is_training,
						scope_name=img_scope.name,
						config=config['image'],
						att = False
					)
					
					image_pooling_size = [int((self.image_out).get_shape()[1]), int((self.image_out).get_shape()[2])]
					image_feature_depth = int((self.image_out).get_shape()[3])

					self.image_out = tf.layers.max_pooling2d(self.image_out,
																		image_pooling_size,
																		1,
																		padding='valid',
																		data_format='channels_last',
																		name='max_pooling_image_out')
					self.image_out = tf.reshape(self.image_out, [-1, image_feature_depth])

					# self.filmed_picture_out = tf.layers.average_pooling2d(	self.filmed_picture_out,
					# 														final_pooling_size,
					# 														1,
					# 														padding='valid',
					# 														data_format='channels_last',
					# 														name='average_pooling_filmed_picture_out')

					# self.image_out = get_attention(self.images, None, config["image"]["attention"]) #TODO: improve by using the previous lstm state?
					# self.image_out = tf.contrib.layers.flatten(self.image_out)

				print self.image_out
				print
				print


				# Reduce the embedding size of the image
				with tf.variable_scope('image_embedding'):
					self.image_emb = utils.fully_connected(self.image_out,
														   config['image_embedding_size'])
					image_emb = tf.expand_dims(self.image_emb, 1)
					image_emb = tf.tile(image_emb, [1, tf.shape(input_dialogues)[1], 1])

			# Compute the question embedding

			input_words = utils.get_embedding(
				input_dialogues,
				n_words=num_words,
				n_dim=config['word_embedding_size'],
				scope="word_embedding")

			# concat word embedding and image embedding
			# TODO: Check the size (see if input_seq_length is increased or not)
			decoder_input = tf.concat([input_words, image_emb], axis=2, name="concat_full_embedding")

			# encode one word+image
			decoder_lstm_cell = tf.contrib.rnn.LayerNormBasicLSTMCell(
					config['num_lstm_units'],
					layer_norm=False,
					dropout_keep_prob=1.0,
					reuse=reuse)

			# TODO: Since we have concatinated image, check if the input_seq_length should be increased by one
			# Decoding the states to generate questions
			self.decoder_output, self.decoder_state = tf.nn.dynamic_rnn(
				cell=decoder_lstm_cell,
				inputs=decoder_input,
				dtype=tf.float32,
				initial_state=decoder_initial_state,
				sequence_length=input_seq_length,
				scope="word_decoder")  # TODO: use multi-layer RNN

			max_sequence = tf.reduce_max(self.seq_length)

			# For the Guesser

			# Adding extra layers of LSTM
			# TODO: There are several default parameters in the fuction. Try using them
			# TODO: as of now, not using it.
			# TODO, as of now only using the hidden state, you may include the other state too
			last_states = self.decoder_state.h
			# last_states, _ = rnn.variable_length_LSTM_extension(
			#     self.decoder_output,
			#     self.decoder_state,
			#     num_hidden = config['num_lstm_units'],
			#     seq_length = input_seq_length
			#     )

			last_states = tf.reshape(last_states, [-1, config['num_lstm_units']])


			# TODO: Can be moved to utils  
			def masked_softmax(scores, mask):
				# subtract max for stability
				scores = scores - tf.tile(tf.reduce_max(scores, axis=(1,), keepdims=True), [1, tf.shape(scores)[1]])
				# compute padded softmax
				exp_scores = tf.exp(scores)
				exp_scores *= mask
				exp_sum_scores = tf.reduce_sum(exp_scores, axis=1, keepdims=True)
				return exp_scores / tf.tile(exp_sum_scores, [1, tf.shape(exp_scores)[1]])


			# compute the softmax for evaluation (on all the words on dialogue)
			with tf.variable_scope('decoder_output'):
				flat_decoder_output = tf.reshape(self.decoder_output, [-1, decoder_lstm_cell.output_size])
				flat_mlp_output = utils.fully_connected(flat_decoder_output, num_words)

				# retrieve the batch/dialogue format
				mlp_output = tf.reshape(flat_mlp_output, [tf.shape(self.seq_length)[0], max_sequence - 1, num_words])  # Ignore th STOP token

				self.softmax_output = tf.nn.softmax(mlp_output, name="softmax")
				self.argmax_output = tf.argmax(mlp_output, axis=2)
				self.cross_entropy_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=mlp_output, labels=target_words)

			# compute the maximum likelihood loss for the dialogues (for valid words)
			with tf.variable_scope('ml_loss'):

				ml_loss = tf.identity(self.cross_entropy_loss)
				ml_loss *= self.answer_mask[:, 1:]  # remove answers (ignore the <stop> token)
				ml_loss *= self.padding_mask[:, 1:]  # remove padding (ignore the <start> token)

				# Count number of unmask elements
				count = tf.reduce_sum(self.padding_mask) - tf.reduce_sum(1 - self.answer_mask[:, :-1]) - 1  # no_unpad - no_qa - START token

				ml_loss = tf.reduce_sum(ml_loss, axis=1)  # reduce over dialogue dimension
				ml_loss = tf.reduce_sum(ml_loss, axis=0)  # reduce over minibatch dimension
				self.ml_loss = ml_loss / count  # Normalize

				self.qgen_loss = self.ml_loss
				self.loss += self.qgen_loss_weight * self.qgen_loss
				tf.summary.scalar("qgen_loss", self.qgen_loss)

			# TODO NOTE: IMP in config file, under the image section, set cbn to be true

			with tf.variable_scope('guesser_input') as scope:
				
				# Getting the CBN image features
				self.CBN_picture_out = get_image_features(
					image=self.images, question = last_states,
					is_training=self.is_training,
					scope_name=scope.name,
					config=config['image']
				)

				# FILMING the Features

				# self.filmed_picture_out = film_layer(ft=self.CBN_picture_out, context=last_states)
				# TODO: Make n a hyperparameter and add it to network parameters
				self.filmed_picture_out = self.CBN_picture_out
				n = 1
				for i in range(n):
					with tf.variable_scope('film_layer_' + str(i)):
						self.filmed_picture_out = FiLMResblock(features=self.filmed_picture_out, context=last_states, is_training=self.is_training).get()
						# self.filmed_picture_out_3 = FiLMResblock(features=self.filmed_picture_out_2, context=last_states, is_training=self.is_training).get()
						
				# self.filmed_picture_out = FiLMResblock(features=self.filmed_picture_out_3, context=last_states, is_training=self.is_training).get()                
				# self.filmed_picture_out = FiLMResblock(features=self.filmed_picture_out, context=last_states, is_training=self.is_training).get()                
				# self.filmed_picture_out = FiLMResblock(features=self.filmed_picture_out, context=last_states, is_training=self.is_training).get()                
				
				# self.filmed_picture_out = FiLMResblock(features=self.CBN_picture_out, context=last_states, is_training=self.is_training).get()

				# self.filmed_picture_out = film_layer(features=self.CBN_picture_out, context=last_states)


				# TODO: Doing a convolution over the feature maps (before the classifier)
				# Do a max pooling over the feature maps

				final_pooling_size = [int((self.filmed_picture_out).get_shape()[1]), int((self.filmed_picture_out).get_shape()[2])]
				final_feature_depth = int((self.filmed_picture_out).get_shape()[3])

				if str(config["pooling"]).lower() == 'max':
					self.filmed_picture_out = tf.layers.max_pooling2d(	self.filmed_picture_out,
																		final_pooling_size,
																		1,
																		padding='valid',
																		data_format='channels_last',
																		name='max_pooling_filmed_picture_out')

				elif str(config["pooling"]).lower() == 'avg':
					self.filmed_picture_out = tf.layers.average_pooling2d(	self.filmed_picture_out,
																			final_pooling_size,
																			1,
																			padding='valid',
																			data_format='channels_last',
																			name='average_pooling_filmed_picture_out')
				else:
					print "No Pooling defined"
					sys.exit()
				self.filmed_picture_out = tf.reshape(self.filmed_picture_out, [-1, final_feature_depth])
			
				# Combining filmed image and dialog features into one
				#####################

				activation_name = config["activation"]

				self.question_embedding = utils.fully_connected(last_states, config["no_question_mlp"], activation=activation_name, scope='question_mlp')
				self.picture_embedding = utils.fully_connected(self.filmed_picture_out, config["no_picture_mlp"], activation=activation_name, scope='picture_mlp')

				self.full_embedding = self.picture_embedding * self.question_embedding
				# self.full_embedding = tf.nn.dropout(full_embedding, dropout_keep)

				# self.guesser_out_0 = utils.fully_connected(self.full_embedding, config["no_hidden_prefinal_mlp"], scope='hidden_prefinal', activation=activation_name)
				self.guesser_out_0 = self.full_embedding

				# out = tf.nn.dropout(out, dropout_keep)
				
				# Since we are not having 
				# out = utils.fully_connected(out, no_answers, activation='linear', scope='layer_softmax')
				self.guesser_out = utils.fully_connected(self.guesser_out_0, config["no_hidden_final_mlp"], scope='hidden_final', activation=activation_name)
				self.guesser_out = tf.reshape(self.guesser_out, [-1, config["no_hidden_final_mlp"], 1])


			# TODO DONE: Add all these losses to tensorboard

			with tf.variable_scope('guesser_output'):
				# TODO: In paper they do dot product, but in code they do matmul !!
				scores = tf.matmul(obj_embs, self.guesser_out)
				scores = tf.reshape(scores, [-1, tf.shape(self.obj_cats)[1]])

				self.softmax = masked_softmax(scores, self.obj_mask)
				self.selected_object = tf.argmax(self.softmax, axis=1)

				self.guesser_error = tf.reduce_mean(utils.error(self.softmax, self.targets))
				self.guesser_loss = tf.reduce_mean(utils.cross_entropy(self.softmax, self.targets))
				self.loss += self.guesser_loss_weight * self.guesser_loss
 
				tf.summary.scalar("guesser loss", self.guesser_loss)

			# Compute policy gradient
			if policy_gradient:

				with tf.variable_scope('rl_baseline'):
					decoder_out = tf.stop_gradient(self.decoder_output)  # take the LSTM output (and stop the gradient!)

					flat_decoder_output = tf.reshape(decoder_out, [-1, decoder_lstm_cell.output_size])  #
					flat_h1 = utils.fully_connected(flat_decoder_output, n_out=100, activation='relu', scope='baseline_hidden')
					flat_baseline = utils.fully_connected(flat_h1, 1, activation='relu', scope='baseline_out')

					self.baseline = tf.reshape(flat_baseline, [tf.shape(self.seq_length)[0], max_sequence-1])
					self.baseline *= self.answer_mask[:, 1:]
					self.baseline *= self.padding_mask[:, 1:]


				with tf.variable_scope('policy_gradient_loss'):

					# Compute log_prob
					self.log_of_policy = tf.identity(self.cross_entropy_loss)
					self.log_of_policy *= self.answer_mask[:, 1:]  # remove answers (<=> predicted answer has maximum reward) (ignore the START token in the mask)
					# No need to use padding mask as the discounted_reward is already zero once the episode terminated

					# Policy gradient loss
					rewards *= self.answer_mask[:, 1:]
					self.score_function = tf.multiply(self.log_of_policy, rewards - self.baseline)  # score function

					self.baseline_loss = tf.reduce_sum(tf.square(rewards - self.baseline))

					self.policy_gradient_loss = tf.reduce_sum(self.score_function, axis=1)  # sum over the dialogue trajectory
					self.policy_gradient_loss = tf.reduce_mean(self.policy_gradient_loss, axis=0)  # reduce over minibatch dimension

					self.loss = self.policy_gradient_loss

			tf.summary.scalar("total network loss", self.loss)
			self.summary = tf.summary.merge_all()
			print('Model... build!')