Beispiel #1
0
        def step(prev_state_c, prev_state_h, tokens, seq_length,
                 stop_indicator):
            input = tf.gather(tokens, tf.shape(tokens)[0] - 1)

            # Look for new finish dialogue
            is_stop_token = tf.equal(input, stop_token)
            is_stop_dialogue_token = tf.equal(input, stop_dialogue_token)
            is_stop = tf.logical_or(is_stop_token, is_stop_dialogue_token)
            stop_indicator = tf.logical_or(
                stop_indicator, is_stop)  # flag to false new finished dialogue

            # increment seq_length when the dialogue is not over
            seq_length = tf.where(stop_indicator, seq_length,
                                  tf.add(seq_length, 1))

            # compute the next words. TODO: factorize with qgen.. but how?!
            with tf.variable_scope(self.scope_name, reuse=True):
                word_emb = utils.get_embedding(
                    input,
                    n_words=tokenizer.no_words,
                    n_dim=config['word_embedding_size'],
                    scope="word_embedding",
                    reuse=True)

                inp_emb = tf.concat([word_emb, self.image_emb], axis=1)
                with tf.variable_scope("word_decoder"):
                    lstm_cell = tf.contrib.rnn.LayerNormBasicLSTMCell(
                        config['num_lstm_units'],
                        layer_norm=False,
                        dropout_keep_prob=1.0,
                        reuse=True)

                    state = tf.contrib.rnn.LSTMStateTuple(c=prev_state_c,
                                                          h=prev_state_h)
                    out, state = lstm_cell(inp_emb, state)

                    # store/update the state when the dialogue is not finished (after sampling the <?> token)
                    cond = tf.greater_equal(
                        seq_length, tf.subtract(tf.reduce_max(seq_length), 1))
                    state_c = tf.where(cond, state.c, prev_state_c)
                    state_h = tf.where(cond, state.h, prev_state_h)

                with tf.variable_scope('decoder_output'):
                    output = utils.fully_connected(state_h,
                                                   tokenizer.no_words,
                                                   reuse=True)

                    sampled_tokens = tf.cond(
                        self.greedy, lambda: tf.argmax(output, 1),
                        lambda: tf.reshape(tf.multinomial(output, 1), [-1]))
                    sampled_tokens = tf.cast(sampled_tokens, tf.int32)

            tokens = tf.concat(
                [tokens, tf.expand_dims(sampled_tokens, 0)],
                axis=0)  # check axis!

            return state_c, state_h, tokens, seq_length, stop_indicator
Beispiel #2
0
def compute_attention(feature_maps, context, no_mlp_units, reuse=False):
    with tf.variable_scope("attention"):

        if len(feature_maps.get_shape()) == 3:
            h = tf.shape(feature_maps)[
                1]  # when the shape is dynamic (attention over lstm)
            w = 1
            c = int(feature_maps.get_shape()[2])
        else:
            h = int(feature_maps.get_shape()[1])
            w = int(feature_maps.get_shape()[2])
            c = int(feature_maps.get_shape()[3])

        s = int(context.get_shape()[1])

        feature_maps = tf.reshape(feature_maps, shape=[-1, h * w, c])

        context = tf.expand_dims(context, axis=1)
        context = tf.tile(context, [1, h * w, 1])

        embedding = tf.concat([feature_maps, context], axis=2)
        embedding = tf.reshape(embedding, shape=[-1, s + c])

        # compute the evidence from the embedding
        with tf.variable_scope("mlp"):
            e = utils.fully_connected(embedding,
                                      no_mlp_units,
                                      scope='hidden_layer',
                                      activation="relu",
                                      reuse=reuse)
            e = utils.fully_connected(e, 1, scope='out', reuse=reuse)

        e = tf.reshape(e, shape=[-1, h * w, 1])

        # compute the softmax over the evidence
        alpha = tf.nn.softmax(e, dim=1)

        # apply soft attention
        soft_attention = feature_maps * alpha
        soft_attention = tf.reduce_sum(soft_attention, axis=1)

    return soft_attention
    def create_cbn_input(self, feature_maps):
        no_features = int(feature_maps.get_shape()[3])
        batch_size = tf.shape(feature_maps)[0]

        if self.use_betas:
            h_betas = utils.fully_connected(self.lstm_state,
                                            self.cbn_embedding_size,
                                            weight_initializer=RandomUniform(
                                                -1e-4, 1e-4),
                                            scope="hidden_betas",
                                            activation='relu')

            delta_betas = utils.fully_connected(
                h_betas,
                no_features,
                scope="delta_beta",
                weight_initializer=RandomUniform(-1e-4, 1e-4),
                use_bias=False)
        else:
            delta_betas = tf.tile(tf.constant(0.0, shape=[1, no_features]),
                                  tf.stack([batch_size, 1]))

        if self.use_gammas:
            h_gammas = utils.fully_connected(self.lstm_state,
                                             self.cbn_embedding_size,
                                             weight_initializer=RandomUniform(
                                                 -1e-4, 1e-4),
                                             scope="hidden_gammas",
                                             activation='relu')
            delta_gammas = utils.fully_connected(
                h_gammas,
                no_features,
                scope="delta_gamma",
                weight_initializer=RandomUniform(-1e-4, 1e-4))
        else:
            delta_gammas = tf.tile(tf.constant(0.0, shape=[1, no_features]),
                                   tf.stack([batch_size, 1]))

        return delta_betas, delta_gammas
    def __init__(self, config, num_words, policy_gradient, device='', reuse=False):
        AbstractNetwork.__init__(self, "qgen", device=device)

        # Create the scope for this graph
        with tf.variable_scope(self.scope_name, reuse=reuse):

            mini_batch_size = None

            # Picture
            self.images = tf.placeholder(tf.float32, [mini_batch_size] + config['image']["dim"], name='images')

            # Question
            self.dialogues = tf.placeholder(tf.int32, [mini_batch_size, None], name='dialogues')
            self.answer_mask = tf.placeholder(tf.float32, [mini_batch_size, None], name='answer_mask')  # 1 if keep and (1 q/a 1) for (START q/a STOP)
            self.padding_mask = tf.placeholder(tf.float32, [mini_batch_size, None], name='padding_mask')
            self.seq_length = tf.placeholder(tf.int32, [mini_batch_size], name='seq_length')

            # Rewards
            self.cum_rewards = tf.placeholder(tf.float32, shape=[mini_batch_size, None], name='cum_reward')

            # DECODER Hidden state (for beam search)
            zero_state = tf.zeros([1, config['num_lstm_units']])  # default LSTM state is a zero-vector
            zero_state = tf.tile(zero_state, [tf.shape(self.images)[0], 1])  # trick to do a dynamic size 0 tensors

            self.decoder_zero_state_c = tf.placeholder_with_default(zero_state, [mini_batch_size, config['num_lstm_units']], name="state_c")
            self.decoder_zero_state_h = tf.placeholder_with_default(zero_state, [mini_batch_size, config['num_lstm_units']], name="state_h")
            decoder_initial_state = tf.contrib.rnn.LSTMStateTuple(c=self.decoder_zero_state_c, h=self.decoder_zero_state_h)

            # Misc
            self.is_training = tf.placeholder(tf.bool, name='is_training')
            self.greedy = tf.placeholder_with_default(False, shape=(), name="greedy") # use for graph
            self.samples = None

            # remove last token
            input_dialogues = self.dialogues[:, :-1]
            input_seq_length = self.seq_length - 1

            # remove first token(=start token)
            rewards = self.cum_rewards[:, 1:]
            target_words = self.dialogues[:, 1:]

            # to understand the padding:
            # input
            #   <start>  is   it   a    blue   <?>   <yes>   is   it  a    car  <?>   <no>   <stop_dialogue>
            # target
            #    is      it   a   blue   <?>    -      is    it   a   car  <?>   -   <stop_dialogue>  -



            # image processing
            if len(config["image"]["dim"]) == 1:
                self.image_out = self.images
            else:
                self.image_out = get_attention(self.images, None, "none") #TODO: improve by using the previous lstm state?


            # Reduce the embedding size of the image
            with tf.variable_scope('picture_embedding'):
                self.picture_emb = utils.fully_connected(self.image_out,
                                                    config['picture_embedding_size'])
                picture_emb = tf.expand_dims(self.picture_emb, 1)
                picture_emb = tf.tile(picture_emb, [1, tf.shape(input_dialogues)[1], 1])

            # Compute the question embedding
            input_words = utils.get_embedding(
                input_dialogues,
                n_words=num_words,
                n_dim=config['word_embedding_size'],
                scope="word_embedding")

            # concat word embedding and picture embedding
            decoder_input = tf.concat([input_words, picture_emb], axis=2, name="concat_full_embedding")


            # encode one word+picture
            decoder_lstm_cell = tf.contrib.rnn.LayerNormBasicLSTMCell(
                    config['num_lstm_units'],
                    layer_norm=False,
                    dropout_keep_prob=1.0,
                    reuse=reuse)


            self.decoder_output, self.decoder_state = tf.nn.dynamic_rnn(
                cell=decoder_lstm_cell,
                inputs=decoder_input,
                dtype=tf.float32,
                initial_state=decoder_initial_state,
                sequence_length=input_seq_length,
                scope="word_decoder")  # TODO: use multi-layer RNN

            max_sequence = tf.reduce_max(self.seq_length)

            # compute the softmax for evaluation
            with tf.variable_scope('decoder_output'):
                flat_decoder_output = tf.reshape(self.decoder_output, [-1, decoder_lstm_cell.output_size])
                flat_mlp_output = utils.fully_connected(flat_decoder_output, num_words)

                # retrieve the batch/dialogue format
                mlp_output = tf.reshape(flat_mlp_output, [tf.shape(self.seq_length)[0], max_sequence - 1, num_words])  # Ignore th STOP token

                self.softmax_output = tf.nn.softmax(mlp_output, name="softmax")
                self.argmax_output = tf.argmax(mlp_output, axis=2)

                self.cross_entropy_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=mlp_output, labels=target_words)

            # compute the maximum likelihood loss
            with tf.variable_scope('ml_loss'):

                ml_loss = tf.identity(self.cross_entropy_loss)
                ml_loss *= self.answer_mask[:, 1:]  # remove answers (ignore the <stop> token)
                ml_loss *= self.padding_mask[:, 1:]  # remove padding (ignore the <start> token)

                # Count number of unmask elements
                count = tf.reduce_sum(self.padding_mask) - tf.reduce_sum(1 - self.answer_mask[:, :-1]) - 1  # no_unpad - no_qa - START token

                ml_loss = tf.reduce_sum(ml_loss, axis=1)  # reduce over dialogue dimension
                ml_loss = tf.reduce_sum(ml_loss, axis=0)  # reduce over minibatch dimension
                self.ml_loss = ml_loss / count  # Normalize

                self.loss = self.ml_loss

            # Compute policy gradient
            if policy_gradient:

                with tf.variable_scope('rl_baseline'):
                    decoder_out = tf.stop_gradient(self.decoder_output)  # take the LSTM output (and stop the gradient!)

                    flat_decoder_output = tf.reshape(decoder_out, [-1, decoder_lstm_cell.output_size])  #
                    flat_h1 = utils.fully_connected(flat_decoder_output, n_out=100, activation='relu', scope='baseline_hidden')
                    flat_baseline = utils.fully_connected(flat_h1, 1, activation='relu', scope='baseline_out')

                    self.baseline = tf.reshape(flat_baseline, [tf.shape(self.seq_length)[0], max_sequence-1])
                    self.baseline *= self.answer_mask[:, 1:]
                    self.baseline *= self.padding_mask[:, 1:]


                with tf.variable_scope('policy_gradient_loss'):

                    # Compute log_prob
                    self.log_of_policy = tf.identity(self.cross_entropy_loss)
                    self.log_of_policy *= self.answer_mask[:, 1:]  # remove answers (<=> predicted answer has maximum reward) (ignore the START token in the mask)
                    # No need to use padding mask as the discounted_reward is already zero once the episode terminated

                    # Policy gradient loss
                    rewards *= self.answer_mask[:, 1:]
                    self.score_function = tf.multiply(self.log_of_policy, rewards - self.baseline)  # score function

                    self.baseline_loss = tf.reduce_sum(tf.square(rewards - self.baseline))

                    self.policy_gradient_loss = tf.reduce_sum(self.score_function, axis=1)  # sum over the dialogue trajectory
                    self.policy_gradient_loss = tf.reduce_mean(self.policy_gradient_loss, axis=0)  # reduce over minibatch dimension

                    self.loss = self.policy_gradient_loss
Beispiel #5
0
def compute_glimpse(feature_maps,
                    context,
                    no_glimpse,
                    glimpse_embedding_size,
                    keep_dropout,
                    reuse=False):
    with tf.variable_scope("glimpse"):
        h = int(feature_maps.get_shape()[1])
        w = int(feature_maps.get_shape()[2])
        c = int(feature_maps.get_shape()[3])

        # reshape state to perform batch operation
        context = tf.nn.dropout(context, keep_dropout)
        projected_context = utils.fully_connected(context,
                                                  glimpse_embedding_size,
                                                  scope='hidden_layer',
                                                  activation="tanh",
                                                  use_bias=False,
                                                  reuse=reuse)

        projected_context = tf.expand_dims(projected_context, axis=1)
        projected_context = tf.tile(projected_context, [1, h * w, 1])
        projected_context = tf.reshape(projected_context,
                                       [-1, glimpse_embedding_size])

        feature_maps = tf.reshape(feature_maps, shape=[-1, h * w, c])

        glimpses = []
        with tf.variable_scope("glimpse"):
            g_feature_maps = tf.reshape(
                feature_maps,
                shape=[-1, c])  # linearise the feature map as as single batch
            g_feature_maps = tf.nn.dropout(g_feature_maps, keep_dropout)
            g_feature_maps = utils.fully_connected(g_feature_maps,
                                                   glimpse_embedding_size,
                                                   scope='image_projection',
                                                   activation="tanh",
                                                   use_bias=False,
                                                   reuse=reuse)

            hadamard = g_feature_maps * projected_context
            hadamard = tf.nn.dropout(hadamard, keep_dropout)

            e = utils.fully_connected(hadamard,
                                      no_glimpse,
                                      scope='hadamard_projection',
                                      reuse=reuse)
            e = tf.reshape(e, shape=[-1, h * w, no_glimpse])

            for i in range(no_glimpse):
                ev = e[:, :, i]
                alpha = tf.nn.softmax(ev)
                # apply soft attention
                soft_glimpses = feature_maps * tf.expand_dims(alpha, -1)
                soft_glimpses = tf.reduce_sum(soft_glimpses, axis=1)

                glimpses.append(soft_glimpses)

        full_glimpse = tf.concat(glimpses, axis=1)

    return full_glimpse
def compute_all_attention(question_states,
                          caption,
                          history_states,
                          image_feature,
                          no_mlp_units,
                          reuse=False,
                          config=None):

    print("image_feature = {}", image_feature)
    print("Question = {}", question_states)
    print("caption = {}", caption)
    print("history_sta = {}", history_states)
    ##### 1 ####
    # recupere les données

    #### 2 ####
    #step_dictionnaire

    ##### loop step_dictionnaire ########
    # feature_input = mlp(x,g1,g2) [1.. feature_shape]
    # soft_feature = softmax(feature_input) [1.. feature_input_shape]
    # x = soft_feature * x [1.. feature_input_shape]

    with tf.variable_scope("coattention"):
        if image_feature != None:
            if len(image_feature.get_shape()) == 3:
                h = tf.shape(image_feature)[
                    1]  # when the shape is dynamic (attention over lstm)
                w = 1
                c = int(image_feature.get_shape()[2])
            else:
                h = int(image_feature.get_shape()[1])
                w = int(image_feature.get_shape()[2])
                c = int(image_feature.get_shape()[3])

            s = int(question_states.get_shape()[2])
            image_feature = tf.reshape(image_feature, shape=[
                -1, (h * w), c
            ])  # input image_feature ?,7,7,2048 => ?,49,2048
            print("******************** B Image_feature = {} ".format(
                image_feature))
            image_feature = tf.reduce_sum(image_feature, axis=1)
            print("******************** A Image_feature = {} ".format(
                image_feature))

            set_img(image_feature)

        question_shape = question_states.get_shape()

        question_states = tf.reshape(
            question_states,
            shape=[-1, int(question_shape[1]) * int(question_shape[2])])

        set_question(question_states)

        if history_states != None:
            if caption != None:
                caption = tf.expand_dims(caption, axis=1)

            if caption != None:
                print("caption = {}  ,history_states = {} ".format(
                    caption, history_states))
                history_states = tf.reshape(history_states, [-1, 6, 1024])
                hist = tf.concat([caption, history_states], axis=1)
                print("hist B= {} ".format(hist))
                hist = tf.reshape(history_states, [-1, 7 * 1024])

            else:
                # hist = tf.reshape(history_states,[-1,6*1024])
                hist = history_states
                # print("hist = {} ".format(hist))
                # exit()

            set_history(hist)

        # return question_states,hist,image_feature

        dict_step = {0: "img", 1: "question", 3: "hist"}

        step_attention = {
            0: [0, 1, None],
            1: [3, 0, 1],
            2: [1, 0, 3],
            3: [0, 3, 1]
        }

        # step_attention = {0:[0,1,None]}

        for key, value in step_attention.items():

            input_data, g1, g2 = get_input_g1_g2(value[0], value[1], value[2])

            dimension_two = int(input_data.get_shape()[1])

            print("---- input_shape = {} ".format(input_data.get_shape()))
            print("---- g1_shape = {} ".format(g1.get_shape()))
            # print("g2_shape = {} ".format(g2.get_shape()))

            hidden_mlp, weight = utils.fully_connected(
                input_data,
                no_mlp_units,
                scope='hidden_layer_256_{}'.format(key),
                activation="tanh",
                reuse=reuse,
                co_attention=True,
                g1=g1,
                g2=g2,
                key_input=key)

            hidden_mlp = utils.fully_connected(
                hidden_mlp,
                1,
                scope='hidden_layer_1_{}'.format(key),
                reuse=reuse,
                co_attention=False)

            alpha = tf.nn.softmax(hidden_mlp, axis=1)
            input_data = input_data * alpha
            if value[0] == 0: set_img(input_data)
            elif value[0] == 1: set_question(input_data)
            elif value[0] == 2: set_history(input_data)

            # print("-- {} -- hidden = {} ,ALPHA= {} ,INPUT_DATA = {}".format(key,hidden_mlp,alpha,input_data))
            # print("Data_ouput = ",get_img(),get_question(),get_history())
            # if key == 2:
            #     exit()

        # img_shape = get_img().get_shape()
        # question_shape = get_question().get_shape()
        # history_shape = get_history().get_shape()

        # img = tf.reshape(get_img(), shape=[-1,int(img_shape[1]) * int(img_shape[2]) ])
        # question = tf.reshape(get_question(), shape=[-1, int(question_shape[1]) * int(question_shape[2]) ])
        # history = tf.reshape(get_history(), shape=[-1,int(history_shape[1]) * int(history_shape[2]) ])
        question_states = get_question()
        history = get_history()
        image_feature = get_img()

        # print(" history = {} , image_feature = {} , question = {}".format(history,image_feature,question_states))
        # exit()

    return question_states, history, image_feature
def compute_attention(feature_maps, context, no_mlp_units, reuse=False):

    with tf.variable_scope("attention"):

        if len(feature_maps.get_shape()) == 3:
            h = tf.shape(feature_maps)[
                1]  # when the shape is dynamic (attention over lstm)
            w = 1
            c = int(feature_maps.get_shape()[2])
        else:
            h = int(feature_maps.get_shape()[1])
            w = int(feature_maps.get_shape()[2])
            c = int(feature_maps.get_shape()[3])

        s = int(context.get_shape()[1])

        feature_maps = tf.reshape(
            feature_maps, shape=[-1, h * w, c]
        )  #Tensor("oracle/attention/Reshape:0", shape=(?, 49, 2048), dtype=float32)

        context = tf.expand_dims(
            context, axis=1
        )  # Tensor("oracle/attention/ExpandDims:0", shape=(?, 1, 6144), dtype=float32)

        context = tf.tile(
            context,
            [1, h * w, 1])  # tf.tile([a,b,c],dimension=[2]) => [a,b,c,a,b,c]
        # Tensor("oracle/attention/Tile:0", shape=(?, 49, 6144), dtype=float32)

        embedding = tf.concat([feature_maps, context], axis=2)
        embedding = tf.reshape(
            embedding, shape=[-1, s + c]
        )  #Tensor("oracle/attention/Reshape_1:0", shape=(?, 8192), dtype=float32)

        # compute the evidence from the embedding
        with tf.variable_scope("mlp"):
            e = utils.fully_connected(
                embedding,
                no_mlp_units,
                scope='hidden_layer',
                activation="relu",
                reuse=reuse
            )  #Tensor("oracle/attention/mlp/Relu:0", shape=(?, 256), dtype=float32)
            # print(" Before E = {}".format(e))
            e = utils.fully_connected(
                e, 1, scope='out', reuse=reuse
            )  # Tensor("oracle/attention/mlp/out/add:0", shape=(?, 1), dtype=float32)
            # print(" After E = {}".format(e))

        e = tf.reshape(
            e, shape=[-1, h * w, 1]
        )  #Tensor("oracle/attention/Reshape_2:0", shape=(?, 49, 1), dtype=float32)

        # compute the softmax over the evidence
        alpha = tf.nn.softmax(
            e, dim=1
        )  # Tensor("oracle/attention/transpose_1:0", shape=(?, 49, 1), dtype=float32)

        # apply soft attention
        soft_attention = feature_maps * alpha  # Tensor("oracle/attention/mul:0", shape=(?, 49, 2048), dtype=float32)

        soft_attention = tf.reduce_sum(
            soft_attention, axis=1
        )  # Tensor("oracle/attention/Sum:0", shape=(?, 2048), dtype=float32)

    return soft_attention
Beispiel #8
0
    def __init__(self, config, num_words, device='', reuse=False):
        AbstractNetwork.__init__(self, "guesser", device=device)

        mini_batch_size = None

        with tf.variable_scope(self.scope_name, reuse=reuse):

            # Dialogues
            self.dialogues = tf.placeholder(tf.int32, [mini_batch_size, None],
                                            name='dialogues')
            self.seq_length = tf.placeholder(tf.int32, [mini_batch_size],
                                             name='seq_length')

            # Objects
            self.obj_mask = tf.placeholder(tf.float32, [mini_batch_size, None],
                                           name='obj_mask')
            self.obj_cats = tf.placeholder(tf.int32, [mini_batch_size, None],
                                           name='obj_cats')
            self.obj_spats = tf.placeholder(
                tf.float32, [mini_batch_size, None, config['spat_dim']],
                name='obj_spats')

            # Targets
            self.targets = tf.placeholder(tf.int32, [mini_batch_size],
                                          name="targets_index")

            self.object_cats_emb = utils.get_embedding(
                self.obj_cats,
                config['no_categories'] + 1,
                config['cat_emb_dim'],
                scope='cat_embedding')

            self.objects_input = tf.concat(
                [self.object_cats_emb, self.obj_spats], axis=2)
            self.flat_objects_inp = tf.reshape(
                self.objects_input,
                [-1, config['cat_emb_dim'] + config['spat_dim']])

            with tf.variable_scope('obj_mlp'):
                h1 = utils.fully_connected(self.flat_objects_inp,
                                           n_out=config['obj_mlp_units'],
                                           activation='relu',
                                           scope='l1')
                h2 = utils.fully_connected(h1,
                                           n_out=config['dialog_emb_dim'],
                                           activation='relu',
                                           scope='l2')

            obj_embs = tf.reshape(
                h2,
                [-1, tf.shape(self.obj_cats)[1], config['dialog_emb_dim']])

            # Compute the word embedding
            input_words = utils.get_embedding(self.dialogues,
                                              n_words=num_words,
                                              n_dim=config['word_emb_dim'],
                                              scope="input_word_embedding")

            last_states, _ = rnn.variable_length_LSTM(
                input_words,
                num_hidden=config['num_lstm_units'],
                seq_length=self.seq_length)

            last_states = tf.reshape(last_states,
                                     [-1, config['num_lstm_units'], 1])
            scores = tf.matmul(obj_embs, last_states)
            scores = tf.reshape(scores, [-1, tf.shape(self.obj_cats)[1]])

            def masked_softmax(scores, mask):
                # subtract max for stability
                scores = scores - tf.tile(
                    tf.reduce_max(scores, axis=(1, ), keep_dims=True),
                    [1, tf.shape(scores)[1]])
                # compute padded softmax
                exp_scores = tf.exp(scores)
                exp_scores *= mask
                exp_sum_scores = tf.reduce_sum(exp_scores,
                                               axis=1,
                                               keep_dims=True)
                return exp_scores / tf.tile(exp_sum_scores,
                                            [1, tf.shape(exp_scores)[1]])

            self.softmax = masked_softmax(scores, self.obj_mask)
            self.selected_object = tf.argmax(self.softmax, axis=1)

            self.loss = tf.reduce_mean(
                utils.cross_entropy(self.softmax, self.targets))
            self.error = tf.reduce_mean(utils.error(self.softmax,
                                                    self.targets))
Beispiel #9
0
    def __init__(self, config, no_words, no_answers, reuse=False, device=''):
        ResnetModel.__init__(self, "vqa", device=device)

        with tf.variable_scope(self.scope_name, reuse=reuse) as scope:

            self.batch_size = None

            #####################
            #   QUESTION
            #####################

            self._question = tf.placeholder(tf.int32, [self.batch_size, None],
                                            name='question')
            self._seq_length = tf.placeholder(tf.int32, [self.batch_size],
                                              name='seq_length')
            self._answer_count = tf.placeholder(tf.float32,
                                                [self.batch_size, no_answers],
                                                name='answer_count')

            self._is_training = tf.placeholder(tf.bool, name="is_training")

            dropout_keep = float(config.get("dropout_keep_prob", 1.0))
            dropout_keep = tf.cond(self._is_training,
                                   lambda: tf.constant(dropout_keep),
                                   lambda: tf.constant(1.0))

            word_emb = utils.get_embedding(self._question,
                                           n_words=no_words,
                                           n_dim=int(
                                               config["word_embedding_dim"]),
                                           scope="word_embedding",
                                           reuse=reuse)

            if config['glove']:
                self._glove = tf.placeholder(tf.float32, [None, None, 300],
                                             name="glove")
                word_emb = tf.concat([word_emb, self._glove], axis=2)

            self.question_lstm, self.all_lstm_states = rnn.variable_length_LSTM(
                word_emb,
                num_hidden=int(config["no_hidden_LSTM"]),
                dropout_keep_prob=dropout_keep,
                seq_length=self._seq_length,
                depth=int(config["no_LSTM_cell"]),
                scope="question_lstm",
                reuse=reuse)

            #####################
            #   IMAGES
            #####################

            self._image = tf.placeholder(tf.float32, [self.batch_size] +
                                         config['image']["dim"],
                                         name='image')
            self.image_out = get_image_features(image=self._image,
                                                question=self.question_lstm,
                                                is_training=self._is_training,
                                                scope_name=scope.name,
                                                config=config['image'],
                                                dropout_keep=dropout_keep)

            #####################
            #   COMBINE
            #####################
            activation_name = config["activation"]
            with tf.variable_scope('final_mlp'):

                self.question_embedding = utils.fully_connected(
                    self.question_lstm,
                    config["no_question_mlp"],
                    activation=activation_name,
                    scope='question_mlp')
                self.image_embedding = utils.fully_connected(
                    self.image_out,
                    config["no_image_mlp"],
                    activation=activation_name,
                    scope='image_mlp')

                full_embedding = self.image_embedding * self.question_embedding
                full_embedding = tf.nn.dropout(full_embedding, dropout_keep)

                out = utils.fully_connected(full_embedding,
                                            config["no_hidden_final_mlp"],
                                            scope='layer1',
                                            activation=activation_name)
                out = tf.nn.dropout(out, dropout_keep)
                out = utils.fully_connected(out,
                                            no_answers,
                                            activation='linear',
                                            scope='layer2')

            # improve soft loss
            answer_count = tf.minimum(self._answer_count, 3)

            normalizing_sum = tf.maximum(
                1.0, tf.reduce_sum(answer_count, 1, keep_dims=True))
            self.answer_prob = answer_count / normalizing_sum
            self.soft_cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
                logits=out, labels=self.answer_prob, name='soft_cross_entropy')
            self.soft_loss = self.soft_cross_entropy

            self.target_answer = tf.argmax(self._answer_count, axis=1)
            # unmorm_log_prob = tf.log(self._answer_count)
            # self.target_answer = tf.multinomial(unmorm_log_prob, num_samples=1)
            # self.target_answer = tf.reshape(self.target_answer, shape=[-1])

            self.hard_cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=out,
                labels=self.target_answer,
                name='hard_cross_entropy')
            self.hard_loss = self.hard_cross_entropy

            if config['loss'] == 'soft':
                self.loss = self.soft_loss
            else:
                self.loss = self.hard_loss

            self.loss = tf.reduce_mean(self.loss)
            self.softmax = tf.nn.softmax(out, name='answer_prob')
            self.prediction = tf.argmax(
                out, axis=1,
                name='predicted_answer')  # no need to compute the softmax

            with tf.variable_scope('accuracy'):
                ind = tf.range(tf.shape(
                    self.prediction)[0]) * no_answers + tf.cast(
                        self.prediction, tf.int32)
                pred_count = tf.gather(tf.reshape(self._answer_count, [-1]),
                                       ind)
                self.extended_accuracy = tf.minimum(pred_count / 3.0,
                                                    1.0,
                                                    name="extended_accuracy")
                self.accuracy = tf.reduce_mean(self.extended_accuracy)

            tf.summary.scalar('soft_loss', self.soft_loss)
            tf.summary.scalar('hard_loss', self.hard_loss)
            tf.summary.scalar('accuracy', self.accuracy)

            print('Model... build!')
Beispiel #10
0
    def __init__(self, config, num_words, device='', reuse=False):
        ResnetModel.__init__(self, "oracle", device=device)

        with tf.variable_scope(self.scope_name, reuse=reuse) as scope:
            embeddings = []
            self.batch_size = None

            # QUESTION
            self._is_training = tf.placeholder(tf.bool, name="is_training")
            self._question = tf.placeholder(tf.int32, [self.batch_size, None],
                                            name='question')
            self._seq_length = tf.placeholder(tf.int32, [self.batch_size],
                                              name='seq_length')

            word_emb = utils.get_embedding(
                self._question,
                n_words=num_words,
                n_dim=int(config['model']['question']["embedding_dim"]),
                scope="word_embedding")

            lstm_states, _ = rnn.variable_length_LSTM(
                word_emb,
                num_hidden=int(config['model']['question']["no_LSTM_hiddens"]),
                seq_length=self._seq_length)
            embeddings.append(lstm_states)

            # CATEGORY
            if config['inputs']['category']:
                self._category = tf.placeholder(tf.int32, [self.batch_size],
                                                name='category')

                cat_emb = utils.get_embedding(
                    self._category,
                    int(config['model']['category']["n_categories"]) +
                    1,  # we add the unkwon category
                    int(config['model']['category']["embedding_dim"]),
                    scope="cat_embedding")
                embeddings.append(cat_emb)
                print("Input: Category")

            # SPATIAL
            if config['inputs']['spatial']:
                self._spatial = tf.placeholder(tf.float32,
                                               [self.batch_size, 8],
                                               name='spatial')
                embeddings.append(self._spatial)
                print("Input: Spatial")

            # IMAGE
            if config['inputs']['image']:
                self._image = tf.placeholder(tf.float32, [self.batch_size] +
                                             config['model']['image']["dim"],
                                             name='image')
                self.image_out = get_image_features(
                    image=self._image,
                    question=lstm_states,
                    is_training=self._is_training,
                    scope_name=scope.name,
                    config=config['model']['image'])
                embeddings.append(self.image_out)
                print("Input: Image")

            # CROP
            if config['inputs']['crop']:
                self._crop = tf.placeholder(tf.float32, [self.batch_size] +
                                            config['model']['crop']["dim"],
                                            name='crop')
                self.crop_out = get_image_features(
                    image=self._crop,
                    question=lstm_states,
                    is_training=self._is_training,
                    scope_name=scope.name,
                    config=config["model"]['crop'])

                embeddings.append(self.crop_out)
                print("Input: Crop")

            # Compute the final embedding
            emb = tf.concat(embeddings, axis=1)

            # OUTPUT
            num_classes = 3
            self._answer = tf.placeholder(tf.float32,
                                          [self.batch_size, num_classes],
                                          name='answer')

            with tf.variable_scope('mlp'):
                num_hiddens = config['model']['MLP']['num_hiddens']
                l1 = utils.fully_connected(emb,
                                           num_hiddens,
                                           activation='relu',
                                           scope='l1')

                self.pred = utils.fully_connected(l1,
                                                  num_classes,
                                                  activation='softmax',
                                                  scope='softmax')
                self.best_pred = tf.argmax(self.pred, axis=1)

            self.loss = tf.reduce_mean(
                utils.cross_entropy(self.pred, self._answer))
            self.error = tf.reduce_mean(utils.error(self.pred, self._answer))

            print('Model... Oracle build!')
    def __init__(self, config, num_words_question ,num_words_description=None,  device='', reuse=False):
        ResnetModel.__init__(self, "oracle", device=device)

        with open("data/dict_word_embedding_{}_{}.pickle".format("fasttext",config["model"]["question"]["embedding_type"]),"rb") as f:
            dict_all_embedding = pickle.load(f)


        with tf.variable_scope(self.scope_name, reuse=reuse) as scope:
            embeddings = []
            co_attention = [None,None,None,None]
            self.batch_size = None
            max_seq_length = 12
            

            # QUESTION
            if config['inputs']['question']:
                self._is_training = tf.placeholder(tf.bool, name="is_training")
                # self._question_word = tf.placeholder(tf.int32, [self.batch_size], name='question_word') # 
                self._question = tf.placeholder(tf.int32, [self.batch_size, 14], name='question')
                self.seq_length_question = tf.placeholder(tf.int32, [self.batch_size], name='seq_length_question')

                if config["model"]["glove"] == True or config["model"]["fasttext"] == True:
            
                    print("****** WITH EMBEDDING ******")
                    word_emb = utils.get_embedding(self._question,
                                                n_words=num_words_question,
                                                n_dim=int(config["model"]["word_embedding_dim"]),
                                                scope="word_embedding",
                                                dict_all_embedding=dict_all_embedding)
                else:
                    print("****** NOT EMBEDDING ******")


                    word_emb = utils.get_embedding(self._question,
                                                n_words=num_words_question,
                                                n_dim=int(config["model"]["word_embedding_dim"]),
                                                scope="word_embedding",
                                                dict_all_embedding=[])

                    print(".... word_emb 1 = {} ".format(word_emb))                            

                self.out_question = None

                if config['model']['question']['lstm']:
                    self.lstm_states_question, self.lstm_all_state_ques = rnn.variable_length_LSTM(word_emb,
                                                        num_hidden=int(config['model']['question']["no_LSTM_hiddens"]),
                                                        seq_length=self.seq_length_question)

                    self.out_question =  self.lstm_all_state_ques

                    # print("out_queston = {} ".format(self.lstm_states_question))
                    # exit()
                    # self.out_question = tf.reshape(self.out_question,[-1, self.out_question.get_shape()[1] * self.out_question.get_shape()[2] ])


                else:
                    self.out_question = word_emb

                if config["model"]["attention"]["co-attention"]:
                    co_attention[0] = self.out_question     # Tensor("oracle/lstm/lstmcell0/concat:0", shape=(?, 14, 1024), dtype=float32)
                    embeddings.append(self.lstm_states_question)
                    # print("question_lstm = {} ".format(self.out_question ))

                    # exit()
                else:
                    embeddings.append(self.lstm_states_question)

                # QUESTION-Pos
                if config['model']['question'] ['pos']:
                    print("----------------------------------------")
                    print("**** Oracle_network |  input = question-pos ")

                    self._question_pos = tf.placeholder(tf.int32, [self.batch_size, None], name='question_pos')
                    self.seq_length_pos = tf.placeholder(tf.int32, [self.batch_size], name='seq_length_ques_pos')
                    word_emb = utils.get_embedding(self._question_pos,
                                                n_words=num_words_question,
                                                n_dim=100,
                                                scope="word_embedding_pos")

                    if config["model"]["glove"] == True or config["model"]["fasttext"] == True:
                        self._glove = tf.placeholder(tf.float32, [None, None,int(config["model"]["word_embedding_dim"])], name="embedding_vector_ques_pos")
                        word_emb = tf.concat([word_emb, self._glove], axis=2)

                    else:
                        print("None ****************")
                    

                    lstm_states, _ = rnn.variable_length_LSTM(word_emb,
                                                        num_hidden=int(config['model']['question']["no_LSTM_hiddens"]),
                                                        seq_length=self.seq_length_pos,scope="lstm2")
                    


                    # embeddings.append(lstm_states)

            # DESCRIPTION
            if config['inputs']['description']:
                print("****  Oracle_network |  input = Description ")

                self._description = tf.placeholder(tf.int32, [self.batch_size, None], name='description')
                self.seq_length_description = tf.placeholder(tf.int32, [self.batch_size], name='seq_length_description')

                word_emb = utils.get_embedding(self._description,
                                            n_words=num_words_question,
                                            n_dim=100,
                                            reuse=True,
                                            scope="word_embedding")

                # print("word_emb = {} ".format(word_emb))

                if config['model']['question']['lstm']:
                    self.lstm_states_des, self.lstm_all_state_des = rnn.variable_length_LSTM(word_emb,
                                                        num_hidden=int(config['model']['question']["no_LSTM_hiddens"]),
                                                        seq_length=self.seq_length_description,scope="lstm3")




                    self.out_question =  self.lstm_states_des

                    # print("self.out_question_emb = {} ".format(self.out_question))  

                    # self.out_question = tf.reshape(self.out_question,[-1, self.out_question.get_shape()[1] * self.out_question.get_shape()[2] ])
                else:
                    self.out_question = word_emb
                    # print("self.out_question = {} ".format(self.out_question)) 


                if config["model"]["attention"]["co-attention"]:
                    # co_attention[1] = self.out_question     # embeddings.append(self.lstm_all_state_ques)
                    embeddings.append(self.lstm_states_des)

                else:
                    embeddings.append(self.lstm_states_des)
                
            if config['inputs']['history_question']:
                
                placeholders_lstmQuestion = []
                placeholders_lstmLength = []

               
                for i in range(6):
                    self._embWord = tf.placeholder(tf.int32, [self.batch_size, 14], name="ques_hist_H{}".format(i))

                    self.seq_length_question_history = tf.placeholder(tf.int32, [self.batch_size], name='seq_length_question_history_H{}'.format(i))

                    self.word_emb = utils.get_embedding(self._embWord,
                                        n_words=num_words_question,
                                        n_dim=100,
                                        reuse=True,
                                        scope="word_embedding")

                    placeholders_lstmQuestion.append(self.word_emb)
                    placeholders_lstmLength.append(self.seq_length_question_history)

            
                self.lstm_states, self.lstm_all_state_ques_hist = rnn.variable_length_LSTM(placeholders_lstmQuestion,
                                                num_hidden=int(config['model']['question']["no_LSTM_hiddens"]),
                                                seq_length=placeholders_lstmLength,scope="lstm4",dim_4=True)


                if config["model"]["attention"]["co-attention"]:
                    co_attention[2] = self.lstm_states
                else:
                    embeddings.append(self.lstm_states)

                


             # Description-Pos

                if config['model']['description'] ['pos']:
                    print("----------------------------------------")
                    print("**** Oracle_network |  inpurt = question-pos ")

                    self._question_pos = tf.placeholder(tf.int32, [self.batch_size, None], name='des_pos')
                    self.seq_length_pos = tf.placeholder(tf.int32, [self.batch_size], name='seq_length_des_pos')

                    word_emb = utils.get_embedding(self._question_pos,
                                                n_words=num_words_question,
                                                n_dim=300,
                                                scope="word_embedding_pos")

                    if config["model"]["glove"] == True or config["model"]["fasttext"] == True:
                        self._glove = tf.placeholder(tf.float32, [None, None, int(config["model"]["word_embedding_dim"])], name="embedding_vector_des_pos")
                        word_emb = tf.concat([word_emb, self._glove], axis=2)
                    else:
                        print("None ****************")
                    
                    lstm_states, _ = rnn.variable_length_LSTM(word_emb,
                                                        num_hidden=int(config['model']['question']["no_LSTM_hiddens"]),
                                                        seq_length=self.seq_length_pos,scope="lstm5")
                    

                    # embeddings.append(lstm_states)

            # CATEGORY
            if config['inputs']['category']:
                print("****  Oracle_network |  input = category ")
                

                if config["model"]["category"]["use_embedding"]:
                    self._category = tf.placeholder(tf.float32, [self.batch_size,int(config["model"]["word_embedding_dim"])], name='category')
                    cat_emb = self._category                    
                    # cat_emb = utils.get_embedding(self._category,
                    #                               int(config['model']['category']["n_categories"]) + 1,
                    #                               n_dim=int(config["model"]["word_embedding_dim"]),
                    #                               scope="cat_embedding",
                    #                               dict_all_embedding=dict_all_embedding
                    #                              )
                else:
                    self._category = tf.placeholder(tf.int32, [self.batch_size], name='category')
                    cat_emb = utils.get_embedding(self._category,
                                                int(config['model']['category']["n_categories"]) + 1,  # we add the unkwon category
                                                int(config["model"]["word_embedding_dim"]),
                                                scope="cat_embedding",
                                                 )



                # cat_emb = tf.expand_dims(cat_emb,1)
                embeddings.append(cat_emb)
                print("Input: Category")


            # ALLCATEGORY
            if config['inputs']['allcategory']:
                print("**** Oracle_network |  input = allcategory ")

        
                
                self._allcategory = tf.placeholder(tf.float32, [self.batch_size,90], name='allcategory')
                # self.seq_length_allcategory = tf.placeholder(tf.int32, [self.batch_size], name='seq_length_allcategory')

                # word_emb = utils.get_embedding(self._allcategory,
                #                             n_words=int(config['model']['category']["n_categories"]) + 1,
                #                             n_dim=int(config['model']['description']["embedding_dim"]),
                #                             scope="word_embedding_allcategory")

                
                #print(" SeqDescription = ",self.seq_length_description)
                # lstm_states, _ = rnn.variable_length_LSTM(word_emb,
                #                                     num_hidden=int(config['model']['question']["no_LSTM_hiddens"]),
                #                                     seq_length=self.seq_length_allcategory,scope="lstm3")
                
                print(" Oracle_network | embdedding all_cate=",word_emb)
                # embeddings.append(self._allcategory)
                print("Input: allcategory")

                
            # SPATIAL
            if config['inputs']['spatial']:
                print("****  Oracle_network |  input = spatial ")
                self._spatial = tf.placeholder(tf.float32, [self.batch_size, 8], name='spatial')
                embeddings.append(self._spatial)
                print("Input: Spatial")


            # IMAGE
            if config['inputs']['image']:
                print("****  Oracle_network |  input = image ")

                self._image_id = tf.placeholder(tf.float32, [self.batch_size], name='image_id')
                self._image = tf.placeholder(tf.float32, [self.batch_size] + config['model']['image']["dim"], name='image')
                # self.image_out = tf.reshape(self._image,shpe=[224*224*3])
                # print("question = {} ".format(self.lstm_states_question))
                # exit()

                self.image_out = get_image_features(
                    image=self._image, question=self.lstm_states_question,
                    is_training=self._is_training,
                    scope_name=scope.name,
                    scope_feature="Image/",
                    config=config['model']['image']
                )
                # embeddings.append(self.image_out)
                print("Input: Image")
                co_attention[3]  = self.image_out
                print(" -- image_int ={}".format(self._image))
                # exit()
                image_feature = tf.reshape(self.image_out, shape=[-1, (7 * 7) * 2048]) # input image_feature ?,7,7,2048 => ?,49,2048
                embeddings.append(image_feature)
                # print("... Image Features = {}".format(self.image_out))



            # CROP
            if config['inputs']['crop']:
                print("****  Oracle_network |  input = crop ")
                self._image_id = tf.placeholder(tf.float32, [self.batch_size], name='image_id')
                # self._crop_id = tf.placeholder(tf.float32, [self.batch_size], name='crop_id')

                self._crop = tf.placeholder(tf.float32, [self.batch_size] + config['model']['crop']["dim"], name='crop')
                
                
                
                if config["model"]["attention"]["co-attention"]:
                    self.crop_out = get_image_features(
                    image=self._crop, question=self.lstm_states_question,
                    is_training=self._is_training,
                    scope_name=scope.name,
                    scope_feature="Crop/",
                    config=config["model"]['crop'])
                    co_attention[3] = self.crop_out

                else:
                    self.crop_out = get_image_features(
                    image=self._crop, question=self.lstm_states_question,
                    is_training=self._is_training,
                    scope_name=scope.name,
                    scope_feature="Crop/",
                    co_attention=False,
                    config=config["model"]['crop'])

                    embeddings.append(self.crop_out)



            if config["model"]["crop"]["segment_crop"]["use"]:
                all_segment_crop = []
                # for i in range(10):
                self._segment_crop = tf.placeholder(tf.float32, [self.batch_size] + config['model']['crop']["dim"], name='crop_segment'.format(0))
                

                self.crop_out = get_image_features(
                                image=self._segment_crop, question=self.lstm_states_question,
                                is_training=self._is_training,
                                scope_name="test",
                                scope_feature="Segment/",
                                config=config["model"]['crop'])


                print("self.crop_out = {} ".format(self.crop_out))


                

                        # all_segment_crop.add(self.crop_out)

                
                # print("-- crop = {},image_features = {} ".format(self.crop_out, image_feature))
                # exit()

            if config["model"]["attention"]["co-attention"]:
                question_feature,history_feature , image_feature = compute_all_attention(question_states=co_attention[0],
                                                                                caption=co_attention[1],
                                                                                history_states=co_attention[2],
                                                                                image_feature=co_attention[3],
                                                                                no_mlp_units=config['model']['attention']['no_attention_mlp'],
                                                                                config = config
                                                                                )


                embeddings.append(history_feature)
                embeddings.append(question_feature)
                embeddings.append(image_feature)

            # embeddings.append(question_feature)
            print("*** All Embedding = ",embeddings)
            self.emb = tf.concat(embeddings, axis=1)
            
            print("*** self.emb = ",self.emb)
            


            # Compute the final embedding
            # print("---------- Embeddings=",embeddings)
            # self.emb = tf.concat(embeddings, axis=1)
            
        

            # OUTPUT
            num_classes = 3
            self._answer = tf.placeholder(tf.float32, [self.batch_size, num_classes], name='answer')



            with tf.variable_scope('mlp'):
                num_hiddens = config['model']['MLP']['num_hiddens']
                # emb = tf.print(emb, [emb], "input: ")
                l1 = utils.fully_connected(self.emb, num_hiddens, activation='relu', scope='l1')
                self.pred = utils.fully_connected(l1, num_classes, activation='softmax', scope='softmax')
                self.best_pred = tf.argmax(self.pred, axis=1)
            # self.best_pred = tf.reduce_mean(self.best_pred)

            print("--- predict = {} ,answer = {} ".format(self.pred,self._answer))
            # exit()
            # self.loss = None
        
            self.loss = tf.reduce_mean(utils.cross_entropy(self.pred, self._answer))
            self.error = tf.reduce_mean(utils.error(self.pred, self._answer))

            print("loss = {} ,error = {} ".format(self.loss,self.error))
            print('Model... Oracle build!')
Beispiel #12
0
	def __init__(self, config, num_words, policy_gradient, device='', reuse=False):
		# AbstractNetwork.__init__(self, "qgen_guesser", device=device)
		ResnetModel.__init__(self, "qgen_guesser", device=device)

		# Create the scope for this graph
		with tf.variable_scope(self.scope_name, reuse=reuse):

			# We set batch size to be none as the batch size for the validation set and train set are different
			# mini_batch_size = None
			# mini_batch_size = config['batch_size']
			mini_batch_size = None
			self.guesser_loss_weight = tf.constant(config["guesser_loss_weight"], dtype = tf.float32, name = "guesser_loss_weight")
			self.qgen_loss_weight = tf.constant(config["qgen_loss_weight"], dtype = tf.float32, name = "qgen_loss_weight")
			self.loss = 0    
			# *********************************************************
			# Placeholders specific for guesser and its processing
			# *********************************************************
			
			# Objects
			self.obj_mask = tf.placeholder(tf.float32, [mini_batch_size, None], name='obj_mask')
			self.obj_cats = tf.placeholder(tf.int32, [mini_batch_size, None], name='obj_cats')
			self.obj_spats = tf.placeholder(tf.float32, [mini_batch_size, None, config['spat_dim']], name='obj_spats')

			# Targets
			self.targets = tf.placeholder(tf.int32, [mini_batch_size], name="targets_index")

			self.object_cats_emb = utils.get_embedding(
				self.obj_cats,
				config['no_categories'] + 1,
				config['cat_emb_dim'],
				scope='cat_embedding')

			self.objects_input = tf.concat([self.object_cats_emb, self.obj_spats], axis=2)
			self.flat_objects_inp = tf.reshape(self.objects_input, [-1, config['cat_emb_dim'] + config['spat_dim']])

			with tf.variable_scope('obj_mlp'):
				h1 = utils.fully_connected(
					self.flat_objects_inp,
					n_out=config['obj_mlp_units'],
					activation='relu',
					scope='l1')
				h2 = utils.fully_connected(
					h1,
					n_out=config['no_hidden_final_mlp'],
					activation='relu',
					scope='l2')
			# print 
			# print 
			# print h2
			# TODO: Object Embeddings do not have image features right now
			obj_embs = tf.reshape(h2, [-1, tf.shape(self.obj_cats)[1], config['no_hidden_final_mlp']])


			# *********************************************************
			# Placeholders for Qgen and common placeholder for guesser and its processing
			# *********************************************************

			# Image
			self.images = tf.placeholder(tf.float32, [mini_batch_size] + config['image']["dim"], name='images')

			# Question
			self.dialogues = tf.placeholder(tf.int32, [mini_batch_size, None], name='dialogues')
			self.answer_mask = tf.placeholder(tf.float32, [mini_batch_size, None], name='answer_mask')  # 1 if keep and (1 q/a 1) for (START q/a STOP)
			self.padding_mask = tf.placeholder(tf.float32, [mini_batch_size, None], name='padding_mask')
			self.seq_length = tf.placeholder(tf.int32, [mini_batch_size], name='seq_length')

			# Rewards
			self.cum_rewards = tf.placeholder(tf.float32, shape=[mini_batch_size, None], name='cum_reward')


			# DECODER Hidden state (for beam search)
			zero_state = tf.zeros([1, config['num_lstm_units']])  # default LSTM state is a zero-vector
			zero_state = tf.tile(zero_state, [tf.shape(self.images)[0], 1])  # trick to do a dynamic size 0 tensors

			self.decoder_zero_state_c = tf.placeholder_with_default(zero_state, [mini_batch_size, config['num_lstm_units']], name="state_c")
			self.decoder_zero_state_h = tf.placeholder_with_default(zero_state, [mini_batch_size, config['num_lstm_units']], name="state_h")
			decoder_initial_state = tf.contrib.rnn.LSTMStateTuple(c=self.decoder_zero_state_c, h=self.decoder_zero_state_h)

			# *******
			# Misc
			# *******
			
			self.is_training = tf.placeholder(tf.bool, name='is_training')
			self.greedy = tf.placeholder_with_default(False, shape=(), name="greedy") # use for graph
			self.samples = None

			# For each length of the answer, we are finding the next token

			# remove last token

			input_dialogues = self.dialogues[:, :-1]
			input_seq_length = self.seq_length - 1

			# remove first token(=start token)
			rewards = self.cum_rewards[:, 1:]
			target_words = self.dialogues[:, 1:]

			# to understand the padding:
			# input
			#   <start>  is   it   a    blue   <?>   <yes>   is   it  a    car  <?>   <no>   <stop_dialogue>
			# target
			#    is      it   a   blue   <?>    -      is    it   a   car  <?>   -   <stop_dialogue>  -

			# TODO:

			# 1. Include Film in guesser (Check if you are using film or cbn????)
			# Add finetune in the training (see the training and config file of clever)
				# Check the use of finetune (should we input pretrained model), normalize etc
				# See in the config file, if the attention has to be put inside image block
			# As of now, in the first part where we get image embedding, the we only flatten the image. Use RCNN or other method to get the image features.
			# Include attention on image given the dialog embedding in the guesser part
			# Include dropout om lstm (option for inside and outside) and image
			# Include attention on words given the image features
			# 2. Use tf.gather and use all the lstm states where there was yes or no (-) in the target and the stop dialog
			# 3. Make the code run
			# Check how does the is_training flag works
			
			# image processing
			with tf.variable_scope('image_feature') as img_scope:

				if len(config["image"]["dim"]) == 1:
					self.image_out = self.images
				else:
					# TODO: Create a different config for this attention
					# Putting images
					tf.summary.image("image", self.images)
					self.image_out = get_image_features(
						image=self.images, question = None,
						is_training=self.is_training,
						scope_name=img_scope.name,
						config=config['image'],
						att = False
					)
					
					image_pooling_size = [int((self.image_out).get_shape()[1]), int((self.image_out).get_shape()[2])]
					image_feature_depth = int((self.image_out).get_shape()[3])

					self.image_out = tf.layers.max_pooling2d(self.image_out,
																		image_pooling_size,
																		1,
																		padding='valid',
																		data_format='channels_last',
																		name='max_pooling_image_out')
					self.image_out = tf.reshape(self.image_out, [-1, image_feature_depth])

					# self.filmed_picture_out = tf.layers.average_pooling2d(	self.filmed_picture_out,
					# 														final_pooling_size,
					# 														1,
					# 														padding='valid',
					# 														data_format='channels_last',
					# 														name='average_pooling_filmed_picture_out')

					# self.image_out = get_attention(self.images, None, config["image"]["attention"]) #TODO: improve by using the previous lstm state?
					# self.image_out = tf.contrib.layers.flatten(self.image_out)

				print self.image_out
				print
				print


				# Reduce the embedding size of the image
				with tf.variable_scope('image_embedding'):
					self.image_emb = utils.fully_connected(self.image_out,
														   config['image_embedding_size'])
					image_emb = tf.expand_dims(self.image_emb, 1)
					image_emb = tf.tile(image_emb, [1, tf.shape(input_dialogues)[1], 1])

			# Compute the question embedding

			input_words = utils.get_embedding(
				input_dialogues,
				n_words=num_words,
				n_dim=config['word_embedding_size'],
				scope="word_embedding")

			# concat word embedding and image embedding
			# TODO: Check the size (see if input_seq_length is increased or not)
			decoder_input = tf.concat([input_words, image_emb], axis=2, name="concat_full_embedding")

			# encode one word+image
			decoder_lstm_cell = tf.contrib.rnn.LayerNormBasicLSTMCell(
					config['num_lstm_units'],
					layer_norm=False,
					dropout_keep_prob=1.0,
					reuse=reuse)

			# TODO: Since we have concatinated image, check if the input_seq_length should be increased by one
			# Decoding the states to generate questions
			self.decoder_output, self.decoder_state = tf.nn.dynamic_rnn(
				cell=decoder_lstm_cell,
				inputs=decoder_input,
				dtype=tf.float32,
				initial_state=decoder_initial_state,
				sequence_length=input_seq_length,
				scope="word_decoder")  # TODO: use multi-layer RNN

			max_sequence = tf.reduce_max(self.seq_length)

			# For the Guesser

			# Adding extra layers of LSTM
			# TODO: There are several default parameters in the fuction. Try using them
			# TODO: as of now, not using it.
			# TODO, as of now only using the hidden state, you may include the other state too
			last_states = self.decoder_state.h
			# last_states, _ = rnn.variable_length_LSTM_extension(
			#     self.decoder_output,
			#     self.decoder_state,
			#     num_hidden = config['num_lstm_units'],
			#     seq_length = input_seq_length
			#     )

			last_states = tf.reshape(last_states, [-1, config['num_lstm_units']])


			# TODO: Can be moved to utils  
			def masked_softmax(scores, mask):
				# subtract max for stability
				scores = scores - tf.tile(tf.reduce_max(scores, axis=(1,), keepdims=True), [1, tf.shape(scores)[1]])
				# compute padded softmax
				exp_scores = tf.exp(scores)
				exp_scores *= mask
				exp_sum_scores = tf.reduce_sum(exp_scores, axis=1, keepdims=True)
				return exp_scores / tf.tile(exp_sum_scores, [1, tf.shape(exp_scores)[1]])


			# compute the softmax for evaluation (on all the words on dialogue)
			with tf.variable_scope('decoder_output'):
				flat_decoder_output = tf.reshape(self.decoder_output, [-1, decoder_lstm_cell.output_size])
				flat_mlp_output = utils.fully_connected(flat_decoder_output, num_words)

				# retrieve the batch/dialogue format
				mlp_output = tf.reshape(flat_mlp_output, [tf.shape(self.seq_length)[0], max_sequence - 1, num_words])  # Ignore th STOP token

				self.softmax_output = tf.nn.softmax(mlp_output, name="softmax")
				self.argmax_output = tf.argmax(mlp_output, axis=2)
				self.cross_entropy_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=mlp_output, labels=target_words)

			# compute the maximum likelihood loss for the dialogues (for valid words)
			with tf.variable_scope('ml_loss'):

				ml_loss = tf.identity(self.cross_entropy_loss)
				ml_loss *= self.answer_mask[:, 1:]  # remove answers (ignore the <stop> token)
				ml_loss *= self.padding_mask[:, 1:]  # remove padding (ignore the <start> token)

				# Count number of unmask elements
				count = tf.reduce_sum(self.padding_mask) - tf.reduce_sum(1 - self.answer_mask[:, :-1]) - 1  # no_unpad - no_qa - START token

				ml_loss = tf.reduce_sum(ml_loss, axis=1)  # reduce over dialogue dimension
				ml_loss = tf.reduce_sum(ml_loss, axis=0)  # reduce over minibatch dimension
				self.ml_loss = ml_loss / count  # Normalize

				self.qgen_loss = self.ml_loss
				self.loss += self.qgen_loss_weight * self.qgen_loss
				tf.summary.scalar("qgen_loss", self.qgen_loss)

			# TODO NOTE: IMP in config file, under the image section, set cbn to be true

			with tf.variable_scope('guesser_input') as scope:
				
				# Getting the CBN image features
				self.CBN_picture_out = get_image_features(
					image=self.images, question = last_states,
					is_training=self.is_training,
					scope_name=scope.name,
					config=config['image']
				)

				# FILMING the Features

				# self.filmed_picture_out = film_layer(ft=self.CBN_picture_out, context=last_states)
				# TODO: Make n a hyperparameter and add it to network parameters
				self.filmed_picture_out = self.CBN_picture_out
				n = 1
				for i in range(n):
					with tf.variable_scope('film_layer_' + str(i)):
						self.filmed_picture_out = FiLMResblock(features=self.filmed_picture_out, context=last_states, is_training=self.is_training).get()
						# self.filmed_picture_out_3 = FiLMResblock(features=self.filmed_picture_out_2, context=last_states, is_training=self.is_training).get()
						
				# self.filmed_picture_out = FiLMResblock(features=self.filmed_picture_out_3, context=last_states, is_training=self.is_training).get()                
				# self.filmed_picture_out = FiLMResblock(features=self.filmed_picture_out, context=last_states, is_training=self.is_training).get()                
				# self.filmed_picture_out = FiLMResblock(features=self.filmed_picture_out, context=last_states, is_training=self.is_training).get()                
				
				# self.filmed_picture_out = FiLMResblock(features=self.CBN_picture_out, context=last_states, is_training=self.is_training).get()

				# self.filmed_picture_out = film_layer(features=self.CBN_picture_out, context=last_states)


				# TODO: Doing a convolution over the feature maps (before the classifier)
				# Do a max pooling over the feature maps

				final_pooling_size = [int((self.filmed_picture_out).get_shape()[1]), int((self.filmed_picture_out).get_shape()[2])]
				final_feature_depth = int((self.filmed_picture_out).get_shape()[3])

				if str(config["pooling"]).lower() == 'max':
					self.filmed_picture_out = tf.layers.max_pooling2d(	self.filmed_picture_out,
																		final_pooling_size,
																		1,
																		padding='valid',
																		data_format='channels_last',
																		name='max_pooling_filmed_picture_out')

				elif str(config["pooling"]).lower() == 'avg':
					self.filmed_picture_out = tf.layers.average_pooling2d(	self.filmed_picture_out,
																			final_pooling_size,
																			1,
																			padding='valid',
																			data_format='channels_last',
																			name='average_pooling_filmed_picture_out')
				else:
					print "No Pooling defined"
					sys.exit()
				self.filmed_picture_out = tf.reshape(self.filmed_picture_out, [-1, final_feature_depth])
			
				# Combining filmed image and dialog features into one
				#####################

				activation_name = config["activation"]

				self.question_embedding = utils.fully_connected(last_states, config["no_question_mlp"], activation=activation_name, scope='question_mlp')
				self.picture_embedding = utils.fully_connected(self.filmed_picture_out, config["no_picture_mlp"], activation=activation_name, scope='picture_mlp')

				self.full_embedding = self.picture_embedding * self.question_embedding
				# self.full_embedding = tf.nn.dropout(full_embedding, dropout_keep)

				# self.guesser_out_0 = utils.fully_connected(self.full_embedding, config["no_hidden_prefinal_mlp"], scope='hidden_prefinal', activation=activation_name)
				self.guesser_out_0 = self.full_embedding

				# out = tf.nn.dropout(out, dropout_keep)
				
				# Since we are not having 
				# out = utils.fully_connected(out, no_answers, activation='linear', scope='layer_softmax')
				self.guesser_out = utils.fully_connected(self.guesser_out_0, config["no_hidden_final_mlp"], scope='hidden_final', activation=activation_name)
				self.guesser_out = tf.reshape(self.guesser_out, [-1, config["no_hidden_final_mlp"], 1])


			# TODO DONE: Add all these losses to tensorboard

			with tf.variable_scope('guesser_output'):
				# TODO: In paper they do dot product, but in code they do matmul !!
				scores = tf.matmul(obj_embs, self.guesser_out)
				scores = tf.reshape(scores, [-1, tf.shape(self.obj_cats)[1]])

				self.softmax = masked_softmax(scores, self.obj_mask)
				self.selected_object = tf.argmax(self.softmax, axis=1)

				self.guesser_error = tf.reduce_mean(utils.error(self.softmax, self.targets))
				self.guesser_loss = tf.reduce_mean(utils.cross_entropy(self.softmax, self.targets))
				self.loss += self.guesser_loss_weight * self.guesser_loss
 
				tf.summary.scalar("guesser loss", self.guesser_loss)

			# Compute policy gradient
			if policy_gradient:

				with tf.variable_scope('rl_baseline'):
					decoder_out = tf.stop_gradient(self.decoder_output)  # take the LSTM output (and stop the gradient!)

					flat_decoder_output = tf.reshape(decoder_out, [-1, decoder_lstm_cell.output_size])  #
					flat_h1 = utils.fully_connected(flat_decoder_output, n_out=100, activation='relu', scope='baseline_hidden')
					flat_baseline = utils.fully_connected(flat_h1, 1, activation='relu', scope='baseline_out')

					self.baseline = tf.reshape(flat_baseline, [tf.shape(self.seq_length)[0], max_sequence-1])
					self.baseline *= self.answer_mask[:, 1:]
					self.baseline *= self.padding_mask[:, 1:]


				with tf.variable_scope('policy_gradient_loss'):

					# Compute log_prob
					self.log_of_policy = tf.identity(self.cross_entropy_loss)
					self.log_of_policy *= self.answer_mask[:, 1:]  # remove answers (<=> predicted answer has maximum reward) (ignore the START token in the mask)
					# No need to use padding mask as the discounted_reward is already zero once the episode terminated

					# Policy gradient loss
					rewards *= self.answer_mask[:, 1:]
					self.score_function = tf.multiply(self.log_of_policy, rewards - self.baseline)  # score function

					self.baseline_loss = tf.reduce_sum(tf.square(rewards - self.baseline))

					self.policy_gradient_loss = tf.reduce_sum(self.score_function, axis=1)  # sum over the dialogue trajectory
					self.policy_gradient_loss = tf.reduce_mean(self.policy_gradient_loss, axis=0)  # reduce over minibatch dimension

					self.loss = self.policy_gradient_loss

			tf.summary.scalar("total network loss", self.loss)
			self.summary = tf.summary.merge_all()
			print('Model... build!')