Example #1
0
    def __init__(self, config, num_words, policy_gradient, device='', reuse=False):
        AbstractNetwork.__init__(self, "qgen", device=device)

        # Create the scope for this graph
        with tf.variable_scope(self.scope_name, reuse=reuse):

            mini_batch_size = None

            # Picture
            self.images = tf.placeholder(tf.float32, [mini_batch_size] + config['image']["dim"], name='images')

            # Question
            self.dialogues = tf.placeholder(tf.int32, [mini_batch_size, None], name='dialogues')
            self.answer_mask = tf.placeholder(tf.float32, [mini_batch_size, None], name='answer_mask')  # 1 if keep and (1 q/a 1) for (START q/a STOP)
            self.padding_mask = tf.placeholder(tf.float32, [mini_batch_size, None], name='padding_mask')
            self.seq_length = tf.placeholder(tf.int32, [mini_batch_size], name='seq_length')

            # Rewards
            self.cum_rewards = tf.placeholder(tf.float32, shape=[mini_batch_size, None], name='cum_reward')

            # DECODER Hidden state (for beam search)
            zero_state = tf.zeros([1, config['num_lstm_units']])  # default LSTM state is a zero-vector
            zero_state = tf.tile(zero_state, [tf.shape(self.images)[0], 1])  # trick to do a dynamic size 0 tensors

            self.decoder_zero_state_c = tf.placeholder_with_default(zero_state, [mini_batch_size, config['num_lstm_units']], name="state_c")
            self.decoder_zero_state_h = tf.placeholder_with_default(zero_state, [mini_batch_size, config['num_lstm_units']], name="state_h")
            decoder_initial_state = tf.contrib.rnn.LSTMStateTuple(c=self.decoder_zero_state_c, h=self.decoder_zero_state_h)

            # Misc
            self.is_training = tf.placeholder(tf.bool, name='is_training')
            self.greedy = tf.placeholder_with_default(False, shape=(), name="greedy") # use for graph
            self.samples = None

            # remove last token
            input_dialogues = self.dialogues[:, :-1]
            input_seq_length = self.seq_length - 1

            # remove first token(=start token)
            rewards = self.cum_rewards[:, 1:]
            target_words = self.dialogues[:, 1:]

            # to understand the padding:
            # input
            #   <start>  is   it   a    blue   <?>   <yes>   is   it  a    car  <?>   <no>   <stop_dialogue>
            # target
            #    is      it   a   blue   <?>    -      is    it   a   car  <?>   -   <stop_dialogue>  -



            # image processing
            if len(config["image"]["dim"]) == 1:
                self.image_out = self.images
            else:
                self.image_out = get_attention(self.images, None, "none") #TODO: improve by using the previous lstm state?


            # Reduce the embedding size of the image
            with tf.variable_scope('picture_embedding'):
                self.picture_emb = utils.fully_connected(self.image_out,
                                                    config['picture_embedding_size'])
                picture_emb = tf.expand_dims(self.picture_emb, 1)
                picture_emb = tf.tile(picture_emb, [1, tf.shape(input_dialogues)[1], 1])

            # Compute the question embedding
            input_words = utils.get_embedding(
                input_dialogues,
                n_words=num_words,
                n_dim=config['word_embedding_size'],
                scope="word_embedding")

            # concat word embedding and picture embedding
            decoder_input = tf.concat([input_words, picture_emb], axis=2, name="concat_full_embedding")


            # encode one word+picture
            decoder_lstm_cell = tf.contrib.rnn.LayerNormBasicLSTMCell(
                    config['num_lstm_units'],
                    layer_norm=False,
                    dropout_keep_prob=1.0,
                    reuse=reuse)


            self.decoder_output, self.decoder_state = tf.nn.dynamic_rnn(
                cell=decoder_lstm_cell,
                inputs=decoder_input,
                dtype=tf.float32,
                initial_state=decoder_initial_state,
                sequence_length=input_seq_length,
                scope="word_decoder")  # TODO: use multi-layer RNN

            max_sequence = tf.reduce_max(self.seq_length)

            # compute the softmax for evaluation
            with tf.variable_scope('decoder_output'):
                flat_decoder_output = tf.reshape(self.decoder_output, [-1, decoder_lstm_cell.output_size])
                flat_mlp_output = utils.fully_connected(flat_decoder_output, num_words)

                # retrieve the batch/dialogue format
                mlp_output = tf.reshape(flat_mlp_output, [tf.shape(self.seq_length)[0], max_sequence - 1, num_words])  # Ignore th STOP token

                self.softmax_output = tf.nn.softmax(mlp_output, name="softmax")
                self.argmax_output = tf.argmax(mlp_output, axis=2)

                self.cross_entropy_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=mlp_output, labels=target_words)

            # compute the maximum likelihood loss
            with tf.variable_scope('ml_loss'):

                ml_loss = tf.identity(self.cross_entropy_loss)
                ml_loss *= self.answer_mask[:, 1:]  # remove answers (ignore the <stop> token)
                ml_loss *= self.padding_mask[:, 1:]  # remove padding (ignore the <start> token)

                # Count number of unmask elements
                count = tf.reduce_sum(self.padding_mask) - tf.reduce_sum(1 - self.answer_mask[:, :-1]) - 1  # no_unpad - no_qa - START token

                ml_loss = tf.reduce_sum(ml_loss, axis=1)  # reduce over dialogue dimension
                ml_loss = tf.reduce_sum(ml_loss, axis=0)  # reduce over minibatch dimension
                self.ml_loss = ml_loss / count  # Normalize

                self.loss = self.ml_loss

            # Compute policy gradient
            if policy_gradient:

                with tf.variable_scope('rl_baseline'):
                    decoder_out = tf.stop_gradient(self.decoder_output)  # take the LSTM output (and stop the gradient!)

                    flat_decoder_output = tf.reshape(decoder_out, [-1, decoder_lstm_cell.output_size])  #
                    flat_h1 = utils.fully_connected(flat_decoder_output, n_out=100, activation='relu', scope='baseline_hidden')
                    flat_baseline = utils.fully_connected(flat_h1, 1, activation='relu', scope='baseline_out')

                    self.baseline = tf.reshape(flat_baseline, [tf.shape(self.seq_length)[0], max_sequence-1])
                    self.baseline *= self.answer_mask[:, 1:]
                    self.baseline *= self.padding_mask[:, 1:]


                with tf.variable_scope('policy_gradient_loss'):

                    # Compute log_prob
                    self.log_of_policy = tf.identity(self.cross_entropy_loss)
                    self.log_of_policy *= self.answer_mask[:, 1:]  # remove answers (<=> predicted answer has maximum reward) (ignore the START token in the mask)
                    # No need to use padding mask as the discounted_reward is already zero once the episode terminated

                    # Policy gradient loss
                    rewards *= self.answer_mask[:, 1:]
                    self.score_function = tf.multiply(self.log_of_policy, rewards - self.baseline)  # score function

                    self.baseline_loss = tf.reduce_sum(tf.square(rewards - self.baseline))

                    self.policy_gradient_loss = tf.reduce_sum(self.score_function, axis=1)  # sum over the dialogue trajectory
                    self.policy_gradient_loss = tf.reduce_mean(self.policy_gradient_loss, axis=0)  # reduce over minibatch dimension

                    self.loss = self.policy_gradient_loss
Example #2
0
def get_image_features(image, question, is_training, scope_name, config, dropout_keep=1., reuse=False, att = True):
	image_input_type = config["image_input"]

	# Extract feature from 1D-image feature s
	if image_input_type == "fc8" \
			or image_input_type == "fc7" \
			or image_input_type == "dummy":

		image_out = image
		if config.get('normalize', False):
			image_out = tf.nn.l2_normalize(image, dim=1, name="fc_normalization")

	elif image_input_type.startswith("conv") or image_input_type.startswith("raw"):

		# Extract feature from raw images
		if image_input_type.startswith("raw"):

			# Create CBN
			cbn = None

			if "cbn" in config and config["cbn"].get("use_cbn", False) and question is not None:
				cbn_factory = CBNfromLSTM(question, no_units=config['cbn']["cbn_embedding_size"])
				excluded_scopes = config["cbn"].get('excluded_scope_names', [])
				cbn = ConditionalBatchNorm(cbn_factory, excluded_scope_names=excluded_scopes,
										   is_training=is_training)

			# Due to the following bug
	        #"There is a bug with classic batchnorm with slim networks (https://github.com/tensorflow/tensorflow/issues/4887). \n" \
	        #"Please use the following config -> 'cbn': {'use_cbn':true, 'excluded_scope_names': ['*']}"
			else:
				cbn_factory = CBNfromLSTM(question, no_units=config['cbn']["cbn_embedding_size"])
				excluded_scopes = ["*"]
				cbn = ConditionalBatchNorm(cbn_factory, excluded_scope_names=excluded_scopes, is_training=is_training)


			# Create ResNet
			resnet_version = config['resnet_version']
			image_feature_maps = create_resnet(image,
												 is_training=is_training,
												 scope=scope_name,
												 cbn=cbn,
												 resnet_version=resnet_version,
												 resnet_out=config.get('resnet_out', "block4"))

			image_feature_maps = image_feature_maps
			if config.get('normalize', False):
				image_feature_maps = tf.nn.l2_normalize(image_feature_maps, dim=[1, 2, 3])

		# Extract feature from 3D-image features
		else:
			image_feature_maps = image

		# apply attention

		if att:
			image_out = get_attention(image_feature_maps, question,
								  config=config["attention"],
								  dropout_keep=dropout_keep,
								  reuse=reuse)
		else:
			image_out = image_feature_maps
	else:
		assert False, "Wrong input type for image"

	return image_out
Example #3
0
    def __init__(self, config, num_words, num_answers, reuse=False, device=''):
        ResnetModel.__init__(self, "clevr", device=device)

        with tf.variable_scope(self.scope_name, reuse=reuse):
            batch_size = None
            self._is_training = tf.placeholder(tf.bool, name="is_training")

            dropout_keep_scalar = float(config["dropout_keep_prob"])
            dropout_keep = tf.cond(self._is_training,
                                   lambda: tf.constant(dropout_keep_scalar),
                                   lambda: tf.constant(1.0))

            #####################
            #   QUESTION
            #####################

            self._question = tf.placeholder(tf.int32, [batch_size, None], name='question')
            self._seq_length = tf.placeholder(tf.int32, [batch_size], name='seq_length')
            self._answer = tf.placeholder(tf.int64, [batch_size], name='answer')

            word_emb = tfc_layers.embed_sequence(
                ids=self._question,
                vocab_size=num_words,
                embed_dim=config["question"]["word_embedding_dim"],
                scope="word_embedding",
                reuse=reuse)

            if config["question"]['glove']:
                self._glove = tf.placeholder(tf.float32, [None, None, 300], name="glove")
                word_emb = tf.concat([word_emb, self._glove], axis=2)

            word_emb = tf.nn.dropout(word_emb, dropout_keep)

            _, last_rnn_state = rnn.rnn_factory(
                inputs=word_emb,
                seq_length=self._seq_length,
                cell=config["question"]["cell"],
                num_hidden=config["question"]["rnn_state_size"],
                bidirectional=config["question"]["bidirectional"],
                max_pool=config["question"]["max_pool"],
                layer_norm=config["question"]["layer_norm"],
                reuse=reuse)

            last_rnn_state = tf.nn.dropout(last_rnn_state, dropout_keep)

            #####################
            #   IMAGES
            #####################

            self._image = tf.placeholder(tf.float32, [batch_size] + config['image']["dim"], name='image')

            visual_features = get_image_features(image=self._image,
                                                 is_training=self._is_training,
                                                 config=config['image'])

            with tf.variable_scope("image_film_stack", reuse=reuse):
                film_stack = FiLM_Stack(image=visual_features,
                                        film_input=last_rnn_state,
                                        is_training=self._is_training,
                                        config=config["film_block"],
                                        reuse=reuse)

                visual_features = film_stack.get()

            # Pool Image Features
            with tf.variable_scope("image_pooling"):
                multimodal_features = get_attention(visual_features, last_rnn_state,
                                                    is_training=self._is_training,
                                                    config=config["pooling"],
                                                    dropout_keep=dropout_keep,
                                                    reuse=reuse)

            with tf.variable_scope("classifier"):
                self.hidden_state = tfc_layers.fully_connected(multimodal_features,
                                                               num_outputs=config["classifier"]["no_mlp_units"],
                                                               normalizer_fn=tfc_layers.batch_norm,
                                                               normalizer_params={"center": True, "scale": True,
                                                                                  "decay": 0.9,
                                                                                  "is_training": self._is_training,
                                                                                  "reuse": reuse},
                                                               activation_fn=tf.nn.relu,
                                                               reuse=reuse,
                                                               scope="classifier_hidden_layer")

                self.out = tfc_layers.fully_connected(self.hidden_state,
                                                      num_outputs=num_answers,
                                                      activation_fn=None,
                                                      reuse=reuse,
                                                      scope="classifier_softmax_layer")

            #####################
            #   Loss
            #####################

            self.cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.out, labels=self._answer, name='cross_entropy')
            self.loss = tf.reduce_mean(self.cross_entropy)

            self.softmax = tf.nn.softmax(self.out, name='answer_prob')
            self.prediction = tf.argmax(self.out, axis=1, name='predicted_answer')  # no need to compute the softmax

            with tf.variable_scope('accuracy'):
                self.accuracy = tf.equal(self.prediction, self._answer)
                self.accuracy = tf.reduce_mean(tf.cast(self.accuracy, tf.float32))

            tf.summary.scalar('accuracy', self.accuracy)

            print('Model... build!')
    def __init__(self, config, num_words, device='', reuse=False):
        AbstractNetwork.__init__(self, "guesser", device=device)

        batch_size = None

        with tf.variable_scope(self.scope_name, reuse=reuse):

            self._is_training = tf.placeholder(tf.bool, name="is_training")

            dropout_keep_scalar = float(config["dropout_keep_prob"])
            dropout_keep = tf.cond(self._is_training,
                                   lambda: tf.constant(dropout_keep_scalar),
                                   lambda: tf.constant(1.0))

            #####################
            #   DIALOGUE
            #####################

            self._dialogue = tf.placeholder(tf.int32, [batch_size, None],
                                            name='dialogue')
            self._seq_length = tf.placeholder(tf.int32, [batch_size],
                                              name='seq_length_dialogue')

            word_emb = tfc_layers.embed_sequence(
                ids=self._dialogue,
                vocab_size=num_words,
                embed_dim=config["question"]["word_embedding_dim"],
                scope="word_embedding",
                reuse=reuse)

            if config["question"]['glove']:
                self._glove = tf.placeholder(tf.float32, [None, None, 300],
                                             name="glove")
                word_emb = tf.concat([word_emb, self._glove], axis=2)

            _, self.dialogue_embedding = rnn.rnn_factory(
                inputs=word_emb,
                seq_length=self._seq_length,
                cell=config['question']["cell"],
                num_hidden=config['question']["rnn_units"],
                bidirectional=config["question"]["bidirectional"],
                max_pool=config["question"]["max_pool"],
                layer_norm=config["question"]["layer_norm"],
                reuse=reuse)

            #####################
            #   IMAGE
            #####################

            self.img_embedding = None
            if config['inputs']['image']:

                self._image = tf.placeholder(tf.float32, [batch_size] +
                                             config['image']["dim"],
                                             name='image')

                # get image
                self.img_embedding = get_image_features(
                    image=self._image,
                    is_training=self._is_training,
                    config=config['image'])

                # pool image feature if needed
                if len(self.img_embedding.get_shape()) > 2:
                    with tf.variable_scope("image_pooling"):
                        self.img_embedding = get_attention(
                            self.img_embedding,
                            self.dialogue_embedding,
                            is_training=self._is_training,
                            config=config["pooling"],
                            dropout_keep=dropout_keep,
                            reuse=reuse)

                # fuse vision/language
                self.visdiag_embedding = get_fusion_mechanism(
                    input1=self.dialogue_embedding,
                    input2=self.img_embedding,
                    config=config.get["fusion"],
                    dropout_keep=dropout_keep)
            else:
                self.visdiag_embedding = self.dialogue_embedding

            visdiag_dim = int(self.visdiag_embedding.get_shape()[-1])

            #####################
            #   OBJECTS
            #####################

            self._num_object = tf.placeholder(tf.int32, [batch_size],
                                              name='obj_seq_length')
            self._obj_cats = tf.placeholder(tf.int32, [batch_size, None],
                                            name='obj_cat')
            self._obj_spats = tf.placeholder(tf.float32, [batch_size, None, 8],
                                             name='obj_spat')

            cats_emb = tfc_layers.embed_sequence(
                ids=self._obj_cats,
                vocab_size=config['category']["n_categories"] +
                1,  # we add the unknown category
                embed_dim=config['category']["embedding_dim"],
                scope="cat_embedding",
                reuse=reuse)
            '''
            spatial_emb = tfc_layers.fully_connected(self._obj_spats,
                                                     num_outputs=config["spatial"]["no_mlp_units"],
                                                     activation_fn=tf.nn.relu,
                                                     reuse=reuse,
                                                     scope="spatial_upsampling")
            '''
            spatial_emb = self._obj_spats

            self.objects_input = tf.concat([cats_emb, spatial_emb], axis=2)
            # self.objects_input = tf.nn.dropout(self.objects_input, dropout_keep)

            with tf.variable_scope('obj_mlp'):
                h1 = tfc_layers.fully_connected(
                    self.objects_input,
                    num_outputs=config["object"]['no_mlp_units'],
                    activation_fn=tf.nn.relu,
                    scope='l1')
                # h1 = tf.nn.dropout(h1, dropout_keep)

                obj_embeddings = tfc_layers.fully_connected(
                    h1,
                    num_outputs=visdiag_dim,
                    activation_fn=tf.nn.relu,
                    scope='l2')

            #####################
            #   SCORES
            #####################

            self.scores = obj_embeddings * tf.expand_dims(
                self.visdiag_embedding, axis=1)
            self.scores = tf.reduce_sum(self.scores, axis=2)

            # remove max for stability (trick)
            self.scores -= tf.reduce_max(self.scores, axis=1, keep_dims=True)

            with tf.variable_scope('object_mask', reuse=reuse):

                object_mask = tf.sequence_mask(self._num_object)
                score_mask_values = float("-inf") * tf.ones_like(self.scores)

                self.score_masked = tf.where(object_mask, self.scores,
                                             score_mask_values)

            #####################
            #   LOSS
            #####################

            # Targets
            self._targets = tf.placeholder(tf.int32, [batch_size],
                                           name="target_index")

            self.loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
                labels=self._targets, logits=self.score_masked)
            self.loss = tf.reduce_mean(self.loss)

            self.selected_object = tf.argmax(self.score_masked, axis=1)
            self.softmax = tf.nn.softmax(self.score_masked)

            with tf.variable_scope('accuracy'):
                self.accuracy = tf.equal(self.selected_object,
                                         tf.cast(self._targets, tf.int64))
                self.accuracy = tf.reduce_mean(
                    tf.cast(self.accuracy, tf.float32))
    def __init__(self, config, no_words, no_answers, reuse=False, device=''):
        ResnetModel.__init__(self, "oracle", device=device)

        with tf.variable_scope(self.scope_name, reuse=reuse):

            self.batch_size = None
            self._is_training = tf.placeholder(tf.bool, name="is_training")

            dropout_keep_scalar = float(config["dropout_keep_prob"])
            dropout_keep = tf.cond(self._is_training,
                                   lambda: tf.constant(dropout_keep_scalar),
                                   lambda: tf.constant(1.0))

            #####################
            #   QUESTION
            #####################

            self._question = tf.placeholder(tf.int32, [self.batch_size, None],
                                            name='question')
            self._seq_length = tf.placeholder(tf.int32, [self.batch_size],
                                              name='seq_length')
            self._answer = tf.placeholder(tf.int64,
                                          [self.batch_size, no_answers],
                                          name='answer')

            word_emb = tfc_layers.embed_sequence(
                ids=self._question,
                vocab_size=no_words,
                embed_dim=config["question"]["word_embedding_dim"],
                scope="word_embedding",
                reuse=reuse)

            if config["question"]['glove']:
                self._glove = tf.placeholder(tf.float32, [None, None, 300],
                                             name="glove")
                word_emb = tf.concat([word_emb, self._glove], axis=2)

            word_emb = tf.nn.dropout(word_emb, dropout_keep)
            self.rnn_states, self.last_rnn_states = rnn.rnn_factory(
                inputs=word_emb,
                seq_length=self._seq_length,
                cell=config["question"]["cell"],
                num_hidden=config["question"]["rnn_state_size"],
                bidirectional=config["question"]["bidirectional"],
                max_pool=config["question"]["max_pool"],
                layer_norm=config["question"]["layer_norm"],
                reuse=reuse)

            #####################
            #   SIDE INPUTS
            #####################

            # Category
            if any(get_recursively(config, "category",
                                   no_field_recursive=True)):
                self._category = tf.placeholder(tf.int32, [self.batch_size],
                                                name='category')

                cat_emb = tfc_layers.embed_sequence(
                    ids=self._category,
                    vocab_size=config['category']["n_categories"] + 1,
                    embed_dim=config['category']["embedding_dim"],
                    scope="category_embedding",
                    reuse=reuse)
                cat_emb = tf.nn.dropout(cat_emb, dropout_keep)
            else:
                cat_emb = None

            # Spatial
            if any(get_recursively(config, "spatial",
                                   no_field_recursive=True)):
                self._spatial = tf.placeholder(tf.float32,
                                               [self.batch_size, 8],
                                               name='spatial')
                spatial_emb = tfc_layers.fully_connected(
                    self._spatial,
                    num_outputs=config["spatial"]["no_mlp_units"],
                    activation_fn=tf.nn.relu,
                    reuse=reuse,
                    scope="spatial_upsampling")
                spatial_emb = tf.nn.dropout(spatial_emb, dropout_keep)
            else:
                spatial_emb = None

            self.classifier_input = []

            #####################
            #   IMAGES / CROP
            #####################

            for visual_str in ["image", "crop"]:

                # Check whether to use the visual input
                if config["inputs"][visual_str]:

                    # Load Image Features
                    visual_features = tf.placeholder(tf.float32,
                                                     shape=[self.batch_size] +
                                                     config[visual_str]["dim"],
                                                     name=visual_str)
                    with tf.variable_scope(visual_str, reuse=reuse):
                        visual_features = get_image_features(
                            image=visual_features,
                            config=config[visual_str],
                            is_training=self._is_training)

                    # Modulate Image Features
                    if "film_input" in config:

                        # Retrieve configuration
                        film_config = config["film_input"]
                        block_config = config["film_block"]

                        # Load object mask
                        mask = tf.placeholder(
                            tf.float32,
                            visual_features.get_shape()[:3],
                            name='{}_mask'.format(visual_str))
                        mask = tf.expand_dims(mask, axis=-1)

                        # Perform the actual modulation
                        with tf.variable_scope(
                                "{}_modulation".format(visual_str)):

                            extra_context = []
                            with tf.variable_scope(
                                    "{}_film_input".format(visual_str),
                                    reuse=reuse):

                                if film_config["category"]:
                                    extra_context.append(cat_emb)

                                if film_config["spatial"]:
                                    extra_context.append(spatial_emb)

                                if film_config["mask"]:
                                    mask_dim = int(
                                        visual_features.get_shape()[1]) * int(
                                            visual_features.get_shape()[2])
                                    flat_mask = tf.reshape(
                                        mask, shape=[-1, mask_dim])
                                    extra_context.append(flat_mask)

                            with tf.variable_scope(
                                    "{}_reading_cell".format(visual_str)):

                                reading_unit = create_reading_unit(
                                    last_state=self.last_rnn_states,
                                    states=self.rnn_states,
                                    seq_length=self._seq_length,
                                    config=film_config["reading_unit"],
                                    reuse=reuse)

                                film_layer_fct = create_film_layer_with_reading_unit(
                                    reading_unit)

                            with tf.variable_scope(
                                    "{}_film_stack".format(visual_str),
                                    reuse=reuse):

                                def append_extra_features(features, config):
                                    if config[
                                            "spatial_location"]:  # add the pixel location as two additional feature map
                                        features = ft_utils.append_spatial_location(
                                            features)
                                    if config[
                                            "mask"]:  # add the mask on the object as one additional feature map
                                        features = tf.concat([features, mask],
                                                             axis=3)
                                    return features

                                film_stack = FiLM_Stack(
                                    image=visual_features,
                                    film_input=extra_context,
                                    film_layer_fct=film_layer_fct,
                                    is_training=self._is_training,
                                    config=block_config,
                                    append_extra_features=append_extra_features,
                                    reuse=reuse)

                                visual_features = film_stack.get()

                    # Pool Image Features
                    if len(visual_features.get_shape()) > 2:
                        with tf.variable_scope(
                                "{}_pooling".format(visual_str)):
                            visual_features = get_attention(
                                visual_features,
                                self.last_rnn_states,
                                is_training=self._is_training,
                                config=config["pooling"],
                                dropout_keep=dropout_keep,
                                reuse=reuse)

                    self.classifier_input.append(visual_features)

            #####################
            #   FINAL LAYER
            #####################

            with tf.variable_scope("classifier", reuse=reuse):

                if config["classifier"]["inputs"]["question"]:
                    self.classifier_input.append(self.last_rnn_states)

                if config["classifier"]["inputs"]["category"]:
                    self.classifier_input.append(cat_emb)

                if config["classifier"]["inputs"]["spatial"]:
                    self.classifier_input.append(spatial_emb)

                assert len(
                    self.classifier_input
                ) > 0, "Please provide some inputs for the classifier!!!"
                self.classifier_input = tf.concat(self.classifier_input,
                                                  axis=1)

                self.hidden_state = tfc_layers.fully_connected(
                    self.classifier_input,
                    num_outputs=config["classifier"]["no_mlp_units"],
                    activation_fn=tf.nn.relu,
                    reuse=reuse,
                    scope="classifier_hidden_layer")

                self.hidden_state = tf.nn.dropout(self.hidden_state,
                                                  dropout_keep)
                self.out = tfc_layers.fully_connected(
                    self.hidden_state,
                    num_outputs=no_answers,
                    activation_fn=None,
                    reuse=reuse,
                    scope="classifier_softmax_layer")

            #####################
            #   Loss
            #####################

            self.cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2(
                logits=self.out, labels=self._answer, name='cross_entropy')
            self.loss = tf.reduce_mean(self.cross_entropy)

            self.softmax = tf.nn.softmax(self.out, name='answer_prob')
            self.prediction = tf.argmax(
                self.out, axis=1,
                name='predicted_answer')  # no need to compute the softmax

            self.success = tf.equal(
                self.prediction,
                tf.argmax(self._answer,
                          axis=1))  # no need to compute the softmax

            with tf.variable_scope('accuracy'):
                self.accuracy = tf.equal(self.prediction,
                                         tf.argmax(self._answer, axis=1))
                self.accuracy = tf.reduce_mean(
                    tf.cast(self.accuracy, tf.float32))

            tf.summary.scalar('accuracy', self.accuracy)

            print('Model... build!')
Example #6
0
    def __init__(self, config, num_words, num_answers, device='', reuse=False):
        ResnetModel.__init__(self, "clevr", device=device)

        with tf.variable_scope(self.scope_name, reuse=reuse):

            batch_size = None
            self._is_training = tf.placeholder(tf.bool, name="is_training")

            dropout_keep_scalar = float(config["dropout_keep_prob"])
            dropout_keep = tf.cond(self._is_training,
                                   lambda: tf.constant(dropout_keep_scalar),
                                   lambda: tf.constant(1.0))

            #####################
            #   QUESTION
            #####################

            self._question = tf.placeholder(tf.int32, [batch_size, None],
                                            name='question')
            self._seq_length = tf.placeholder(tf.int32, [batch_size],
                                              name='seq_length')
            self._answer = tf.placeholder(tf.int64, [batch_size, num_answers],
                                          name='answer')

            word_emb = tfc_layers.embed_sequence(
                ids=self._question,
                vocab_size=num_words,
                embed_dim=config["question"]["word_embedding_dim"],
                scope="word_embedding",
                reuse=reuse)

            if config["question"]['glove']:
                self._glove = tf.placeholder(tf.float32, [None, None, 300],
                                             name="glove")
                word_emb = tf.concat([word_emb, self._glove], axis=2)

            word_emb = tf.nn.dropout(word_emb, dropout_keep)
            _, last_rnn_state = rnn.rnn_factory(
                inputs=word_emb,
                seq_length=self._seq_length,
                cell=config["question"]["cell"],
                num_hidden=config["question"]["rnn_state_size"],
                bidirectional=config["question"]["bidirectional"],
                max_pool=config["question"]["max_pool"],
                layer_norm=config["question"]["layer_norm"],
                reuse=reuse)

            #####################
            #   IMAGES
            #####################

            self._image = tf.placeholder(tf.float32,
                                         [batch_size] + config['image']["dim"],
                                         name='image')

            cbn = None
            if "cbn" in config:
                cbn = get_cbn(config["cbn"], last_rnn_state, dropout_keep,
                              self._is_training)

            self.image_out = get_image_features(image=self._image,
                                                is_training=self._is_training,
                                                config=config['image'],
                                                cbn=cbn)

            if len(self.image_out.get_shape()) > 2:
                with tf.variable_scope("image_pooling"):
                    self.image_out = get_attention(
                        self.image_out,
                        last_rnn_state,
                        is_training=self._is_training,
                        config=config["pooling"],
                        dropout_keep=dropout_keep,
                        reuse=reuse)

            #####################
            #   FUSION
            #####################

            self.visdiag_embedding = get_fusion_mechanism(
                input1=self.image_out,
                input2=last_rnn_state,
                config=config.get["fusion"],
                dropout_keep=dropout_keep)

            #####################
            #   CLASSIFIER
            #####################

            with tf.variable_scope('mlp'):
                num_hiddens = config['classifier']['no_mlp_units']

                self.out = tfc_layers.fully_connected(self.visdiag_embedding,
                                                      num_hiddens,
                                                      activation_fn=tf.nn.relu)
                self.out = tfc_layers.fully_connected(self.out,
                                                      num_answers,
                                                      activation_fn=None)

                self.cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
                    logits=self.out, labels=self._answer)
                self.loss = tf.reduce_mean(self.cross_entropy)

                self.softmax = tf.nn.softmax(self.out, name='answer_prob')
                self.prediction = tf.argmax(
                    self.out, axis=1,
                    name='predicted_answer')  # no need to compute the softmax

                self.success = tf.equal(
                    self.prediction,
                    tf.argmax(self._answer,
                              axis=1))  # no need to compute the softmax

        with tf.variable_scope('accuracy'):
            self.accuracy = tf.equal(self.prediction,
                                     tf.argmax(self._answer, axis=1))
            self.accuracy = tf.reduce_mean(tf.cast(self.accuracy, tf.float32))

            print('Model... CLEVR (baseline) build!')
Example #7
0
def get_image_features(image,
                       question,
                       is_training,
                       scope_name,
                       scope_feature,
                       config,
                       dropout_keep=1.,
                       reuse=False,
                       co_attention=True):
    image_input_type = config["image_input"]

    # Extract feature from 1D-image feature s
    if image_input_type == "fc8" \
            or image_input_type == "fc7" \
            or image_input_type == "dummy":

        image_out = image
        if config.get('normalize', False):
            image_out = tf.nn.l2_normalize(image,
                                           dim=1,
                                           name="fc_normalization")

    elif image_input_type.startswith("conv") or image_input_type.startswith(
            "raw"):

        print("---------------------------------- Before IF")
        # Extract feature from raw images
        if image_input_type.startswith("raw"):

            # Create CBN
            cbn = None
            if config["cbn"].get("use_cbn", False):
                cbn_factory = CBNfromLSTM(
                    question, no_units=config['cbn']["cbn_embedding_size"])
                excluded_scopes = config["cbn"].get('excluded_scope_names', [])

                cbn = ConditionalBatchNorm(
                    cbn_factory,
                    excluded_scope_names=excluded_scopes,
                    is_training=is_training)

                print("Image = {} ".format(image))
                print("cbn_factory = {} ".format(cbn_factory))
                print("excluded_scopes = {} ".format(excluded_scopes))
                print("cbn = {} ".format(cbn))

                # exit()

            # print("---------------------------------- Before resnet_version")
            # Create ResNet
            resnet_version = config['resnet_version']

            image_feature_maps, _ = create_resnet(
                image,
                is_training=is_training,
                scope=scope_name,
                scope_feature=scope_feature,
                cbn=cbn,
                resnet_version=resnet_version,
                resnet_out=config.get('resnet_out', "block4"))

            print("-- image_feature_maps = {}".format(image_feature_maps))

            print("---------------------------------- After resnet_version")

            image_feature_maps = image_feature_maps
            if config.get('normalize', False):
                image_feature_maps = tf.nn.l2_normalize(image_feature_maps,
                                                        dim=[1, 2, 3])

        # Extract feature from 3D-image features
        else:
            image_feature_maps = image

        # apply attention
        image_out = image_feature_maps
        print("image_out 1= {}".format(image_out))

        # exit()

        # print("before im")
        if not co_attention:
            image_out = get_attention(image_feature_maps,
                                      question,
                                      config=config["attention"],
                                      dropout_keep=dropout_keep,
                                      reuse=reuse)
        # print("-------- image_out = ",image_out)
        # exit()

    else:
        assert False, "Wrong input type for image"

    print("---------------------------------- Finish image_out")

    return image_out