def FindSamePropertyModule(self, input_0, time_idx, batch_idx, map_dim=250,
        scope='FindSamePropertyModule', reuse=True):
        # In TF Fold, batch_idx and time_idx are both [N_batch, 1] tensors

        image_feat_grid = self._slice_image_feat_grid(batch_idx)
        text_param = self._slice_word_vecs(time_idx, batch_idx)
        # Mapping: att_grid x image_feat_grid x text_param -> att_grid
        # Input:
        #   input_0: [N, H, W, 1]
        #   image_feat_grid: [N, H, W, D_im]
        #   text_param: [N, D_txt]
        # Output:
        #   att_grid: [N, H, W, 1]
        #
        # Implementation:
        #   1. Extract visual features using the input attention map, and
        #      linear transform to map_dim
        #   2. linear transform language features to map_dim
        #   3. Convolve image features to map_dim
        #   4. Element-wise multiplication of the three, l2_normalize, linear transform.
        with tf.variable_scope(self.module_variable_scope):
            with tf.variable_scope(scope, reuse=reuse):
                image_shape = tf.shape(image_feat_grid)
                N = tf.shape(time_idx)[0]
                H = image_shape[1]
                W = image_shape[2]
                D_im = image_feat_grid.get_shape().as_list()[-1]
                D_txt = text_param.get_shape().as_list()[-1]

                # image_feat_mapped has shape [N, H, W, map_dim]
                image_feat_mapped = _1x1_conv('conv_image', image_feat_grid,
                                              output_dim=map_dim)

                text_param_mapped = fc('fc_text', text_param, output_dim=map_dim)
                text_param_mapped = tf.reshape(text_param_mapped, to_T([N, 1, 1, map_dim]))

                att_softmax = tf.reshape(
                    tf.nn.softmax(tf.reshape(input_0, to_T([N, H*W]))),
                    to_T([N, H, W, 1]))
                # att_feat has shape [N, D_vis]
                att_feat = tf.reduce_sum(image_feat_grid * att_softmax, axis=[1, 2])
                att_feat_mapped = tf.reshape(
                    fc('fc_att', att_feat, output_dim=map_dim), to_T([N, 1, 1, map_dim]))

                eltwise_mult = tf.nn.l2_normalize(
                    image_feat_mapped * text_param_mapped * att_feat_mapped, 3)
                att_grid = _1x1_conv('conv_eltwise', eltwise_mult, output_dim=1)

        att_grid.set_shape(self.att_shape)
        return att_grid
Exemple #2
0
    def FindModule(self,
                   time_idx,
                   batch_idx,
                   map_dim=1024,
                   scope='FindModule',
                   reuse=True):
        # In TF Fold, batch_idx and time_idx are both [N_batch, 1] tensors

        image_feat_grid = self._slice_image_feat_grid(batch_idx)
        text_param = self._slice_word_vecs(time_idx, batch_idx)
        # Mapping: image_feat_grid x text_param -> att_grid
        # Input:
        #   image_feat_grid: [N, H, W, D_im]
        #   text_param: [N, D_txt]
        # Output:
        #   att_grid: [N, H, W, 1]
        #
        # Implementation:
        #   1. Elementwise multiplication between image_feat_grid and text_param
        #   2. L2-normalization
        #   3. Linear classification
        with tf.variable_scope(self.module_variable_scope):
            with tf.variable_scope(scope, reuse=reuse):
                image_shape = tf.shape(image_feat_grid)
                N = tf.shape(time_idx)[0]
                H = image_shape[1]
                W = image_shape[2]
                D_im = image_feat_grid.get_shape().as_list()[-1]
                D_txt = text_param.get_shape().as_list()[-1]

                # image_feat_mapped has shape [N, H, W, map_dim]
                image_feat_mapped = _1x1_conv('conv_image',
                                              image_feat_grid,
                                              output_dim=map_dim)

                text_param_mapped = fc('fc_text',
                                       text_param,
                                       output_dim=map_dim)
                text_param_mapped = tf.reshape(text_param_mapped,
                                               to_T([N, 1, 1, map_dim]))

                eltwise_mult = tf.nn.l2_normalize(
                    image_feat_mapped * text_param_mapped, 3)
                att_grid = _1x1_conv('conv_eltwise',
                                     eltwise_mult,
                                     output_dim=1)

        att_grid.set_shape(self.att_shape)
        return att_grid
Exemple #3
0
    def instantiate_batch(self, inputs):
        """
    Inputs:
      output from the previous modules
      image feature for the example
      text attention for all modules for the example
      time id for current module
    """
        vis_att, img_feat, text_att = inputs

        # text feature dimension, intermediate mapping dimension
        # batch size, image feature height and width
        text_dim = text_att.shape.as_list()[-1]
        map_dim = self._params['map_dim']
        encode_size = self._params['encode_size']
        N = tf.shape(img_feat)[0]
        H, W = img_feat.shape.as_list()[1:3]

        with tf.variable_scope(self._module_scope):
            with tf.variable_scope(self._scope, reuse=self._reuse):
                # image_feat_mapped has shape [N, H, W, map_dim]
                img_map = _1x1_conv('conv_image', img_feat, output_dim=map_dim)
                # nonlinearity
                img_map = tf.nn.relu(img_map)

                text_map = fc('fc_text', text_att, output_dim=map_dim)
                text_map = tf.reshape(text_map, [-1, 1, 1, map_dim])
                # nonlinearity
                text_map = tf.nn.relu(text_map)

                att_feats = tf.reduce_sum(img_feat * vis_att, axis=[1, 2])
                att_map = tf.reshape(
                    fc('fc_att', att_feats, output_dim=map_dim),
                    [N, 1, 1, map_dim])

                # interact via element wise map
                eltwise_mult = tf.nn.l2_normalize(img_map * text_map * att_map,
                                                  3)
                att_grid = _1x1_conv('conv_eltwise',
                                     eltwise_mult,
                                     output_dim=1)

                # softmax
                att_grid_soft = tf.nn.softmax(tf.reshape(
                    att_grid, [-1, H * W]))
                att_grid = tf.reshape(att_grid_soft, [-1, H, W, 1])

        return [att_grid]
    def TransformModule(self, input_0, time_idx, batch_idx, kernel_size=5,
        map_dim=250, scope='TransformModule', reuse=True):
        # In TF Fold, batch_idx and time_idx are both [N_batch, 1] tensors

        text_param = self._slice_word_vecs(time_idx, batch_idx)
        # Mapping: att_grid x text_param -> att_grid
        # Input:
        #   input_0: [N, H, W, 1]
        #   text_param: [N, D_txt]
        # Output:
        #   att_grid: [N, H, W, 1]
        #
        # Implementation:
        #   Convolutional layer that also involve text_param
        #   A 'soft' convolutional kernel that is modulated by text_param
        with tf.variable_scope(self.module_variable_scope):
            with tf.variable_scope(scope, reuse=reuse):
                att_shape = tf.shape(input_0)
                N = att_shape[0]
                H = att_shape[1]
                W = att_shape[2]
                att_maps = _conv('conv_maps', input_0, kernel_size=kernel_size,
                    stride=1, output_dim=map_dim)

                text_param_mapped = fc('text_fc', text_param, output_dim=map_dim)
                text_param_mapped = tf.reshape(text_param_mapped, to_T([N, 1, 1, map_dim]))

                eltwise_mult = tf.nn.l2_normalize(att_maps * text_param_mapped, 3)
                att_grid = _1x1_conv('conv_eltwise', eltwise_mult, output_dim=1)

        att_grid.set_shape(self.att_shape)
        return att_grid
Exemple #5
0
    def CountModule(self,
                    input_0,
                    time_idx,
                    batch_idx,
                    map_dim=1024,
                    scope='CountModule',
                    reuse=True):
        # In TF Fold, batch_idx and time_idx are both [N_batch, 1] tensors

        image_feat_grid = self._slice_image_feat_grid(batch_idx)
        text_param = self._slice_word_vecs(time_idx, batch_idx)
        encoder_states = self._slice_encoder_states(batch_idx)
        # Mapping: att_grid -> answer probs
        # Input:
        #   input_0: [N, H, W, 1]
        # Output:
        #   answer_scores: [N, self.num_choices]
        #
        # Implementation:
        #   Two paths
        with tf.variable_scope(self.module_variable_scope):
            with tf.variable_scope(scope, reuse=reuse):
                # The first path, same as Describe
                image_shape = tf.shape(image_feat_grid)
                N = tf.shape(time_idx)[0]
                H, W = self.att_shape[1:3]
                D_im = image_feat_grid.get_shape().as_list()[-1]
                D_txt = text_param.get_shape().as_list()[-1]

                text_param_mapped_0 = fc('fc_text_0',
                                         text_param,
                                         output_dim=map_dim)

                att_softmax_0 = tf.reshape(
                    tf.nn.softmax(tf.reshape(input_0, to_T([N, H * W]))),
                    to_T([N, H, W, 1]))

                # att_feat, att_feat_1 has shape [N, D_vis]
                att_feat_0 = tf.reduce_sum(image_feat_grid * att_softmax_0,
                                           axis=[1, 2])
                att_feat_mapped_0 = tf.reshape(
                    fc('fc_att_0', att_feat_0, output_dim=map_dim),
                    to_T([N, map_dim]))

                if encoder_states is not None:
                    # Add in encoder states in the elementwise multiplication
                    encoder_states_mapped = fc('fc_encoder_states',
                                               encoder_states,
                                               output_dim=map_dim)
                    eltwise_mult_0 = tf.nn.l2_normalize(
                        text_param_mapped_0 * att_feat_mapped_0 *
                        encoder_states_mapped, 1)
                else:
                    eltwise_mult_0 = tf.nn.l2_normalize(
                        text_param_mapped_0 * att_feat_mapped_0, 1)
                scores_0 = fc('fc_eltwise_0',
                              eltwise_mult_0,
                              output_dim=self.num_choices)

                # the second path
                # text agnostic counting, same as Count in CLEVR v0 modules
                att_all_1 = tf.reshape(input_0, to_T([-1, H * W]))
                att_min_1 = tf.reduce_min(input_0, axis=[1, 2])
                att_max_1 = tf.reduce_max(input_0, axis=[1, 2])
                # text aware counting, similar to Find
                att_mapped_2 = _conv('conv_att_2',
                                     input_0,
                                     kernel_size=3,
                                     stride=1,
                                     output_dim=map_dim)
                text_param_mapped_2 = fc('fc_text_2',
                                         text_param,
                                         output_dim=map_dim)
                text_param_mapped_2 = tf.reshape(text_param_mapped_2,
                                                 to_T([N, 1, 1, map_dim]))

                eltwise_mult_2 = tf.nn.l2_normalize(
                    att_mapped_2 * text_param_mapped_2, 3)
                att_grid_2 = _1x1_conv('conv_eltwise_2',
                                       eltwise_mult_2,
                                       output_dim=1)
                att_grid_2.set_shape(input_0.get_shape())
                att_all_2 = tf.reshape(att_grid_2, to_T([-1, H * W]))
                att_min_2 = tf.reduce_min(att_grid_2, axis=[1, 2])
                att_max_2 = tf.reduce_max(att_grid_2, axis=[1, 2])
                att_concat_2 = tf.concat([
                    att_all_1, att_min_1, att_max_1, att_all_2, att_min_2,
                    att_max_2
                ],
                                         axis=1)
                scores_2 = fc('fc_scores_2',
                              att_concat_2,
                              output_dim=self.num_choices)

                # Fuse the score from both paths
                scores = scores_0 + scores_2

        return scores