from collections import OrderedDict
import cPickle as pkl
import crc_input_data_seq

from util import log, override
from models.base import ModelBase, BaseModelConfig
from models.saliency_shallownet import SaliencyModel

from models.model_util import tf_normalize_map, normalize_probability_map
from models.model_util import tf_softmax_2d, tf_softmax_cross_entropy_with_logits_2d
from evaluation_metrics import saliency_score, AVAILABLE_METRICS

from easydict import EasyDict as E

CONSTANTS = E()
CONSTANTS.image_width = 98
CONSTANTS.image_height = 98
CONSTANTS.gazemap_width = 7
CONSTANTS.gazemap_height = 7
CONSTANTS.saliencymap_width = 49
CONSTANTS.saliencymap_height = 49


# config : changed as paramter later
class GRUModelConfig(BaseModelConfig):
    def __init__(self):
        super(GRUModelConfig, self).__init__()

        self.n_lstm_steps = 35
        self.batch_size = 7  # XXX XXX XXX XXX
Beispiel #2
0
    def create_gazeprediction_network(frame_images,
                                      c3d_input,
                                      gt_gazemap,
                                      dropout_keep_prob,
                                      net=None):
        '''
        Args:
            frame_images: a [B x T x IH x IW x 3] tensor (frame images)
            c3d_input : a [B x T x 1024 x 7 x 7] tensor for C3D convmap features
            gt_gazemap : a [B x T x GH x GW] tensor of ground truth per-frame gaze maps
            dropout_keep_prob : float tensor
            (optional) net : a dictionary to get intra-layer activations or tensors.

        Outputs:
            [predicted_gazemaps, loss, image_summary] where

            predicted_gazemaps : a [B x T x GH x GW] tensor,
                predicted gaze maps per frame
            loss: a scalar (float) tensor of RNN supervision loss.
            image_summary
        '''

        if net is None: net = {}
        else: assert isinstance(net, dict)

        vars = E()

        # (0) input sanity check
        GH, GW = CONSTANTS.gazemap_height, CONSTANTS.gazemap_width
        IH, IW = CONSTANTS.image_height, CONSTANTS.image_width
        B, T = frame_images.get_shape().as_list()[:2]

        assert B > 0 and T > 0
        frame_images.get_shape().assert_is_compatible_with([B, T, IH, IW, 3])
        c3d_input.get_shape().assert_is_compatible_with([B, T, 1024, 7, 7])
        gt_gazemap.get_shape().assert_is_compatible_with([B, T, GH, GW])

        dim_cnn_proj = 512  # XXX FIXME (see __init__ in GazePredictionGRU)

        # some variables
        # --------------
        # not a proper name, it should be rnn_state_feature_size in # GRCN????????? FIXME
        rnn_state_size = 256  #dim_cnn_proj # filter size is more correct name
        ''' The RGP (Recurrent Gaze Prediction) model. '''

        # (1) Input frame saliency
        # ------------------------

        # Input.
        net['frame_images'] = frame_images  # [B x T x IH x IW x 3]

        net['frm_sal'] = SaliencyModel.create_shallownet(
            tf.reshape(net['frame_images'], [-1, IH, IW, 3]),
            scope='ShallowNet',
            dropout=False)  # [-1, 49, 49]
        net['frm_sal'] = tf.reshape(net['frm_sal'],
                                    [B, T, GH, GW])  # [B x T x 49 x 49]

        # [B x T x 49 x 49] --> [B x T x 49 x 49 x 1]
        net['frm_sal_cubic'] = tf.reshape(net['frm_sal'], [B, T, GH, GW, 1],
                                          name='frame_saliency_cubic')

        # (2) C3D
        # -------
        # a. reduce filter size [7 x 7 x 1024] -> [7 x 7 x 32] via FC or CONV
        # b. apply RCN, and get the [7 x 7 x 32] outputs from RNN

        # c3d input.
        net['c3d_input'] = c3d_input  # [B x T x 1024 x 7 x 7]
        # change axis and reshape to [B x T x 7 x 7 x 1024]
        net['c3d_input_reshape'] = tf.transpose(net['c3d_input'],
                                                perm=[0, 1, 3, 4, 2],
                                                name='c3d_input_reshape')
        log.info('c3d_input_reshape shape : %s',
                 net['c3d_input_reshape'].get_shape().as_list())
        net['c3d_input_reshape'].get_shape().assert_is_compatible_with(
            [B, T, 7, 7, 1024])

        # c3d_embedded: project each 1024 feature (per 7x7 c3d conv-feature map) into 12
        vars.proj_c3d_W = tf.Variable(tf.random_uniform([1024, dim_cnn_proj],
                                                        -0.1, 0.1),
                                      name="proj_c3d_W")
        vars.proj_c3d_b = tf.Variable(tf.random_uniform([dim_cnn_proj], -0.1,
                                                        0.1),
                                      name="proj_c3d_b")

        net['c3d_embedded'] = tf.nn.xw_plus_b(
            tf.reshape(net['c3d_input_reshape'],
                       [-1, 1024]), vars.proj_c3d_W, vars.proj_c3d_b
        )  # [(B*T*7*7) x 1024] --> [(B*T*7*7) x 12] by appling W:1024->12

        # --> [B x T x 7 x 7 x 12]
        net['c3d_embedded'] = tf.reshape(net['c3d_embedded'],
                                         [B, T, 7, 7, dim_cnn_proj])
        log.info('c3d_embedded shape : %s',
                 net['c3d_embedded'].get_shape().as_list())
        net['c3d_embedded'].get_shape().assert_is_compatible_with(
            [B, T, 7, 7, dim_cnn_proj])

        # The RNN Part.
        # -------------

        # Batch size x (gaze map size), per frame
        net['gt_gazemap'] = gt_gazemap  # [B x T x GH, GW]
        log.info('gt_gazemap shape : %s',
                 net['gt_gazemap'].get_shape().as_list())

        with tf.variable_scope('RCNBottom') as scope:
            vars.lstm_u = GRU_RCN_Cell(rnn_state_size, dim_cnn_proj)

            state_u = vars.lstm_u.zero_state(B, tf.float32)
            log.info('RNN state shape : %s', state_u.get_shape().as_list())

            # n_lstm_step for example, 35.
            net['rcn_outputs'] = rcn_outputs = []
            for i in range(T):
                if i > 0:
                    tf.get_variable_scope().reuse_variables()

                # We use cnn embedding + ... as RNN input (as a flatted/concatenated vector)
                rnn_input = tf.concat(
                    concat_dim=3,  # [:, i, 7, 7, HERE]
                    values=[  #  0     1  2  3
                        net['c3d_embedded']
                        [:, i, :, :, :],  # (i) C3D map (embedded into 7x7x12)
                    ],
                    name='rnn_input' + str(i))

                #with tf.variable_scope("RNN"):
                output_u, state_u = vars.lstm_u(rnn_input, state_u)

                # at time t
                output_u.get_shape().assert_is_compatible_with(
                    [B, 7, 7, rnn_state_size])  # Bx{time}x7x7x32
                rcn_outputs.append(output_u)

        # (3) RCN output unpooling to 49x49 size
        # each of (7x7x32) maps are up-sampled to (49x49x8)
        upsampling_filter_size = 11
        upsampling_output_channel = 64
        vars.upsampling_filter = tf.get_variable(
            'Upsampling/weight',
            [
                upsampling_filter_size, upsampling_filter_size,
                upsampling_output_channel, rnn_state_size
            ],  # rnn_state_size bad name (indeed a channel size)
            initializer=initializers.xavier_initializer_conv2d(uniform=True))

        net['rcn_upsampled_outputs'] = rcn_upsampled_outputs = []
        for i in range(T):
            rcn_output_map = rcn_outputs[i]  # [B x 7 x 7 x 128]

            rcn_upsampled_output = tf.nn.conv2d_transpose(
                rcn_output_map,
                vars.upsampling_filter,
                output_shape=[B, GH, GW, upsampling_output_channel],
                strides=[1, 7, 7, 1],
                padding='SAME',
                name='upsampled_rcn_output_' + str(i))
            rcn_upsampled_output.get_shape().assert_is_compatible_with(
                [B, GH, GW, upsampling_output_channel])
            rcn_upsampled_outputs.append(rcn_upsampled_output)

            if i == 0:
                log.info('RCN input map size : %s',
                         rcn_output_map.get_shape().as_list())
                log.info('RCN upsampled size : %s',
                         rcn_upsampled_output.get_shape().as_list())

        # (4) The upper layer of GRCN to emit gaze map
        # --------------------------------------------
        with tf.variable_scope('RCNGaze') as scope:

            vars.lstm_g = GRU_RCN_Cell(
                num_units=3,
                #                                       dim_feature=upsampling_output_channel + 1 + 1, # 10?
                dim_feature=upsampling_output_channel + 1,  # 10?
                spatial_shape=[GH, GW],
                kernel_spatial_shape=[5, 5])

            state_g = vars.lstm_g.zero_state(B, tf.float32)
            #            last_output_gazemap = tf.zeros([B, GH, GW, 1])

            predicted_gazemaps = []
            for i in range(T):
                if i > 0:
                    tf.get_variable_scope().reuse_variables()

                # try RNN supervision with GT gazemap.
                # FIXME decoder should be spin off here
                #if i > 0:
                #    last_output_gazemap = tf.expand_dims(gt_gazemap[:, i - 1, :, :], 3)

                # now, combine image saliency, rcn map from the bottom layer,
                # and the previous input
                '''
                rcn_input_concat = tf.concat(concat_dim=3, # the last dimension
                                            values=[
                                                rcn_upsampled_outputs[i],             # [B x 49 x 49 x 8]
                                                net['frm_sal_cubic'][:, i, :, :, :],  # [B x 49 x 49 x 1]
#                                                last_output_gazemap                   # [B x 49 x 49 x 1]
                                            ])
                '''
                #with tf.variable_scope("RNN"):
                output_g, state_g = vars.lstm_g(rcn_upsampled_outputs[i],
                                                state_g)

                output_g.get_shape().assert_is_compatible_with([B, GH, GW, 3])
                rcn_outputs.append(rcn_outputs)
                output_g = tf.reshape(output_g, [B, -1])

                # apply another convolutional layer (== fc in fact) to gaze map
                # [B x 49 x 49 x 3] -> # [B x 49 x 49 x 1]

                with tf.variable_scope('LastProjection') as scope_proj:
                    if i > 0:
                        tf.get_variable_scope().reuse_variables()

                    fc1 = fully_connected(
                        output_g,
                        4802,
                        activation_fn=None,  #tf.nn.relu,
                        weight_init=initializers.xavier_initializer(
                            uniform=True),
                        bias_init=tf.constant_initializer(0.0),
                        weight_collections=['MODEL_VARS'],
                        bias_collections=['MODEL_VARS'],
                        name='fc1')
                    #net['fc1'] = tflearn.layers.batch_normalization(net['fc1'])
                    fc1 = tf.nn.relu(fc1)

                    if dropout_keep_prob is not None:
                        fc1 = tf.nn.dropout(fc1, dropout_keep_prob)

                    fc1_slice1, fc1_slice2 = tf.split(1,
                                                      2,
                                                      fc1,
                                                      name='fc1_slice')
                    max_out = tf.maximum(fc1_slice1,
                                         fc1_slice2,
                                         name='fc1_maxout')

                    fc2 = fully_connected(
                        max_out,
                        4802,
                        activation_fn=None,  # no relu here
                        weight_init=initializers.xavier_initializer(
                            uniform=True),
                        bias_init=tf.constant_initializer(0.0),
                        weight_collections=['MODEL_VARS'],
                        bias_collections=['MODEL_VARS'],
                        name='fc2')
                    #net['fc2'] = tflearn.layers.batch_normalization(net['fc2'])
                    fc2 = tf.nn.relu(fc2)

                    #if dropout:
                    #    net['fc2'] = tf.nn.dropout( net['fc2'], net['dropout_keep_prob'] )

                    fc2_slice1, fc2_slice2 = tf.split(1,
                                                      2,
                                                      fc2,
                                                      name='fc2_slice')
                    max_out2 = tf.maximum(fc2_slice1,
                                          fc2_slice2,
                                          name='fc2_maxout')

                predicted_gazemap = tf.reshape(
                    max_out2,
                    [B, GH, GW])  # [B x 49 x 49 x 1] -> [B x 49 x 49] squeeze
                predicted_gazemaps.append(predicted_gazemap)
                # TODO should we normalize predicted_gazemap ????????????????????????????

        # (4) Finally, calculate the loss
        loss = 0.0

        for i in range(T):
            predicted_gazemap = predicted_gazemaps[i]

            # Cross entropy and softmax??
            l2loss = tf.nn.l2_loss(predicted_gazemap -
                                   gt_gazemap[:, i, :, :])  # on Bx49x49
            current_gaze_loss = tf.reduce_sum(l2loss)

            current_loss = current_gaze_loss
            loss += current_loss

        # loss: take average
        loss = tf.div(loss, float(B * T), name='loss_avg')

        # FIXME may be duplicates?
        tf.scalar_summary('loss/train', loss)
        tf.scalar_summary('loss/val', loss, collections=['TEST_SUMMARIES'])

        # pack as a tensor
        # T-list of [B x 49 x 49] --> [B x 49 x 49]
        net['predicted_gazemaps'] = tf.transpose(tf.pack(predicted_gazemaps),
                                                 [1, 0, 2, 3],
                                                 name='predicted_gazemaps')
        net['predicted_gazemaps'].get_shape().assert_is_compatible_with(
            [B, T, GH, GW])

        # Debugging Informations
        # ----------------------

        # OPTIONAL: for debugging and visualization
        # XXX only last predicted_gazemap is shown as of now :( T^T
        # XXX rename saliency -> gaze (to avoid confusion)
        def _add_image_summary(tag, tensor):
            return tf.image_summary(tag,
                                    tensor,
                                    max_images=2,
                                    collections=['IMAGE_SUMMARIES'])

        _input_image = frame_images[:, i, :, :, :]  # last rnn step
        _saliency_output = tf.reshape(predicted_gazemap, [-1, GH, GW, 1])
        _saliency_gt = tf.reshape(gt_gazemap[:, i, :, :], [-1, GH, GW, 1])
        _saliency_shallow = tf.reshape(net['frm_sal'][:, i, :, :],
                                       [-1, GH, GW, 1])

        _add_image_summary('inputimage', _input_image)
        _add_image_summary('saliency_maps_gt', _saliency_gt)
        _add_image_summary('saliency_maps_pred_original', _saliency_output)
        _add_image_summary('saliency_maps_pred_norm',
                           tf_normalize_map(_saliency_output))
        #_add_image_summary('saliency_zimgframe_shallow77', _saliency_shallow77)
        _add_image_summary('saliency_zshallownet', _saliency_shallow)

        image_summaries = tf.merge_summary(
            inputs=tf.get_collection('IMAGE_SUMMARIES'),
            collections=[],
            name='merged_image_summary',
        )

        return net['predicted_gazemaps'], loss, image_summaries
    def create_gazeprediction_network(frame_images, c3d_input,
                                      dropout_keep_prob = 1.0,
                                      net=None):
        '''
        Args:
            frame_images: a [B x T x IH x IW x 3] tensor (frame images)
            c3d_input : a [B x T x 1024 x 7 x 7] tensor for C3D convmap features

        Outputs:
            predicted_gazemaps : a [B x T x GH x GW] tensor,
                predicted gaze maps per frame
        '''
        

        if net is None: net = {}
        else: assert isinstance(net, dict)

        vars = E()

        # (0) input sanity check
        GH, GW = CONSTANTS.gazemap_height, CONSTANTS.gazemap_width
        IH, IW = CONSTANTS.image_height, CONSTANTS.image_width
        B, T = frame_images.get_shape().as_list()[:2]

        assert B > 0 and T > 0
        frame_images.get_shape().assert_is_compatible_with([B, T, IH, IW, 3])
        c3d_input.get_shape().assert_is_compatible_with([B, T, 1024, 7, 7])


        dim_cnn_proj = 32 # XXX FIXME (see __init__ in GazePredictionGRU)

        # some variables
        # --------------
        rnn_state_size = 7 * 7 * dim_cnn_proj     # flatten with this dimension order
        rnn_state_size += 7 * 7 * 1          # C3D projected PLUS saliency map (49) # FIXME

        ''' C3D + Image Saliency + Vanila RNN '''

        # (1) Input frame saliency
        # ------------------------

        # Input.
        net['frame_images'] = frame_images  # [B x T x IH x IW x 3]

        net['frm_sal'] = SaliencyModel.create_shallownet(
            tf.reshape(net['frame_images'], [-1, IH, IW, 3]),
            scope='ShallowNet',
            dropout=False
        ) # [-1, 49, 49]

        if (GH, GW) == (7, 7):
            log.warn('Downsampling 49x49 saliency to 7x7 ...')
            # downsampling 49,49 -> 7,7
            net['frm_sal'] = tf.nn.avg_pool(
                tf.expand_dims(net['frm_sal'], 3), # [B, 49, 49, '1']
                [1, 7, 7, 1], [1, 7, 7, 1],
                padding='VALID'
            )

        net['frm_sal'] = tf.reshape(net['frm_sal'], [B, T, GH, GW]) # [B x T x 49 x 49]

        # [B x T x 49 x 49] --> [B x T x 49 x 49 x 1]
        net['frm_sal_cubic'] = tf.reshape(net['frm_sal'], [B, T, GH, GW, 1],
                                          name='frame_saliency_cubic')


        # (2) C3D
        # -------
        # a. reduce filter size [7 x 7 x 1024] -> [7 x 7 x 32] via FC or CONV
        # b. apply RCN, and unpool the [7 x 7 x 32] outputs to [49, 49, 8]

        # c3d input.
        net['c3d_input'] = c3d_input   # [B x T x 1024 x 7 x 7]
        # change axis and reshape to [B x T x 7 x 7 x 1024]
        net['c3d_input_reshape'] = tf.transpose(net['c3d_input'],
                                         perm=[0,1,3,4,2],
                                         name='c3d_input_reshape')
        log.info('c3d_input_reshape shape : %s', net['c3d_input_reshape'].get_shape().as_list())
        net['c3d_input_reshape'].get_shape().assert_is_compatible_with([B, T, 7, 7, 1024])


        # c3d_embedded: project each 1024 feature (per 7x7 c3d conv-feature map) into 12
        vars.proj_c3d_W = tf.Variable(tf.random_uniform([1024, dim_cnn_proj], -0.1, 0.1), name="proj_c3d_W")
        vars.proj_c3d_b = tf.Variable(tf.random_uniform([dim_cnn_proj], -0.1, 0.1), name="proj_c3d_b")

        net['c3d_embedded'] = tf.nn.xw_plus_b(
            tf.reshape(net['c3d_input_reshape'], [-1, 1024]),
            vars.proj_c3d_W, vars.proj_c3d_b
        ) # [(B*T*7*7) x 1024] --> [(B*T*7*7) x 12] by appling W:1024->12

        if dropout_keep_prob != 1.0:
            net['c3d_embedded'] = tf.nn.dropout(net['c3d_embedded'], dropout_keep_prob)

        # --> [B x T x 7 x 7 x 12]
        net['c3d_embedded'] = tf.reshape(net['c3d_embedded'], [B, T, 7, 7, dim_cnn_proj])
        log.info('c3d_embedded shape : %s', net['c3d_embedded'].get_shape().as_list())
        net['c3d_embedded'].get_shape().assert_is_compatible_with([B, T, 7, 7, dim_cnn_proj])
        

        # The RNN Part.
        # -------------

        with tf.variable_scope("RNN") as scope:
            vars.lstm_u = rnn_cell.GRUCell(rnn_state_size, kernel_initializer="orthogonal")

            state_u = vars.lstm_u.zero_state(B, tf.float32)

            vars.proj_out_W = tf.Variable( tf.random_uniform( [rnn_state_size, GH*GW], -0.1, 0.1), name = "proj_out_W")
            vars.proj_out_b = tf.Variable( tf.zeros( [GH*GW], name = "proj_out_b"))

            log.info('RNN state shape : %s', state_u.get_shape().as_list())

            predicted_gazemaps = []
            # n_lstm_step for example, 35.
            for i in range(T):
                if i > 0:
                    tf.get_variable_scope().reuse_variables()

                # We use cnn embedding + ... as RNN input (as a flatted/concatenated vector)
                rnn_input = tf.concat(axis=3,  # [:, i, 7, 7, HERE]
                                    values=[       #  0     1  2  3
                                        net['c3d_embedded'][:, i, :, :, :],      # (i) C3D map (embedded into 7x7x12)
#                                      self.frm_sal_771[:, i, :, :, :],  # (ii) frame saliency
                                    ],
                                    name='rnn_input')

                # flatten rnn_input[i] into rank 2 (e.g. [B x -1]
                # TODO this part should be seamlessly done in RNN cell
                rnn_input_flatten = tf.reshape(rnn_input, [B, -1],
                                            name='rnn_input_flatten')

                output_u, state_u = vars.lstm_u(rnn_input_flatten, state_u)

                # at time t
                predicted_gazemap = tf.nn.xw_plus_b(output_u,
                                                    vars.proj_out_W, vars.proj_out_b)
                predicted_gazemap = tf.reshape(predicted_gazemap, [-1, GH, GW])
                predicted_gazemap.get_shape().assert_is_compatible_with([B, GH, GW])

                predicted_gazemaps.append(predicted_gazemap)


        # pack as a tensor
        # T-list of [B x 49 x 49] --> [B x 49 x 49]
        net['predicted_gazemaps'] = tf.transpose(tf.stack(predicted_gazemaps), [1, 0, 2, 3], name='predicted_gazemaps')
        net['predicted_gazemaps'].get_shape().assert_is_compatible_with([B, T, GH, GW])
        

        return net['predicted_gazemaps']
Beispiel #4
0
    def create_gazeprediction_network(frame_images,
                                      c3d_input,
                                      dropout_keep_prob=1.0,
                                      net=None):
        '''
        Args:
            frame_images: a [B x T x IH x IW x 3] tensor (frame images)
            c3d_input : a [B x T x 1024 x 7 x 7] tensor for C3D convmap features
            gt_gazemap : a [B x T x GH x GW] tensor of ground truth per-frame gaze maps
            dropout_keep_prob : float tensor
            (optional) net : a dictionary to get intra-layer activations or tensors.

        Outputs:
            predicted_gazemaps : a [B x T x GH x GW] tensor,
                predicted gaze maps per frame
        '''

        if net is None: net = {}
        else: assert isinstance(net, dict)

        vars = E()

        # (0) input sanity check
        GH, GW = CONSTANTS.gazemap_height, CONSTANTS.gazemap_width
        IH, IW = CONSTANTS.image_height, CONSTANTS.image_width
        B, T = frame_images.get_shape().as_list()[:2]

        assert B > 0 and T > 0
        frame_images.get_shape().assert_is_compatible_with([B, T, IH, IW, 3])
        c3d_input.get_shape().assert_is_compatible_with([B, T, 1024, 7, 7])

        dim_cnn_proj = 512  # XXX FIXME (see __init__ in GazePredictionGRU)

        # some variables
        # --------------
        # not a proper name, it should be rnn_state_feature_size in # GRCN????????? FIXME
        rnn_state_size = 128  #dim_cnn_proj # filter size is more correct name
        ''' The RGP (Recurrent Gaze Prediction) model. '''

        with tf.variable_scope("RGP"):

            # (2) C3D
            # -------
            # a. reduce filter size [7 x 7 x 1024] -> [7 x 7 x 32] via FC or CONV
            # b. apply RCN, and get the [7 x 7 x 32] outputs from RNN

            # c3d input.
            net['c3d_input'] = c3d_input  # [B x T x 1024 x 7 x 7]
            # change axis and reshape to [B x T x 7 x 7 x 1024]
            net['c3d_input_reshape'] = tf.transpose(net['c3d_input'],
                                                    perm=[0, 1, 3, 4, 2],
                                                    name='c3d_input_reshape')
            log.info('c3d_input_reshape shape : %s',
                     net['c3d_input_reshape'].get_shape().as_list())
            net['c3d_input_reshape'].get_shape().assert_is_compatible_with(
                [B, T, 7, 7, 1024])

            # c3d_embedded: project each 1024 feature (per 7x7 c3d conv-feature map) into 12
            vars.proj_c3d_W = tf.Variable(tf.random_uniform(
                [1024, dim_cnn_proj], -0.1, 0.1),
                                          name="proj_c3d_W")
            vars.proj_c3d_b = tf.Variable(tf.random_uniform([dim_cnn_proj],
                                                            -0.1, 0.1),
                                          name="proj_c3d_b")

            net['c3d_embedded'] = tf.nn.xw_plus_b(
                tf.reshape(net['c3d_input_reshape'],
                           [-1, 1024]), vars.proj_c3d_W, vars.proj_c3d_b
            )  # [(B*T*7*7) x 1024] --> [(B*T*7*7) x 12] by appling W:1024->12

            if dropout_keep_prob != 1.0:
                net['c3d_embedded'] = tf.nn.dropout(net['c3d_embedded'],
                                                    dropout_keep_prob)

            # --> [B x T x 7 x 7 x 12]
            net['c3d_embedded'] = tf.reshape(net['c3d_embedded'],
                                             [B, T, 7, 7, dim_cnn_proj])
            log.info('c3d_embedded shape : %s',
                     net['c3d_embedded'].get_shape().as_list())
            net['c3d_embedded'].get_shape().assert_is_compatible_with(
                [B, T, 7, 7, dim_cnn_proj])

            # Instead of RNN part, we have deconvolution
            # -------------

            rcn_outputs = [None] * T
            for i in range(T):
                rcn_outputs[i] = net['c3d_embedded'][:, i, :, :, :]
                # B x 7 x 7 x 512(dim_cnn_proj)

            # (3) RCN output unpooling to 49x49 size
            # each of (7x7x32) maps are up-sampled to (49x49x8)
            vars.upsampling_filter1 = tf.get_variable(
                'Upsampling/weight1',
                [
                    5,
                    5,
                    64,
                    dim_cnn_proj,  # directly project 512->64
                    #rnn_state_size
                ],  # rnn_state_size bad name (indeed a channel size)
                initializer=initializers.xavier_initializer_conv2d(
                    uniform=True))
            vars.upsampling_filter2 = tf.get_variable(
                'Upsampling/weight2',
                [5, 5, 32, 64
                 ],  # rnn_state_size bad name (indeed a channel size)
                initializer=initializers.xavier_initializer_conv2d(
                    uniform=True))

            vars.upsampling_filter3 = tf.get_variable(
                'Upsampling/weight3',
                [7, 7, 12, 32
                 ],  # rnn_state_size bad name (indeed a channel size)
                initializer=initializers.xavier_initializer_conv2d(
                    uniform=True))
            vars.out_W = tf.Variable(tf.random_uniform([12, 1], -0.1, 0.1),
                                     name="out_W")
            vars.out_b = tf.Variable(tf.random_uniform([1], -0.1, 0.1),
                                     name="out_b")

            predicted_gazemaps = []
            for i in range(T):
                rcn_output_map = rcn_outputs[i]  # [B x 7 x 7 x 128]

                rcn_upsampled_output = tf.nn.conv2d_transpose(
                    rcn_output_map,
                    vars.upsampling_filter1,
                    output_shape=[B, 23, 23, 64],
                    strides=[1, 3, 3, 1],
                    padding='VALID',
                    name='upsampled_rcn_output_' + str(i))
                #rcn_upsampled_output.get_shape().assert_is_compatible_with([B, GH, GW, upsampling_output_channel])
                rcn_upsampled_output = tf.nn.conv2d_transpose(
                    rcn_upsampled_output,
                    vars.upsampling_filter2,
                    output_shape=[B, 49, 49, 32],
                    strides=[1, 2, 2, 1],
                    padding='VALID',
                    name='upsampled_rcn_output_' + str(i))
                input_concat = tf.concat(
                    concat_dim=3,  # the last dimension
                    values=[
                        rcn_upsampled_output,  # [B x 49 x 49 x 8]
                        #                                            net['frm_sal_cubic'][:, i, :, :, :],  # [B x 49 x 49 x 1]
                        # last_output_gazemap                   # [B x 49 x 49 x 1]
                    ])

                output = tf.nn.conv2d_transpose(input_concat,
                                                vars.upsampling_filter3,
                                                output_shape=[B, 49, 49, 12],
                                                strides=[1, 1, 1, 1],
                                                padding='SAME',
                                                name='upsampled_rcn_output_' +
                                                str(i))

                output = tf.nn.xw_plus_b(tf.reshape(output, [-1, 12]),
                                         vars.out_W, vars.out_b)
                output = tf.nn.dropout(output, dropout_keep_prob)

                predicted_gazemap = tf.reshape(
                    output,
                    [B, GH, GW])  # [B x 49 x 49 x 1] -> [B x 49 x 49] squeeze
                predicted_gazemaps.append(predicted_gazemap)
                # TODO should we normalize predicted_gazemap ????????????????????????????

            # pack as a tensor
            # T-list of [B x 49 x 49] --> [B x 49 x 49]
            net['predicted_gazemaps'] = tf.transpose(
                tf.pack(predicted_gazemaps), [1, 0, 2, 3],
                name='predicted_gazemaps')
            net['predicted_gazemaps'].get_shape().assert_is_compatible_with(
                [B, T, GH, GW])

        return net['predicted_gazemaps']
    def create_gazeprediction_network(frame_images,
                                      c3d_input,
                                      dropout_keep_prob=1.0,
                                      net=None):
        '''
        Args:d
            frame_images: a [B x T x IH x IW x 3] tensor (frame images)
            c3d_input : a [B x T x 1024 x 7 x 7] tensor for C3D convmap features
            gt_gazemap : a [B x T x GH x GW] tensor of ground truth per-frame gaze maps
            dropout_keep_prob : float tensor
            (optional) net : a dictionary to get intra-layer activations or tensors.

        Outputs:
            predicted_gazemaps : a [B x T x GH x GW] tensor,
                predicted gaze maps per frame
        '''

        if net is None:
            net = {}
        else:
            assert isinstance(net, dict)

        vars = E()

        # (0) input sanity check
        GH, GW = CONSTANTS.gazemap_height, CONSTANTS.gazemap_width
        IH, IW = CONSTANTS.image_height, CONSTANTS.image_width
        B, T = frame_images.get_shape().as_list()[:2]

        assert B > 0 and T > 0
        frame_images.get_shape().assert_is_compatible_with([B, T, IH, IW, 3])
        c3d_input.get_shape().assert_is_compatible_with([B, T, 1024, 7, 7])

        dim_cnn_proj = 512  # XXX FIXME (see __init__ in GazePredictionGRU)

        # some variables
        # --------------
        # not a proper name, it should be rnn_state_feature_size in # GRCN????????? FIXME
        rnn_state_size = 128  # dim_cnn_proj # filter size is more correct name
        ''' The RGP (Recurrent Gaze Prediction) model. '''

        with tf.variable_scope("RGP"):

            # (2) C3D
            # -------
            # a. reduce filter size [7 x 7 x 1024] -> [7 x 7 x 32] via FC or CONV
            # b. apply RCN, and get the [7 x 7 x 32] outputs from RNN

            # c3d input.
            net['c3d_input'] = c3d_input  # [B x T x 1024 x 7 x 7]
            # change axis and reshape to [B x T x 7 x 7 x 1024]
            net['c3d_input_reshape'] = tf.transpose(net['c3d_input'],
                                                    perm=[0, 1, 3, 4, 2],
                                                    name='c3d_input_reshape')
            log.info('c3d_input_reshape shape : %s',
                     net['c3d_input_reshape'].get_shape().as_list())
            net['c3d_input_reshape'].get_shape().assert_is_compatible_with(
                [B, T, 7, 7, 1024])

            # c3d_embedded: project each 1024 feature (per 7x7 c3d conv-feature map) into 12
            vars.proj_c3d_W = tf.Variable(tf.random_uniform(
                [1024, dim_cnn_proj], -0.1, 0.1),
                                          name="proj_c3d_W")
            vars.proj_c3d_b = tf.Variable(tf.random_uniform([dim_cnn_proj],
                                                            -0.1, 0.1),
                                          name="proj_c3d_b")

            net['c3d_embedded'] = tf.nn.xw_plus_b(
                tf.reshape(net['c3d_input_reshape'],
                           [-1, 1024]), vars.proj_c3d_W, vars.proj_c3d_b
            )  # [(B*T*7*7) x 1024] --> [(B*T*7*7) x 12] by appling W:1024->12

            if dropout_keep_prob != 1.0:
                net['c3d_embedded'] = tf.nn.dropout(net['c3d_embedded'],
                                                    dropout_keep_prob)

            # --> [B x T x 7 x 7 x 12]
            net['c3d_embedded'] = tf.reshape(net['c3d_embedded'],
                                             [B, T, 7, 7, dim_cnn_proj])
            log.info('c3d_embedded shape : %s',
                     net['c3d_embedded'].get_shape().as_list())
            net['c3d_embedded'].get_shape().assert_is_compatible_with(
                [B, T, 7, 7, dim_cnn_proj])

            # The RNN Part.
            # -------------

            with tf.variable_scope('RCNBottom') as scope:
                vars.lstm_u = GRU_RCN_Cell(rnn_state_size, dim_cnn_proj)

                state_u = vars.lstm_u.zero_state(B, tf.float32)
                log.info('RNN state shape : %s', state_u.get_shape().as_list())

                predicted_gazemaps = []
                net['rcn_outputs'] = rcn_outputs = []

                # n_lstm_step for example, 35. -> 42 has highest performance
                for i in range(T):  # T = number of timesteps
                    if i > 0:
                        tf.get_variable_scope().reuse_variables()

                    # We use cnn embedding + ... as RNN input (as a flatted/concatenated vector)
                    rnn_input = tf.concat(
                        values=[  # 0     1  2  3
                            # (i) C3D map (embedded into 7x7x12)
                            net['c3d_embedded'][:, i, :, :, :],
                        ],
                        axis=3,  # [:, i, 7, 7, HERE]
                        name='rnn_input' + str(i))

                    # with tf.variable_scope("RNN"):
                    output_u, state_u = vars.lstm_u(rnn_input, state_u)

                    # at time t
                    output_u.get_shape().assert_is_compatible_with(
                        [B, 7, 7, rnn_state_size])  # Bx{time}x7x7x32
                    rcn_outputs.append(output_u)

            # (3) RCN output unpooling to 49x49 size
            # each of (7x7x32) maps are up-sampled to (49x49x8)
            vars.upsampling_filter1 = tf.get_variable(
                'Upsampling/weight1',
                [5, 5, 64, rnn_state_size
                 ],  # rnn_state_size bad name (indeed a channel size)
                initializer=initializers.xavier_initializer_conv2d(
                    uniform=True))
            vars.upsampling_filter2 = tf.get_variable(
                'Upsampling/weight2',
                [5, 5, 32, 64
                 ],  # rnn_state_size bad name (indeed a channel size)
                initializer=initializers.xavier_initializer_conv2d(
                    uniform=True))

            vars.upsampling_filter3 = tf.get_variable(
                'Upsampling/weight3',
                [7, 7, 12, 32
                 ],  # rnn_state_size bad name (indeed a channel size)
                initializer=initializers.xavier_initializer_conv2d(
                    uniform=True))
            vars.out_W = tf.Variable(tf.random_uniform([12, 1], -0.1, 0.1),
                                     name="out_W")
            vars.out_b = tf.Variable(tf.random_uniform([1], -0.1, 0.1),
                                     name="out_b")

            predicted_gazemaps = []
            # Batch normalization assumption (if wrong fix): apply before eac convolutional layer
            for i in range(T):
                rcn_output_map = rcn_outputs[i]  # [B x 7 x 7 x 128]

                # for now in here - later will add to base:

                # batch_mean, batch_var = tf.nn.moments(rcn_output_map, axes = [0,1,2]) #global normalization for conv_filters
                # what to do with offset and scale?
                rcn_output_map = tf.layers.batch_normalization(rcn_output_map)
                rcn_upsampled_output = tf.nn.conv2d_transpose(
                    rcn_output_map,
                    vars.upsampling_filter1,
                    output_shape=[B, 23, 23, 64],
                    strides=[1, 3, 3, 1],
                    padding='VALID',
                    name='upsampled_rcn_output_' + str(i))

                #rcn_upsampled_output.get_shape().assert_is_compatible_with([B, GH, GW, upsampling_output_channel])
                rcn_upsampled_output = tf.nn.conv2d_transpose(
                    rcn_upsampled_output,
                    vars.upsampling_filter2,
                    output_shape=[B, 49, 49, 32],
                    strides=[1, 2, 2, 1],
                    padding='VALID',
                    name='upsampled_rcn_output_' + str(i))

                input_concat = tf.concat(
                    axis=3,  # the last dimension
                    values=[
                        # [B x 49 x 49 x 8]
                        rcn_upsampled_output,
                        #                                            net['frm_sal_cubic'][:, i, :, :, :],  # [B x 49 x 49 x 1]
                        # last_output_gazemap                   # [B x 49 x 49 x 1]
                    ])

                output = tf.nn.conv2d_transpose(input_concat,
                                                vars.upsampling_filter3,
                                                output_shape=[B, 49, 49, 12],
                                                strides=[1, 1, 1, 1],
                                                padding='SAME',
                                                name='upsampled_rcn_output_' +
                                                str(i))

                output = tf.nn.xw_plus_b(tf.reshape(output, [-1, 12]),
                                         vars.out_W, vars.out_b)
                output = tf.nn.dropout(output, dropout_keep_prob)

                # [B x 49 x 49 x 1] -> [B x 49 x 49] squeeze
                predicted_gazemap = tf.reshape(output, [B, GH, GW])
                predicted_gazemaps.append(predicted_gazemap)
                # TODO should we normalize predicted_gazemap ????????????????????????????

            # pack as a tensor
            # T-list of [B x 49 x 49] --> [B x 49 x 49]
            net['predicted_gazemaps'] = tf.transpose(
                tf.stack(predicted_gazemaps), [1, 0, 2, 3],
                name='predicted_gazemaps')
            net['predicted_gazemaps'].get_shape().assert_is_compatible_with(
                [B, T, GH, GW])

        return net['predicted_gazemaps']
Beispiel #6
0
    def create_gazeprediction_network(frame_images,
                                      c3d_input,
                                      dropout_keep_prob=1.0,
                                      net=None):
        '''
        Args:
            frame_images: a [B x T x IH x IW x 3] tensor (frame images)
            c3d_input : a [B x T x 1024 x 7 x 7] tensor for C3D convmap features
            gt_gazemap : a [B x T x GH x GW] tensor of ground truth per-frame gaze maps
            dropout_keep_prob : float tensor
            (optional) net : a dictionary to get intra-layer activations or tensors.

        Outputs:
            predicted_gazemaps : a [B x T x GH x GW] tensor,
                predicted gaze maps per frame
        '''

        if net is None: net = {}
        else: assert isinstance(net, dict)

        vars = E()

        # (0) input sanity check
        GH, GW = CONSTANTS.gazemap_height, CONSTANTS.gazemap_width
        IH, IW = CONSTANTS.image_height, CONSTANTS.image_width
        B, T = frame_images.get_shape().as_list()[:2]

        assert B > 0 and T > 0
        frame_images.get_shape().assert_is_compatible_with([B, T, IH, IW, 3])
        c3d_input.get_shape().assert_is_compatible_with([B, T, 1024, 7, 7])

        dim_cnn_proj = 512  # XXX FIXME (see __init__ in GazePredictionGRU)

        # some variables
        # --------------
        # not a proper name, it should be rnn_state_feature_size in # GRCN????????? FIXME
        rnn_state_size = 128  #dim_cnn_proj # filter size is more correct name
        ''' The RGP (Recurrent Gaze Prediction) model. '''

        # (1) Input frame saliency
        # ------------------------

        # Input.
        net['frame_images'] = frame_images  # [B x T x IH x IW x 3]

        #net['frm_sal'] = SaliencyModel.create_shallownet(
        #    tf.reshape(net['frame_images'], [-1, IH, IW, 3]),
        #    scope='ShallowNet',
        #    dropout=False
        #) # [-1, 49, 49]
        #net['frm_sal'] = tf.reshape(net['frm_sal'], [B, T, GH, GW]) # [B x T x 49 x 49]

        # # [B x T x 49 x 49] --> [B x T x 49 x 49 x 1]
        # net['frm_sal_cubic'] = tf.reshape(net['frm_sal'], [B, T, GH, GW, 1],
        # name='frame_saliency_cubic')

        # (2) C3D
        # -------
        # a. reduce filter size [7 x 7 x 1024] -> [7 x 7 x 32] via FC or CONV
        # b. apply RCN, and get the [7 x 7 x 32] outputs from RNN

        # c3d input.
        net['c3d_input'] = c3d_input  # [B x T x 1024 x 7 x 7]
        # change axis and reshape to [B x T x 7 x 7 x 1024]
        net['c3d_input_reshape'] = tf.transpose(net['c3d_input'],
                                                perm=[0, 1, 3, 4, 2],
                                                name='c3d_input_reshape')
        log.info('c3d_input_reshape shape : %s',
                 net['c3d_input_reshape'].get_shape().as_list())
        net['c3d_input_reshape'].get_shape().assert_is_compatible_with(
            [B, T, 7, 7, 1024])

        # c3d_embedded: project each 1024 feature (per 7x7 c3d conv-feature map) into 12
        vars.proj_c3d_W = tf.Variable(tf.random_uniform([1024, dim_cnn_proj],
                                                        -0.1, 0.1),
                                      name="proj_c3d_W")
        vars.proj_c3d_b = tf.Variable(tf.random_uniform([dim_cnn_proj], -0.1,
                                                        0.1),
                                      name="proj_c3d_b")

        net['c3d_embedded'] = tf.nn.xw_plus_b(
            tf.reshape(net['c3d_input_reshape'],
                       [-1, 1024]), vars.proj_c3d_W, vars.proj_c3d_b
        )  # [(B*T*7*7) x 1024] --> [(B*T*7*7) x 12] by appling W:1024->12

        if dropout_keep_prob != 1.0:
            net['c3d_embedded'] = tf.nn.dropout(net['c3d_embedded'],
                                                dropout_keep_prob)

        # --> [B x T x 7 x 7 x 12]
        net['c3d_embedded'] = tf.reshape(net['c3d_embedded'],
                                         [B, T, 7, 7, dim_cnn_proj])
        log.info('c3d_embedded shape : %s',
                 net['c3d_embedded'].get_shape().as_list())
        net['c3d_embedded'].get_shape().assert_is_compatible_with(
            [B, T, 7, 7, dim_cnn_proj])

        # The RNN Part.
        # -------------

        with tf.variable_scope('RCNBottom') as scope:
            vars.lstm_u = GRU_RCN_Cell(rnn_state_size, dim_cnn_proj)

            state_u = vars.lstm_u.zero_state(B, tf.float32)
            log.info('RNN state shape : %s', state_u.get_shape().as_list())

            predicted_gazemaps = []
            net['rcn_outputs'] = rcn_outputs = []

            vars.out_W = tf.Variable(tf.random_uniform([rnn_state_size, 1],
                                                       -0.1, 0.1),
                                     name="out_W")
            vars.out_b = tf.Variable(tf.random_uniform([1], -0.1, 0.1),
                                     name="out_b")

            # n_lstm_step for example, 35.
            for i in range(T):
                if i > 0:
                    tf.get_variable_scope().reuse_variables()

                # We use cnn embedding + ... as RNN input (as a flatted/concatenated vector)
                rnn_input = tf.concat(
                    concat_dim=3,  # [:, i, 7, 7, HERE]
                    values=[  #  0     1  2  3
                        net['c3d_embedded']
                        [:, i, :, :, :],  # (i) C3D map (embedded into 7x7x12)
                    ],
                    name='rnn_input' + str(i))

                #with tf.variable_scope("RNN"):
                output_u, state_u = vars.lstm_u(rnn_input, state_u)

                # at time t
                output_u.get_shape().assert_is_compatible_with(
                    [B, 7, 7, rnn_state_size])  # Bx{time}x7x7x32
                rcn_outputs.append(output_u)

                # a FC layer follows
                output = tf.nn.xw_plus_b(
                    tf.reshape(output_u, [-1, rnn_state_size]), vars.out_W,
                    vars.out_b)
                output = tf.nn.dropout(output, dropout_keep_prob)

                predicted_gazemap = tf.reshape(
                    output, [B, GH, GW])  # 7x7 softmax logit
                predicted_gazemaps.append(predicted_gazemap)

        # pack as a tensor
        # T-list of [B x 49 x 49] --> [B x 49 x 49]
        net['predicted_gazemaps'] = tf.transpose(tf.pack(predicted_gazemaps),
                                                 [1, 0, 2, 3],
                                                 name='predicted_gazemaps')
        net['predicted_gazemaps'].get_shape().assert_is_compatible_with(
            [B, T, GH, GW])
        return net['predicted_gazemaps']