from collections import OrderedDict import cPickle as pkl import crc_input_data_seq from util import log, override from models.base import ModelBase, BaseModelConfig from models.saliency_shallownet import SaliencyModel from models.model_util import tf_normalize_map, normalize_probability_map from models.model_util import tf_softmax_2d, tf_softmax_cross_entropy_with_logits_2d from evaluation_metrics import saliency_score, AVAILABLE_METRICS from easydict import EasyDict as E CONSTANTS = E() CONSTANTS.image_width = 98 CONSTANTS.image_height = 98 CONSTANTS.gazemap_width = 7 CONSTANTS.gazemap_height = 7 CONSTANTS.saliencymap_width = 49 CONSTANTS.saliencymap_height = 49 # config : changed as paramter later class GRUModelConfig(BaseModelConfig): def __init__(self): super(GRUModelConfig, self).__init__() self.n_lstm_steps = 35 self.batch_size = 7 # XXX XXX XXX XXX
def create_gazeprediction_network(frame_images, c3d_input, gt_gazemap, dropout_keep_prob, net=None): ''' Args: frame_images: a [B x T x IH x IW x 3] tensor (frame images) c3d_input : a [B x T x 1024 x 7 x 7] tensor for C3D convmap features gt_gazemap : a [B x T x GH x GW] tensor of ground truth per-frame gaze maps dropout_keep_prob : float tensor (optional) net : a dictionary to get intra-layer activations or tensors. Outputs: [predicted_gazemaps, loss, image_summary] where predicted_gazemaps : a [B x T x GH x GW] tensor, predicted gaze maps per frame loss: a scalar (float) tensor of RNN supervision loss. image_summary ''' if net is None: net = {} else: assert isinstance(net, dict) vars = E() # (0) input sanity check GH, GW = CONSTANTS.gazemap_height, CONSTANTS.gazemap_width IH, IW = CONSTANTS.image_height, CONSTANTS.image_width B, T = frame_images.get_shape().as_list()[:2] assert B > 0 and T > 0 frame_images.get_shape().assert_is_compatible_with([B, T, IH, IW, 3]) c3d_input.get_shape().assert_is_compatible_with([B, T, 1024, 7, 7]) gt_gazemap.get_shape().assert_is_compatible_with([B, T, GH, GW]) dim_cnn_proj = 512 # XXX FIXME (see __init__ in GazePredictionGRU) # some variables # -------------- # not a proper name, it should be rnn_state_feature_size in # GRCN????????? FIXME rnn_state_size = 256 #dim_cnn_proj # filter size is more correct name ''' The RGP (Recurrent Gaze Prediction) model. ''' # (1) Input frame saliency # ------------------------ # Input. net['frame_images'] = frame_images # [B x T x IH x IW x 3] net['frm_sal'] = SaliencyModel.create_shallownet( tf.reshape(net['frame_images'], [-1, IH, IW, 3]), scope='ShallowNet', dropout=False) # [-1, 49, 49] net['frm_sal'] = tf.reshape(net['frm_sal'], [B, T, GH, GW]) # [B x T x 49 x 49] # [B x T x 49 x 49] --> [B x T x 49 x 49 x 1] net['frm_sal_cubic'] = tf.reshape(net['frm_sal'], [B, T, GH, GW, 1], name='frame_saliency_cubic') # (2) C3D # ------- # a. reduce filter size [7 x 7 x 1024] -> [7 x 7 x 32] via FC or CONV # b. apply RCN, and get the [7 x 7 x 32] outputs from RNN # c3d input. net['c3d_input'] = c3d_input # [B x T x 1024 x 7 x 7] # change axis and reshape to [B x T x 7 x 7 x 1024] net['c3d_input_reshape'] = tf.transpose(net['c3d_input'], perm=[0, 1, 3, 4, 2], name='c3d_input_reshape') log.info('c3d_input_reshape shape : %s', net['c3d_input_reshape'].get_shape().as_list()) net['c3d_input_reshape'].get_shape().assert_is_compatible_with( [B, T, 7, 7, 1024]) # c3d_embedded: project each 1024 feature (per 7x7 c3d conv-feature map) into 12 vars.proj_c3d_W = tf.Variable(tf.random_uniform([1024, dim_cnn_proj], -0.1, 0.1), name="proj_c3d_W") vars.proj_c3d_b = tf.Variable(tf.random_uniform([dim_cnn_proj], -0.1, 0.1), name="proj_c3d_b") net['c3d_embedded'] = tf.nn.xw_plus_b( tf.reshape(net['c3d_input_reshape'], [-1, 1024]), vars.proj_c3d_W, vars.proj_c3d_b ) # [(B*T*7*7) x 1024] --> [(B*T*7*7) x 12] by appling W:1024->12 # --> [B x T x 7 x 7 x 12] net['c3d_embedded'] = tf.reshape(net['c3d_embedded'], [B, T, 7, 7, dim_cnn_proj]) log.info('c3d_embedded shape : %s', net['c3d_embedded'].get_shape().as_list()) net['c3d_embedded'].get_shape().assert_is_compatible_with( [B, T, 7, 7, dim_cnn_proj]) # The RNN Part. # ------------- # Batch size x (gaze map size), per frame net['gt_gazemap'] = gt_gazemap # [B x T x GH, GW] log.info('gt_gazemap shape : %s', net['gt_gazemap'].get_shape().as_list()) with tf.variable_scope('RCNBottom') as scope: vars.lstm_u = GRU_RCN_Cell(rnn_state_size, dim_cnn_proj) state_u = vars.lstm_u.zero_state(B, tf.float32) log.info('RNN state shape : %s', state_u.get_shape().as_list()) # n_lstm_step for example, 35. net['rcn_outputs'] = rcn_outputs = [] for i in range(T): if i > 0: tf.get_variable_scope().reuse_variables() # We use cnn embedding + ... as RNN input (as a flatted/concatenated vector) rnn_input = tf.concat( concat_dim=3, # [:, i, 7, 7, HERE] values=[ # 0 1 2 3 net['c3d_embedded'] [:, i, :, :, :], # (i) C3D map (embedded into 7x7x12) ], name='rnn_input' + str(i)) #with tf.variable_scope("RNN"): output_u, state_u = vars.lstm_u(rnn_input, state_u) # at time t output_u.get_shape().assert_is_compatible_with( [B, 7, 7, rnn_state_size]) # Bx{time}x7x7x32 rcn_outputs.append(output_u) # (3) RCN output unpooling to 49x49 size # each of (7x7x32) maps are up-sampled to (49x49x8) upsampling_filter_size = 11 upsampling_output_channel = 64 vars.upsampling_filter = tf.get_variable( 'Upsampling/weight', [ upsampling_filter_size, upsampling_filter_size, upsampling_output_channel, rnn_state_size ], # rnn_state_size bad name (indeed a channel size) initializer=initializers.xavier_initializer_conv2d(uniform=True)) net['rcn_upsampled_outputs'] = rcn_upsampled_outputs = [] for i in range(T): rcn_output_map = rcn_outputs[i] # [B x 7 x 7 x 128] rcn_upsampled_output = tf.nn.conv2d_transpose( rcn_output_map, vars.upsampling_filter, output_shape=[B, GH, GW, upsampling_output_channel], strides=[1, 7, 7, 1], padding='SAME', name='upsampled_rcn_output_' + str(i)) rcn_upsampled_output.get_shape().assert_is_compatible_with( [B, GH, GW, upsampling_output_channel]) rcn_upsampled_outputs.append(rcn_upsampled_output) if i == 0: log.info('RCN input map size : %s', rcn_output_map.get_shape().as_list()) log.info('RCN upsampled size : %s', rcn_upsampled_output.get_shape().as_list()) # (4) The upper layer of GRCN to emit gaze map # -------------------------------------------- with tf.variable_scope('RCNGaze') as scope: vars.lstm_g = GRU_RCN_Cell( num_units=3, # dim_feature=upsampling_output_channel + 1 + 1, # 10? dim_feature=upsampling_output_channel + 1, # 10? spatial_shape=[GH, GW], kernel_spatial_shape=[5, 5]) state_g = vars.lstm_g.zero_state(B, tf.float32) # last_output_gazemap = tf.zeros([B, GH, GW, 1]) predicted_gazemaps = [] for i in range(T): if i > 0: tf.get_variable_scope().reuse_variables() # try RNN supervision with GT gazemap. # FIXME decoder should be spin off here #if i > 0: # last_output_gazemap = tf.expand_dims(gt_gazemap[:, i - 1, :, :], 3) # now, combine image saliency, rcn map from the bottom layer, # and the previous input ''' rcn_input_concat = tf.concat(concat_dim=3, # the last dimension values=[ rcn_upsampled_outputs[i], # [B x 49 x 49 x 8] net['frm_sal_cubic'][:, i, :, :, :], # [B x 49 x 49 x 1] # last_output_gazemap # [B x 49 x 49 x 1] ]) ''' #with tf.variable_scope("RNN"): output_g, state_g = vars.lstm_g(rcn_upsampled_outputs[i], state_g) output_g.get_shape().assert_is_compatible_with([B, GH, GW, 3]) rcn_outputs.append(rcn_outputs) output_g = tf.reshape(output_g, [B, -1]) # apply another convolutional layer (== fc in fact) to gaze map # [B x 49 x 49 x 3] -> # [B x 49 x 49 x 1] with tf.variable_scope('LastProjection') as scope_proj: if i > 0: tf.get_variable_scope().reuse_variables() fc1 = fully_connected( output_g, 4802, activation_fn=None, #tf.nn.relu, weight_init=initializers.xavier_initializer( uniform=True), bias_init=tf.constant_initializer(0.0), weight_collections=['MODEL_VARS'], bias_collections=['MODEL_VARS'], name='fc1') #net['fc1'] = tflearn.layers.batch_normalization(net['fc1']) fc1 = tf.nn.relu(fc1) if dropout_keep_prob is not None: fc1 = tf.nn.dropout(fc1, dropout_keep_prob) fc1_slice1, fc1_slice2 = tf.split(1, 2, fc1, name='fc1_slice') max_out = tf.maximum(fc1_slice1, fc1_slice2, name='fc1_maxout') fc2 = fully_connected( max_out, 4802, activation_fn=None, # no relu here weight_init=initializers.xavier_initializer( uniform=True), bias_init=tf.constant_initializer(0.0), weight_collections=['MODEL_VARS'], bias_collections=['MODEL_VARS'], name='fc2') #net['fc2'] = tflearn.layers.batch_normalization(net['fc2']) fc2 = tf.nn.relu(fc2) #if dropout: # net['fc2'] = tf.nn.dropout( net['fc2'], net['dropout_keep_prob'] ) fc2_slice1, fc2_slice2 = tf.split(1, 2, fc2, name='fc2_slice') max_out2 = tf.maximum(fc2_slice1, fc2_slice2, name='fc2_maxout') predicted_gazemap = tf.reshape( max_out2, [B, GH, GW]) # [B x 49 x 49 x 1] -> [B x 49 x 49] squeeze predicted_gazemaps.append(predicted_gazemap) # TODO should we normalize predicted_gazemap ???????????????????????????? # (4) Finally, calculate the loss loss = 0.0 for i in range(T): predicted_gazemap = predicted_gazemaps[i] # Cross entropy and softmax?? l2loss = tf.nn.l2_loss(predicted_gazemap - gt_gazemap[:, i, :, :]) # on Bx49x49 current_gaze_loss = tf.reduce_sum(l2loss) current_loss = current_gaze_loss loss += current_loss # loss: take average loss = tf.div(loss, float(B * T), name='loss_avg') # FIXME may be duplicates? tf.scalar_summary('loss/train', loss) tf.scalar_summary('loss/val', loss, collections=['TEST_SUMMARIES']) # pack as a tensor # T-list of [B x 49 x 49] --> [B x 49 x 49] net['predicted_gazemaps'] = tf.transpose(tf.pack(predicted_gazemaps), [1, 0, 2, 3], name='predicted_gazemaps') net['predicted_gazemaps'].get_shape().assert_is_compatible_with( [B, T, GH, GW]) # Debugging Informations # ---------------------- # OPTIONAL: for debugging and visualization # XXX only last predicted_gazemap is shown as of now :( T^T # XXX rename saliency -> gaze (to avoid confusion) def _add_image_summary(tag, tensor): return tf.image_summary(tag, tensor, max_images=2, collections=['IMAGE_SUMMARIES']) _input_image = frame_images[:, i, :, :, :] # last rnn step _saliency_output = tf.reshape(predicted_gazemap, [-1, GH, GW, 1]) _saliency_gt = tf.reshape(gt_gazemap[:, i, :, :], [-1, GH, GW, 1]) _saliency_shallow = tf.reshape(net['frm_sal'][:, i, :, :], [-1, GH, GW, 1]) _add_image_summary('inputimage', _input_image) _add_image_summary('saliency_maps_gt', _saliency_gt) _add_image_summary('saliency_maps_pred_original', _saliency_output) _add_image_summary('saliency_maps_pred_norm', tf_normalize_map(_saliency_output)) #_add_image_summary('saliency_zimgframe_shallow77', _saliency_shallow77) _add_image_summary('saliency_zshallownet', _saliency_shallow) image_summaries = tf.merge_summary( inputs=tf.get_collection('IMAGE_SUMMARIES'), collections=[], name='merged_image_summary', ) return net['predicted_gazemaps'], loss, image_summaries
def create_gazeprediction_network(frame_images, c3d_input, dropout_keep_prob = 1.0, net=None): ''' Args: frame_images: a [B x T x IH x IW x 3] tensor (frame images) c3d_input : a [B x T x 1024 x 7 x 7] tensor for C3D convmap features Outputs: predicted_gazemaps : a [B x T x GH x GW] tensor, predicted gaze maps per frame ''' if net is None: net = {} else: assert isinstance(net, dict) vars = E() # (0) input sanity check GH, GW = CONSTANTS.gazemap_height, CONSTANTS.gazemap_width IH, IW = CONSTANTS.image_height, CONSTANTS.image_width B, T = frame_images.get_shape().as_list()[:2] assert B > 0 and T > 0 frame_images.get_shape().assert_is_compatible_with([B, T, IH, IW, 3]) c3d_input.get_shape().assert_is_compatible_with([B, T, 1024, 7, 7]) dim_cnn_proj = 32 # XXX FIXME (see __init__ in GazePredictionGRU) # some variables # -------------- rnn_state_size = 7 * 7 * dim_cnn_proj # flatten with this dimension order rnn_state_size += 7 * 7 * 1 # C3D projected PLUS saliency map (49) # FIXME ''' C3D + Image Saliency + Vanila RNN ''' # (1) Input frame saliency # ------------------------ # Input. net['frame_images'] = frame_images # [B x T x IH x IW x 3] net['frm_sal'] = SaliencyModel.create_shallownet( tf.reshape(net['frame_images'], [-1, IH, IW, 3]), scope='ShallowNet', dropout=False ) # [-1, 49, 49] if (GH, GW) == (7, 7): log.warn('Downsampling 49x49 saliency to 7x7 ...') # downsampling 49,49 -> 7,7 net['frm_sal'] = tf.nn.avg_pool( tf.expand_dims(net['frm_sal'], 3), # [B, 49, 49, '1'] [1, 7, 7, 1], [1, 7, 7, 1], padding='VALID' ) net['frm_sal'] = tf.reshape(net['frm_sal'], [B, T, GH, GW]) # [B x T x 49 x 49] # [B x T x 49 x 49] --> [B x T x 49 x 49 x 1] net['frm_sal_cubic'] = tf.reshape(net['frm_sal'], [B, T, GH, GW, 1], name='frame_saliency_cubic') # (2) C3D # ------- # a. reduce filter size [7 x 7 x 1024] -> [7 x 7 x 32] via FC or CONV # b. apply RCN, and unpool the [7 x 7 x 32] outputs to [49, 49, 8] # c3d input. net['c3d_input'] = c3d_input # [B x T x 1024 x 7 x 7] # change axis and reshape to [B x T x 7 x 7 x 1024] net['c3d_input_reshape'] = tf.transpose(net['c3d_input'], perm=[0,1,3,4,2], name='c3d_input_reshape') log.info('c3d_input_reshape shape : %s', net['c3d_input_reshape'].get_shape().as_list()) net['c3d_input_reshape'].get_shape().assert_is_compatible_with([B, T, 7, 7, 1024]) # c3d_embedded: project each 1024 feature (per 7x7 c3d conv-feature map) into 12 vars.proj_c3d_W = tf.Variable(tf.random_uniform([1024, dim_cnn_proj], -0.1, 0.1), name="proj_c3d_W") vars.proj_c3d_b = tf.Variable(tf.random_uniform([dim_cnn_proj], -0.1, 0.1), name="proj_c3d_b") net['c3d_embedded'] = tf.nn.xw_plus_b( tf.reshape(net['c3d_input_reshape'], [-1, 1024]), vars.proj_c3d_W, vars.proj_c3d_b ) # [(B*T*7*7) x 1024] --> [(B*T*7*7) x 12] by appling W:1024->12 if dropout_keep_prob != 1.0: net['c3d_embedded'] = tf.nn.dropout(net['c3d_embedded'], dropout_keep_prob) # --> [B x T x 7 x 7 x 12] net['c3d_embedded'] = tf.reshape(net['c3d_embedded'], [B, T, 7, 7, dim_cnn_proj]) log.info('c3d_embedded shape : %s', net['c3d_embedded'].get_shape().as_list()) net['c3d_embedded'].get_shape().assert_is_compatible_with([B, T, 7, 7, dim_cnn_proj]) # The RNN Part. # ------------- with tf.variable_scope("RNN") as scope: vars.lstm_u = rnn_cell.GRUCell(rnn_state_size, kernel_initializer="orthogonal") state_u = vars.lstm_u.zero_state(B, tf.float32) vars.proj_out_W = tf.Variable( tf.random_uniform( [rnn_state_size, GH*GW], -0.1, 0.1), name = "proj_out_W") vars.proj_out_b = tf.Variable( tf.zeros( [GH*GW], name = "proj_out_b")) log.info('RNN state shape : %s', state_u.get_shape().as_list()) predicted_gazemaps = [] # n_lstm_step for example, 35. for i in range(T): if i > 0: tf.get_variable_scope().reuse_variables() # We use cnn embedding + ... as RNN input (as a flatted/concatenated vector) rnn_input = tf.concat(axis=3, # [:, i, 7, 7, HERE] values=[ # 0 1 2 3 net['c3d_embedded'][:, i, :, :, :], # (i) C3D map (embedded into 7x7x12) # self.frm_sal_771[:, i, :, :, :], # (ii) frame saliency ], name='rnn_input') # flatten rnn_input[i] into rank 2 (e.g. [B x -1] # TODO this part should be seamlessly done in RNN cell rnn_input_flatten = tf.reshape(rnn_input, [B, -1], name='rnn_input_flatten') output_u, state_u = vars.lstm_u(rnn_input_flatten, state_u) # at time t predicted_gazemap = tf.nn.xw_plus_b(output_u, vars.proj_out_W, vars.proj_out_b) predicted_gazemap = tf.reshape(predicted_gazemap, [-1, GH, GW]) predicted_gazemap.get_shape().assert_is_compatible_with([B, GH, GW]) predicted_gazemaps.append(predicted_gazemap) # pack as a tensor # T-list of [B x 49 x 49] --> [B x 49 x 49] net['predicted_gazemaps'] = tf.transpose(tf.stack(predicted_gazemaps), [1, 0, 2, 3], name='predicted_gazemaps') net['predicted_gazemaps'].get_shape().assert_is_compatible_with([B, T, GH, GW]) return net['predicted_gazemaps']
def create_gazeprediction_network(frame_images, c3d_input, dropout_keep_prob=1.0, net=None): ''' Args: frame_images: a [B x T x IH x IW x 3] tensor (frame images) c3d_input : a [B x T x 1024 x 7 x 7] tensor for C3D convmap features gt_gazemap : a [B x T x GH x GW] tensor of ground truth per-frame gaze maps dropout_keep_prob : float tensor (optional) net : a dictionary to get intra-layer activations or tensors. Outputs: predicted_gazemaps : a [B x T x GH x GW] tensor, predicted gaze maps per frame ''' if net is None: net = {} else: assert isinstance(net, dict) vars = E() # (0) input sanity check GH, GW = CONSTANTS.gazemap_height, CONSTANTS.gazemap_width IH, IW = CONSTANTS.image_height, CONSTANTS.image_width B, T = frame_images.get_shape().as_list()[:2] assert B > 0 and T > 0 frame_images.get_shape().assert_is_compatible_with([B, T, IH, IW, 3]) c3d_input.get_shape().assert_is_compatible_with([B, T, 1024, 7, 7]) dim_cnn_proj = 512 # XXX FIXME (see __init__ in GazePredictionGRU) # some variables # -------------- # not a proper name, it should be rnn_state_feature_size in # GRCN????????? FIXME rnn_state_size = 128 #dim_cnn_proj # filter size is more correct name ''' The RGP (Recurrent Gaze Prediction) model. ''' with tf.variable_scope("RGP"): # (2) C3D # ------- # a. reduce filter size [7 x 7 x 1024] -> [7 x 7 x 32] via FC or CONV # b. apply RCN, and get the [7 x 7 x 32] outputs from RNN # c3d input. net['c3d_input'] = c3d_input # [B x T x 1024 x 7 x 7] # change axis and reshape to [B x T x 7 x 7 x 1024] net['c3d_input_reshape'] = tf.transpose(net['c3d_input'], perm=[0, 1, 3, 4, 2], name='c3d_input_reshape') log.info('c3d_input_reshape shape : %s', net['c3d_input_reshape'].get_shape().as_list()) net['c3d_input_reshape'].get_shape().assert_is_compatible_with( [B, T, 7, 7, 1024]) # c3d_embedded: project each 1024 feature (per 7x7 c3d conv-feature map) into 12 vars.proj_c3d_W = tf.Variable(tf.random_uniform( [1024, dim_cnn_proj], -0.1, 0.1), name="proj_c3d_W") vars.proj_c3d_b = tf.Variable(tf.random_uniform([dim_cnn_proj], -0.1, 0.1), name="proj_c3d_b") net['c3d_embedded'] = tf.nn.xw_plus_b( tf.reshape(net['c3d_input_reshape'], [-1, 1024]), vars.proj_c3d_W, vars.proj_c3d_b ) # [(B*T*7*7) x 1024] --> [(B*T*7*7) x 12] by appling W:1024->12 if dropout_keep_prob != 1.0: net['c3d_embedded'] = tf.nn.dropout(net['c3d_embedded'], dropout_keep_prob) # --> [B x T x 7 x 7 x 12] net['c3d_embedded'] = tf.reshape(net['c3d_embedded'], [B, T, 7, 7, dim_cnn_proj]) log.info('c3d_embedded shape : %s', net['c3d_embedded'].get_shape().as_list()) net['c3d_embedded'].get_shape().assert_is_compatible_with( [B, T, 7, 7, dim_cnn_proj]) # Instead of RNN part, we have deconvolution # ------------- rcn_outputs = [None] * T for i in range(T): rcn_outputs[i] = net['c3d_embedded'][:, i, :, :, :] # B x 7 x 7 x 512(dim_cnn_proj) # (3) RCN output unpooling to 49x49 size # each of (7x7x32) maps are up-sampled to (49x49x8) vars.upsampling_filter1 = tf.get_variable( 'Upsampling/weight1', [ 5, 5, 64, dim_cnn_proj, # directly project 512->64 #rnn_state_size ], # rnn_state_size bad name (indeed a channel size) initializer=initializers.xavier_initializer_conv2d( uniform=True)) vars.upsampling_filter2 = tf.get_variable( 'Upsampling/weight2', [5, 5, 32, 64 ], # rnn_state_size bad name (indeed a channel size) initializer=initializers.xavier_initializer_conv2d( uniform=True)) vars.upsampling_filter3 = tf.get_variable( 'Upsampling/weight3', [7, 7, 12, 32 ], # rnn_state_size bad name (indeed a channel size) initializer=initializers.xavier_initializer_conv2d( uniform=True)) vars.out_W = tf.Variable(tf.random_uniform([12, 1], -0.1, 0.1), name="out_W") vars.out_b = tf.Variable(tf.random_uniform([1], -0.1, 0.1), name="out_b") predicted_gazemaps = [] for i in range(T): rcn_output_map = rcn_outputs[i] # [B x 7 x 7 x 128] rcn_upsampled_output = tf.nn.conv2d_transpose( rcn_output_map, vars.upsampling_filter1, output_shape=[B, 23, 23, 64], strides=[1, 3, 3, 1], padding='VALID', name='upsampled_rcn_output_' + str(i)) #rcn_upsampled_output.get_shape().assert_is_compatible_with([B, GH, GW, upsampling_output_channel]) rcn_upsampled_output = tf.nn.conv2d_transpose( rcn_upsampled_output, vars.upsampling_filter2, output_shape=[B, 49, 49, 32], strides=[1, 2, 2, 1], padding='VALID', name='upsampled_rcn_output_' + str(i)) input_concat = tf.concat( concat_dim=3, # the last dimension values=[ rcn_upsampled_output, # [B x 49 x 49 x 8] # net['frm_sal_cubic'][:, i, :, :, :], # [B x 49 x 49 x 1] # last_output_gazemap # [B x 49 x 49 x 1] ]) output = tf.nn.conv2d_transpose(input_concat, vars.upsampling_filter3, output_shape=[B, 49, 49, 12], strides=[1, 1, 1, 1], padding='SAME', name='upsampled_rcn_output_' + str(i)) output = tf.nn.xw_plus_b(tf.reshape(output, [-1, 12]), vars.out_W, vars.out_b) output = tf.nn.dropout(output, dropout_keep_prob) predicted_gazemap = tf.reshape( output, [B, GH, GW]) # [B x 49 x 49 x 1] -> [B x 49 x 49] squeeze predicted_gazemaps.append(predicted_gazemap) # TODO should we normalize predicted_gazemap ???????????????????????????? # pack as a tensor # T-list of [B x 49 x 49] --> [B x 49 x 49] net['predicted_gazemaps'] = tf.transpose( tf.pack(predicted_gazemaps), [1, 0, 2, 3], name='predicted_gazemaps') net['predicted_gazemaps'].get_shape().assert_is_compatible_with( [B, T, GH, GW]) return net['predicted_gazemaps']
def create_gazeprediction_network(frame_images, c3d_input, dropout_keep_prob=1.0, net=None): ''' Args:d frame_images: a [B x T x IH x IW x 3] tensor (frame images) c3d_input : a [B x T x 1024 x 7 x 7] tensor for C3D convmap features gt_gazemap : a [B x T x GH x GW] tensor of ground truth per-frame gaze maps dropout_keep_prob : float tensor (optional) net : a dictionary to get intra-layer activations or tensors. Outputs: predicted_gazemaps : a [B x T x GH x GW] tensor, predicted gaze maps per frame ''' if net is None: net = {} else: assert isinstance(net, dict) vars = E() # (0) input sanity check GH, GW = CONSTANTS.gazemap_height, CONSTANTS.gazemap_width IH, IW = CONSTANTS.image_height, CONSTANTS.image_width B, T = frame_images.get_shape().as_list()[:2] assert B > 0 and T > 0 frame_images.get_shape().assert_is_compatible_with([B, T, IH, IW, 3]) c3d_input.get_shape().assert_is_compatible_with([B, T, 1024, 7, 7]) dim_cnn_proj = 512 # XXX FIXME (see __init__ in GazePredictionGRU) # some variables # -------------- # not a proper name, it should be rnn_state_feature_size in # GRCN????????? FIXME rnn_state_size = 128 # dim_cnn_proj # filter size is more correct name ''' The RGP (Recurrent Gaze Prediction) model. ''' with tf.variable_scope("RGP"): # (2) C3D # ------- # a. reduce filter size [7 x 7 x 1024] -> [7 x 7 x 32] via FC or CONV # b. apply RCN, and get the [7 x 7 x 32] outputs from RNN # c3d input. net['c3d_input'] = c3d_input # [B x T x 1024 x 7 x 7] # change axis and reshape to [B x T x 7 x 7 x 1024] net['c3d_input_reshape'] = tf.transpose(net['c3d_input'], perm=[0, 1, 3, 4, 2], name='c3d_input_reshape') log.info('c3d_input_reshape shape : %s', net['c3d_input_reshape'].get_shape().as_list()) net['c3d_input_reshape'].get_shape().assert_is_compatible_with( [B, T, 7, 7, 1024]) # c3d_embedded: project each 1024 feature (per 7x7 c3d conv-feature map) into 12 vars.proj_c3d_W = tf.Variable(tf.random_uniform( [1024, dim_cnn_proj], -0.1, 0.1), name="proj_c3d_W") vars.proj_c3d_b = tf.Variable(tf.random_uniform([dim_cnn_proj], -0.1, 0.1), name="proj_c3d_b") net['c3d_embedded'] = tf.nn.xw_plus_b( tf.reshape(net['c3d_input_reshape'], [-1, 1024]), vars.proj_c3d_W, vars.proj_c3d_b ) # [(B*T*7*7) x 1024] --> [(B*T*7*7) x 12] by appling W:1024->12 if dropout_keep_prob != 1.0: net['c3d_embedded'] = tf.nn.dropout(net['c3d_embedded'], dropout_keep_prob) # --> [B x T x 7 x 7 x 12] net['c3d_embedded'] = tf.reshape(net['c3d_embedded'], [B, T, 7, 7, dim_cnn_proj]) log.info('c3d_embedded shape : %s', net['c3d_embedded'].get_shape().as_list()) net['c3d_embedded'].get_shape().assert_is_compatible_with( [B, T, 7, 7, dim_cnn_proj]) # The RNN Part. # ------------- with tf.variable_scope('RCNBottom') as scope: vars.lstm_u = GRU_RCN_Cell(rnn_state_size, dim_cnn_proj) state_u = vars.lstm_u.zero_state(B, tf.float32) log.info('RNN state shape : %s', state_u.get_shape().as_list()) predicted_gazemaps = [] net['rcn_outputs'] = rcn_outputs = [] # n_lstm_step for example, 35. -> 42 has highest performance for i in range(T): # T = number of timesteps if i > 0: tf.get_variable_scope().reuse_variables() # We use cnn embedding + ... as RNN input (as a flatted/concatenated vector) rnn_input = tf.concat( values=[ # 0 1 2 3 # (i) C3D map (embedded into 7x7x12) net['c3d_embedded'][:, i, :, :, :], ], axis=3, # [:, i, 7, 7, HERE] name='rnn_input' + str(i)) # with tf.variable_scope("RNN"): output_u, state_u = vars.lstm_u(rnn_input, state_u) # at time t output_u.get_shape().assert_is_compatible_with( [B, 7, 7, rnn_state_size]) # Bx{time}x7x7x32 rcn_outputs.append(output_u) # (3) RCN output unpooling to 49x49 size # each of (7x7x32) maps are up-sampled to (49x49x8) vars.upsampling_filter1 = tf.get_variable( 'Upsampling/weight1', [5, 5, 64, rnn_state_size ], # rnn_state_size bad name (indeed a channel size) initializer=initializers.xavier_initializer_conv2d( uniform=True)) vars.upsampling_filter2 = tf.get_variable( 'Upsampling/weight2', [5, 5, 32, 64 ], # rnn_state_size bad name (indeed a channel size) initializer=initializers.xavier_initializer_conv2d( uniform=True)) vars.upsampling_filter3 = tf.get_variable( 'Upsampling/weight3', [7, 7, 12, 32 ], # rnn_state_size bad name (indeed a channel size) initializer=initializers.xavier_initializer_conv2d( uniform=True)) vars.out_W = tf.Variable(tf.random_uniform([12, 1], -0.1, 0.1), name="out_W") vars.out_b = tf.Variable(tf.random_uniform([1], -0.1, 0.1), name="out_b") predicted_gazemaps = [] # Batch normalization assumption (if wrong fix): apply before eac convolutional layer for i in range(T): rcn_output_map = rcn_outputs[i] # [B x 7 x 7 x 128] # for now in here - later will add to base: # batch_mean, batch_var = tf.nn.moments(rcn_output_map, axes = [0,1,2]) #global normalization for conv_filters # what to do with offset and scale? rcn_output_map = tf.layers.batch_normalization(rcn_output_map) rcn_upsampled_output = tf.nn.conv2d_transpose( rcn_output_map, vars.upsampling_filter1, output_shape=[B, 23, 23, 64], strides=[1, 3, 3, 1], padding='VALID', name='upsampled_rcn_output_' + str(i)) #rcn_upsampled_output.get_shape().assert_is_compatible_with([B, GH, GW, upsampling_output_channel]) rcn_upsampled_output = tf.nn.conv2d_transpose( rcn_upsampled_output, vars.upsampling_filter2, output_shape=[B, 49, 49, 32], strides=[1, 2, 2, 1], padding='VALID', name='upsampled_rcn_output_' + str(i)) input_concat = tf.concat( axis=3, # the last dimension values=[ # [B x 49 x 49 x 8] rcn_upsampled_output, # net['frm_sal_cubic'][:, i, :, :, :], # [B x 49 x 49 x 1] # last_output_gazemap # [B x 49 x 49 x 1] ]) output = tf.nn.conv2d_transpose(input_concat, vars.upsampling_filter3, output_shape=[B, 49, 49, 12], strides=[1, 1, 1, 1], padding='SAME', name='upsampled_rcn_output_' + str(i)) output = tf.nn.xw_plus_b(tf.reshape(output, [-1, 12]), vars.out_W, vars.out_b) output = tf.nn.dropout(output, dropout_keep_prob) # [B x 49 x 49 x 1] -> [B x 49 x 49] squeeze predicted_gazemap = tf.reshape(output, [B, GH, GW]) predicted_gazemaps.append(predicted_gazemap) # TODO should we normalize predicted_gazemap ???????????????????????????? # pack as a tensor # T-list of [B x 49 x 49] --> [B x 49 x 49] net['predicted_gazemaps'] = tf.transpose( tf.stack(predicted_gazemaps), [1, 0, 2, 3], name='predicted_gazemaps') net['predicted_gazemaps'].get_shape().assert_is_compatible_with( [B, T, GH, GW]) return net['predicted_gazemaps']
def create_gazeprediction_network(frame_images, c3d_input, dropout_keep_prob=1.0, net=None): ''' Args: frame_images: a [B x T x IH x IW x 3] tensor (frame images) c3d_input : a [B x T x 1024 x 7 x 7] tensor for C3D convmap features gt_gazemap : a [B x T x GH x GW] tensor of ground truth per-frame gaze maps dropout_keep_prob : float tensor (optional) net : a dictionary to get intra-layer activations or tensors. Outputs: predicted_gazemaps : a [B x T x GH x GW] tensor, predicted gaze maps per frame ''' if net is None: net = {} else: assert isinstance(net, dict) vars = E() # (0) input sanity check GH, GW = CONSTANTS.gazemap_height, CONSTANTS.gazemap_width IH, IW = CONSTANTS.image_height, CONSTANTS.image_width B, T = frame_images.get_shape().as_list()[:2] assert B > 0 and T > 0 frame_images.get_shape().assert_is_compatible_with([B, T, IH, IW, 3]) c3d_input.get_shape().assert_is_compatible_with([B, T, 1024, 7, 7]) dim_cnn_proj = 512 # XXX FIXME (see __init__ in GazePredictionGRU) # some variables # -------------- # not a proper name, it should be rnn_state_feature_size in # GRCN????????? FIXME rnn_state_size = 128 #dim_cnn_proj # filter size is more correct name ''' The RGP (Recurrent Gaze Prediction) model. ''' # (1) Input frame saliency # ------------------------ # Input. net['frame_images'] = frame_images # [B x T x IH x IW x 3] #net['frm_sal'] = SaliencyModel.create_shallownet( # tf.reshape(net['frame_images'], [-1, IH, IW, 3]), # scope='ShallowNet', # dropout=False #) # [-1, 49, 49] #net['frm_sal'] = tf.reshape(net['frm_sal'], [B, T, GH, GW]) # [B x T x 49 x 49] # # [B x T x 49 x 49] --> [B x T x 49 x 49 x 1] # net['frm_sal_cubic'] = tf.reshape(net['frm_sal'], [B, T, GH, GW, 1], # name='frame_saliency_cubic') # (2) C3D # ------- # a. reduce filter size [7 x 7 x 1024] -> [7 x 7 x 32] via FC or CONV # b. apply RCN, and get the [7 x 7 x 32] outputs from RNN # c3d input. net['c3d_input'] = c3d_input # [B x T x 1024 x 7 x 7] # change axis and reshape to [B x T x 7 x 7 x 1024] net['c3d_input_reshape'] = tf.transpose(net['c3d_input'], perm=[0, 1, 3, 4, 2], name='c3d_input_reshape') log.info('c3d_input_reshape shape : %s', net['c3d_input_reshape'].get_shape().as_list()) net['c3d_input_reshape'].get_shape().assert_is_compatible_with( [B, T, 7, 7, 1024]) # c3d_embedded: project each 1024 feature (per 7x7 c3d conv-feature map) into 12 vars.proj_c3d_W = tf.Variable(tf.random_uniform([1024, dim_cnn_proj], -0.1, 0.1), name="proj_c3d_W") vars.proj_c3d_b = tf.Variable(tf.random_uniform([dim_cnn_proj], -0.1, 0.1), name="proj_c3d_b") net['c3d_embedded'] = tf.nn.xw_plus_b( tf.reshape(net['c3d_input_reshape'], [-1, 1024]), vars.proj_c3d_W, vars.proj_c3d_b ) # [(B*T*7*7) x 1024] --> [(B*T*7*7) x 12] by appling W:1024->12 if dropout_keep_prob != 1.0: net['c3d_embedded'] = tf.nn.dropout(net['c3d_embedded'], dropout_keep_prob) # --> [B x T x 7 x 7 x 12] net['c3d_embedded'] = tf.reshape(net['c3d_embedded'], [B, T, 7, 7, dim_cnn_proj]) log.info('c3d_embedded shape : %s', net['c3d_embedded'].get_shape().as_list()) net['c3d_embedded'].get_shape().assert_is_compatible_with( [B, T, 7, 7, dim_cnn_proj]) # The RNN Part. # ------------- with tf.variable_scope('RCNBottom') as scope: vars.lstm_u = GRU_RCN_Cell(rnn_state_size, dim_cnn_proj) state_u = vars.lstm_u.zero_state(B, tf.float32) log.info('RNN state shape : %s', state_u.get_shape().as_list()) predicted_gazemaps = [] net['rcn_outputs'] = rcn_outputs = [] vars.out_W = tf.Variable(tf.random_uniform([rnn_state_size, 1], -0.1, 0.1), name="out_W") vars.out_b = tf.Variable(tf.random_uniform([1], -0.1, 0.1), name="out_b") # n_lstm_step for example, 35. for i in range(T): if i > 0: tf.get_variable_scope().reuse_variables() # We use cnn embedding + ... as RNN input (as a flatted/concatenated vector) rnn_input = tf.concat( concat_dim=3, # [:, i, 7, 7, HERE] values=[ # 0 1 2 3 net['c3d_embedded'] [:, i, :, :, :], # (i) C3D map (embedded into 7x7x12) ], name='rnn_input' + str(i)) #with tf.variable_scope("RNN"): output_u, state_u = vars.lstm_u(rnn_input, state_u) # at time t output_u.get_shape().assert_is_compatible_with( [B, 7, 7, rnn_state_size]) # Bx{time}x7x7x32 rcn_outputs.append(output_u) # a FC layer follows output = tf.nn.xw_plus_b( tf.reshape(output_u, [-1, rnn_state_size]), vars.out_W, vars.out_b) output = tf.nn.dropout(output, dropout_keep_prob) predicted_gazemap = tf.reshape( output, [B, GH, GW]) # 7x7 softmax logit predicted_gazemaps.append(predicted_gazemap) # pack as a tensor # T-list of [B x 49 x 49] --> [B x 49 x 49] net['predicted_gazemaps'] = tf.transpose(tf.pack(predicted_gazemaps), [1, 0, 2, 3], name='predicted_gazemaps') net['predicted_gazemaps'].get_shape().assert_is_compatible_with( [B, T, GH, GW]) return net['predicted_gazemaps']