def get_decision_net_simple(self, net, net_prob_mat): avg_output = keras.layers.GlobalAveragePooling2D()(net_prob_mat) max_output = keras.layers.GlobalMaxPooling2D()(net_prob_mat) decision_net = tf.concat([avg_output, max_output], 3) decision_net = layers.conv2d( decision_net, 1, [1, 1], scope='decision6', normalizer_fn=None, weights_initializer=initializers.xavier_initializer_conv2d(False), biases_initializer=tf.constant_initializer(0), activation_fn=None) return decision_net
def squeezenet_arg_scope(is_training, weight_decay=0.00001, use_batch_norm=False, batch_norm_decay=0.999): normalizer_fn = slim.batch_norm if use_batch_norm else None with slim.arg_scope([slim.conv2d, slim.fully_connected, batch_activate], activation_fn=tf.nn.relu): with slim.arg_scope( [slim.fully_connected], weights_regularizer=slim.l2_regularizer(weight_decay), weights_initializer=initializers.xavier_initializer()): with slim.arg_scope( [slim.conv2d], weights_regularizer=slim.l2_regularizer(weight_decay), weights_initializer=initializers.xavier_initializer_conv2d( )): with slim.arg_scope([slim.batch_norm], is_training=is_training, decay=batch_norm_decay): with slim.arg_scope( [slim.conv2d, batch_activate], # slim.fully_connected normalizer_fn=normalizer_fn) as sc: return sc
def legacy_convolution2d(x, num_output_channels, kernel_size, activation_fn=None, stride=(1, 1), padding='SAME', weight_init=initializers.xavier_initializer_conv2d(), bias_init=standard_ops.zeros_initializer, name=None, weight_collections=(ops.GraphKeys.WEIGHTS,), bias_collections=(ops.GraphKeys.BIASES,), output_collections=(ops.GraphKeys.ACTIVATIONS,), trainable=True, weight_regularizer=None, bias_regularizer=None): # pylint: disable=g-docstring-has-escape """Adds the parameters for a conv2d layer and returns the output. A neural network convolution layer is generally defined as: \\\\(y = f(conv2d(w, x) + b)\\\\) where **f** is given by `activation_fn`, **conv2d** is `tf.nn.conv2d` and `x` has shape `[batch, height, width, channels]`. The output of this op is of shape `[batch, out_height, out_width, num_output_channels]`, where `out_width` and `out_height` are determined by the `padding` argument. See `conv2D` for details. This op creates `w` and optionally `b` and adds various summaries that can be useful for visualizing learning or diagnosing training problems. Bias can be disabled by setting `bias_init` to `None`. The variable creation is compatible with `tf.variable_scope` and so can be reused with `tf.variable_scope` or `tf.make_template`. Most of the details of variable creation can be controlled by specifying the initializers (`weight_init` and `bias_init`) and which collections to place the created variables in (`weight_collections` and `bias_collections`). A per layer regularization can be specified by setting `weight_regularizer`. This is only applied to weights and not the bias. Args: x: A 4-D input `Tensor`. num_output_channels: The number of output channels (i.e. the size of the last dimension of the output). kernel_size: A length 2 `list` or `tuple` containing the kernel size. activation_fn: A function that requires a single Tensor that is applied as a non-linearity. stride: A length 2 `list` or `tuple` specifying the stride of the sliding window across the image. padding: A `string` from: "SAME", "VALID". The type of padding algorithm to use. weight_init: An optional initialization. If not specified, uses Xavier initialization (see `tf.learn.xavier_initializer`). bias_init: An initializer for the bias, defaults to 0. Set to`None` in order to disable bias. name: The name for this operation is used to name operations and to find variables. If specified it must be unique for this scope, otherwise a unique name starting with "convolution2d" will be created. See `tf.variable_op_scope` for details. weight_collections: List of graph collections to which weights are added. bias_collections: List of graph collections to which biases are added. output_collections: List of graph collections to which outputs are added. trainable: If `True` also add variables to the graph collection `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable). weight_regularizer: A regularizer like the result of `l1_regularizer` or `l2_regularizer`. Used for weights. bias_regularizer: A regularizer like the result of `l1_regularizer` or `l2_regularizer`. Used for biases. Returns: The result of applying a 2-D convolutional layer. Raises: ValueError: If `kernel_size` or `stride` are not length 2. """ with variable_scope.variable_op_scope([x], name, 'convolution2d'): num_input_channels = x.get_shape().dims[3].value if len(kernel_size) != 2: raise ValueError('kernel_size must be length 2: %d ' % kernel_size) if len(stride) != 2: raise ValueError('stride must be length 2: %d' % stride) stride = [1, stride[0], stride[1], 1] shape = [kernel_size[0], kernel_size[1], num_input_channels, num_output_channels] dtype = x.dtype.base_dtype weight_collections = set(list(weight_collections or []) + [ops.GraphKeys.VARIABLES]) w = variable_scope.get_variable('weights', shape=shape, dtype=dtype, initializer=weight_init, collections=weight_collections, regularizer=weight_regularizer, trainable=trainable) y = nn.conv2d(x, w, stride, padding) if bias_init is not None: bias_collections = set(list(bias_collections or []) + [ops.GraphKeys.VARIABLES]) b = variable_scope.get_variable('bias', shape=[num_output_channels], dtype=dtype, initializer=bias_init, collections=bias_collections, regularizer=bias_regularizer, trainable=trainable) y = nn.bias_add(y, b) return _apply_activation(y, activation_fn, output_collections)
def convolution2d(x, num_output_channels, kernel_size, activation_fn=None, stride=(1, 1), padding='SAME', weight_init=initializers.xavier_initializer_conv2d(), bias_init=standard_ops.constant_initializer(0.), name=None, weight_collections=None, bias_collections=None, output_collections=None, weight_regularizer=None, bias_regularizer=None): """Adds the parameters for a conv2d layer and returns the output. A neural network convolution layer is generally defined as: \\\\(y = f(conv2d(w, x) + b)\\\\) where **f** is given by `activation_fn`, **conv2d** is `tf.nn.conv2d` and `x` has shape `[batch, height, width, channels]`. The output of this op is of shape `[batch, out_height, out_width, num_output_channels]`, where `out_width` and `out_height` are determined by the `padding` argument. See `conv2D` for details. This op creates `w` and optionally `b` and adds various summaries that can be useful for visualizing learning or diagnosing training problems. Bias can be disabled by setting `bias_init` to `None`. The variable creation is compatible with `tf.variable_scope` and so can be reused with `tf.variable_scope` or `tf.make_template`. Most of the details of variable creation can be controlled by specifying the initializers (`weight_init` and `bias_init`) and which collections to place the created variables in (`weight_collections` and `bias_collections`). A per layer regularization can be specified by setting `weight_regularizer`. This is only applied to weights and not the bias. Args: x: A 4-D input `Tensor`. num_output_channels: The number of output channels (i.e. the size of the last dimension of the output). kernel_size: A length 2 `list` or `tuple` containing the kernel size. activation_fn: A function that requires a single Tensor that is applied as a non-linearity. stride: A length 2 `list` or `tuple` specifying the stride of the sliding window across the image. padding: A `string` from: "SAME", "VALID". The type of padding algorithm to use. weight_init: An optional initialization. If not specified, uses Xavier initialization (see `tf.learn.xavier_initializer`). bias_init: An initializer for the bias, defaults to 0. Set to`None` in order to disable bias. name: The name for this operation is used to name operations and to find variables. If specified it must be unique for this scope, otherwise a unique name starting with "convolution2d" will be created. See `tf.variable_op_scope` for details. weight_collections: List of graph collections to which weights are added. bias_collections: List of graph collections to which biases are added. output_collections: List of graph collections to which outputs are added. weight_regularizer: A regularizer like the result of `l1_regularizer` or `l2_regularizer`. Used for weights. bias_regularizer: A regularizer like the result of `l1_regularizer` or `l2_regularizer`. Used for biases. Returns: The result of applying a 2-D convolutional layer. Raises: ValueError: If `kernel_size` or `stride` are not length 2. """ with variable_scope.variable_op_scope([x], name, 'convolution2d'): num_input_channels = x.get_shape().dims[3].value if len(kernel_size) != 2: raise ValueError('kernel_size must be length 2: ' % kernel_size) if len(stride) != 2: raise ValueError('stride must be length 2: ' % kernel_size) stride = [1, stride[0], stride[1], 1] shape = [ kernel_size[0], kernel_size[1], num_input_channels, num_output_channels ] dtype = x.dtype.base_dtype w = _weight_variable(shape=shape, dtype=dtype, initializer=weight_init, collections=weight_collections, regularizer=weight_regularizer) y = nn.conv2d(x, w, stride, padding) if bias_init is not None: b = _bias_variable(shape=[num_output_channels], dtype=dtype, initializer=bias_init, collections=bias_collections, regularizer=bias_regularizer) y = nn.bias_add(y, b) return _apply_activation(y, activation_fn, output_collections)
def create_shallownet(images, scope=None, net=None, dropout=True): """ Args: images: a tensor of shape [B x H x W x C] net: An optional dict object scope: The variable scope for the subgraph, defaults to ShallowNet Returns: saliency_output: a tensor of shape [B x 48 x 48] """ assert len(images.get_shape()) == 4 # [B, H, W, C] if net is None: net = {} else: assert isinstance(net, dict) net['dropout_keep_prob'] = tf.placeholder(tf.float32, name='dropout_keep_prob') with tf.variable_scope(scope or 'ShallowNet'): # CONV net['conv1'] = convolution2d( images, 32, kernel_size=(5, 5), stride=(1, 1), padding='VALID', activation_fn=None, #tf.nn.relu, weights_initializer=initializers.xavier_initializer_conv2d( uniform=True), biases_initializer=tf.constant_initializer(0.0), variables_collections=['MODEL_VARS'], scope='conv1') #net['conv1'] = tflearn.layers.batch_normalization(net['conv1']) net['conv1'] = tf.nn.relu(net['conv1']) net['pool1'] = tf.nn.max_pool(net['conv1'], ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME', name='pool1') log.info('Conv1 size : %s', net['conv1'].get_shape().as_list()) log.info('Pool1 size : %s', net['pool1'].get_shape().as_list()) net['conv2'] = convolution2d( net['pool1'], 64, kernel_size=(3, 3), stride=(1, 1), padding='VALID', activation_fn=None, #tf.nn.relu, weights_initializer=initializers.xavier_initializer_conv2d( uniform=True), biases_initializer=tf.constant_initializer(0.0), variables_collections=['MODEL_VARS'], scope='conv2') #net['conv2'] = tflearn.layers.batch_normalization(net['conv2']) net['conv2'] = tf.nn.relu(net['conv2']) net['pool2'] = tf.nn.max_pool(net['conv2'], ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1], padding='SAME', name='pool2') log.info('Conv2 size : %s', net['conv2'].get_shape().as_list()) log.info('Pool2 size : %s', net['pool2'].get_shape().as_list()) net['conv3'] = convolution2d( net['pool2'], 32, kernel_size=(3, 3), stride=(1, 1), padding='VALID', activation_fn=None, #tf.nn.relu, weights_initializer=initializers.xavier_initializer_conv2d( uniform=True), biases_initializer=tf.constant_initializer(0.0), variables_collections=['MODEL_VARS'], scope='conv3') #net['conv3'] = tflearn.layers.batch_normalization(net['conv3']) net['conv3'] = tf.nn.relu(net['conv3']) net['pool3'] = tf.nn.max_pool(net['conv3'], ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1], padding='SAME', name='pool3') log.info('Conv3 size : %s', net['conv3'].get_shape().as_list()) log.info('Pool3 size : %s', net['pool3'].get_shape().as_list()) # FC layer n_inputs = int(np.prod(net['pool3'].get_shape().as_list()[1:])) pool3_flat = tf.reshape(net['pool3'], [-1, n_inputs]) net['fc1'] = fully_connected( pool3_flat, 4802, activation_fn=None, #tf.nn.relu, weights_initializer=initializers.xavier_initializer( uniform=True), biases_initializer=tf.constant_initializer(0.0), variables_collections=['MODEL_VARS'], scope='fc1') log.info('fc1 size : %s', net['fc1'].get_shape().as_list()) #net['fc1'] = tflearn.layers.batch_normalization(net['fc1']) net['fc1'] = tf.nn.relu(net['fc1']) if dropout: net['fc1'] = tf.nn.dropout(net['fc1'], net['dropout_keep_prob']) fc1_slice1, fc1_slice2 = tf.split( net['fc1'], num_or_size_splits=2, axis=1, name='fc1_slice' ) #syntax probably wrong here for newer tensorflow version net['max_out'] = tf.maximum(fc1_slice1, fc1_slice2, name='fc1_maxout') log.info('maxout size : %s', net['max_out'].get_shape().as_list()) net['fc2'] = fully_connected( net['max_out'], 4802, activation_fn=None, # no relu here weights_initializer=initializers.xavier_initializer( uniform=True), biases_initializer=tf.constant_initializer(0.0), variables_collections=['MODEL_VARS'], scope='fc2') #net['fc2'] = tflearn.layers.batch_normalization(net['fc2']) net['fc2'] = tf.nn.relu(net['fc2']) #if dropout: # net['fc2'] = tf.nn.dropout( net['fc2'], net['dropout_keep_prob'] ) log.info('fc2 size : %s', net['fc2'].get_shape().as_list()) fc2_slice1, fc2_slice2 = tf.split(net['fc2'], num_or_size_splits=2, axis=1, name='fc2_slice') net['max_out2'] = tf.maximum(fc2_slice1, fc2_slice2, name='fc2_maxout') ''' net['fc3'] = fully_connected(net['max_out2'], 4802, activation_fn=None, # no relu here weights_initXializer=initializers.xavier_initializer(uniform=True), biases_initializer=tf.constant_initializer(0.0), weight_collections=['MODEL_VARS'], bias_collections=['MODEL_VARS'], name='fc3') #net['fc3'] = tflearn.layers.batch_normalization(net['fc3']) net['fc3'] = tf.nn.relu(net['fc3']) fc3_slice1, fc3_slice2 = tf.split(1, 2, net['fc3'], name='fc3_slice') net['max_out3'] = tf.maximum(fc3_slice1, fc3_slice2, name='fc3_maxout') net['max_out3'] = tflearn.layers.batch_normalization(net['max_out3']) ''' #net['fc2'] = tf.nn.dropout( net['fc2'], net['dropout_keep_prob'] ) #log.info('fc3 size : %s', net['fc3'].get_shape().as_list()) # debug and summary #net['fc1'].get_shape().assert_is_compatible_with([None, 4802]) #net['fc2'].get_shape().assert_is_compatible_with([None, 4802]) #net['fc3'].get_shape().assert_is_compatible_with([None, 4802]) #for t in [self.conv1, self.conv2, self.conv3, # self.pool1, self.pool2, self.pool3, # self.fc1, self.max_out, self.fc2]: # _add_activation_histogram_summary(t) net['saliency'] = tf.reshape(net['max_out2'], [-1, 49, 49], name='saliency') return net['saliency']
def create_gazeprediction_network(frame_images, c3d_input, gt_gazemap, dropout_keep_prob, net=None): ''' Args: frame_images: a [B x T x IH x IW x 3] tensor (frame images) c3d_input : a [B x T x 1024 x 7 x 7] tensor for C3D convmap features gt_gazemap : a [B x T x GH x GW] tensor of ground truth per-frame gaze maps dropout_keep_prob : float tensor (optional) net : a dictionary to get intra-layer activations or tensors. Outputs: [predicted_gazemaps, loss, image_summary] where predicted_gazemaps : a [B x T x GH x GW] tensor, predicted gaze maps per frame loss: a scalar (float) tensor of RNN supervision loss. image_summary ''' if net is None: net = {} else: assert isinstance(net, dict) vars = E() # (0) input sanity check GH, GW = CONSTANTS.gazemap_height, CONSTANTS.gazemap_width IH, IW = CONSTANTS.image_height, CONSTANTS.image_width B, T = frame_images.get_shape().as_list()[:2] assert B > 0 and T > 0 frame_images.get_shape().assert_is_compatible_with([B, T, IH, IW, 3]) c3d_input.get_shape().assert_is_compatible_with([B, T, 1024, 7, 7]) gt_gazemap.get_shape().assert_is_compatible_with([B, T, GH, GW]) dim_cnn_proj = 512 # XXX FIXME (see __init__ in GazePredictionGRU) # some variables # -------------- # not a proper name, it should be rnn_state_feature_size in # GRCN????????? FIXME rnn_state_size = 256 #dim_cnn_proj # filter size is more correct name ''' The RGP (Recurrent Gaze Prediction) model. ''' # (1) Input frame saliency # ------------------------ # Input. net['frame_images'] = frame_images # [B x T x IH x IW x 3] net['frm_sal'] = SaliencyModel.create_shallownet( tf.reshape(net['frame_images'], [-1, IH, IW, 3]), scope='ShallowNet', dropout=False) # [-1, 49, 49] net['frm_sal'] = tf.reshape(net['frm_sal'], [B, T, GH, GW]) # [B x T x 49 x 49] # [B x T x 49 x 49] --> [B x T x 49 x 49 x 1] net['frm_sal_cubic'] = tf.reshape(net['frm_sal'], [B, T, GH, GW, 1], name='frame_saliency_cubic') # (2) C3D # ------- # a. reduce filter size [7 x 7 x 1024] -> [7 x 7 x 32] via FC or CONV # b. apply RCN, and get the [7 x 7 x 32] outputs from RNN # c3d input. net['c3d_input'] = c3d_input # [B x T x 1024 x 7 x 7] # change axis and reshape to [B x T x 7 x 7 x 1024] net['c3d_input_reshape'] = tf.transpose(net['c3d_input'], perm=[0, 1, 3, 4, 2], name='c3d_input_reshape') log.info('c3d_input_reshape shape : %s', net['c3d_input_reshape'].get_shape().as_list()) net['c3d_input_reshape'].get_shape().assert_is_compatible_with( [B, T, 7, 7, 1024]) # c3d_embedded: project each 1024 feature (per 7x7 c3d conv-feature map) into 12 vars.proj_c3d_W = tf.Variable(tf.random_uniform([1024, dim_cnn_proj], -0.1, 0.1), name="proj_c3d_W") vars.proj_c3d_b = tf.Variable(tf.random_uniform([dim_cnn_proj], -0.1, 0.1), name="proj_c3d_b") net['c3d_embedded'] = tf.nn.xw_plus_b( tf.reshape(net['c3d_input_reshape'], [-1, 1024]), vars.proj_c3d_W, vars.proj_c3d_b ) # [(B*T*7*7) x 1024] --> [(B*T*7*7) x 12] by appling W:1024->12 # --> [B x T x 7 x 7 x 12] net['c3d_embedded'] = tf.reshape(net['c3d_embedded'], [B, T, 7, 7, dim_cnn_proj]) log.info('c3d_embedded shape : %s', net['c3d_embedded'].get_shape().as_list()) net['c3d_embedded'].get_shape().assert_is_compatible_with( [B, T, 7, 7, dim_cnn_proj]) # The RNN Part. # ------------- # Batch size x (gaze map size), per frame net['gt_gazemap'] = gt_gazemap # [B x T x GH, GW] log.info('gt_gazemap shape : %s', net['gt_gazemap'].get_shape().as_list()) with tf.variable_scope('RCNBottom') as scope: vars.lstm_u = GRU_RCN_Cell(rnn_state_size, dim_cnn_proj) state_u = vars.lstm_u.zero_state(B, tf.float32) log.info('RNN state shape : %s', state_u.get_shape().as_list()) # n_lstm_step for example, 35. net['rcn_outputs'] = rcn_outputs = [] for i in range(T): if i > 0: tf.get_variable_scope().reuse_variables() # We use cnn embedding + ... as RNN input (as a flatted/concatenated vector) rnn_input = tf.concat( concat_dim=3, # [:, i, 7, 7, HERE] values=[ # 0 1 2 3 net['c3d_embedded'] [:, i, :, :, :], # (i) C3D map (embedded into 7x7x12) ], name='rnn_input' + str(i)) #with tf.variable_scope("RNN"): output_u, state_u = vars.lstm_u(rnn_input, state_u) # at time t output_u.get_shape().assert_is_compatible_with( [B, 7, 7, rnn_state_size]) # Bx{time}x7x7x32 rcn_outputs.append(output_u) # (3) RCN output unpooling to 49x49 size # each of (7x7x32) maps are up-sampled to (49x49x8) upsampling_filter_size = 11 upsampling_output_channel = 64 vars.upsampling_filter = tf.get_variable( 'Upsampling/weight', [ upsampling_filter_size, upsampling_filter_size, upsampling_output_channel, rnn_state_size ], # rnn_state_size bad name (indeed a channel size) initializer=initializers.xavier_initializer_conv2d(uniform=True)) net['rcn_upsampled_outputs'] = rcn_upsampled_outputs = [] for i in range(T): rcn_output_map = rcn_outputs[i] # [B x 7 x 7 x 128] rcn_upsampled_output = tf.nn.conv2d_transpose( rcn_output_map, vars.upsampling_filter, output_shape=[B, GH, GW, upsampling_output_channel], strides=[1, 7, 7, 1], padding='SAME', name='upsampled_rcn_output_' + str(i)) rcn_upsampled_output.get_shape().assert_is_compatible_with( [B, GH, GW, upsampling_output_channel]) rcn_upsampled_outputs.append(rcn_upsampled_output) if i == 0: log.info('RCN input map size : %s', rcn_output_map.get_shape().as_list()) log.info('RCN upsampled size : %s', rcn_upsampled_output.get_shape().as_list()) # (4) The upper layer of GRCN to emit gaze map # -------------------------------------------- with tf.variable_scope('RCNGaze') as scope: vars.lstm_g = GRU_RCN_Cell( num_units=3, # dim_feature=upsampling_output_channel + 1 + 1, # 10? dim_feature=upsampling_output_channel + 1, # 10? spatial_shape=[GH, GW], kernel_spatial_shape=[5, 5]) state_g = vars.lstm_g.zero_state(B, tf.float32) # last_output_gazemap = tf.zeros([B, GH, GW, 1]) predicted_gazemaps = [] for i in range(T): if i > 0: tf.get_variable_scope().reuse_variables() # try RNN supervision with GT gazemap. # FIXME decoder should be spin off here #if i > 0: # last_output_gazemap = tf.expand_dims(gt_gazemap[:, i - 1, :, :], 3) # now, combine image saliency, rcn map from the bottom layer, # and the previous input ''' rcn_input_concat = tf.concat(concat_dim=3, # the last dimension values=[ rcn_upsampled_outputs[i], # [B x 49 x 49 x 8] net['frm_sal_cubic'][:, i, :, :, :], # [B x 49 x 49 x 1] # last_output_gazemap # [B x 49 x 49 x 1] ]) ''' #with tf.variable_scope("RNN"): output_g, state_g = vars.lstm_g(rcn_upsampled_outputs[i], state_g) output_g.get_shape().assert_is_compatible_with([B, GH, GW, 3]) rcn_outputs.append(rcn_outputs) output_g = tf.reshape(output_g, [B, -1]) # apply another convolutional layer (== fc in fact) to gaze map # [B x 49 x 49 x 3] -> # [B x 49 x 49 x 1] with tf.variable_scope('LastProjection') as scope_proj: if i > 0: tf.get_variable_scope().reuse_variables() fc1 = fully_connected( output_g, 4802, activation_fn=None, #tf.nn.relu, weight_init=initializers.xavier_initializer( uniform=True), bias_init=tf.constant_initializer(0.0), weight_collections=['MODEL_VARS'], bias_collections=['MODEL_VARS'], name='fc1') #net['fc1'] = tflearn.layers.batch_normalization(net['fc1']) fc1 = tf.nn.relu(fc1) if dropout_keep_prob is not None: fc1 = tf.nn.dropout(fc1, dropout_keep_prob) fc1_slice1, fc1_slice2 = tf.split(1, 2, fc1, name='fc1_slice') max_out = tf.maximum(fc1_slice1, fc1_slice2, name='fc1_maxout') fc2 = fully_connected( max_out, 4802, activation_fn=None, # no relu here weight_init=initializers.xavier_initializer( uniform=True), bias_init=tf.constant_initializer(0.0), weight_collections=['MODEL_VARS'], bias_collections=['MODEL_VARS'], name='fc2') #net['fc2'] = tflearn.layers.batch_normalization(net['fc2']) fc2 = tf.nn.relu(fc2) #if dropout: # net['fc2'] = tf.nn.dropout( net['fc2'], net['dropout_keep_prob'] ) fc2_slice1, fc2_slice2 = tf.split(1, 2, fc2, name='fc2_slice') max_out2 = tf.maximum(fc2_slice1, fc2_slice2, name='fc2_maxout') predicted_gazemap = tf.reshape( max_out2, [B, GH, GW]) # [B x 49 x 49 x 1] -> [B x 49 x 49] squeeze predicted_gazemaps.append(predicted_gazemap) # TODO should we normalize predicted_gazemap ???????????????????????????? # (4) Finally, calculate the loss loss = 0.0 for i in range(T): predicted_gazemap = predicted_gazemaps[i] # Cross entropy and softmax?? l2loss = tf.nn.l2_loss(predicted_gazemap - gt_gazemap[:, i, :, :]) # on Bx49x49 current_gaze_loss = tf.reduce_sum(l2loss) current_loss = current_gaze_loss loss += current_loss # loss: take average loss = tf.div(loss, float(B * T), name='loss_avg') # FIXME may be duplicates? tf.scalar_summary('loss/train', loss) tf.scalar_summary('loss/val', loss, collections=['TEST_SUMMARIES']) # pack as a tensor # T-list of [B x 49 x 49] --> [B x 49 x 49] net['predicted_gazemaps'] = tf.transpose(tf.pack(predicted_gazemaps), [1, 0, 2, 3], name='predicted_gazemaps') net['predicted_gazemaps'].get_shape().assert_is_compatible_with( [B, T, GH, GW]) # Debugging Informations # ---------------------- # OPTIONAL: for debugging and visualization # XXX only last predicted_gazemap is shown as of now :( T^T # XXX rename saliency -> gaze (to avoid confusion) def _add_image_summary(tag, tensor): return tf.image_summary(tag, tensor, max_images=2, collections=['IMAGE_SUMMARIES']) _input_image = frame_images[:, i, :, :, :] # last rnn step _saliency_output = tf.reshape(predicted_gazemap, [-1, GH, GW, 1]) _saliency_gt = tf.reshape(gt_gazemap[:, i, :, :], [-1, GH, GW, 1]) _saliency_shallow = tf.reshape(net['frm_sal'][:, i, :, :], [-1, GH, GW, 1]) _add_image_summary('inputimage', _input_image) _add_image_summary('saliency_maps_gt', _saliency_gt) _add_image_summary('saliency_maps_pred_original', _saliency_output) _add_image_summary('saliency_maps_pred_norm', tf_normalize_map(_saliency_output)) #_add_image_summary('saliency_zimgframe_shallow77', _saliency_shallow77) _add_image_summary('saliency_zshallownet', _saliency_shallow) image_summaries = tf.merge_summary( inputs=tf.get_collection('IMAGE_SUMMARIES'), collections=[], name='merged_image_summary', ) return net['predicted_gazemaps'], loss, image_summaries
def _build_net(self): print('Constructing generator with resolution of %dx%d' % (self.nin_sp,self.nin_sp)) self.layers = [] with tf.variable_scope('encoder_in'): net = slim.conv2d(self.nin, self.first_layer_ch, [1,1], stride=1, padding='SAME', weights_initializer=initializers.xavier_initializer_conv2d(), weights_regularizer=None, rate=1, normalizer_fn=slim.batch_norm, activation_fn=tf.nn.leaky_relu, scope='conv0') self.layers.append(net) print('-- Layer %d: ' % len(self.layers), 'encoder_in ', self.layers[-1].get_shape().as_list()) for i in range(1, self.encoder_layer_num, 1): sp = self.layers[-1].get_shape().as_list()[-2] with tf.variable_scope('encoder_%dx%d' % (sp, sp)): net = slim.conv2d(self.layers[-1], min(self.first_layer_ch*(2**i), self.bottleneck_ch), [4,4], stride=2, padding='SAME', weights_initializer=initializers.xavier_initializer_conv2d(), weights_regularizer=None, rate=1, normalizer_fn=slim.batch_norm, activation_fn=tf.nn.leaky_relu, scope='conv0') self.layers.append(net) print('-- Layer %d: ' % len(self.layers), 'encoder_%dx%d ' % (sp, sp), self.layers[-1].get_shape().as_list()) for i in range(self.res_block_num): with tf.variable_scope('residual_block_%d' % i): net = slim.conv2d(self.layers[-1], self.bottleneck_ch, [3,3], stride=1, padding='SAME', weights_initializer=initializers.xavier_initializer_conv2d(), weights_regularizer=None, rate=1, normalizer_fn=None, activation_fn=tf.nn.leaky_relu, scope='conv0') net = tf.add(net, self.layers[-1]) self.layers.append(net) print('-- Layer %d: ' % len(self.layers), 'residual_block_%d ' % i, self.layers[-1].get_shape().as_list()) for i in range(self.decoder_layer_num-1, 0, -1): sp = self.layers[-1].get_shape().as_list()[-2] with tf.variable_scope('decoder_%dx%d' % (sp*2, sp*2)): net = tf.image.resize_bilinear(self.layers[-1], (sp*2, sp*2), align_corners=True) net = slim.conv2d(net, min(self.first_layer_ch*(2**i), self.bottleneck_ch), [3,3], stride=1, padding='SAME', weights_initializer=initializers.xavier_initializer_conv2d(), weights_regularizer=None, rate=1, normalizer_fn=slim.batch_norm, activation_fn=tf.nn.relu, scope='conv0') net = tf.concat([net, self.layers[i-1], tf.image.resize_area(self.nin, (sp*2,sp*2), align_corners=False)], axis=3) net = slim.conv2d(net, min(self.first_layer_ch*(2**i), self.bottleneck_ch), [3,3], stride=1, padding='SAME', weights_initializer=initializers.xavier_initializer_conv2d(), weights_regularizer=None, rate=1, normalizer_fn=slim.batch_norm, activation_fn=tf.nn.relu, scope='conv1') self.layers.append(net) print('-- Layer %d: ' % len(self.layers), 'decoder_%dx%d ' % (sp*2, sp*2), self.layers[-1].get_shape().as_list()) with tf.variable_scope('decoder_out'): net = slim.conv2d(self.layers[-1], self.nout_ch, [1,1], stride=1, padding='SAME', weights_initializer=initializers.xavier_initializer_conv2d(), rate=1, activation_fn=tf.nn.sigmoid, scope='conv0') self.layers.append(net) print('-- Layer %d: ' % len(self.layers), 'decoder_out ', self.layers[-1].get_shape().as_list())
def conv2d_tiny_complex( inputs, num_outputs, rate=1, padding='SAME', data_format=None, activation_fn=nn.relu, normalizer_fn=None, normalizer_params=None, weights_initializer=initializers.xavier_initializer_conv2d(), weights_regularizer=None, biases_initializer=init_ops.zeros_initializer, biases_regularizer=None, reuse=None, variables_collections=None, outputs_collections=None, trainable=True, scope=None, ): """Tiny Convolution 2d. """ with variable_scope.variable_scope(scope, 'Conv', [inputs], reuse=reuse) as sc: inputs = ops.convert_to_tensor(inputs) dtype = inputs.dtype.base_dtype input_rank = inputs.get_shape().ndims if input_rank is None: raise ValueError('Rank of inputs must be known') if input_rank < 3 or input_rank > 5: raise ValueError( 'Rank of inputs is %d, which is not >= 3 and <= 5' % input_rank) conv_dims = input_rank - 2 # First 2x2 convolution. num_outputs_inter = num_outputs // 4 out_list = [] paddings = [[[0, 0], [0, rate], [0, rate], [0, 0]], [[0, 0], [0, rate], [rate, 0], [0, 0]], [[0, 0], [rate, 0], [0, rate], [0, 0]], [[0, 0], [rate, 0], [rate, 0], [0, 0]]] for i in range(4): output = slim.conv2d(inputs, num_outputs_inter, [2, 2], rate=rate, padding='VALID', activation_fn=activation_fn, normalizer_fn=normalizer_fn, normalizer_params=normalizer_params, weights_initializer=weights_initializer, weights_regularizer=weights_regularizer, biases_initializer=biases_regularizer, biases_regularizer=biases_regularizer, scope='conv_2x2_%i' % i) out_list.append(tf.pad(output, paddings[i], mode='CONSTANT')) print(out_list[-1].get_shape()) # out_list.append(output) # Concatening outputs. output = tf.concat(input_rank - 1, out_list) return output
def conv2d_tiny( inputs, num_outputs, rate=1, padding='SAME', data_format=None, activation_fn=nn.relu, normalizer_fn=None, normalizer_params=None, weights_initializer=initializers.xavier_initializer_conv2d(), weights_regularizer=None, biases_initializer=init_ops.zeros_initializer, biases_regularizer=None, reuse=None, variables_collections=None, outputs_collections=None, trainable=True, scope=None, ): """Tiny Convolution 2d. """ with variable_scope.variable_scope(scope, 'Conv', [inputs], reuse=reuse) as sc: inputs = ops.convert_to_tensor(inputs) dtype = inputs.dtype.base_dtype input_rank = inputs.get_shape().ndims if input_rank is None: raise ValueError('Rank of inputs must be known') if input_rank < 3 or input_rank > 5: raise ValueError( 'Rank of inputs is %d, which is not >= 3 and <= 5' % input_rank) conv_dims = input_rank - 2 # First 2x2 convolution. # num_outputs_inter = num_outputs output = slim.conv2d( inputs, num_outputs, [2, 2], rate=rate, padding='VALID', activation_fn=None, normalizer_fn=normalizer_fn, normalizer_params=normalizer_params, # normalizer_fn=None, # normalizer_params=None, weights_initializer=initializers.xavier_initializer_conv2d(), weights_regularizer=weights_regularizer, biases_initializer=None, # biases_initializer=init_ops.zeros_initializer, biases_regularizer=biases_regularizer, scope='conv_2x2') # Paddings + second convolution. paddings = [[0, 0], [rate, rate], [rate, rate], [0, 0]] output = tf.pad(output, paddings, mode='CONSTANT') output = slim.conv2d( output, num_outputs, [2, 2], rate=rate, padding='VALID', activation_fn=activation_fn, # normalizer_fn=normalizer_fn, # normalizer_params=normalizer_params, normalizer_fn=None, normalizer_params=None, weights_initializer=initializers.xavier_initializer_conv2d(), weights_regularizer=weights_regularizer, # biases_initializer=None, biases_initializer=init_ops.zeros_initializer, biases_regularizer=biases_regularizer, scope='conv_concat') return output
def create_gazeprediction_network(frame_images, c3d_input, dropout_keep_prob=1.0, net=None): ''' Args: frame_images: a [B x T x IH x IW x 3] tensor (frame images) c3d_input : a [B x T x 1024 x 7 x 7] tensor for C3D convmap features gt_gazemap : a [B x T x GH x GW] tensor of ground truth per-frame gaze maps dropout_keep_prob : float tensor (optional) net : a dictionary to get intra-layer activations or tensors. Outputs: predicted_gazemaps : a [B x T x GH x GW] tensor, predicted gaze maps per frame ''' if net is None: net = {} else: assert isinstance(net, dict) vars = E() # (0) input sanity check GH, GW = CONSTANTS.gazemap_height, CONSTANTS.gazemap_width IH, IW = CONSTANTS.image_height, CONSTANTS.image_width B, T = frame_images.get_shape().as_list()[:2] assert B > 0 and T > 0 frame_images.get_shape().assert_is_compatible_with([B, T, IH, IW, 3]) c3d_input.get_shape().assert_is_compatible_with([B, T, 1024, 7, 7]) dim_cnn_proj = 512 # XXX FIXME (see __init__ in GazePredictionGRU) # some variables # -------------- # not a proper name, it should be rnn_state_feature_size in # GRCN????????? FIXME rnn_state_size = 128 #dim_cnn_proj # filter size is more correct name ''' The RGP (Recurrent Gaze Prediction) model. ''' with tf.variable_scope("RGP"): # (2) C3D # ------- # a. reduce filter size [7 x 7 x 1024] -> [7 x 7 x 32] via FC or CONV # b. apply RCN, and get the [7 x 7 x 32] outputs from RNN # c3d input. net['c3d_input'] = c3d_input # [B x T x 1024 x 7 x 7] # change axis and reshape to [B x T x 7 x 7 x 1024] net['c3d_input_reshape'] = tf.transpose(net['c3d_input'], perm=[0, 1, 3, 4, 2], name='c3d_input_reshape') log.info('c3d_input_reshape shape : %s', net['c3d_input_reshape'].get_shape().as_list()) net['c3d_input_reshape'].get_shape().assert_is_compatible_with( [B, T, 7, 7, 1024]) # c3d_embedded: project each 1024 feature (per 7x7 c3d conv-feature map) into 12 vars.proj_c3d_W = tf.Variable(tf.random_uniform( [1024, dim_cnn_proj], -0.1, 0.1), name="proj_c3d_W") vars.proj_c3d_b = tf.Variable(tf.random_uniform([dim_cnn_proj], -0.1, 0.1), name="proj_c3d_b") net['c3d_embedded'] = tf.nn.xw_plus_b( tf.reshape(net['c3d_input_reshape'], [-1, 1024]), vars.proj_c3d_W, vars.proj_c3d_b ) # [(B*T*7*7) x 1024] --> [(B*T*7*7) x 12] by appling W:1024->12 if dropout_keep_prob != 1.0: net['c3d_embedded'] = tf.nn.dropout(net['c3d_embedded'], dropout_keep_prob) # --> [B x T x 7 x 7 x 12] net['c3d_embedded'] = tf.reshape(net['c3d_embedded'], [B, T, 7, 7, dim_cnn_proj]) log.info('c3d_embedded shape : %s', net['c3d_embedded'].get_shape().as_list()) net['c3d_embedded'].get_shape().assert_is_compatible_with( [B, T, 7, 7, dim_cnn_proj]) # Instead of RNN part, we have deconvolution # ------------- rcn_outputs = [None] * T for i in range(T): rcn_outputs[i] = net['c3d_embedded'][:, i, :, :, :] # B x 7 x 7 x 512(dim_cnn_proj) # (3) RCN output unpooling to 49x49 size # each of (7x7x32) maps are up-sampled to (49x49x8) vars.upsampling_filter1 = tf.get_variable( 'Upsampling/weight1', [ 5, 5, 64, dim_cnn_proj, # directly project 512->64 #rnn_state_size ], # rnn_state_size bad name (indeed a channel size) initializer=initializers.xavier_initializer_conv2d( uniform=True)) vars.upsampling_filter2 = tf.get_variable( 'Upsampling/weight2', [5, 5, 32, 64 ], # rnn_state_size bad name (indeed a channel size) initializer=initializers.xavier_initializer_conv2d( uniform=True)) vars.upsampling_filter3 = tf.get_variable( 'Upsampling/weight3', [7, 7, 12, 32 ], # rnn_state_size bad name (indeed a channel size) initializer=initializers.xavier_initializer_conv2d( uniform=True)) vars.out_W = tf.Variable(tf.random_uniform([12, 1], -0.1, 0.1), name="out_W") vars.out_b = tf.Variable(tf.random_uniform([1], -0.1, 0.1), name="out_b") predicted_gazemaps = [] for i in range(T): rcn_output_map = rcn_outputs[i] # [B x 7 x 7 x 128] rcn_upsampled_output = tf.nn.conv2d_transpose( rcn_output_map, vars.upsampling_filter1, output_shape=[B, 23, 23, 64], strides=[1, 3, 3, 1], padding='VALID', name='upsampled_rcn_output_' + str(i)) #rcn_upsampled_output.get_shape().assert_is_compatible_with([B, GH, GW, upsampling_output_channel]) rcn_upsampled_output = tf.nn.conv2d_transpose( rcn_upsampled_output, vars.upsampling_filter2, output_shape=[B, 49, 49, 32], strides=[1, 2, 2, 1], padding='VALID', name='upsampled_rcn_output_' + str(i)) input_concat = tf.concat( concat_dim=3, # the last dimension values=[ rcn_upsampled_output, # [B x 49 x 49 x 8] # net['frm_sal_cubic'][:, i, :, :, :], # [B x 49 x 49 x 1] # last_output_gazemap # [B x 49 x 49 x 1] ]) output = tf.nn.conv2d_transpose(input_concat, vars.upsampling_filter3, output_shape=[B, 49, 49, 12], strides=[1, 1, 1, 1], padding='SAME', name='upsampled_rcn_output_' + str(i)) output = tf.nn.xw_plus_b(tf.reshape(output, [-1, 12]), vars.out_W, vars.out_b) output = tf.nn.dropout(output, dropout_keep_prob) predicted_gazemap = tf.reshape( output, [B, GH, GW]) # [B x 49 x 49 x 1] -> [B x 49 x 49] squeeze predicted_gazemaps.append(predicted_gazemap) # TODO should we normalize predicted_gazemap ???????????????????????????? # pack as a tensor # T-list of [B x 49 x 49] --> [B x 49 x 49] net['predicted_gazemaps'] = tf.transpose( tf.pack(predicted_gazemaps), [1, 0, 2, 3], name='predicted_gazemaps') net['predicted_gazemaps'].get_shape().assert_is_compatible_with( [B, T, GH, GW]) return net['predicted_gazemaps']
def create_gazeprediction_network(frame_images, c3d_input, dropout_keep_prob=1.0, net=None): ''' Args:d frame_images: a [B x T x IH x IW x 3] tensor (frame images) c3d_input : a [B x T x 1024 x 7 x 7] tensor for C3D convmap features gt_gazemap : a [B x T x GH x GW] tensor of ground truth per-frame gaze maps dropout_keep_prob : float tensor (optional) net : a dictionary to get intra-layer activations or tensors. Outputs: predicted_gazemaps : a [B x T x GH x GW] tensor, predicted gaze maps per frame ''' if net is None: net = {} else: assert isinstance(net, dict) vars = E() # (0) input sanity check GH, GW = CONSTANTS.gazemap_height, CONSTANTS.gazemap_width IH, IW = CONSTANTS.image_height, CONSTANTS.image_width B, T = frame_images.get_shape().as_list()[:2] assert B > 0 and T > 0 frame_images.get_shape().assert_is_compatible_with([B, T, IH, IW, 3]) c3d_input.get_shape().assert_is_compatible_with([B, T, 1024, 7, 7]) dim_cnn_proj = 512 # XXX FIXME (see __init__ in GazePredictionGRU) # some variables # -------------- # not a proper name, it should be rnn_state_feature_size in # GRCN????????? FIXME rnn_state_size = 128 # dim_cnn_proj # filter size is more correct name ''' The RGP (Recurrent Gaze Prediction) model. ''' with tf.variable_scope("RGP"): # (2) C3D # ------- # a. reduce filter size [7 x 7 x 1024] -> [7 x 7 x 32] via FC or CONV # b. apply RCN, and get the [7 x 7 x 32] outputs from RNN # c3d input. net['c3d_input'] = c3d_input # [B x T x 1024 x 7 x 7] # change axis and reshape to [B x T x 7 x 7 x 1024] net['c3d_input_reshape'] = tf.transpose(net['c3d_input'], perm=[0, 1, 3, 4, 2], name='c3d_input_reshape') log.info('c3d_input_reshape shape : %s', net['c3d_input_reshape'].get_shape().as_list()) net['c3d_input_reshape'].get_shape().assert_is_compatible_with( [B, T, 7, 7, 1024]) # c3d_embedded: project each 1024 feature (per 7x7 c3d conv-feature map) into 12 vars.proj_c3d_W = tf.Variable(tf.random_uniform( [1024, dim_cnn_proj], -0.1, 0.1), name="proj_c3d_W") vars.proj_c3d_b = tf.Variable(tf.random_uniform([dim_cnn_proj], -0.1, 0.1), name="proj_c3d_b") net['c3d_embedded'] = tf.nn.xw_plus_b( tf.reshape(net['c3d_input_reshape'], [-1, 1024]), vars.proj_c3d_W, vars.proj_c3d_b ) # [(B*T*7*7) x 1024] --> [(B*T*7*7) x 12] by appling W:1024->12 if dropout_keep_prob != 1.0: net['c3d_embedded'] = tf.nn.dropout(net['c3d_embedded'], dropout_keep_prob) # --> [B x T x 7 x 7 x 12] net['c3d_embedded'] = tf.reshape(net['c3d_embedded'], [B, T, 7, 7, dim_cnn_proj]) log.info('c3d_embedded shape : %s', net['c3d_embedded'].get_shape().as_list()) net['c3d_embedded'].get_shape().assert_is_compatible_with( [B, T, 7, 7, dim_cnn_proj]) # The RNN Part. # ------------- with tf.variable_scope('RCNBottom') as scope: vars.lstm_u = GRU_RCN_Cell(rnn_state_size, dim_cnn_proj) state_u = vars.lstm_u.zero_state(B, tf.float32) log.info('RNN state shape : %s', state_u.get_shape().as_list()) predicted_gazemaps = [] net['rcn_outputs'] = rcn_outputs = [] # n_lstm_step for example, 35. -> 42 has highest performance for i in range(T): # T = number of timesteps if i > 0: tf.get_variable_scope().reuse_variables() # We use cnn embedding + ... as RNN input (as a flatted/concatenated vector) rnn_input = tf.concat( values=[ # 0 1 2 3 # (i) C3D map (embedded into 7x7x12) net['c3d_embedded'][:, i, :, :, :], ], axis=3, # [:, i, 7, 7, HERE] name='rnn_input' + str(i)) # with tf.variable_scope("RNN"): output_u, state_u = vars.lstm_u(rnn_input, state_u) # at time t output_u.get_shape().assert_is_compatible_with( [B, 7, 7, rnn_state_size]) # Bx{time}x7x7x32 rcn_outputs.append(output_u) # (3) RCN output unpooling to 49x49 size # each of (7x7x32) maps are up-sampled to (49x49x8) vars.upsampling_filter1 = tf.get_variable( 'Upsampling/weight1', [5, 5, 64, rnn_state_size ], # rnn_state_size bad name (indeed a channel size) initializer=initializers.xavier_initializer_conv2d( uniform=True)) vars.upsampling_filter2 = tf.get_variable( 'Upsampling/weight2', [5, 5, 32, 64 ], # rnn_state_size bad name (indeed a channel size) initializer=initializers.xavier_initializer_conv2d( uniform=True)) vars.upsampling_filter3 = tf.get_variable( 'Upsampling/weight3', [7, 7, 12, 32 ], # rnn_state_size bad name (indeed a channel size) initializer=initializers.xavier_initializer_conv2d( uniform=True)) vars.out_W = tf.Variable(tf.random_uniform([12, 1], -0.1, 0.1), name="out_W") vars.out_b = tf.Variable(tf.random_uniform([1], -0.1, 0.1), name="out_b") predicted_gazemaps = [] # Batch normalization assumption (if wrong fix): apply before eac convolutional layer for i in range(T): rcn_output_map = rcn_outputs[i] # [B x 7 x 7 x 128] # for now in here - later will add to base: # batch_mean, batch_var = tf.nn.moments(rcn_output_map, axes = [0,1,2]) #global normalization for conv_filters # what to do with offset and scale? rcn_output_map = tf.layers.batch_normalization(rcn_output_map) rcn_upsampled_output = tf.nn.conv2d_transpose( rcn_output_map, vars.upsampling_filter1, output_shape=[B, 23, 23, 64], strides=[1, 3, 3, 1], padding='VALID', name='upsampled_rcn_output_' + str(i)) #rcn_upsampled_output.get_shape().assert_is_compatible_with([B, GH, GW, upsampling_output_channel]) rcn_upsampled_output = tf.nn.conv2d_transpose( rcn_upsampled_output, vars.upsampling_filter2, output_shape=[B, 49, 49, 32], strides=[1, 2, 2, 1], padding='VALID', name='upsampled_rcn_output_' + str(i)) input_concat = tf.concat( axis=3, # the last dimension values=[ # [B x 49 x 49 x 8] rcn_upsampled_output, # net['frm_sal_cubic'][:, i, :, :, :], # [B x 49 x 49 x 1] # last_output_gazemap # [B x 49 x 49 x 1] ]) output = tf.nn.conv2d_transpose(input_concat, vars.upsampling_filter3, output_shape=[B, 49, 49, 12], strides=[1, 1, 1, 1], padding='SAME', name='upsampled_rcn_output_' + str(i)) output = tf.nn.xw_plus_b(tf.reshape(output, [-1, 12]), vars.out_W, vars.out_b) output = tf.nn.dropout(output, dropout_keep_prob) # [B x 49 x 49 x 1] -> [B x 49 x 49] squeeze predicted_gazemap = tf.reshape(output, [B, GH, GW]) predicted_gazemaps.append(predicted_gazemap) # TODO should we normalize predicted_gazemap ???????????????????????????? # pack as a tensor # T-list of [B x 49 x 49] --> [B x 49 x 49] net['predicted_gazemaps'] = tf.transpose( tf.stack(predicted_gazemaps), [1, 0, 2, 3], name='predicted_gazemaps') net['predicted_gazemaps'].get_shape().assert_is_compatible_with( [B, T, GH, GW]) return net['predicted_gazemaps']