def test_die_on_tensor_shape_with_rank_three(self): tensor_shape = tf.TensorShape(dims=[32, 299, 384]) with self.assertRaises(ValueError): static_shape.get_batch_size(tensor_shape) static_shape.get_height(tensor_shape) static_shape.get_width(tensor_shape) static_shape.get_depth(tensor_shape)
def build(self, input_shapes): """Creates the variables of the layer.""" if len(input_shapes) != len(self._prediction_heads[BOX_ENCODINGS]): raise ValueError('This box predictor was constructed with %d heads,' 'but there are %d inputs.' % (len(self._prediction_heads[BOX_ENCODINGS]), len(input_shapes))) for stack_index, input_shape in enumerate(input_shapes): net = [] # Add additional conv layers before the class predictor. features_depth = static_shape.get_depth(input_shape) depth = max(min(features_depth, self._max_depth), self._min_depth) tf.logging.info( 'depth of additional conv before box predictor: {}'.format(depth)) if depth > 0 and self._num_layers_before_predictor > 0: for i in range(self._num_layers_before_predictor): net.append(keras.Conv2D(depth, [1, 1], name='SharedConvolutions_%d/Conv2d_%d_1x1_%d' % (stack_index, i, depth), padding='SAME', **self._conv_hyperparams.params())) net.append(self._conv_hyperparams.build_batch_norm( training=(self._is_training and not self._freeze_batchnorm), name='SharedConvolutions_%d/Conv2d_%d_1x1_%d_norm' % (stack_index, i, depth))) net.append(self._conv_hyperparams.build_activation_layer( name='SharedConvolutions_%d/Conv2d_%d_1x1_%d_activation' % (stack_index, i, depth), )) # Until certain bugs are fixed in checkpointable lists, # this net must be appended only once it's been filled with layers self._shared_nets.append(net) self.built = True
def pad_to_multiple(tensor, multiple): """Returns the tensor zero padded to the specified multiple. Appends 0s to the end of the first and second dimension (height and width) of the tensor until both dimensions are a multiple of the input argument 'multiple'. E.g. given an input tensor of shape [1, 3, 5, 1] and an input multiple of 4, PadToMultiple will append 0s so that the resulting tensor will be of shape [1, 4, 8, 1]. Args: tensor: rank 4 float32 tensor, where tensor -> [batch_size, height, width, channels]. multiple: the multiple to pad to. Returns: padded_tensor: the tensor zero padded to the specified multiple. """ tensor_shape = tensor.get_shape() batch_size = static_shape.get_batch_size(tensor_shape) tensor_height = static_shape.get_height(tensor_shape) tensor_width = static_shape.get_width(tensor_shape) tensor_depth = static_shape.get_depth(tensor_shape) if batch_size is None: batch_size = tf.shape(tensor)[0] if tensor_height is None: tensor_height = tf.shape(tensor)[1] padded_tensor_height = tf.to_int32( tf.ceil(tf.to_float(tensor_height) / tf.to_float(multiple))) * multiple else: padded_tensor_height = int( math.ceil(float(tensor_height) / multiple) * multiple) if tensor_width is None: tensor_width = tf.shape(tensor)[2] padded_tensor_width = tf.to_int32( tf.ceil(tf.to_float(tensor_width) / tf.to_float(multiple))) * multiple else: padded_tensor_width = int( math.ceil(float(tensor_width) / multiple) * multiple) if tensor_depth is None: tensor_depth = tf.shape(tensor)[3] # Use tf.concat instead of tf.pad to preserve static shape if padded_tensor_height != tensor_height: height_pad = tf.zeros([ batch_size, padded_tensor_height - tensor_height, tensor_width, tensor_depth ]) tensor = tf.concat([tensor, height_pad], 1) if padded_tensor_width != tensor_width: width_pad = tf.zeros([ batch_size, padded_tensor_height, padded_tensor_width - tensor_width, tensor_depth ]) tensor = tf.concat([tensor, width_pad], 2) return tensor
def _predict(self, image_features, num_predictions_per_location): """Computes encoded object locations and corresponding confidences. Args: image_features: A float tensor of shape [batch_size, height, width, channels] containing features for a batch of images. num_predictions_per_location: an integer representing the number of box predictions to be made per spatial location in the feature map. Returns: A dictionary containing the following tensors. box_encodings: A float tensor of shape [batch_size, num_anchors, 1, code_size] representing the location of the objects, where num_anchors = feat_height * feat_width * num_predictions_per_location class_predictions_with_background: A float tensor of shape [batch_size, num_anchors, num_classes + 1] representing the class predictions for the proposals. """ features_depth = static_shape.get_depth(image_features.get_shape()) depth = max(min(features_depth, self._max_depth), self._min_depth) # Add a slot for the background class. num_class_slots = self.num_classes + 1 net = image_features with slim.arg_scope(self._conv_hyperparams), \ slim.arg_scope([slim.dropout], is_training=self._is_training): # Add additional conv layers before the predictor. if depth > 0 and self._num_layers_before_predictor > 0: for i in range(self._num_layers_before_predictor): net = slim.conv2d( net, depth, [1, 1], scope='Conv2d_%d_1x1_%d' % (i, depth)) with slim.arg_scope([slim.conv2d], activation_fn=None, normalizer_fn=None, normalizer_params=None): box_encodings = slim.conv2d( net, num_predictions_per_location * self._box_code_size, [self._kernel_size, self._kernel_size], scope='BoxEncodingPredictor') if self._use_dropout: net = slim.dropout(net, keep_prob=self._dropout_keep_prob) class_predictions_with_background = slim.conv2d( net, num_predictions_per_location * num_class_slots, [self._kernel_size, self._kernel_size], scope='ClassPredictor') if self._apply_sigmoid_to_scores: class_predictions_with_background = tf.sigmoid( class_predictions_with_background) batch_size = static_shape.get_batch_size(image_features.get_shape()) if batch_size is None: features_height = static_shape.get_height(image_features.get_shape()) features_width = static_shape.get_width(image_features.get_shape()) flattened_predictions_size = (features_height * features_width * num_predictions_per_location) box_encodings = tf.reshape( box_encodings, [-1, flattened_predictions_size, 1, self._box_code_size]) class_predictions_with_background = tf.reshape( class_predictions_with_background, [-1, flattened_predictions_size, num_class_slots]) else: box_encodings = tf.reshape( box_encodings, [batch_size, -1, 1, self._box_code_size]) class_predictions_with_background = tf.reshape( class_predictions_with_background, [batch_size, -1, num_class_slots]) return {BOX_ENCODINGS: box_encodings, CLASS_PREDICTIONS_WITH_BACKGROUND: class_predictions_with_background}
def _predict(self, image_features, num_predictions_per_location): """Computes encoded object locations and corresponding confidences. Args: image_features: A float tensor of shape [batch_size, height, width, channels] containing features for a batch of images. num_predictions_per_location: an integer representing the number of box predictions to be made per spatial location in the feature map. Returns: A dictionary containing the following tensors. box_encodings: A float tensor of shape [batch_size, num_anchors, 1, code_size] representing the location of the objects, where num_anchors = feat_height * feat_width * num_predictions_per_location class_predictions_with_background: A float tensor of shape [batch_size, num_anchors, num_classes + 1] representing the class predictions for the proposals. """ features_depth = static_shape.get_depth(image_features.get_shape()) depth = max(min(features_depth, self._max_depth), self._min_depth) # Add a slot for the background class. num_class_slots = self.num_classes + 1 net_cls = image_features net_reg = image_features with slim.arg_scope(self._conv_hyperparams), \ slim.arg_scope([slim.dropout], is_training=self._is_training): # Add additional conv layers before the predictor. if depth > 0 and self._num_layers_before_predictor > 0: for i in range(self._num_layers_before_predictor): if self._use_depthwise_before_predictor: net_cls = slim.separable_conv2d( net_cls, None, [3, 3], depth_multiplier=1, padding='SAME', reuse=tf.AUTO_REUSE, scope='Conv2d_cls_%d_3x3_%d_depthwise' % (i, depth)) net_cls = slim.conv2d(net_cls, depth, [1, 1], padding='SAME', reuse=tf.AUTO_REUSE, scope='Conv2d_cls_%d_1x1_%d' % (i, depth)) net_reg = slim.separable_conv2d( net_reg, None, [3, 3], depth_multiplier=1, padding='SAME', reuse=tf.AUTO_REUSE, scope='Conv2d_reg_%d_3x3_%d_depthwise' % (i, depth)) net_reg = slim.conv2d(net_reg, depth, [1, 1], padding='SAME', reuse=tf.AUTO_REUSE, scope='Conv2d_reg_%d_1x1_%d' % (i, depth)) else: net_cls = slim.conv2d(net_cls, depth, [3, 3], padding='SAME', reuse=tf.AUTO_REUSE, scope='Conv2d_cls_%d_3x3_%d' % (i, depth)) net_reg = slim.conv2d(net_reg, depth, [3, 3], padding='SAME', reuse=tf.AUTO_REUSE, scope='Conv2d_reg_%d_3x3_%d' % (i, depth)) with slim.arg_scope([slim.conv2d], activation_fn=None, normalizer_fn=None, normalizer_params=None): box_encodings = slim.conv2d( net_reg, num_predictions_per_location * self._box_code_size, [self._kernel_size, self._kernel_size], reuse=tf.AUTO_REUSE, scope='BoxEncodingPredictor') if self._use_dropout: net_cls = slim.dropout(net_cls, keep_prob=self._dropout_keep_prob) class_predictions_with_background = slim.conv2d( net_cls, num_predictions_per_location * num_class_slots, [self._kernel_size, self._kernel_size], reuse=tf.AUTO_REUSE, scope='ClassPredictor') if self._apply_sigmoid_to_scores: class_predictions_with_background = tf.sigmoid( class_predictions_with_background) combined_feature_map_shape = shape_utils.combined_static_and_dynamic_shape( image_features) box_encodings = tf.reshape( box_encodings, tf.stack([ combined_feature_map_shape[0], combined_feature_map_shape[1] * combined_feature_map_shape[2] * num_predictions_per_location, 1, self._box_code_size ])) class_predictions_with_background = tf.reshape( class_predictions_with_background, tf.stack([ combined_feature_map_shape[0], combined_feature_map_shape[1] * combined_feature_map_shape[2] * num_predictions_per_location, num_class_slots ])) return { BOX_ENCODINGS: box_encodings, CLASS_PREDICTIONS_WITH_BACKGROUND: class_predictions_with_background }
def _predict(self, image_features, num_predictions_per_location): """Computes encoded object locations and corresponding confidences. Args: image_features: A float tensor of shape [batch_size, height, width, channels] containing features for a batch of images. num_predictions_per_location: an integer representing the number of box predictions to be made per spatial location in the feature map. Returns: A dictionary containing the following tensors. box_encodings: A float tensor of shape [batch_size, num_anchors, 1, code_size] representing the location of the objects, where num_anchors = feat_height * feat_width * num_predictions_per_location class_predictions_with_background: A float tensor of shape [batch_size, num_anchors, num_classes + 1] representing the class predictions for the proposals. """ features_height = static_shape.get_height(image_features.get_shape()) features_width = static_shape.get_width(image_features.get_shape()) features_depth = static_shape.get_depth(image_features.get_shape()) depth = max(min(features_depth, self._max_depth), self._min_depth) # Add a slot for the background class. num_class_slots = self.num_classes + 1 net = image_features with slim.arg_scope(self._conv_hyperparams), \ slim.arg_scope([slim.dropout], is_training=self._is_training): # Add additional conv layers before the predictor. if depth > 0 and self._num_layers_before_predictor > 0: for i in range(self._num_layers_before_predictor): net = slim.conv2d(net, depth, [1, 1], scope='Conv2d_%d_1x1_%d' % (i, depth)) with tf.variable_scope('gradientconv2d'): box_encodings = gradient_conv2d411( inputs=net, inputs_hwc=[ features_height, features_width, features_depth ], num_outputs=num_predictions_per_location * self._box_code_size, scope='BoxEncodingPredictor') if self._use_dropout: net = slim.dropout(net, keep_prob=self._dropout_keep_prob) class_predictions_with_background = gradient_conv2d411( inputs=net, inputs_hwc=[ features_height, features_width, features_depth ], num_outputs=num_predictions_per_location * num_class_slots, scope='ClassPredictor') if self._apply_sigmoid_to_scores: class_predictions_with_background = tf.sigmoid( class_predictions_with_background) combined_feature_map_shape = shape_utils.combined_static_and_dynamic_shape( image_features) box_encodings = tf.reshape( box_encodings, tf.stack([ combined_feature_map_shape[0], combined_feature_map_shape[1] * combined_feature_map_shape[2] * num_predictions_per_location, 1, self._box_code_size ])) class_predictions_with_background = tf.reshape( class_predictions_with_background, tf.stack([ combined_feature_map_shape[0], combined_feature_map_shape[1] * combined_feature_map_shape[2] * num_predictions_per_location, num_class_slots ])) return { BOX_ENCODINGS: box_encodings, CLASS_PREDICTIONS_WITH_BACKGROUND: class_predictions_with_background }
def _predict(self, image_features, num_predictions_per_location_list): """Computes encoded object locations and corresponding confidences. Args: image_features: A list of float tensors of shape [batch_size, height_i, width_i, channels_i] containing features for a batch of images. num_predictions_per_location_list: A list of integers representing the number of box predictions to be made per spatial location for each feature map. Returns: box_encodings: A list of float tensors of shape [batch_size, num_anchors_i, q, code_size] representing the location of the objects, where q is 1 or the number of classes. Each entry in the list corresponds to a feature map in the input `image_features` list. class_predictions_with_background: A list of float tensors of shape [batch_size, num_anchors_i, num_classes + 1] representing the class predictions for the proposals. Each entry in the list corresponds to a feature map in the input `image_features` list. """ box_encodings_list = [] class_predictions_list = [] # TODO(rathodv): Come up with a better way to generate scope names # in box predictor once we have time to retrain all models in the zoo. # The following lines create scope names to be backwards compatible with the # existing checkpoints. box_predictor_scopes = [_NoopVariableScope()] if len(image_features) > 1: box_predictor_scopes = [ tf.variable_scope('BoxPredictor_{}'.format(i)) for i in range(len(image_features)) ] for (image_feature, num_predictions_per_location, box_predictor_scope) in zip(image_features, num_predictions_per_location_list, box_predictor_scopes): with box_predictor_scope: # Add a slot for the background class. num_class_slots = self.num_classes + 1 net = image_feature with slim.arg_scope(self._conv_hyperparams_fn()), \ slim.arg_scope([slim.dropout], is_training=self._is_training): # Add additional conv layers before the class predictor. features_depth = static_shape.get_depth( image_feature.get_shape()) depth = max(min(features_depth, self._max_depth), self._min_depth) tf.logging.info( 'depth of additional conv before box predictor: {}'. format(depth)) if depth > 0 and self._num_layers_before_predictor > 0: for i in range(self._num_layers_before_predictor): net = slim.conv2d(net, depth, [1, 1], scope='Conv2d_%d_1x1_%d' % (i, depth)) with slim.arg_scope([slim.conv2d], activation_fn=None, normalizer_fn=None, normalizer_params=None): if self._use_depthwise: box_encodings = slim.separable_conv2d( net, None, [self._kernel_size, self._kernel_size], padding='SAME', depth_multiplier=1, stride=1, rate=1, scope='BoxEncodingPredictor_depthwise') box_encodings = slim.conv2d( box_encodings, num_predictions_per_location * self._box_code_size, [1, 1], scope='BoxEncodingPredictor') else: box_encodings = slim.conv2d( net, num_predictions_per_location * self._box_code_size, [self._kernel_size, self._kernel_size], scope='BoxEncodingPredictor') if self._use_dropout: net = slim.dropout( net, keep_prob=self._dropout_keep_prob) if self._use_depthwise: class_predictions_with_background = slim.separable_conv2d( net, None, [self._kernel_size, self._kernel_size], padding='SAME', depth_multiplier=1, stride=1, rate=1, scope='ClassPredictor_depthwise') class_predictions_with_background = slim.conv2d( class_predictions_with_background, num_predictions_per_location * num_class_slots, [1, 1], scope='ClassPredictor') else: class_predictions_with_background = slim.conv2d( net, num_predictions_per_location * num_class_slots, [self._kernel_size, self._kernel_size], scope='ClassPredictor', biases_initializer=tf.constant_initializer( self._class_prediction_bias_init)) if self._apply_sigmoid_to_scores: class_predictions_with_background = tf.sigmoid( class_predictions_with_background) combined_feature_map_shape = ( shape_utils.combined_static_and_dynamic_shape( image_feature)) box_encodings = tf.reshape( box_encodings, tf.stack([ combined_feature_map_shape[0], combined_feature_map_shape[1] * combined_feature_map_shape[2] * num_predictions_per_location, 1, self._box_code_size ])) box_encodings_list.append(box_encodings) class_predictions_with_background = tf.reshape( class_predictions_with_background, tf.stack([ combined_feature_map_shape[0], combined_feature_map_shape[1] * combined_feature_map_shape[2] * num_predictions_per_location, num_class_slots ])) class_predictions_list.append( class_predictions_with_background) return { BOX_ENCODINGS: box_encodings_list, CLASS_PREDICTIONS_WITH_BACKGROUND: class_predictions_list }
def _predict(self, image_features, num_predictions_per_location): """Computes encoded object locations and corresponding confidences. Args: image_features: A float tensor of shape [batch_size, height, width, channels] containing features for a batch of images. num_predictions_per_location: an integer representing the number of box predictions to be made per spatial location in the feature map. Returns: A dictionary containing the following tensors. box_encodings: A float tensor of shape [batch_size, num_anchors, 1, code_size] representing the location of the objects, where num_anchors = feat_height * feat_width * num_predictions_per_location class_predictions_with_background: A float tensor of shape [batch_size, num_anchors, num_classes + 1] representing the class predictions for the proposals. """ features_depth = static_shape.get_depth(image_features.get_shape()) depth = max(min(features_depth, self._max_depth), self._min_depth) # Add a slot for the background class. num_class_slots = self.num_classes + 1 net = image_features end_points_collection = self._scope.name + '_end_points' with slim.arg_scope(self._conv_hyperparams), \ slim.arg_scope([slim.dropout], is_training=self._is_training), \ slim.arg_scope([slim.conv2d], trainable=self._is_training, outputs_collections=end_points_collection): # Add additional conv layers before the predictor. if depth > 0 and self._num_layers_before_predictor > 0: for i in range(self._num_layers_before_predictor): net = slim.conv2d(net, depth, [1, 1], scope='Conv2d_%d_1x1_%d' % (i, depth)) with slim.arg_scope([slim.conv2d], activation_fn=None, normalizer_fn=None, normalizer_params=None): box_encodings = slim.conv2d( net, num_predictions_per_location * self._box_code_size, [self._kernel_size, self._kernel_size], scope='BoxEncodingPredictor') if self._use_dropout: net = slim.dropout(net, keep_prob=self._dropout_keep_prob) class_predictions_with_background = slim.conv2d( net, num_predictions_per_location * num_class_slots, [self._kernel_size, self._kernel_size], scope='ClassPredictor') if self._apply_sigmoid_to_scores: class_predictions_with_background = tf.sigmoid( class_predictions_with_background) combined_feature_map_shape = shape_utils.combined_static_and_dynamic_shape( image_features) box_encodings = tf.reshape( box_encodings, tf.stack([ combined_feature_map_shape[0], combined_feature_map_shape[1] * combined_feature_map_shape[2] * num_predictions_per_location, 1, self._box_code_size ])) class_predictions_with_background = tf.reshape( class_predictions_with_background, tf.stack([ combined_feature_map_shape[0], combined_feature_map_shape[1] * combined_feature_map_shape[2] * num_predictions_per_location, num_class_slots ])) # TODO: If TF's version is updated, just use clear_collection argument for # convert_collection_to_dict (current: 1.2.1). end_points = slim.utils.convert_collection_to_dict( end_points_collection) framework_ops.get_default_graph().clear_collection( end_points_collection) return { BOX_ENCODINGS: box_encodings, CLASS_PREDICTIONS_WITH_BACKGROUND: class_predictions_with_background }
def test_return_correct_depth(self): tensor_shape = tf.TensorShape(dims=[32, 299, 384, 3]) self.assertEqual(3, static_shape.get_depth(tensor_shape))
def _predict(self, image_features, num_predictions_per_location_list): """Computes encoded object locations and corresponding confidences. Args: image_features: A list of float tensors of shape [batch_size, height_i, width_i, channels_i] containing features for a batch of images. num_predictions_per_location_list: A list of integers representing the number of box predictions to be made per spatial location for each feature map. Returns: box_encodings: A list of float tensors of shape [batch_size, num_anchors_i, q, code_size] representing the location of the objects, where q is 1 or the number of classes. Each entry in the list corresponds to a feature map in the input `image_features` list. class_predictions_with_background: A list of float tensors of shape [batch_size, num_anchors_i, num_classes + 1] representing the class predictions for the proposals. Each entry in the list corresponds to a feature map in the input `image_features` list. """ box_encodings_list = [] class_predictions_list = [] # TODO(rathodv): Come up with a better way to generate scope names # in box predictor once we have time to retrain all models in the zoo. # The following lines create scope names to be backwards compatible with the # existing checkpoints. box_predictor_scopes = [_NoopVariableScope()] if len(image_features) > 1: box_predictor_scopes = [ tf.variable_scope('BoxPredictor_{}'.format(i)) for i in range(len(image_features)) ] for (image_feature, num_predictions_per_location, box_predictor_scope) in zip( image_features, num_predictions_per_location_list, box_predictor_scopes): with box_predictor_scope: # Add a slot for the background class. num_class_slots = self.num_classes + 1 net = image_feature with slim.arg_scope(self._conv_hyperparams), \ slim.arg_scope([slim.dropout], is_training=self._is_training): # Add additional conv layers before the class predictor. features_depth = static_shape.get_depth(image_feature.get_shape()) depth = max(min(features_depth, self._max_depth), self._min_depth) tf.logging.info('depth of additional conv before box predictor: {}'. format(depth)) if depth > 0 and self._num_layers_before_predictor > 0: for i in range(self._num_layers_before_predictor): net = slim.conv2d( net, depth, [1, 1], scope='Conv2d_%d_1x1_%d' % (i, depth)) with slim.arg_scope([slim.conv2d], activation_fn=None, normalizer_fn=None, normalizer_params=None): if self._use_depthwise: box_encodings = slim.separable_conv2d( net, None, [self._kernel_size, self._kernel_size], padding='SAME', depth_multiplier=1, stride=1, rate=1, scope='BoxEncodingPredictor_depthwise') box_encodings = slim.conv2d( box_encodings, num_predictions_per_location * self._box_code_size, [1, 1], scope='BoxEncodingPredictor') else: box_encodings = slim.conv2d( net, num_predictions_per_location * self._box_code_size, [self._kernel_size, self._kernel_size], scope='BoxEncodingPredictor') if self._use_dropout: net = slim.dropout(net, keep_prob=self._dropout_keep_prob) if self._use_depthwise: class_predictions_with_background = slim.separable_conv2d( net, None, [self._kernel_size, self._kernel_size], padding='SAME', depth_multiplier=1, stride=1, rate=1, scope='ClassPredictor_depthwise') class_predictions_with_background = slim.conv2d( class_predictions_with_background, num_predictions_per_location * num_class_slots, [1, 1], scope='ClassPredictor') else: class_predictions_with_background = slim.conv2d( net, num_predictions_per_location * num_class_slots, [self._kernel_size, self._kernel_size], scope='ClassPredictor', biases_initializer=tf.constant_initializer( self._class_prediction_bias_init)) if self._apply_sigmoid_to_scores: class_predictions_with_background = tf.sigmoid( class_predictions_with_background) combined_feature_map_shape = (shape_utils. combined_static_and_dynamic_shape( image_feature)) box_encodings = tf.reshape( box_encodings, tf.stack([combined_feature_map_shape[0], combined_feature_map_shape[1] * combined_feature_map_shape[2] * num_predictions_per_location, 1, self._box_code_size])) box_encodings_list.append(box_encodings) class_predictions_with_background = tf.reshape( class_predictions_with_background, tf.stack([combined_feature_map_shape[0], combined_feature_map_shape[1] * combined_feature_map_shape[2] * num_predictions_per_location, num_class_slots])) class_predictions_list.append(class_predictions_with_background) return { BOX_ENCODINGS: box_encodings_list, CLASS_PREDICTIONS_WITH_BACKGROUND: class_predictions_list }
def _predict_class(self, image_features, activation_fn=None, with_background=False, scope=None): """Computes encoded object classes (without background). Flattens image_features and applies fully connected ops (with no non-linearity) to predict class predictions. In this setting, anchors are not spatially arranged in any way and are assumed to have been folded into the batch dimension. Thus we output 1 for the anchors dimension. Args: image_features: A float tensor of shape [batch_size, height, width, channels] containing features for a batch of images. Returns: A dictionary containing the following tensors. class_predictions: A float tensor of shape [batch_size, 1, num_classes] representing the class predictions for the proposals. If predict_masks is True the dictionary also contains: instance_masks: A float tensor of shape [batch_size, 1, num_classes, image_height, image_width] If predict_keypoints is True the dictionary also contains: keypoints: [batch_size, 1, num_keypoints, 2] Raises: ValueError: if num_predictions_per_location is not 1. """ num_classes = self.num_classes if with_background: num_classes += 1 features_depth = static_shape.get_depth(image_features.get_shape()) depth = max(min(features_depth, self._max_depth), self._min_depth) roi_features = image_features if self._spatial_average: roi_features = tf.reduce_mean(roi_features, [1, 2], keep_dims=True, name='AvgPool') # net = slim.flatten(roi_features) n_batch = roi_features.get_shape().as_list()[0] if n_batch == None: is_empty = tf.equal(tf.size(roi_features), 0) h = roi_features.get_shape().as_list()[1] w = roi_features.get_shape().as_list()[2] c = roi_features.get_shape().as_list()[3] net = tf.cond(is_empty, lambda: tf.zeros([0, h * w * c], tf.float32), lambda: slim.flatten(roi_features)) else: net = slim.flatten(roi_features) end_points_collection = self._scope.name + '_end_points' with slim.arg_scope(self._fc_hyperparams), \ slim.arg_scope([slim.dropout], is_training=self._is_training), \ slim.arg_scope([slim.fully_connected], trainable=self._is_training, outputs_collections=end_points_collection): # Add additional fc layers before the predictor. if depth > 0 and self._num_layers_before_predictor > 0: for i in range(self._num_layers_before_predictor): net = slim.fully_connected(net, depth, scope='FC_%d_%d' % (i, depth)) if self._use_dropout: net = slim.dropout(net, keep_prob=self._dropout_keep_prob) class_predictions = slim.fully_connected( net, num_classes, activation_fn=activation_fn, scope='ClassPredictor') class_predictions = tf.reshape(class_predictions, [-1, 1, num_classes]) if with_background: predictions_dict = { CLASS_PREDICTIONS_WITH_BACKGROUND: class_predictions } else: predictions_dict = {CLASS_PREDICTIONS: class_predictions} end_points = slim.utils.convert_collection_to_dict( end_points_collection) framework_ops.get_default_graph().clear_collection( end_points_collection) return predictions_dict
def _predict(self, image_features, num_predictions_per_location_list, audio_features=None): """Computes encoded object locations and corresponding confidences. Args: image_features: A list of float tensors of shape [batch_size, height_i, width_i, channels_i] containing features for a batch of images. num_predictions_per_location_list: A list of integers representing the number of box predictions to be made per spatial location for each feature map. Returns: box_encodings: A list of float tensors of shape [batch_size, num_anchors_i, q, code_size] representing the location of the objects, where q is 1 or the number of classes. Each entry in the list corresponds to a feature map in the input `image_features` list. class_predictions_with_background: A list of float tensors of shape [batch_size, num_anchors_i, num_classes + 1] representing the class predictions for the proposals. Each entry in the list corresponds to a feature map in the input `image_features` list. """ predictions = { BOX_ENCODINGS: [], CLASS_PREDICTIONS_WITH_BACKGROUND: [], } for head_name in self._other_heads.keys(): predictions[head_name] = [] # TODO(rathodv): Come up with a better way to generate scope names # in box predictor once we have time to retrain all models in the zoo. # The following lines create scope names to be backwards compatible with the # existing checkpoints. box_predictor_scopes = [_NoopVariableScope()] if len(image_features) > 1: box_predictor_scopes = [ tf.variable_scope('BoxPredictor_{}'.format(i)) for i in range(len(image_features)) ] if (audio_features != None): for (image_feature, num_predictions_per_location, box_predictor_scope) in zip( image_features, num_predictions_per_location_list, box_predictor_scopes): net = image_feature audio_feature = audio_features['fc5'] with box_predictor_scope: with slim.arg_scope(self._conv_hyperparams_fn()): with slim.arg_scope([slim.dropout], is_training=self._is_training): # Add additional conv layers before the class predictor. features_depth = static_shape.get_depth( image_feature.get_shape()) depth = max(min(features_depth, self._max_depth), self._min_depth) tf.logging.info( 'depth of additional conv before box predictor: {}' .format(depth)) if depth > 0 and self._num_layers_before_predictor > 0: for i in range( self._num_layers_before_predictor): net = slim.conv2d( net, depth, [1, 1], reuse=tf.AUTO_REUSE, scope='Conv2d_%d_1x1_%d' % (i, depth)) # M: video + audio shape = net.get_shape() print("before fusion: feature_map", net.get_shape()) print("before fusion: audio_feature", audio_feature.get_shape()) # for test #inputs = np.random.rand(4,1,1,256).astype(np.float32) #audio_feature = [tf.placeholder_with_default(v, v.shape) for v in inputs] extended_audio = tf.tile( audio_feature, [1, shape[1], shape[2], 1]) f_net = tf.concat([net, extended_audio], 3) print("after fusion: new feature_map", f_net.get_shape()) # do 1 x 1 convolution # check out the code in the old model #f_net = slim.conv2d(f_net, shape[3], [1, 1], scope='fusion') sorted_keys = sorted(self._other_heads.keys()) sorted_keys.append(BOX_ENCODINGS) sorted_keys.append( CLASS_PREDICTIONS_WITH_BACKGROUND) for head_name in sorted_keys: if head_name == BOX_ENCODINGS: head_obj = self._box_prediction_head prediction = head_obj.predict( features=net, num_predictions_per_location= num_predictions_per_location) elif head_name == CLASS_PREDICTIONS_WITH_BACKGROUND: head_obj = self._class_prediction_head prediction = head_obj.predict( features=f_net, num_predictions_per_location= num_predictions_per_location) else: head_obj = self._other_heads[head_name] prediction = head_obj.predict( features=net, num_predictions_per_location= num_predictions_per_location) predictions[head_name].append(prediction) else: for (image_feature, num_predictions_per_location, box_predictor_scope) in zip( image_features, num_predictions_per_location_list, box_predictor_scopes): net = image_feature with box_predictor_scope: with slim.arg_scope(self._conv_hyperparams_fn()): with slim.arg_scope([slim.dropout], is_training=self._is_training): # Add additional conv layers before the class predictor. features_depth = static_shape.get_depth( image_feature.get_shape()) depth = max(min(features_depth, self._max_depth), self._min_depth) tf.logging.info( 'depth of additional conv before box predictor: {}' .format(depth)) if depth > 0 and self._num_layers_before_predictor > 0: for i in range( self._num_layers_before_predictor): net = slim.conv2d( net, depth, [1, 1], reuse=tf.AUTO_REUSE, scope='Conv2d_%d_1x1_%d' % (i, depth)) sorted_keys = sorted(self._other_heads.keys()) sorted_keys.append(BOX_ENCODINGS) sorted_keys.append( CLASS_PREDICTIONS_WITH_BACKGROUND) for head_name in sorted_keys: if head_name == BOX_ENCODINGS: head_obj = self._box_prediction_head elif head_name == CLASS_PREDICTIONS_WITH_BACKGROUND: head_obj = self._class_prediction_head else: head_obj = self._other_heads[head_name] prediction = head_obj.predict( features=net, num_predictions_per_location= num_predictions_per_location) predictions[head_name].append(prediction) return predictions
def _predict(self, image_features, num_predictions_per_location_list): """Computes encoded object locations and corresponding confidences. Args: image_features: A list of float tensors of shape [batch_size, height_i, width_i, channels_i] containing features for a batch of images. num_predictions_per_location_list: A list of integers representing the number of box predictions to be made per spatial location for each feature map. Returns: A dictionary containing: box_encodings: A list of float tensors of shape [batch_size, num_anchors_i, q, code_size] representing the location of the objects, where q is 1 or the number of classes. Each entry in the list corresponds to a feature map in the input `image_features` list. class_predictions_with_background: A list of float tensors of shape [batch_size, num_anchors_i, num_classes + 1] representing the class predictions for the proposals. Each entry in the list corresponds to a feature map in the input `image_features` list. (optional) Predictions from other heads. """ predictions = { BOX_ENCODINGS: [], CLASS_PREDICTIONS_WITH_BACKGROUND: [], } for head_name in self._other_heads.keys(): predictions[head_name] = [] # TODO(rathodv): Come up with a better way to generate scope names # in box predictor once we have time to retrain all models in the zoo. # The following lines create scope names to be backwards compatible with the # existing checkpoints. box_predictor_scopes = [_NoopVariableScope()] if len(image_features) > 1: box_predictor_scopes = [ tf.variable_scope('BoxPredictor_{}'.format(i)) for i in range(len(image_features)) ] for (image_feature, num_predictions_per_location, box_predictor_scope) in zip( image_features, num_predictions_per_location_list, box_predictor_scopes): net = image_feature with box_predictor_scope: with slim.arg_scope(self._conv_hyperparams_fn()): with slim.arg_scope([slim.dropout], is_training=self._is_training): # Add additional conv layers before the class predictor. features_depth = static_shape.get_depth(image_feature.get_shape()) depth = max(min(features_depth, self._max_depth), self._min_depth) tf.logging.info('depth of additional conv before box predictor: {}'. format(depth)) if depth > 0 and self._num_layers_before_predictor > 0: for i in range(self._num_layers_before_predictor): net = slim.conv2d( net, depth, [1, 1], reuse=tf.AUTO_REUSE, scope='Conv2d_%d_1x1_%d' % (i, depth)) sorted_keys = sorted(self._other_heads.keys()) sorted_keys.append(BOX_ENCODINGS) sorted_keys.append(CLASS_PREDICTIONS_WITH_BACKGROUND) for head_name in sorted_keys: if head_name == BOX_ENCODINGS: head_obj = self._box_prediction_head elif head_name == CLASS_PREDICTIONS_WITH_BACKGROUND: head_obj = self._class_prediction_head else: head_obj = self._other_heads[head_name] prediction = head_obj.predict( features=net, num_predictions_per_location=num_predictions_per_location) predictions[head_name].append(prediction) return predictions
def _predict(self, image_features, audio_features, num_predictions_per_location): """Computes encoded object locations and corresponding confidences. Args: image_features: A float tensor of shape [batch_size, height, width, channels] containing features for a batch of images. num_predictions_per_location: an integer representing the number of box predictions to be made per spatial location in the feature map. Returns: A dictionary containing the following tensors. box_encodings: A float tensor of shape [batch_size, num_anchors, 1, code_size] representing the location of the objects, where num_anchors = feat_height * feat_width * num_predictions_per_location class_predictions_with_background: A float tensor of shape [batch_size, num_anchors, num_classes + 1] representing the class predictions for the proposals. """ # Add a slot for the background class. num_class_slots = self.num_classes + 1 net = image_features with slim.arg_scope(self._conv_hyperparams), \ slim.arg_scope([slim.dropout], is_training=self._is_training): # Add additional conv layers before the class predictor. features_depth = static_shape.get_depth(image_features.get_shape()) depth = max(min(features_depth, self._max_depth), self._min_depth) tf.logging.info( 'depth of additional conv before box predictor: {}'.format( depth)) if depth > 0 and self._num_layers_before_predictor > 0: for i in range(self._num_layers_before_predictor): net = slim.conv2d(net, depth, [1, 1], scope='Conv2d_%d_1x1_%d' % (i, depth)) with slim.arg_scope([slim.conv2d], activation_fn=None, normalizer_fn=None, normalizer_params=None): box_encodings = slim.conv2d( net, num_predictions_per_location * self._box_code_size, [self._kernel_size, self._kernel_size], scope='BoxEncodingPredictor') if self._use_dropout: net = slim.dropout(net, keep_prob=self._dropout_keep_prob) # fusing with audio features print("before fusion: feature_map", net.get_shape()) shape = net.get_shape() extended_audio = tf.tile(audio_features, [1, shape[1], shape[2], 1]) net = tf.concat([net, extended_audio], 3) print("after fusion: new feature_map", net.get_shape()) # do 1 x 1 convolution net = slim.conv2d(net, shape[3], [1, 1], scope='fusion') class_predictions_with_background = slim.conv2d( net, num_predictions_per_location * num_class_slots, [self._kernel_size, self._kernel_size], scope='ClassPredictor', biases_initializer=tf.constant_initializer( self._class_prediction_bias_init)) if self._apply_sigmoid_to_scores: class_predictions_with_background = tf.sigmoid( class_predictions_with_background) print("num_predictions_per_location", num_predictions_per_location) print("class_predictions_with_background", class_predictions_with_background.get_shape()) combined_feature_map_shape = shape_utils.combined_static_and_dynamic_shape( image_features) box_encodings = tf.reshape( box_encodings, tf.stack([ combined_feature_map_shape[0], combined_feature_map_shape[1] * combined_feature_map_shape[2] * num_predictions_per_location, 1, self._box_code_size ])) class_predictions_with_background = tf.reshape( class_predictions_with_background, tf.stack([ combined_feature_map_shape[0], combined_feature_map_shape[1] * combined_feature_map_shape[2] * num_predictions_per_location, num_class_slots ])) return { BOX_ENCODINGS: box_encodings, CLASS_PREDICTIONS_WITH_BACKGROUND: class_predictions_with_background }
def _predict(self, image_features, num_predictions_per_location): """Computes encoded object locations and corresponding confidences. Args: image_features: A float tensor of shape [batch_size, height, width, channels] containing features for a batch of images. num_predictions_per_location: an integer representing the number of box predictions to be made per spatial location in the feature map. Returns: A dictionary containing the following tensors. box_encodings: A float tensor of shape [batch_size, num_anchors, 1, code_size] representing the location of the objects, where num_anchors = feat_height * feat_width * num_predictions_per_location class_predictions_with_background: A float tensor of shape [batch_size, num_anchors, num_classes + 1] representing the class predictions for the proposals. """ features_depth = static_shape.get_depth(image_features.get_shape()) depth = max(min(features_depth, self._max_depth), self._min_depth) # Add a slot for the background class. num_class_slots = self.num_classes + 1 net = image_features with slim.arg_scope(self._conv_hyperparams), \ slim.arg_scope([slim.dropout], is_training=self._is_training): # Add additional conv layers before the predictor. if depth > 0 and self._num_layers_before_predictor > 0: for i in range(self._num_layers_before_predictor): net = slim.conv2d( net, depth, [1, 1], scope='Conv2d_%d_1x1_%d' % (i, depth)) with slim.arg_scope([slim.conv2d], activation_fn=None, normalizer_fn=None, normalizer_params=None): box_encodings = slim.conv2d( net, num_predictions_per_location * self._box_code_size, [self._kernel_size, self._kernel_size], scope='BoxEncodingPredictor') if self._use_dropout: net = slim.dropout(net, keep_prob=self._dropout_keep_prob) class_predictions_with_background = slim.conv2d( net, num_predictions_per_location * num_class_slots, [self._kernel_size, self._kernel_size], scope='ClassPredictor') if self._apply_sigmoid_to_scores: class_predictions_with_background = tf.sigmoid( class_predictions_with_background) combined_feature_map_shape = shape_utils.combined_static_and_dynamic_shape( image_features) box_encodings = tf.reshape( box_encodings, tf.stack([combined_feature_map_shape[0], combined_feature_map_shape[1] * combined_feature_map_shape[2] * num_predictions_per_location, 1, self._box_code_size])) class_predictions_with_background = tf.reshape( class_predictions_with_background, tf.stack([combined_feature_map_shape[0], combined_feature_map_shape[1] * combined_feature_map_shape[2] * num_predictions_per_location, num_class_slots])) return {BOX_ENCODINGS: box_encodings, CLASS_PREDICTIONS_WITH_BACKGROUND: class_predictions_with_background}
def _predict(self, image_features, num_predictions_per_location): """Computes encoded object locations and corresponding confidences. Args: image_features: A float tensor of shape [batch_size, height, width, channels] containing features for a batch of images. num_predictions_per_location: an integer representing the number of box predictions to be made per spatial location in the feature map. Returns: A dictionary containing the following tensors. box_encodings: A float tensor of shape [batch_size, num_anchors, 1, code_size] representing the location of the objects, where num_anchors = feat_height * feat_width * num_predictions_per_location score_predictions: A float tensor of shape [batch_size, num_anchors, 1] representing the score predictions for the proposals. """ features_depth = static_shape.get_depth(image_features.get_shape()) depth = max(min(features_depth, self._max_depth), self._min_depth) num_class_slots = 1 net = image_features with slim.arg_scope(self._conv_hyperparams), \ slim.arg_scope([slim.dropout], is_training=self._is_training): # Add additional conv layers before the predictor. if depth > 0 and self._num_layers_before_predictor > 0: for i in range(self._num_layers_before_predictor): net = slim.conv2d(net, depth, [1, 1], scope='Conv2d_%d_1x1_%d' % (i, depth)) with slim.arg_scope([slim.conv2d], activation_fn=None, normalizer_fn=None, normalizer_params=None): box_encodings = slim.conv2d( net, num_predictions_per_location * self._box_code_size, [self._kernel_size, self._kernel_size], activation_fn=tf.nn.sigmoid, scope='BoxEncodingPredictor') angle_encodings = slim.conv2d( net, num_predictions_per_location * 1, [self._kernel_size, self._kernel_size], scope='AngleEncodingPredictor') score_predictions = slim.conv2d( net, num_predictions_per_location * 1, [self._kernel_size, self._kernel_size], scope='ScorePredictor') #score_predictions = tf.sigmoid(score_predictions) batch_size = static_shape.get_batch_size(image_features.get_shape()) if batch_size is None: features_height = static_shape.get_height( image_features.get_shape()) features_width = static_shape.get_width(image_features.get_shape()) flattened_predictions_size = (features_height * features_width * num_predictions_per_location) box_encodings = tf.reshape( box_encodings, [-1, flattened_predictions_size, 1, self._box_code_size]) angle_encodings = tf.reshape( angle_encodings, [-1, flattened_predictions_size, 1, 1]) score_predictions = tf.reshape(score_predictions, [-1, flattened_predictions_size, 1]) else: box_encodings = tf.reshape( box_encodings, [batch_size, -1, 1, self._box_code_size]) angle_encodings = tf.reshape(angle_encodings, [batch_size, -1, 1, 1]) score_predictions = tf.reshape(score_predictions, [batch_size, -1, 1]) return { BOX_ENCODINGS: box_encodings, ANGLE_ENCODINGS: angle_encodings, SCORE_PREDICTIONS: score_predictions }
def _predict(self, image_features, num_predictions_per_location): """Computes encoded object locations and corresponding confidences. Args: image_features: A float tensor of shape [batch_size, height, width, channels] containing features for a batch of images. num_predictions_per_location: an integer representing the number of box predictions to be made per spatial location in the feature map. Returns: A dictionary containing the following tensors. box_encodings: A float tensor of shape [batch_size, num_anchors, 1, code_size] representing the location of the objects, where num_anchors = feat_height * feat_width * num_predictions_per_location class_predictions_with_background: A float tensor of shape [batch_size, num_anchors, num_classes + 1] representing the class predictions for the proposals. """ features_depth = static_shape.get_depth(image_features.get_shape()) #here this is the depth of the feature map depth = max(min(features_depth, self._max_depth), self._min_depth) #here the depth is zero This is like how many additional layers # Add a slot for the background class. num_class_slots = self.num_classes + 1 #this is for ground trth class also net = image_features #get the image feature or the last layer this is for that convolutinal wilter with slim.arg_scope(self._conv_hyperparams), \ slim.arg_scope([slim.dropout], is_training=self._is_training): # Add additional conv layers before the predictor. if depth > 0 and self._num_layers_before_predictor > 0: #number of layers means at which depth we calculate things for i in range(self._num_layers_before_predictor): #here both are zero in our case so no net = slim.conv2d( #all additoonal layers are covolutional layer 1*1 net, depth, [1, 1], scope='Conv2d_%d_1x1_%d' % (i, depth)) #extra convolution perform to feature extractor with slim.arg_scope([slim.conv2d], activation_fn=None, normalizer_fn=None, normalizer_params=None): box_encodings = slim.conv2d( #here input is the net , then second parameter is the how many filters , Kernal size net, num_predictions_per_location * self._box_code_size, #This is basically convolition we will get [self._kernel_size, self._kernel_size], scope='BoxEncodingPredictor') #here the output depth will be net, num_predictions_per_location * self._box_code_size if self._use_dropout: #we only use dropout for the class prediction net = slim.dropout(net, keep_prob=self._dropout_keep_prob) #we are using the dropout class_predictions_with_background = slim.conv2d( net, num_predictions_per_location * num_class_slots, #like above we get a 3d feature set as aboive [self._kernel_size, self._kernel_size], scope='ClassPredictor') if self._apply_sigmoid_to_scores: #We don't apply elementwise sigmoid to the score class_predictions_with_background = tf.sigmoid( class_predictions_with_background) batch_size = static_shape.get_batch_size(image_features.get_shape()) #batch_size if batch_size is None: # features_height = static_shape.get_height(image_features.get_shape()) features_width = static_shape.get_width(image_features.get_shape()) flattened_predictions_size = (features_height * features_width * num_predictions_per_location) box_encodings = tf.reshape( #do some reshaping box_encodings, [-1, flattened_predictions_size, 1, self._box_code_size]) class_predictions_with_background = tf.reshape( class_predictions_with_background, [-1, flattened_predictions_size, num_class_slots]) else: box_encodings = tf.reshape( box_encodings, [batch_size, -1, 1, self._box_code_size]) class_predictions_with_background = tf.reshape( class_predictions_with_background, [batch_size, -1, num_class_slots]) return {BOX_ENCODINGS: box_encodings, CLASS_PREDICTIONS_WITH_BACKGROUND: class_predictions_with_background}
def _predict(self, image_features, num_predictions_per_location, boxes_normalized=None): """Computes encoded object locations and corresponding confidences. Flattens image_features and applies fully connected ops (with no non-linearity) to predict box encodings and class predictions. In this setting, anchors are not spatially arranged in any way and are assumed to have been folded into the batch dimension. Thus we output 1 for the anchors dimension. Args: image_features: A float tensor of shape [batch_size, height, width, channels] containing features for a batch of images. num_predictions_per_location: an integer representing the number of box predictions to be made per spatial location in the feature map. Currently, this must be set to 1, or an error will be raised. Returns: A dictionary containing the following tensors. box_encodings: A float tensor of shape [batch_size, 1, num_classes, code_size] representing the location of the objects. class_predictions_with_background: A float tensor of shape [batch_size, 1, num_classes + 1] representing the class predictions for the proposals. If predict_masks is True the dictionary also contains: instance_masks: A float tensor of shape [batch_size, 1, num_classes, image_height, image_width] If predict_keypoints is True the dictionary also contains: keypoints: [batch_size, 1, num_keypoints, 2] Raises: ValueError: if num_predictions_per_location is not 1. """ features_depth = static_shape.get_depth(image_features.get_shape()) depth = max(min(features_depth, self._max_depth), self._min_depth) if num_predictions_per_location != 1: raise ValueError( 'Currently FullyConnectedBoxPredictor only supports ' 'predicting a single box per class per location.') roi_features = image_features if self._spatial_average: roi_features = tf.reduce_mean(roi_features, [1, 2], keep_dims=True, name='AvgPool') net = slim.flatten(roi_features) end_points_collection = self._scope.name + '_end_points' with slim.arg_scope(self._fc_hyperparams), \ slim.arg_scope([slim.dropout], is_training=self._is_training), \ slim.arg_scope([slim.fully_connected], trainable=self._is_training, outputs_collections=end_points_collection): # Add additional fc layers before the predictor. if depth > 0 and self._num_layers_before_predictor > 0: for i in range(self._num_layers_before_predictor): net = slim.fully_connected(net, depth, scope='FC_%d_%d' % (i, depth)) if self._use_dropout: net = slim.dropout(net, keep_prob=self._dropout_keep_prob) box_encodings = slim.fully_connected( net, self._num_classes * self._box_code_size, activation_fn=None, weights_initializer=self._box_initializer, scope='BoxEncodingPredictor') class_predictions_with_background = slim.fully_connected( net, self._num_classes + 1, activation_fn=None, scope='ClassPredictor') box_encodings = tf.reshape( box_encodings, [-1, 1, self._num_classes, self._box_code_size]) class_predictions_with_background = tf.reshape( class_predictions_with_background, [-1, 1, self._num_classes + 1]) predictions_dict = { BOX_ENCODINGS: box_encodings, CLASS_PREDICTIONS_WITH_BACKGROUND: class_predictions_with_background } if self._predict_instance_masks: with slim.arg_scope(self._conv_hyperparams): upsampled_features = slim.conv2d_transpose( image_features, num_outputs=self._mask_prediction_conv_depth, kernel_size=[2, 2], stride=2, trainable=self._is_training) mask_predictions = slim.conv2d(upsampled_features, num_outputs=self.num_classes, activation_fn=None, kernel_size=[1, 1], trainable=self._is_training) instance_masks = tf.expand_dims(tf.transpose(mask_predictions, perm=[0, 3, 1, 2]), axis=1, name='MaskPredictor') predictions_dict[MASK_PREDICTIONS] = instance_masks end_points = slim.utils.convert_collection_to_dict( end_points_collection) framework_ops.get_default_graph().clear_collection( end_points_collection) return predictions_dict