def test_unequal_static_shape_along_first_dim_raises_exception(self): shape_a = tf.constant(np.zeros([4, 2, 2, 1])) shape_b = tf.constant(np.zeros([6, 2, 3, 1])) with self.assertRaisesRegexp(ValueError, 'Unequal first dimension'): shape_utils.assert_shape_equal_along_first_dimension( shape_utils.combined_static_and_dynamic_shape(shape_a), shape_utils.combined_static_and_dynamic_shape(shape_b))
def test_equal_static_shape_along_first_dim_succeeds(self): shape_a = tf.constant(np.zeros([4, 2, 2, 1])) shape_b = tf.constant(np.zeros([4, 7, 2])) with self.test_session() as sess: op = shape_utils.assert_shape_equal_along_first_dimension( shape_utils.combined_static_and_dynamic_shape(shape_a), shape_utils.combined_static_and_dynamic_shape(shape_b)) sess.run(op)
def test_equal_dynamic_shape_along_first_dim_succeeds(self): tensor_a = tf.placeholder(tf.float32, shape=[None, None, None, 3]) tensor_b = tf.placeholder(tf.float32, shape=[None]) op = shape_utils.assert_shape_equal_along_first_dimension( shape_utils.combined_static_and_dynamic_shape(tensor_a), shape_utils.combined_static_and_dynamic_shape(tensor_b)) with self.test_session() as sess: sess.run(op, feed_dict={ tensor_a: np.zeros([5, 2, 2, 3]), tensor_b: np.zeros([5]) })
def test_unequal_dynamic_shape_along_first_dim_raises_tf_assert(self): tensor_a = tf.placeholder(tf.float32, shape=[None, None, None, 3]) tensor_b = tf.placeholder(tf.float32, shape=[None, None, 3]) op = shape_utils.assert_shape_equal_along_first_dimension( shape_utils.combined_static_and_dynamic_shape(tensor_a), shape_utils.combined_static_and_dynamic_shape(tensor_b)) with self.test_session() as sess: with self.assertRaises(tf.errors.InvalidArgumentError): sess.run(op, feed_dict={ tensor_a: np.zeros([1, 2, 2, 3]), tensor_b: np.zeros([2, 4, 3]) })
def _batch_decode(self, box_encodings): """Decodes a batch of box encodings with respect to the anchors. Args: box_encodings: A float32 tensor of shape [batch_size, num_anchors, box_code_size] containing box encodings. Returns: decoded_boxes: A float32 tensor of shape [batch_size, num_anchors, 4] containing the decoded boxes. decoded_keypoints: A float32 tensor of shape [batch_size, num_anchors, num_keypoints, 2] containing the decoded keypoints if present in the input `box_encodings`, None otherwise. """ combined_shape = shape_utils.combined_static_and_dynamic_shape( box_encodings) batch_size = combined_shape[0] tiled_anchor_boxes = tf.tile( tf.expand_dims(self.anchors.get(), 0), [batch_size, 1, 1]) tiled_anchors_boxlist = box_list.BoxList( tf.reshape(tiled_anchor_boxes, [-1, 4])) decoded_boxes = self._box_coder.decode( tf.reshape(box_encodings, [-1, self._box_coder.code_size]), tiled_anchors_boxlist) decoded_keypoints = None if decoded_boxes.has_field(fields.BoxListFields.keypoints): decoded_keypoints = decoded_boxes.get_field( fields.BoxListFields.keypoints) num_keypoints = decoded_keypoints.get_shape()[1] decoded_keypoints = tf.reshape( decoded_keypoints, tf.stack([combined_shape[0], combined_shape[1], num_keypoints, 2])) decoded_boxes = tf.reshape(decoded_boxes.get(), tf.stack( [combined_shape[0], combined_shape[1], 4])) return decoded_boxes, decoded_keypoints
def nearest_neighbor_upsampling(input_tensor, scale): """Nearest neighbor upsampling implementation. Nearest neighbor upsampling function that maps input tensor with shape [batch_size, height, width, channels] to [batch_size, height * scale , width * scale, channels]. This implementation only uses reshape and broadcasting to make it TPU compatible. Args: input_tensor: A float32 tensor of size [batch, height_in, width_in, channels]. scale: An integer multiple to scale resolution of input data. Returns: data_up: A float32 tensor of size [batch, height_in*scale, width_in*scale, channels]. """ with tf.name_scope('nearest_neighbor_upsampling'): (batch_size, height, width, channels ) = shape_utils.combined_static_and_dynamic_shape(input_tensor) output_tensor = tf.reshape(input_tensor, [ batch_size, height, 1, width, 1, channels ]) * tf.ones([1, 1, scale, 1, scale, 1], dtype=input_tensor.dtype) return tf.reshape( output_tensor, [batch_size, height * scale, width * scale, channels])
def select_random_box(boxlist, default_box=None, seed=None, scope=None): """Selects a random bounding box from a `BoxList`. Args: boxlist: A BoxList. default_box: A [1, 4] float32 tensor. If no boxes are present in `boxlist`, this default box will be returned. If None, will use a default box of [[-1., -1., -1., -1.]]. seed: Random seed. scope: Name scope. Returns: bbox: A [1, 4] tensor with a random bounding box. valid: A bool tensor indicating whether a valid bounding box is returned (True) or whether the default box is returned (False). """ with tf.name_scope(scope, 'SelectRandomBox'): bboxes = boxlist.get() combined_shape = shape_utils.combined_static_and_dynamic_shape(bboxes) number_of_boxes = combined_shape[0] default_box = default_box or tf.constant([[-1., -1., -1., -1.]]) def select_box(): random_index = tf.random_uniform([], maxval=number_of_boxes, dtype=tf.int32, seed=seed) return tf.expand_dims(bboxes[random_index], axis=0), tf.constant(True) return tf.cond(tf.greater_equal(number_of_boxes, 1), true_fn=select_box, false_fn=lambda: (default_box, tf.constant(False)))
def _match_when_rows_are_empty(): """Performs matching when the rows of similarity matrix are empty. When the rows are empty, all detections are false positives. So we return a tensor of -1's to indicate that the columns do not match to any rows. Returns: matches: int32 tensor indicating the row each column matches to. """ similarity_matrix_shape = shape_utils.combined_static_and_dynamic_shape( similarity_matrix) return -1 * tf.ones([similarity_matrix_shape[1]], dtype=tf.int32)
def matmul_gather_on_zeroth_axis(params, indices, scope=None): """Matrix multiplication based implementation of tf.gather on zeroth axis. TODO(rathodv, jonathanhuang): enable sparse matmul option. Args: params: A float32 Tensor. The tensor from which to gather values. Must be at least rank 1. indices: A Tensor. Must be one of the following types: int32, int64. Must be in range [0, params.shape[0]) scope: A name for the operation (optional). Returns: A Tensor. Has the same type as params. Values from params gathered from indices given by indices, with shape indices.shape + params.shape[1:]. """ with tf.name_scope(scope, 'MatMulGather'): params_shape = shape_utils.combined_static_and_dynamic_shape(params) indices_shape = shape_utils.combined_static_and_dynamic_shape(indices) params2d = tf.reshape(params, [params_shape[0], -1]) indicator_matrix = tf.one_hot(indices, params_shape[0]) gathered_result_flattened = tf.matmul(indicator_matrix, params2d) return tf.reshape(gathered_result_flattened, tf.stack(indices_shape + params_shape[1:]))
def _get_feature_map_spatial_dims(self, feature_maps): """Return list of spatial dimensions for each feature map in a list. Args: feature_maps: a list of tensors where the ith tensor has shape [batch, height_i, width_i, depth_i]. Returns: a list of pairs (height, width) for each feature map in feature_maps """ feature_map_shapes = [ shape_utils.combined_static_and_dynamic_shape( feature_map) for feature_map in feature_maps ] return [(shape[1], shape[2]) for shape in feature_map_shapes]
def _match_when_rows_are_non_empty(): """Performs matching when the rows of similarity matrix are non empty. Returns: matches: int32 tensor indicating the row each column matches to. """ # Matches for each column matches = tf.argmax(similarity_matrix, 0, output_type=tf.int32) # Deal with matched and unmatched threshold if self._matched_threshold is not None: # Get logical indices of ignored and unmatched columns as tf.int64 matched_vals = tf.reduce_max(similarity_matrix, 0) below_unmatched_threshold = tf.greater( self._unmatched_threshold, matched_vals) between_thresholds = tf.logical_and( tf.greater_equal(matched_vals, self._unmatched_threshold), tf.greater(self._matched_threshold, matched_vals)) if self._negatives_lower_than_unmatched: matches = self._set_values_using_indicator( matches, below_unmatched_threshold, -1) matches = self._set_values_using_indicator( matches, between_thresholds, -2) else: matches = self._set_values_using_indicator( matches, below_unmatched_threshold, -2) matches = self._set_values_using_indicator( matches, between_thresholds, -1) if self._force_match_for_each_row: similarity_matrix_shape = shape_utils.combined_static_and_dynamic_shape( similarity_matrix) force_match_column_ids = tf.argmax(similarity_matrix, 1, output_type=tf.int32) force_match_column_indicators = tf.one_hot( force_match_column_ids, depth=similarity_matrix_shape[1]) force_match_row_ids = tf.argmax(force_match_column_indicators, 0, output_type=tf.int32) force_match_column_mask = tf.cast( tf.reduce_max(force_match_column_indicators, 0), tf.bool) final_matches = tf.where(force_match_column_mask, force_match_row_ids, matches) return final_matches else: return matches
def _predict(self, image_features, **kwargs): image_feature = image_features[0] combined_feature_shape = shape_utils.combined_static_and_dynamic_shape( image_feature) batch_size = combined_feature_shape[0] num_anchors = (combined_feature_shape[1] * combined_feature_shape[2]) code_size = 4 zero = tf.reduce_sum(0 * image_feature) box_encodings = zero + tf.zeros( (batch_size, num_anchors, 1, code_size), dtype=tf.float32) class_predictions_with_background = zero + tf.zeros( (batch_size, num_anchors, self.num_classes + 1), dtype=tf.float32) return { box_predictor.BOX_ENCODINGS: box_encodings, box_predictor.CLASS_PREDICTIONS_WITH_BACKGROUND: class_predictions_with_background }
def _compute_clip_window(self, preprocessed_images, true_image_shapes): """Computes clip window to use during post_processing. Computes a new clip window to use during post-processing based on `resized_image_shapes` and `true_image_shapes` only if `preprocess` method has been called. Otherwise returns a default clip window of [0, 0, 1, 1]. Args: preprocessed_images: the [batch, height, width, channels] image tensor. true_image_shapes: int32 tensor of shape [batch, 3] where each row is of the form [height, width, channels] indicating the shapes of true images in the resized images, as resized images can be padded with zeros. Or None if the clip window should cover the full image. Returns: a 2-D float32 tensor of the form [batch_size, 4] containing the clip window for each image in the batch in normalized coordinates (relative to the resized dimensions) where each clip window is of the form [ymin, xmin, ymax, xmax] or a default clip window of [0, 0, 1, 1]. """ if true_image_shapes is None: return tf.constant([0, 0, 1, 1], dtype=tf.float32) resized_inputs_shape = shape_utils.combined_static_and_dynamic_shape( preprocessed_images) true_heights, true_widths, _ = tf.unstack( tf.to_float(true_image_shapes), axis=1) padded_height = tf.to_float(resized_inputs_shape[1]) padded_width = tf.to_float(resized_inputs_shape[2]) return tf.stack( [ tf.zeros_like(true_heights), tf.zeros_like(true_widths), true_heights / padded_height, true_widths / padded_width ], axis=1)
def _create_regression_targets(self, anchors, groundtruth_boxes, match): """Returns a regression target for each anchor. Args: anchors: a BoxList representing N anchors groundtruth_boxes: a BoxList representing M groundtruth_boxes match: a matcher.Match object Returns: reg_targets: a float32 tensor with shape [N, box_code_dimension] """ matched_gt_boxes = match.gather_based_on_match( groundtruth_boxes.get(), unmatched_value=tf.zeros(4), ignored_value=tf.zeros(4)) matched_gt_boxlist = box_list.BoxList(matched_gt_boxes) if groundtruth_boxes.has_field(fields.BoxListFields.keypoints): groundtruth_keypoints = groundtruth_boxes.get_field( fields.BoxListFields.keypoints) matched_keypoints = match.gather_based_on_match( groundtruth_keypoints, unmatched_value=tf.zeros( groundtruth_keypoints.get_shape()[1:]), ignored_value=tf.zeros(groundtruth_keypoints.get_shape()[1:])) matched_gt_boxlist.add_field(fields.BoxListFields.keypoints, matched_keypoints) matched_reg_targets = self._box_coder.encode(matched_gt_boxlist, anchors) match_results_shape = shape_utils.combined_static_and_dynamic_shape( match.match_results) # Zero out the unmatched and ignored regression targets. unmatched_ignored_reg_targets = tf.tile( self._default_regression_target(), [match_results_shape[0], 1]) matched_anchors_mask = match.matched_column_indicator() reg_targets = tf.where(matched_anchors_mask, matched_reg_targets, unmatched_ignored_reg_targets) return reg_targets
def predict(self, preprocessed_inputs, true_image_shapes): """Predicts unpostprocessed tensors from input tensor. This function takes an input batch of images and runs it through the forward pass of the network to yield unpostprocessesed predictions. A side effect of calling the predict method is that self._anchors is populated with a box_list.BoxList of anchors. These anchors must be constructed before the postprocess or loss functions can be called. Args: preprocessed_inputs: a [batch, height, width, channels] image tensor. true_image_shapes: int32 tensor of shape [batch, 3] where each row is of the form [height, width, channels] indicating the shapes of true images in the resized images, as resized images can be padded with zeros. Returns: prediction_dict: a dictionary holding "raw" prediction tensors: 1) preprocessed_inputs: the [batch, height, width, channels] image tensor. 2) box_encodings: 4-D float tensor of shape [batch_size, num_anchors, box_code_dimension] containing predicted boxes. 3) class_predictions_with_background: 3-D float tensor of shape [batch_size, num_anchors, num_classes+1] containing class predictions (logits) for each of the anchors. Note that this tensor *includes* background class predictions (at class index 0). 4) feature_maps: a list of tensors where the ith tensor has shape [batch, height_i, width_i, depth_i]. 5) anchors: 2-D float tensor of shape [num_anchors, 4] containing the generated anchors in normalized coordinates. """ batchnorm_updates_collections = (None if self._inplace_batchnorm_update else tf.GraphKeys.UPDATE_OPS) if self._feature_extractor.is_keras_model: feature_maps = self._feature_extractor(preprocessed_inputs) else: with slim.arg_scope([slim.batch_norm], is_training=(self._is_training and not self._freeze_batchnorm), updates_collections=batchnorm_updates_collections): with tf.variable_scope(None, self._extract_features_scope, [preprocessed_inputs]): feature_maps = self._feature_extractor.extract_features( preprocessed_inputs) feature_map_spatial_dims = self._get_feature_map_spatial_dims( feature_maps) image_shape = shape_utils.combined_static_and_dynamic_shape( preprocessed_inputs) self._anchors = box_list_ops.concatenate( self._anchor_generator.generate( feature_map_spatial_dims, im_height=image_shape[1], im_width=image_shape[2])) if self._box_predictor.is_keras_model: prediction_dict = self._box_predictor(feature_maps) else: with slim.arg_scope([slim.batch_norm], is_training=(self._is_training and not self._freeze_batchnorm), updates_collections=batchnorm_updates_collections): prediction_dict = self._box_predictor.predict( feature_maps, self._anchor_generator.num_anchors_per_location()) box_encodings = tf.concat(prediction_dict['box_encodings'], axis=1) if box_encodings.shape.ndims == 4 and box_encodings.shape[2] == 1: box_encodings = tf.squeeze(box_encodings, axis=2) class_predictions_with_background = tf.concat( prediction_dict['class_predictions_with_background'], axis=1) predictions_dict = { 'preprocessed_inputs': preprocessed_inputs, 'box_encodings': box_encodings, 'class_predictions_with_background': class_predictions_with_background, 'feature_maps': feature_maps, 'anchors': self._anchors.get() } self._batched_prediction_tensor_names = [x for x in predictions_dict if x != 'anchors'] return predictions_dict
def assign(self, anchors, groundtruth_boxes, groundtruth_labels=None, unmatched_class_label=None, groundtruth_weights=None, **params): """Assign classification and regression targets to each anchor. For a given set of anchors and groundtruth detections, match anchors to groundtruth_boxes and assign classification and regression targets to each anchor as well as weights based on the resulting match (specifying, e.g., which anchors should not contribute to training loss). Anchors that are not matched to anything are given a classification target of self._unmatched_cls_target which can be specified via the constructor. Args: anchors: a BoxList representing N anchors groundtruth_boxes: a BoxList representing M groundtruth boxes groundtruth_labels: a tensor of shape [M, d_1, ... d_k] with labels for each of the ground_truth boxes. The subshape [d_1, ... d_k] can be empty (corresponding to scalar inputs). When set to None, groundtruth_labels assumes a binary problem where all ground_truth boxes get a positive label (of 1). unmatched_class_label: a float32 tensor with shape [d_1, d_2, ..., d_k] which is consistent with the classification target for each anchor (and can be empty for scalar targets). This shape must thus be compatible with the groundtruth labels that are passed to the "assign" function (which have shape [num_gt_boxes, d_1, d_2, ..., d_k]). If set to None, unmatched_cls_target is set to be [0] for each anchor. groundtruth_weights: a float tensor of shape [M] indicating the weight to assign to all anchors match to a particular groundtruth box. The weights must be in [0., 1.]. If None, all weights are set to 1. **params: Additional keyword arguments for specific implementations of the Matcher. Returns: cls_targets: a float32 tensor with shape [num_anchors, d_1, d_2 ... d_k], where the subshape [d_1, ..., d_k] is compatible with groundtruth_labels which has shape [num_gt_boxes, d_1, d_2, ... d_k]. cls_weights: a float32 tensor with shape [num_anchors] reg_targets: a float32 tensor with shape [num_anchors, box_code_dimension] reg_weights: a float32 tensor with shape [num_anchors] match: a matcher.Match object encoding the match between anchors and groundtruth boxes, with rows corresponding to groundtruth boxes and columns corresponding to anchors. Raises: ValueError: if anchors or groundtruth_boxes are not of type box_list.BoxList """ if not isinstance(anchors, box_list.BoxList): raise ValueError('anchors must be an BoxList') if not isinstance(groundtruth_boxes, box_list.BoxList): raise ValueError('groundtruth_boxes must be an BoxList') if unmatched_class_label is None: unmatched_class_label = tf.constant([0], tf.float32) if groundtruth_labels is None: groundtruth_labels = tf.ones( tf.expand_dims(groundtruth_boxes.num_boxes(), 0)) groundtruth_labels = tf.expand_dims(groundtruth_labels, -1) unmatched_shape_assert = shape_utils.assert_shape_equal( shape_utils.combined_static_and_dynamic_shape(groundtruth_labels) [1:], shape_utils.combined_static_and_dynamic_shape( unmatched_class_label)) labels_and_box_shapes_assert = shape_utils.assert_shape_equal( shape_utils.combined_static_and_dynamic_shape(groundtruth_labels) [:1], shape_utils.combined_static_and_dynamic_shape( groundtruth_boxes.get())[:1]) if groundtruth_weights is None: num_gt_boxes = groundtruth_boxes.num_boxes_static() if not num_gt_boxes: num_gt_boxes = groundtruth_boxes.num_boxes() groundtruth_weights = tf.ones([num_gt_boxes], dtype=tf.float32) # set scores on the gt boxes scores = 1 - groundtruth_labels[:, 0] groundtruth_boxes.add_field(fields.BoxListFields.scores, scores) with tf.control_dependencies( [unmatched_shape_assert, labels_and_box_shapes_assert]): match_quality_matrix = self._similarity_calc.compare( groundtruth_boxes, anchors) match = self._matcher.match(match_quality_matrix, **params) reg_targets = self._create_regression_targets( anchors, groundtruth_boxes, match) cls_targets = self._create_classification_targets( groundtruth_labels, unmatched_class_label, match) reg_weights = self._create_regression_weights( match, groundtruth_weights) cls_weights = self._create_classification_weights( match, groundtruth_weights) num_anchors = anchors.num_boxes_static() if num_anchors is not None: reg_targets = self._reset_target_shape(reg_targets, num_anchors) cls_targets = self._reset_target_shape(cls_targets, num_anchors) reg_weights = self._reset_target_shape(reg_weights, num_anchors) cls_weights = self._reset_target_shape(cls_weights, num_anchors) return cls_targets, cls_weights, reg_targets, reg_weights, match
def _predict(self, image_features, num_predictions_per_location_list): """Computes encoded object locations and corresponding confidences. Args: image_features: A list of float tensors of shape [batch_size, height_i, width_i, channels_i] containing features for a batch of images. num_predictions_per_location_list: A list of integers representing the number of box predictions to be made per spatial location for each feature map. Returns: box_encodings: A list of float tensors of shape [batch_size, num_anchors_i, q, code_size] representing the location of the objects, where q is 1 or the number of classes. Each entry in the list corresponds to a feature map in the input `image_features` list. class_predictions_with_background: A list of float tensors of shape [batch_size, num_anchors_i, num_classes + 1] representing the class predictions for the proposals. Each entry in the list corresponds to a feature map in the input `image_features` list. """ box_encodings_list = [] class_predictions_list = [] # TODO(rathodv): Come up with a better way to generate scope names # in box predictor once we have time to retrain all models in the zoo. # The following lines create scope names to be backwards compatible with the # existing checkpoints. box_predictor_scopes = [_NoopVariableScope()] if len(image_features) > 1: box_predictor_scopes = [ tf.variable_scope('BoxPredictor_{}'.format(i)) for i in range(len(image_features)) ] for (image_feature, num_predictions_per_location, box_predictor_scope) in zip(image_features, num_predictions_per_location_list, box_predictor_scopes): with box_predictor_scope: # Add a slot for the background class. num_class_slots = self.num_classes + 1 net = image_feature with slim.arg_scope(self._conv_hyperparams_fn()), \ slim.arg_scope([slim.dropout], is_training=self._is_training): # Add additional conv layers before the class predictor. features_depth = static_shape.get_depth( image_feature.get_shape()) depth = max(min(features_depth, self._max_depth), self._min_depth) tf.logging.info( 'depth of additional conv before box predictor: {}'. format(depth)) if depth > 0 and self._num_layers_before_predictor > 0: for i in range(self._num_layers_before_predictor): net = slim.conv2d(net, depth, [1, 1], scope='Conv2d_%d_1x1_%d' % (i, depth)) with slim.arg_scope([slim.conv2d], activation_fn=None, normalizer_fn=None, normalizer_params=None): if self._use_depthwise: box_encodings = slim.separable_conv2d( net, None, [self._kernel_size, self._kernel_size], padding='SAME', depth_multiplier=1, stride=1, rate=1, scope='BoxEncodingPredictor_depthwise') box_encodings = slim.conv2d( box_encodings, num_predictions_per_location * self._box_code_size, [1, 1], scope='BoxEncodingPredictor') else: box_encodings = slim.conv2d( net, num_predictions_per_location * self._box_code_size, [self._kernel_size, self._kernel_size], scope='BoxEncodingPredictor') if self._use_dropout: net = slim.dropout( net, keep_prob=self._dropout_keep_prob) if self._use_depthwise: class_predictions_with_background = slim.separable_conv2d( net, None, [self._kernel_size, self._kernel_size], padding='SAME', depth_multiplier=1, stride=1, rate=1, scope='ClassPredictor_depthwise') class_predictions_with_background = slim.conv2d( class_predictions_with_background, num_predictions_per_location * num_class_slots, [1, 1], scope='ClassPredictor') else: class_predictions_with_background = slim.conv2d( net, num_predictions_per_location * num_class_slots, [self._kernel_size, self._kernel_size], scope='ClassPredictor', biases_initializer=tf.constant_initializer( self._class_prediction_bias_init)) if self._apply_sigmoid_to_scores: class_predictions_with_background = tf.sigmoid( class_predictions_with_background) combined_feature_map_shape = ( shape_utils.combined_static_and_dynamic_shape( image_feature)) box_encodings = tf.reshape( box_encodings, tf.stack([ combined_feature_map_shape[0], combined_feature_map_shape[1] * combined_feature_map_shape[2] * num_predictions_per_location, 1, self._box_code_size ])) box_encodings_list.append(box_encodings) class_predictions_with_background = tf.reshape( class_predictions_with_background, tf.stack([ combined_feature_map_shape[0], combined_feature_map_shape[1] * combined_feature_map_shape[2] * num_predictions_per_location, num_class_slots ])) class_predictions_list.append( class_predictions_with_background) return { BOX_ENCODINGS: box_encodings_list, CLASS_PREDICTIONS_WITH_BACKGROUND: class_predictions_list }
def _predict(self, image_features, num_predictions_per_location_list): """Computes encoded object locations and corresponding confidences. Args: image_features: A list of float tensors of shape [batch_size, height_i, width_i, channels] containing features for a batch of images. Note that when not all tensors in the list have the same number of channels, an additional projection layer will be added on top the tensor to generate feature map with number of channels consitent with the majority. num_predictions_per_location_list: A list of integers representing the number of box predictions to be made per spatial location for each feature map. Note that all values must be the same since the weights are shared. Returns: box_encodings: A list of float tensors of shape [batch_size, num_anchors_i, code_size] representing the location of the objects. Each entry in the list corresponds to a feature map in the input `image_features` list. class_predictions_with_background: A list of float tensors of shape [batch_size, num_anchors_i, num_classes + 1] representing the class predictions for the proposals. Each entry in the list corresponds to a feature map in the input `image_features` list. Raises: ValueError: If the image feature maps do not have the same number of channels or if the num predictions per locations is differs between the feature maps. """ if len(set(num_predictions_per_location_list)) > 1: raise ValueError( 'num predictions per location must be same for all' 'feature maps, found: {}'.format( num_predictions_per_location_list)) feature_channels = [ image_feature.shape[3].value for image_feature in image_features ] has_different_feature_channels = len(set(feature_channels)) > 1 if has_different_feature_channels: inserted_layer_counter = 0 target_channel = max(set(feature_channels), key=feature_channels.count) tf.logging.info('Not all feature maps have the same number of ' 'channels, found: {}, addition project layers ' 'to bring all feature maps to uniform channels ' 'of {}'.format(feature_channels, target_channel)) box_encodings_list = [] class_predictions_list = [] num_class_slots = self.num_classes + 1 for feature_index, (image_feature, num_predictions_per_location) in enumerate( zip(image_features, num_predictions_per_location_list)): # Add a slot for the background class. with tf.variable_scope('WeightSharedConvolutionalBoxPredictor', reuse=tf.AUTO_REUSE): with slim.arg_scope(self._conv_hyperparams_fn()): # Insert an additional projection layer if necessary. if (has_different_feature_channels and image_feature.shape[3].value != target_channel): image_feature = slim.conv2d( image_feature, target_channel, [1, 1], stride=1, padding='SAME', activation_fn=None, normalizer_fn=(tf.identity if self._apply_batch_norm else None), scope='ProjectionLayer/conv2d_{}'.format( inserted_layer_counter)) if self._apply_batch_norm: image_feature = slim.batch_norm( image_feature, scope='ProjectionLayer/conv2d_{}/BatchNorm'. format(inserted_layer_counter)) inserted_layer_counter += 1 box_encodings_net = image_feature class_predictions_net = image_feature for i in range(self._num_layers_before_predictor): box_prediction_tower_prefix = ( 'PredictionTower' if self._share_prediction_tower else 'BoxPredictionTower') box_encodings_net = slim.conv2d( box_encodings_net, self._depth, [self._kernel_size, self._kernel_size], stride=1, padding='SAME', activation_fn=None, normalizer_fn=(tf.identity if self._apply_batch_norm else None), scope='{}/conv2d_{}'.format( box_prediction_tower_prefix, i)) if self._apply_batch_norm: box_encodings_net = slim.batch_norm( box_encodings_net, scope='{}/conv2d_{}/BatchNorm/feature_{}'. format(box_prediction_tower_prefix, i, feature_index)) box_encodings_net = tf.nn.relu6(box_encodings_net) box_encodings = slim.conv2d( box_encodings_net, num_predictions_per_location * self._box_code_size, [self._kernel_size, self._kernel_size], activation_fn=None, stride=1, padding='SAME', normalizer_fn=None, scope='BoxPredictor') if self._share_prediction_tower: class_predictions_net = box_encodings_net else: for i in range(self._num_layers_before_predictor): class_predictions_net = slim.conv2d( class_predictions_net, self._depth, [self._kernel_size, self._kernel_size], stride=1, padding='SAME', activation_fn=None, normalizer_fn=(tf.identity if self._apply_batch_norm else None), scope='ClassPredictionTower/conv2d_{}'.format( i)) if self._apply_batch_norm: class_predictions_net = slim.batch_norm( class_predictions_net, scope= 'ClassPredictionTower/conv2d_{}/BatchNorm/feature_{}' .format(i, feature_index)) class_predictions_net = tf.nn.relu6( class_predictions_net) if self._use_dropout: class_predictions_net = slim.dropout( class_predictions_net, keep_prob=self._dropout_keep_prob) class_predictions_with_background = slim.conv2d( class_predictions_net, num_predictions_per_location * num_class_slots, [self._kernel_size, self._kernel_size], activation_fn=None, stride=1, padding='SAME', normalizer_fn=None, biases_initializer=tf.constant_initializer( self._class_prediction_bias_init), scope='ClassPredictor') combined_feature_map_shape = ( shape_utils.combined_static_and_dynamic_shape( image_feature)) box_encodings = tf.reshape( box_encodings, tf.stack([ combined_feature_map_shape[0], combined_feature_map_shape[1] * combined_feature_map_shape[2] * num_predictions_per_location, self._box_code_size ])) box_encodings_list.append(box_encodings) class_predictions_with_background = tf.reshape( class_predictions_with_background, tf.stack([ combined_feature_map_shape[0], combined_feature_map_shape[1] * combined_feature_map_shape[2] * num_predictions_per_location, num_class_slots ])) class_predictions_list.append( class_predictions_with_background) return { BOX_ENCODINGS: box_encodings_list, CLASS_PREDICTIONS_WITH_BACKGROUND: class_predictions_list }
def test_combines_static_dynamic_shape(self): tensor = tf.placeholder(tf.float32, shape=(None, 2, 3)) combined_shape = shape_utils.combined_static_and_dynamic_shape(tensor) self.assertTrue(tf.contrib.framework.is_tensor(combined_shape[0])) self.assertListEqual(combined_shape[1:], [2, 3])
def unstack_batch(tensor_dict, unpad_groundtruth_tensors=True): """Unstacks all tensors in `tensor_dict` along 0th dimension. Unstacks tensor from the tensor dict along 0th dimension and returns a tensor_dict containing values that are lists of unstacked, unpadded tensors. Tensors in the `tensor_dict` are expected to be of one of the three shapes: 1. [batch_size] 2. [batch_size, height, width, channels] 3. [batch_size, num_boxes, d1, d2, ... dn] When unpad_groundtruth_tensors is set to true, unstacked tensors of form 3 above are sliced along the `num_boxes` dimension using the value in tensor field.InputDataFields.num_groundtruth_boxes. Note that this function has a static list of input data fields and has to be kept in sync with the InputDataFields defined in core/standard_fields.py Args: tensor_dict: A dictionary of batched groundtruth tensors. unpad_groundtruth_tensors: Whether to remove padding along `num_boxes` dimension of the groundtruth tensors. Returns: A dictionary where the keys are from fields.InputDataFields and values are a list of unstacked (optionally unpadded) tensors. Raises: ValueError: If unpad_tensors is True and `tensor_dict` does not contain `num_groundtruth_boxes` tensor. """ unbatched_tensor_dict = { key: tf.unstack(tensor) for key, tensor in tensor_dict.items() } if unpad_groundtruth_tensors: if (fields.InputDataFields.num_groundtruth_boxes not in unbatched_tensor_dict): raise ValueError( '`num_groundtruth_boxes` not found in tensor_dict. ' 'Keys available: {}'.format(unbatched_tensor_dict.keys())) unbatched_unpadded_tensor_dict = {} unpad_keys = set([ # List of input data fields that are padded along the num_boxes # dimension. This list has to be kept in sync with InputDataFields in # standard_fields.py. fields.InputDataFields.groundtruth_instance_masks, fields.InputDataFields.groundtruth_classes, fields.InputDataFields.groundtruth_boxes, fields.InputDataFields.groundtruth_keypoints, fields.InputDataFields.groundtruth_group_of, fields.InputDataFields.groundtruth_difficult, fields.InputDataFields.groundtruth_is_crowd, fields.InputDataFields.groundtruth_area, fields.InputDataFields.groundtruth_weights ]).intersection(set(unbatched_tensor_dict.keys())) for key in unpad_keys: unpadded_tensor_list = [] for num_gt, padded_tensor in zip( unbatched_tensor_dict[ fields.InputDataFields.num_groundtruth_boxes], unbatched_tensor_dict[key]): tensor_shape = shape_utils.combined_static_and_dynamic_shape( padded_tensor) slice_begin = tf.zeros([len(tensor_shape)], dtype=tf.int32) slice_size = tf.stack( [num_gt] + [-1 if dim is None else dim for dim in tensor_shape[1:]]) unpadded_tensor = tf.slice(padded_tensor, slice_begin, slice_size) unpadded_tensor_list.append(unpadded_tensor) unbatched_unpadded_tensor_dict[key] = unpadded_tensor_list unbatched_tensor_dict.update(unbatched_unpadded_tensor_dict) return unbatched_tensor_dict