Exemple #1
0
def _validate_boxes_scores_iou_thresh(boxes, scores, iou_thresh,
                                      change_coordinate_frame, clip_window):
    """Validates boxes, scores and iou_thresh.

  This function validates the boxes, scores, iou_thresh
     and if change_coordinate_frame is True, clip_window must be specified.

  Args:
    boxes: A [k, q, 4] float32 tensor containing k detections. `q` can be either
      number of classes or 1 depending on whether a separate box is predicted
      per class.
    scores: A [k, num_classes] float32 tensor containing the scores for each of
      the k detections. The scores have to be non-negative when
      pad_to_max_output_size is True.
    iou_thresh: scalar threshold for IOU (new boxes that have high IOU overlap
      with previously selected boxes are removed).
    change_coordinate_frame: Whether to normalize coordinates after clipping
      relative to clip_window (this can only be set to True if a clip_window is
      provided)
    clip_window: A float32 tensor of the form [y_min, x_min, y_max, x_max]
      representing the window to clip and normalize boxes to before performing
      non-max suppression.

  Raises:
    ValueError: if iou_thresh is not in [0, 1] or if input boxlist does not
    have a valid scores field.
  """
    if not 0 <= iou_thresh <= 1.0:
        raise ValueError('iou_thresh must be between 0 and 1')
    if scores.shape.ndims != 2:
        raise ValueError('scores field must be of rank 2')
    if scores.shape[1].value is None:
        raise ValueError('scores must have statically defined second '
                         'dimension')
    if boxes.shape.ndims != 3:
        raise ValueError('boxes must be of rank 3.')
    if not (shape_utils.get_dim_as_int(
            boxes.shape[1]) == shape_utils.get_dim_as_int(scores.shape[1])
            or shape_utils.get_dim_as_int(boxes.shape[1]) == 1):
        raise ValueError('second dimension of boxes must be either 1 or equal '
                         'to the second dimension of scores')
    if boxes.shape[2].value != 4:
        raise ValueError('last dimension of boxes must be of size 4.')
    if change_coordinate_frame and clip_window is None:
        raise ValueError(
            'if change_coordinate_frame is True, then a clip_window'
            'must be specified.')
    def build(self, input_shapes):
        """Creates the variables of the layer."""
        feature_channels = [
            shape_utils.get_dim_as_int(input_shape[3])
            for input_shape in input_shapes
        ]
        has_different_feature_channels = len(set(feature_channels)) > 1
        if has_different_feature_channels:
            inserted_layer_counter = 0
            target_channel = max(set(feature_channels),
                                 key=feature_channels.count)
            tf.logging.info(
                'Not all feature maps have the same number of '
                'channels, found: {}, appending additional projection '
                'layers to bring all feature maps to uniformly have {} '
                'channels.'.format(feature_channels, target_channel))
        else:
            # Place holder variables if has_different_feature_channels is False.
            target_channel = -1
            inserted_layer_counter = -1

        def _build_layers(tower_name_scope, feature_index):
            conv_layers, base_tower_layers = self._compute_base_tower(
                tower_name_scope=tower_name_scope, feature_index=feature_index)
            if tower_name_scope not in self._head_scope_conv_layers:
                self._head_scope_conv_layers[tower_name_scope] = conv_layers
            return base_tower_layers

        for feature_index, input_shape in enumerate(input_shapes):
            # Additional projection layers should not be shared as input channels
            # (and thus weight shapes) are different
            inserted_layer_counter, projection_layers = (
                self._insert_additional_projection_layer(
                    inserted_layer_counter, target_channel))
            self._additional_projection_layers.append(projection_layers)

            if self._share_prediction_tower:
                box_tower_scope = 'PredictionTower'
            else:
                box_tower_scope = 'BoxPredictionTower'
            # For box tower base
            box_tower_layers = _build_layers(box_tower_scope, feature_index)
            self._base_tower_layers_for_heads[BOX_ENCODINGS].append(
                box_tower_layers)

            for head_name in self._sorted_head_names:
                if head_name == CLASS_PREDICTIONS_WITH_BACKGROUND:
                    tower_name_scope = 'ClassPredictionTower'
                else:
                    tower_name_scope = '{}PredictionTower'.format(head_name)
                box_tower_layers = _build_layers(tower_name_scope,
                                                 feature_index)
                self._base_tower_layers_for_heads[head_name].append(
                    box_tower_layers)

        self.built = True
    def num_boxes_static(self):
        """Returns number of boxes held in collection.

    This number is inferred at graph construction time rather than run-time.

    Returns:
      Number of boxes held in collection (integer) or None if this is not
        inferrable at graph construction time.
    """
        return shape_utils.get_dim_as_int(self.data['boxes'].get_shape()[0])
Exemple #4
0
def batch_decode(encoded_boxes, box_coder, anchors):
    """Decode a batch of encoded boxes.

  This op takes a batch of encoded bounding boxes and transforms
  them to a batch of bounding boxes specified by their corners in
  the order of [y_min, x_min, y_max, x_max].

  Args:
    encoded_boxes: a float32 tensor of shape [batch_size, num_anchors,
      code_size] representing the location of the objects.
    box_coder: a BoxCoder object.
    anchors: a BoxList of anchors used to encode `encoded_boxes`.

  Returns:
    decoded_boxes: a float32 tensor of shape [batch_size, num_anchors,
      coder_size] representing the corners of the objects in the order
      of [y_min, x_min, y_max, x_max].

  Raises:
    ValueError: if batch sizes of the inputs are inconsistent, or if
    the number of anchors inferred from encoded_boxes and anchors are
    inconsistent.
  """
    encoded_boxes.get_shape().assert_has_rank(3)
    if (shape_utils.get_dim_as_int(encoded_boxes.get_shape()[1]) !=
            anchors.num_boxes_static()):
        raise ValueError(
            'The number of anchors inferred from encoded_boxes'
            ' and anchors are inconsistent: shape[1] of encoded_boxes'
            ' %s should be equal to the number of anchors: %s.' %
            (shape_utils.get_dim_as_int(
                encoded_boxes.get_shape()[1]), anchors.num_boxes_static()))

    decoded_boxes = tf.stack([
        box_coder.decode(boxes, anchors).get()
        for boxes in tf.unstack(encoded_boxes)
    ])
    return decoded_boxes
def pad_input_data_to_static_shapes(tensor_dict, max_num_boxes, num_classes,
                                    spatial_image_shape=None):
  """Pads input tensors to static shapes.

  In case num_additional_channels > 0, we assume that the additional channels
  have already been concatenated to the base image.

  Args:
    tensor_dict: Tensor dictionary of input data
    max_num_boxes: Max number of groundtruth boxes needed to compute shapes for
      padding.
    num_classes: Number of classes in the dataset needed to compute shapes for
      padding.
    spatial_image_shape: A list of two integers of the form [height, width]
      containing expected spatial shape of the image.

  Returns:
    A dictionary keyed by fields.InputDataFields containing padding shapes for
    tensors in the dataset.

  Raises:
    ValueError: If groundtruth classes is neither rank 1 nor rank 2, or if we
      detect that additional channels have not been concatenated yet.
  """

  if not spatial_image_shape or spatial_image_shape == [-1, -1]:
    height, width = None, None
  else:
    height, width = spatial_image_shape  # pylint: disable=unpacking-non-sequence

  num_additional_channels = 0
  if fields.InputDataFields.image_additional_channels in tensor_dict:
    num_additional_channels = shape_utils.get_dim_as_int(tensor_dict[
        fields.InputDataFields.image_additional_channels].shape[2])

  # We assume that if num_additional_channels > 0, then it has already been
  # concatenated to the base image (but not the ground truth).
  num_channels = 3
  if fields.InputDataFields.image in tensor_dict:
    num_channels = shape_utils.get_dim_as_int(
        tensor_dict[fields.InputDataFields.image].shape[2])

  if num_additional_channels:
    if num_additional_channels >= num_channels:
      raise ValueError(
          'Image must be already concatenated with additional channels.')

    if (fields.InputDataFields.original_image in tensor_dict and
        shape_utils.get_dim_as_int(
            tensor_dict[fields.InputDataFields.original_image].shape[2]) ==
        num_channels):
      raise ValueError(
          'Image must be already concatenated with additional channels.')

  padding_shapes = {
      fields.InputDataFields.image: [
          height, width, num_channels
      ],
      fields.InputDataFields.original_image_spatial_shape: [2],
      fields.InputDataFields.image_additional_channels: [
          height, width, num_additional_channels
      ],
      fields.InputDataFields.source_id: [],
      fields.InputDataFields.filename: [],
      fields.InputDataFields.key: [],
      fields.InputDataFields.groundtruth_difficult: [max_num_boxes],
      fields.InputDataFields.groundtruth_boxes: [max_num_boxes, 4],
      fields.InputDataFields.groundtruth_classes: [max_num_boxes, num_classes],
      fields.InputDataFields.groundtruth_instance_masks: [
          max_num_boxes, height, width
      ],
      fields.InputDataFields.groundtruth_is_crowd: [max_num_boxes],
      fields.InputDataFields.groundtruth_group_of: [max_num_boxes],
      fields.InputDataFields.groundtruth_area: [max_num_boxes],
      fields.InputDataFields.groundtruth_weights: [max_num_boxes],
      fields.InputDataFields.groundtruth_confidences: [
          max_num_boxes, num_classes
      ],
      fields.InputDataFields.num_groundtruth_boxes: [],
      fields.InputDataFields.groundtruth_label_types: [max_num_boxes],
      fields.InputDataFields.groundtruth_label_weights: [max_num_boxes],
      fields.InputDataFields.true_image_shape: [3],
      fields.InputDataFields.groundtruth_image_classes: [num_classes],
      fields.InputDataFields.groundtruth_image_confidences: [num_classes],
  }

  if fields.InputDataFields.original_image in tensor_dict:
    padding_shapes[fields.InputDataFields.original_image] = [
        height, width,
        shape_utils.get_dim_as_int(tensor_dict[fields.InputDataFields.
                                               original_image].shape[2])
    ]
  if fields.InputDataFields.groundtruth_keypoints in tensor_dict:
    tensor_shape = (
        tensor_dict[fields.InputDataFields.groundtruth_keypoints].shape)
    padding_shape = [max_num_boxes,
                     shape_utils.get_dim_as_int(tensor_shape[1]),
                     shape_utils.get_dim_as_int(tensor_shape[2])]
    padding_shapes[fields.InputDataFields.groundtruth_keypoints] = padding_shape
  if fields.InputDataFields.groundtruth_keypoint_visibilities in tensor_dict:
    tensor_shape = tensor_dict[fields.InputDataFields.
                               groundtruth_keypoint_visibilities].shape
    padding_shape = [max_num_boxes, shape_utils.get_dim_as_int(tensor_shape[1])]
    padding_shapes[fields.InputDataFields.
                   groundtruth_keypoint_visibilities] = padding_shape

  padded_tensor_dict = {}
  for tensor_name in tensor_dict:
    padded_tensor_dict[tensor_name] = shape_utils.pad_or_clip_nd(
        tensor_dict[tensor_name], padding_shapes[tensor_name])

  # Make sure that the number of groundtruth boxes now reflects the
  # padded/clipped tensors.
  if fields.InputDataFields.num_groundtruth_boxes in padded_tensor_dict:
    padded_tensor_dict[fields.InputDataFields.num_groundtruth_boxes] = (
        tf.minimum(
            padded_tensor_dict[fields.InputDataFields.num_groundtruth_boxes],
            max_num_boxes))
  return padded_tensor_dict
    def build(self, input_shapes):
        num_conv_channels = self._mask_prediction_conv_depth
        if num_conv_channels == 0:
            num_feature_channels = input_shapes.as_list()[3]
            num_conv_channels = self._get_mask_predictor_conv_depth(
                num_feature_channels, self._num_classes)

        for i in range(self._mask_prediction_num_conv_layers - 1):
            self._mask_predictor_layers.append(
                tf.keras.layers.Conv2D(
                    num_conv_channels, [3, 3],
                    padding='SAME',
                    name='MaskPredictor_conv2d_{}'.format(i),
                    **self._conv_hyperparams.params()))
            self._mask_predictor_layers.append(
                self._conv_hyperparams.build_batch_norm(
                    training=(self._is_training
                              and not self._freeze_batchnorm),
                    name='MaskPredictor_batchnorm_{}'.format(i)))
            self._mask_predictor_layers.append(
                self._conv_hyperparams.build_activation_layer(
                    name='MaskPredictor_activation_{}'.format(i)))

        if self._convolve_then_upsample:
            # Replace Transposed Convolution with a Nearest Neighbor upsampling step
            # followed by 3x3 convolution.
            height_scale = self._mask_height / shape_utils.get_dim_as_int(
                input_shapes[1])
            width_scale = self._mask_width / shape_utils.get_dim_as_int(
                input_shapes[2])
            # pylint: disable=g-long-lambda
            self._mask_predictor_layers.append(
                tf.keras.layers.Lambda(
                    lambda features: ops.nearest_neighbor_upsampling(
                        features,
                        height_scale=height_scale,
                        width_scale=width_scale)))
            # pylint: enable=g-long-lambda
            self._mask_predictor_layers.append(
                tf.keras.layers.Conv2D(num_conv_channels, [3, 3],
                                       padding='SAME',
                                       name='MaskPredictor_upsample_conv2d',
                                       **self._conv_hyperparams.params()))
            self._mask_predictor_layers.append(
                self._conv_hyperparams.build_batch_norm(
                    training=(self._is_training
                              and not self._freeze_batchnorm),
                    name='MaskPredictor_upsample_batchnorm'))
            self._mask_predictor_layers.append(
                self._conv_hyperparams.build_activation_layer(
                    name='MaskPredictor_upsample_activation'))

        num_masks = 1 if self._masks_are_class_agnostic else self._num_classes
        self._mask_predictor_layers.append(
            tf.keras.layers.Conv2D(
                num_masks, [3, 3],
                padding='SAME',
                name='MaskPredictor_last_conv2d',
                **self._conv_hyperparams.params(use_bias=True)))

        self.built = True
    def _predict(self, image_features, num_predictions_per_location_list):
        """Computes encoded object locations and corresponding confidences.

    Args:
      image_features: A list of float tensors of shape [batch_size, height_i,
        width_i, channels] containing features for a batch of images. Note that
        when not all tensors in the list have the same number of channels, an
        additional projection layer will be added on top the tensor to generate
        feature map with number of channels consitent with the majority.
      num_predictions_per_location_list: A list of integers representing the
        number of box predictions to be made per spatial location for each
        feature map. Note that all values must be the same since the weights are
        shared.

    Returns:
      A dictionary containing:
        box_encodings: A list of float tensors of shape
          [batch_size, num_anchors_i, code_size] representing the location of
          the objects. Each entry in the list corresponds to a feature map in
          the input `image_features` list.
        class_predictions_with_background: A list of float tensors of shape
          [batch_size, num_anchors_i, num_classes + 1] representing the class
          predictions for the proposals. Each entry in the list corresponds to a
          feature map in the input `image_features` list.
        (optional) Predictions from other heads.
          E.g., mask_predictions: A list of float tensors of shape
          [batch_size, num_anchord_i, num_classes, mask_height, mask_width].


    Raises:
      ValueError: If the num predictions per locations differs between the
        feature maps.
    """
        if len(set(num_predictions_per_location_list)) > 1:
            raise ValueError(
                'num predictions per location must be same for all'
                'feature maps, found: {}'.format(
                    num_predictions_per_location_list))
        feature_channels = [
            shape_utils.get_dim_as_int(image_feature.shape[3])
            for image_feature in image_features
        ]
        has_different_feature_channels = len(set(feature_channels)) > 1
        if has_different_feature_channels:
            inserted_layer_counter = 0
            target_channel = max(set(feature_channels),
                                 key=feature_channels.count)
            tf.logging.info(
                'Not all feature maps have the same number of '
                'channels, found: {}, appending additional projection '
                'layers to bring all feature maps to uniformly have {} '
                'channels.'.format(feature_channels, target_channel))
        else:
            # Place holder variables if has_different_feature_channels is False.
            target_channel = -1
            inserted_layer_counter = -1
        predictions = {
            BOX_3D_ENCODINGS: [],
            CLASS_PREDICTIONS_WITH_BACKGROUND: [],
        }
        for head_name in self._other_heads.keys():
            predictions[head_name] = []
        for feature_index, (image_feature,
                            num_predictions_per_location) in enumerate(
                                zip(image_features,
                                    num_predictions_per_location_list)):
            with tf.variable_scope('WeightSharedConvolutionalBoxPredictor',
                                   reuse=tf.AUTO_REUSE):
                with slim.arg_scope(self._conv_hyperparams_fn()):
                    # TODO(wangjiang) Pass is_training to the head class directly.
                    with slim.arg_scope([slim.dropout],
                                        is_training=self._is_training):
                        (image_feature, inserted_layer_counter
                         ) = self._insert_additional_projection_layer(
                             image_feature, inserted_layer_counter,
                             target_channel)
                        if self._share_prediction_tower:
                            box_tower_scope = 'PredictionTower'
                        else:
                            box_tower_scope = 'BoxPredictionTower'
                        box_tower_feature = self._compute_base_tower(
                            tower_name_scope=box_tower_scope,
                            image_feature=image_feature,
                            feature_index=feature_index)
                    box_3d_encodings = self._box_prediction_head.predict(  # _box_3d_prediction_head.predict(
                        features=box_tower_feature,
                        num_predictions_per_location=
                        num_predictions_per_location)
                    predictions[BOX_3D_ENCODINGS].append(box_3d_encodings)
                    sorted_keys = sorted(self._other_heads.keys())
                    sorted_keys.append(CLASS_PREDICTIONS_WITH_BACKGROUND)
                    for head_name in sorted_keys:
                        if head_name == CLASS_PREDICTIONS_WITH_BACKGROUND:
                            head_obj = self._class_prediction_head
                        else:
                            head_obj = self._other_heads[head_name]
                        prediction = self._predict_head(
                            head_name=head_name,
                            head_obj=head_obj,
                            image_feature=image_feature,
                            box_tower_feature=box_tower_feature,
                            feature_index=feature_index,
                            num_predictions_per_location=
                            num_predictions_per_location)
                        predictions[head_name].append(prediction)
        return predictions
Exemple #8
0
    def _match(self, similarity_matrix, valid_rows):
        """Tries to match each column of the similarity matrix to a row.

    Args:
      similarity_matrix: tensor of shape [N, M] representing any similarity
        metric.
      valid_rows: a boolean tensor of shape [N] indicating valid rows.

    Returns:
      Match object with corresponding matches for each of M columns.
    """
        def _match_when_rows_are_empty():
            """Performs matching when the rows of similarity matrix are empty.

      When the rows are empty, all detections are false positives. So we return
      a tensor of -1's to indicate that the columns do not match to any rows.

      Returns:
        matches:  int32 tensor indicating the row each column matches to.
      """
            similarity_matrix_shape = shape_utils.combined_static_and_dynamic_shape(
                similarity_matrix)
            return -1 * tf.ones([similarity_matrix_shape[1]], dtype=tf.int32)

        def _match_when_rows_are_non_empty():
            """Performs matching when the rows of similarity matrix are non empty.

      Returns:
        matches:  int32 tensor indicating the row each column matches to.
      """
            # Matches for each column
            matches = tf.argmax(similarity_matrix, 0, output_type=tf.int32)

            # Deal with matched and unmatched threshold
            if self._matched_threshold is not None:
                # Get logical indices of ignored and unmatched columns as tf.int64
                matched_vals = tf.reduce_max(similarity_matrix, 0)
                below_unmatched_threshold = tf.greater(
                    self._unmatched_threshold, matched_vals)
                between_thresholds = tf.logical_and(
                    tf.greater_equal(matched_vals, self._unmatched_threshold),
                    tf.greater(self._matched_threshold, matched_vals))

                if self._negatives_lower_than_unmatched:
                    matches = self._set_values_using_indicator(
                        matches, below_unmatched_threshold, -1)
                    matches = self._set_values_using_indicator(
                        matches, between_thresholds, -2)
                else:
                    matches = self._set_values_using_indicator(
                        matches, below_unmatched_threshold, -2)
                    matches = self._set_values_using_indicator(
                        matches, between_thresholds, -1)

            if self._force_match_for_each_row:
                similarity_matrix_shape = shape_utils.combined_static_and_dynamic_shape(
                    similarity_matrix)
                force_match_column_ids = tf.argmax(similarity_matrix,
                                                   1,
                                                   output_type=tf.int32)
                force_match_column_indicators = (
                    tf.one_hot(force_match_column_ids,
                               depth=similarity_matrix_shape[1]) *
                    tf.cast(tf.expand_dims(valid_rows, axis=-1),
                            dtype=tf.float32))
                force_match_row_ids = tf.argmax(force_match_column_indicators,
                                                0,
                                                output_type=tf.int32)
                force_match_column_mask = tf.cast(
                    tf.reduce_max(force_match_column_indicators, 0), tf.bool)
                final_matches = tf.where(force_match_column_mask,
                                         force_match_row_ids, matches)
                return final_matches
            else:
                return matches

        if similarity_matrix.shape.is_fully_defined():
            if shape_utils.get_dim_as_int(similarity_matrix.shape[0]) == 0:
                return _match_when_rows_are_empty()
            else:
                return _match_when_rows_are_non_empty()
        else:
            return tf.cond(tf.greater(tf.shape(similarity_matrix)[0],
                                      0), _match_when_rows_are_non_empty,
                           _match_when_rows_are_empty)
Exemple #9
0
        def _single_image_nms_fn(args):
            """Runs NMS on a single image and returns padded output.

      Args:
        args: A list of tensors consisting of the following:
          per_image_boxes - A [num_anchors, q, 4] float32 tensor containing
            detections. If `q` is 1 then same boxes are used for all classes
            otherwise, if `q` is equal to number of classes, class-specific
            boxes are used.
          per_image_scores - A [num_anchors, num_classes] float32 tensor
            containing the scores for each of the `num_anchors` detections.
          per_image_masks - A [num_anchors, q, mask_height, mask_width] float32
            tensor containing box masks. `q` can be either number of classes
            or 1 depending on whether a separate mask is predicted per class.
          per_image_clip_window - A 1D float32 tensor of the form
            [ymin, xmin, ymax, xmax] representing the window to clip the boxes
            to.
          per_image_additional_fields - (optional) A variable number of float32
            tensors each with size [num_anchors, ...].
          per_image_num_valid_boxes - A tensor of type `int32`. A 1-D tensor of
            shape [batch_size] representing the number of valid boxes to be
            considered for each image in the batch.  This parameter allows for
            ignoring zero paddings.

      Returns:
        'nmsed_boxes': A [max_detections, 4] float32 tensor containing the
          non-max suppressed boxes.
        'nmsed_scores': A [max_detections] float32 tensor containing the scores
          for the boxes.
        'nmsed_classes': A [max_detections] float32 tensor containing the class
          for boxes.
        'nmsed_masks': (optional) a [max_detections, mask_height, mask_width]
          float32 tensor containing masks for each selected box. This is set to
          None if input `masks` is None.
        'nmsed_additional_fields':  (optional) A variable number of float32
          tensors each with size [max_detections, ...] corresponding to the
          input `per_image_additional_fields`.
        'num_detections': A [batch_size] int32 tensor indicating the number of
          valid detections per batch item. Only the top num_detections[i]
          entries in nms_boxes[i], nms_scores[i] and nms_class[i] are valid. The
          rest of the entries are zero paddings.
      """
            per_image_boxes = args[0]
            per_image_boxes_3d = args[1]
            per_image_scores = args[2]
            per_image_clip_window = args[3]
            # Make sure that the order of elements passed in args is aligned with
            # the iteration order of ordered_additional_fields
            per_image_additional_fields = {
                key: value
                for key, value in zip(ordered_additional_fields, args[4:-1])
            }
            per_image_num_valid_boxes = args[-1]
            if use_static_shapes:
                total_proposals = tf.shape(per_image_scores)
                per_image_scores = tf.where(
                    tf.less(tf.range(total_proposals[0]),
                            per_image_num_valid_boxes), per_image_scores,
                    tf.fill(total_proposals,
                            np.finfo('float32').min))
            else:
                per_image_boxes = tf.reshape(
                    tf.slice(per_image_boxes, 3 * [0],
                             tf.stack([per_image_num_valid_boxes, -1, -1])),
                    [-1, q, 4])
                per_image_boxes_3d = tf.reshape(
                    tf.slice(per_image_boxes_3d, 3 * [0],
                             tf.stack([per_image_num_valid_boxes, -1, -1])),
                    [-1, q, 6])
                per_image_scores = tf.reshape(
                    tf.slice(per_image_scores, [0, 0],
                             tf.stack([per_image_num_valid_boxes, -1])),
                    [-1, num_classes])
                if per_image_additional_fields is not None:
                    for key, tensor in per_image_additional_fields.items():
                        additional_field_shape = tensor.get_shape()
                        additional_field_dim = len(additional_field_shape)
                        per_image_additional_fields[key] = tf.reshape(
                            tf.slice(
                                per_image_additional_fields[key],
                                additional_field_dim * [0],
                                tf.stack([per_image_num_valid_boxes] +
                                         (additional_field_dim - 1) * [-1])),
                            [-1] + [
                                shape_utils.get_dim_as_int(dim)
                                for dim in additional_field_shape[1:]
                            ])
            if use_class_agnostic_nms:
                nmsed_boxlist, num_valid_nms_boxes = class_agnostic_non_max_suppression(
                    per_image_boxes,
                    per_image_boxes_3d,
                    per_image_scores,
                    score_thresh,
                    iou_thresh,
                    max_classes_per_detection,
                    max_total_size,
                    clip_window=per_image_clip_window,
                    change_coordinate_frame=change_coordinate_frame,
                    pad_to_max_output_size=use_static_shapes,
                    additional_fields=per_image_additional_fields)
            else:
                nmsed_boxlist, num_valid_nms_boxes = multiclass_non_max_suppression(
                    per_image_boxes,
                    per_image_boxes_3d,
                    per_image_scores,
                    score_thresh,
                    iou_thresh,
                    max_size_per_class,
                    max_total_size,
                    clip_window=per_image_clip_window,
                    change_coordinate_frame=change_coordinate_frame,
                    pad_to_max_output_size=use_static_shapes,
                    additional_fields=per_image_additional_fields)

            if not use_static_shapes:
                nmsed_boxlist = box_list_ops.pad_or_clip_box_list(
                    nmsed_boxlist, max_total_size)
            num_detections = num_valid_nms_boxes
            nmsed_boxes = nmsed_boxlist.get()
            nmsed_boxes_3d = nmsed_boxlist.get_field(
                fields.BoxListFields.boxes_3d)
            nmsed_scores = nmsed_boxlist.get_field(fields.BoxListFields.scores)
            nmsed_classes = nmsed_boxlist.get_field(
                fields.BoxListFields.classes)
            nmsed_additional_fields = []
            # Sorting is needed here to ensure that the values stored in
            # nmsed_additional_fields are always kept in the same order
            # across different execution runs.
            for key in sorted(per_image_additional_fields.keys()):
                nmsed_additional_fields.append(nmsed_boxlist.get_field(key))
            return (
                [nmsed_boxes, nmsed_boxes_3d, nmsed_scores, nmsed_classes] +
                nmsed_additional_fields + [num_detections])
Exemple #10
0
def batch_multiclass_non_max_suppression(boxes,
                                         boxes_3d,
                                         scores,
                                         score_thresh,
                                         iou_thresh,
                                         max_size_per_class,
                                         max_total_size=0,
                                         clip_window=None,
                                         change_coordinate_frame=False,
                                         num_valid_boxes=None,
                                         additional_fields=None,
                                         scope=None,
                                         use_static_shapes=False,
                                         parallel_iterations=32,
                                         use_class_agnostic_nms=False,
                                         max_classes_per_detection=1):
    """Multi-class version of non maximum suppression that operates on a batch.

  This op is similar to `multiclass_non_max_suppression` but operates on a batch
  of boxes and scores. See documentation for `multiclass_non_max_suppression`
  for details.

  Args:
    boxes: A [batch_size, num_anchors, q, 4] float32 tensor containing
      detections. If `q` is 1 then same boxes are used for all classes
      otherwise, if `q` is equal to number of classes, class-specific boxes are
      used.
    scores: A [batch_size, num_anchors, num_classes] float32 tensor containing
      the scores for each of the `num_anchors` detections. The scores have to be
      non-negative when use_static_shapes is set True.
    score_thresh: scalar threshold for score (low scoring boxes are removed).
    iou_thresh: scalar threshold for IOU (new boxes that have high IOU overlap
      with previously selected boxes are removed).
    max_size_per_class: maximum number of retained boxes per class.
    max_total_size: maximum number of boxes retained over all classes. By
      default returns all boxes retained after capping boxes per class.
    clip_window: A float32 tensor of shape [batch_size, 4]  where each entry is
      of the form [y_min, x_min, y_max, x_max] representing the window to clip
      boxes to before performing non-max suppression. This argument can also be
      a tensor of shape [4] in which case, the same clip window is applied to
      all images in the batch. If clip_widow is None, all boxes are used to
      perform non-max suppression.
    change_coordinate_frame: Whether to normalize coordinates after clipping
      relative to clip_window (this can only be set to True if a clip_window is
      provided)
    num_valid_boxes: (optional) a Tensor of type `int32`. A 1-D tensor of shape
      [batch_size] representing the number of valid boxes to be considered for
      each image in the batch.  This parameter allows for ignoring zero
      paddings.
    masks: (optional) a [batch_size, num_anchors, q, mask_height, mask_width]
      float32 tensor containing box masks. `q` can be either number of classes
      or 1 depending on whether a separate mask is predicted per class.
    additional_fields: (optional) If not None, a dictionary that maps keys to
      tensors whose dimensions are [batch_size, num_anchors, ...].
    scope: tf scope name.
    use_static_shapes: If true, the output nmsed boxes are padded to be of
      length `max_size_per_class` and it doesn't clip boxes to max_total_size.
      Defaults to false.
    parallel_iterations: (optional) number of batch items to process in
      parallel.
    use_class_agnostic_nms: If true, this uses class-agnostic non max
      suppression
    max_classes_per_detection: Maximum number of retained classes per detection
      box in class-agnostic NMS.

  Returns:
    'nmsed_boxes': A [batch_size, max_detections, 4] float32 tensor
      containing the non-max suppressed boxes.
    'nmsed_scores': A [batch_size, max_detections] float32 tensor containing
      the scores for the boxes.
    'nmsed_classes': A [batch_size, max_detections] float32 tensor
      containing the class for boxes.
    'nmsed_masks': (optional) a
      [batch_size, max_detections, mask_height, mask_width] float32 tensor
      containing masks for each selected box. This is set to None if input
      `masks` is None.
    'nmsed_additional_fields': (optional) a dictionary of
      [batch_size, max_detections, ...] float32 tensors corresponding to the
      tensors specified in the input `additional_fields`. This is not returned
      if input `additional_fields` is None.
    'num_detections': A [batch_size] int32 tensor indicating the number of
      valid detections per batch item. Only the top num_detections[i] entries in
      nms_boxes[i], nms_scores[i] and nms_class[i] are valid. The rest of the
      entries are zero paddings.

  Raises:
    ValueError: if `q` in boxes.shape is not 1 or not equal to number of
      classes as inferred from scores.shape.
  """
    q = shape_utils.get_dim_as_int(boxes.shape[2])
    num_classes = shape_utils.get_dim_as_int(scores.shape[2])
    if q != 1 and q != num_classes:
        raise ValueError('third dimension of boxes must be either 1 or equal '
                         'to the third dimension of scores')
    if change_coordinate_frame and clip_window is None:
        raise ValueError(
            'if change_coordinate_frame is True, then a clip_window'
            'must be specified.')

    # Create ordered dictionary using the sorted keys from
    # additional fields to ensure getting the same key value assignment
    # in _single_image_nms_fn(). The dictionary is thus a sorted version of
    # additional_fields.
    if additional_fields is None:
        ordered_additional_fields = {}
    else:
        ordered_additional_fields = collections.OrderedDict(
            sorted(additional_fields.items(), key=lambda item: item[0]))
    del additional_fields
    with tf.name_scope(scope, 'BatchMultiClassNonMaxSuppression'):
        boxes_shape = boxes.shape
        batch_size = shape_utils.get_dim_as_int(boxes_shape[0])
        num_anchors = shape_utils.get_dim_as_int(boxes_shape[1])

        if batch_size is None:
            batch_size = tf.shape(boxes)[0]
        if num_anchors is None:
            num_anchors = tf.shape(boxes)[1]

        # If num valid boxes aren't provided, create one and mark all boxes as
        # valid.
        if num_valid_boxes is None:
            num_valid_boxes = tf.ones([batch_size],
                                      dtype=tf.int32) * num_anchors

        if clip_window is None:
            clip_window = tf.stack([
                tf.reduce_min(boxes[:, :, :, 0]),
                tf.reduce_min(boxes[:, :, :, 1]),
                tf.reduce_max(boxes[:, :, :, 2]),
                tf.reduce_max(boxes[:, :, :, 3])
            ])
        if clip_window.shape.ndims == 1:
            clip_window = tf.tile(tf.expand_dims(clip_window, 0),
                                  [batch_size, 1])

        def _single_image_nms_fn(args):
            """Runs NMS on a single image and returns padded output.

      Args:
        args: A list of tensors consisting of the following:
          per_image_boxes - A [num_anchors, q, 4] float32 tensor containing
            detections. If `q` is 1 then same boxes are used for all classes
            otherwise, if `q` is equal to number of classes, class-specific
            boxes are used.
          per_image_scores - A [num_anchors, num_classes] float32 tensor
            containing the scores for each of the `num_anchors` detections.
          per_image_masks - A [num_anchors, q, mask_height, mask_width] float32
            tensor containing box masks. `q` can be either number of classes
            or 1 depending on whether a separate mask is predicted per class.
          per_image_clip_window - A 1D float32 tensor of the form
            [ymin, xmin, ymax, xmax] representing the window to clip the boxes
            to.
          per_image_additional_fields - (optional) A variable number of float32
            tensors each with size [num_anchors, ...].
          per_image_num_valid_boxes - A tensor of type `int32`. A 1-D tensor of
            shape [batch_size] representing the number of valid boxes to be
            considered for each image in the batch.  This parameter allows for
            ignoring zero paddings.

      Returns:
        'nmsed_boxes': A [max_detections, 4] float32 tensor containing the
          non-max suppressed boxes.
        'nmsed_scores': A [max_detections] float32 tensor containing the scores
          for the boxes.
        'nmsed_classes': A [max_detections] float32 tensor containing the class
          for boxes.
        'nmsed_masks': (optional) a [max_detections, mask_height, mask_width]
          float32 tensor containing masks for each selected box. This is set to
          None if input `masks` is None.
        'nmsed_additional_fields':  (optional) A variable number of float32
          tensors each with size [max_detections, ...] corresponding to the
          input `per_image_additional_fields`.
        'num_detections': A [batch_size] int32 tensor indicating the number of
          valid detections per batch item. Only the top num_detections[i]
          entries in nms_boxes[i], nms_scores[i] and nms_class[i] are valid. The
          rest of the entries are zero paddings.
      """
            per_image_boxes = args[0]
            per_image_boxes_3d = args[1]
            per_image_scores = args[2]
            per_image_clip_window = args[3]
            # Make sure that the order of elements passed in args is aligned with
            # the iteration order of ordered_additional_fields
            per_image_additional_fields = {
                key: value
                for key, value in zip(ordered_additional_fields, args[4:-1])
            }
            per_image_num_valid_boxes = args[-1]
            if use_static_shapes:
                total_proposals = tf.shape(per_image_scores)
                per_image_scores = tf.where(
                    tf.less(tf.range(total_proposals[0]),
                            per_image_num_valid_boxes), per_image_scores,
                    tf.fill(total_proposals,
                            np.finfo('float32').min))
            else:
                per_image_boxes = tf.reshape(
                    tf.slice(per_image_boxes, 3 * [0],
                             tf.stack([per_image_num_valid_boxes, -1, -1])),
                    [-1, q, 4])
                per_image_boxes_3d = tf.reshape(
                    tf.slice(per_image_boxes_3d, 3 * [0],
                             tf.stack([per_image_num_valid_boxes, -1, -1])),
                    [-1, q, 6])
                per_image_scores = tf.reshape(
                    tf.slice(per_image_scores, [0, 0],
                             tf.stack([per_image_num_valid_boxes, -1])),
                    [-1, num_classes])
                if per_image_additional_fields is not None:
                    for key, tensor in per_image_additional_fields.items():
                        additional_field_shape = tensor.get_shape()
                        additional_field_dim = len(additional_field_shape)
                        per_image_additional_fields[key] = tf.reshape(
                            tf.slice(
                                per_image_additional_fields[key],
                                additional_field_dim * [0],
                                tf.stack([per_image_num_valid_boxes] +
                                         (additional_field_dim - 1) * [-1])),
                            [-1] + [
                                shape_utils.get_dim_as_int(dim)
                                for dim in additional_field_shape[1:]
                            ])
            if use_class_agnostic_nms:
                nmsed_boxlist, num_valid_nms_boxes = class_agnostic_non_max_suppression(
                    per_image_boxes,
                    per_image_boxes_3d,
                    per_image_scores,
                    score_thresh,
                    iou_thresh,
                    max_classes_per_detection,
                    max_total_size,
                    clip_window=per_image_clip_window,
                    change_coordinate_frame=change_coordinate_frame,
                    pad_to_max_output_size=use_static_shapes,
                    additional_fields=per_image_additional_fields)
            else:
                nmsed_boxlist, num_valid_nms_boxes = multiclass_non_max_suppression(
                    per_image_boxes,
                    per_image_boxes_3d,
                    per_image_scores,
                    score_thresh,
                    iou_thresh,
                    max_size_per_class,
                    max_total_size,
                    clip_window=per_image_clip_window,
                    change_coordinate_frame=change_coordinate_frame,
                    pad_to_max_output_size=use_static_shapes,
                    additional_fields=per_image_additional_fields)

            if not use_static_shapes:
                nmsed_boxlist = box_list_ops.pad_or_clip_box_list(
                    nmsed_boxlist, max_total_size)
            num_detections = num_valid_nms_boxes
            nmsed_boxes = nmsed_boxlist.get()
            nmsed_boxes_3d = nmsed_boxlist.get_field(
                fields.BoxListFields.boxes_3d)
            nmsed_scores = nmsed_boxlist.get_field(fields.BoxListFields.scores)
            nmsed_classes = nmsed_boxlist.get_field(
                fields.BoxListFields.classes)
            nmsed_additional_fields = []
            # Sorting is needed here to ensure that the values stored in
            # nmsed_additional_fields are always kept in the same order
            # across different execution runs.
            for key in sorted(per_image_additional_fields.keys()):
                nmsed_additional_fields.append(nmsed_boxlist.get_field(key))
            return (
                [nmsed_boxes, nmsed_boxes_3d, nmsed_scores, nmsed_classes] +
                nmsed_additional_fields + [num_detections])

        num_additional_fields = 0
        if ordered_additional_fields:
            num_additional_fields = len(ordered_additional_fields)
        num_nmsed_outputs = 4 + num_additional_fields

        batch_outputs = shape_utils.static_or_dynamic_map_fn(
            _single_image_nms_fn,
            elems=([boxes, boxes_3d, scores, clip_window] +
                   list(ordered_additional_fields.values()) +
                   [num_valid_boxes]),
            dtype=(num_nmsed_outputs * [tf.float32] + [tf.int32]),
            parallel_iterations=parallel_iterations)

        batch_nmsed_boxes = batch_outputs[0]
        batch_nmsed_boxes_3d = batch_outputs[1]
        batch_nmsed_scores = batch_outputs[2]
        batch_nmsed_classes = batch_outputs[3]
        batch_nmsed_values = batch_outputs[4:-1]

        batch_nmsed_additional_fields = {}
        if num_additional_fields > 0:
            # Sort the keys to ensure arranging elements in same order as
            # in _single_image_nms_fn.
            batch_nmsed_keys = list(ordered_additional_fields.keys())
            for i in range(len(batch_nmsed_keys)):
                batch_nmsed_additional_fields[
                    batch_nmsed_keys[i]] = batch_nmsed_values[i]

        batch_num_detections = batch_outputs[-1]

        if not ordered_additional_fields:
            batch_nmsed_additional_fields = None

        return (batch_nmsed_boxes, batch_nmsed_boxes_3d, batch_nmsed_scores,
                batch_nmsed_classes, batch_nmsed_additional_fields,
                batch_num_detections)
Exemple #11
0
def multiclass_non_max_suppression(boxes,
                                   boxes_3d,
                                   scores,
                                   score_thresh,
                                   iou_thresh,
                                   max_size_per_class,
                                   max_total_size=0,
                                   clip_window=None,
                                   change_coordinate_frame=False,
                                   boundaries=None,
                                   pad_to_max_output_size=False,
                                   additional_fields=None,
                                   scope=None):
    """Multi-class version of non maximum suppression.

  This op greedily selects a subset of detection bounding boxes, pruning
  away boxes that have high IOU (intersection over union) overlap (> thresh)
  with already selected boxes.  It operates independently for each class for
  which scores are provided (via the scores field of the input box_list),
  pruning boxes with score less than a provided threshold prior to
  applying NMS.

  Please note that this operation is performed on *all* classes, therefore any
  background classes should be removed prior to calling this function.

  Selected boxes are guaranteed to be sorted in decreasing order by score (but
  the sort is not guaranteed to be stable).

  Args:
    boxes: A [k, q, 4] float32 tensor containing k detections. `q` can be either
      number of classes or 1 depending on whether a separate box is predicted
      per class.
    scores: A [k, num_classes] float32 tensor containing the scores for each of
      the k detections. The scores have to be non-negative when
      pad_to_max_output_size is True.
    score_thresh: scalar threshold for score (low scoring boxes are removed).
    iou_thresh: scalar threshold for IOU (new boxes that have high IOU overlap
      with previously selected boxes are removed).
    max_size_per_class: maximum number of retained boxes per class.
    max_total_size: maximum number of boxes retained over all classes. By
      default returns all boxes retained after capping boxes per class.
    clip_window: A float32 tensor of the form [y_min, x_min, y_max, x_max]
      representing the window to clip and normalize boxes to before performing
      non-max suppression.
    change_coordinate_frame: Whether to normalize coordinates after clipping
      relative to clip_window (this can only be set to True if a clip_window
      is provided)
    masks: (optional) a [k, q, mask_height, mask_width] float32 tensor
      containing box masks. `q` can be either number of classes or 1 depending
      on whether a separate mask is predicted per class.
    boundaries: (optional) a [k, q, boundary_height, boundary_width] float32
      tensor containing box boundaries. `q` can be either number of classes or 1
      depending on whether a separate boundary is predicted per class.
    pad_to_max_output_size: If true, the output nmsed boxes are padded to be of
      length `max_size_per_class`. Defaults to false.
    additional_fields: (optional) If not None, a dictionary that maps keys to
      tensors whose first dimensions are all of size `k`. After non-maximum
      suppression, all tensors corresponding to the selected boxes will be
      added to resulting BoxList.
    scope: name scope.

  Returns:
    A tuple of sorted_boxes and num_valid_nms_boxes. The sorted_boxes is a
      BoxList holds M boxes with a rank-1 scores field representing
      corresponding scores for each box with scores sorted in decreasing order
      and a rank-1 classes field representing a class label for each box. The
      num_valid_nms_boxes is a 0-D integer tensor representing the number of
      valid elements in `BoxList`, with the valid elements appearing first.

  Raises:
    ValueError: if iou_thresh is not in [0, 1] or if input boxlist does not have
      a valid scores field.
  """
    _validate_boxes_scores_iou_thresh(boxes, scores, iou_thresh,
                                      change_coordinate_frame, clip_window)

    with tf.name_scope(scope, 'MultiClassNonMaxSuppression'):
        num_scores = tf.shape(scores)[0]
        num_classes = shape_utils.get_dim_as_int(scores.get_shape()[1])

        selected_boxes_list = []
        num_valid_nms_boxes_cumulative = tf.constant(0)
        per_class_boxes_list = tf.unstack(boxes, axis=1)
        per_class_boxes_3d_list = tf.unstack(boxes_3d, axis=1)
        if boundaries is not None:
            per_class_boundaries_list = tf.unstack(boundaries, axis=1)
        boxes_ids = (range(num_classes)
                     if len(per_class_boxes_list) > 1 else [0] * num_classes)
        for class_idx, boxes_idx in zip(range(num_classes), boxes_ids):
            per_class_boxes = per_class_boxes_list[boxes_idx]
            boxlist_and_class_scores = box_list.BoxList(per_class_boxes)
            per_class_boxes_3d = per_class_boxes_3d_list[boxes_idx]
            boxlist_and_class_scores.add_field(fields.BoxListFields.boxes_3d,
                                               per_class_boxes_3d)
            class_scores = tf.reshape(
                tf.slice(scores, [0, class_idx], tf.stack([num_scores, 1])),
                [-1])

            boxlist_and_class_scores.add_field(fields.BoxListFields.scores,
                                               class_scores)
            if boundaries is not None:
                per_class_boundaries = per_class_boundaries_list[boxes_idx]
                boxlist_and_class_scores.add_field(
                    fields.BoxListFields.boundaries, per_class_boundaries)
            if additional_fields is not None:
                for key, tensor in additional_fields.items():
                    boxlist_and_class_scores.add_field(key, tensor)

            if pad_to_max_output_size:
                max_selection_size = max_size_per_class
                selected_indices, num_valid_nms_boxes = (
                    tf.image.non_max_suppression_padded(
                        boxlist_and_class_scores.get(),
                        boxlist_and_class_scores.get_field(
                            fields.BoxListFields.scores),
                        max_selection_size,
                        iou_threshold=iou_thresh,
                        score_threshold=score_thresh,
                        pad_to_max_output_size=True))
            else:
                max_selection_size = tf.minimum(
                    max_size_per_class, boxlist_and_class_scores.num_boxes())
                selected_indices = tf.image.non_max_suppression(
                    boxlist_and_class_scores.get(),
                    boxlist_and_class_scores.get_field(
                        fields.BoxListFields.scores),
                    max_selection_size,
                    iou_threshold=iou_thresh,
                    score_threshold=score_thresh)
                num_valid_nms_boxes = tf.shape(selected_indices)[0]
                selected_indices = tf.concat([
                    selected_indices,
                    tf.zeros(max_selection_size - num_valid_nms_boxes,
                             tf.int32)
                ], 0)
            nms_result = box_list_ops.gather(boxlist_and_class_scores,
                                             selected_indices)
            # Make the scores -1 for invalid boxes.
            valid_nms_boxes_indx = tf.less(tf.range(max_selection_size),
                                           num_valid_nms_boxes)
            nms_scores = nms_result.get_field(fields.BoxListFields.scores)
            nms_result.add_field(
                fields.BoxListFields.scores,
                tf.where(valid_nms_boxes_indx, nms_scores,
                         -1 * tf.ones(max_selection_size)))
            num_valid_nms_boxes_cumulative += num_valid_nms_boxes

            nms_result.add_field(fields.BoxListFields.classes, (tf.zeros_like(
                nms_result.get_field(fields.BoxListFields.scores)) +
                                                                class_idx))
            selected_boxes_list.append(nms_result)
        selected_boxes = box_list_ops.concatenate(selected_boxes_list)
        sorted_boxes = box_list_ops.sort_by_field(selected_boxes,
                                                  fields.BoxListFields.scores)
        if clip_window is not None:
            # When pad_to_max_output_size is False, it prunes the boxes with zero
            # area.
            sorted_boxes, num_valid_nms_boxes_cumulative = _clip_window_prune_boxes(
                sorted_boxes, clip_window, pad_to_max_output_size,
                change_coordinate_frame)

        if max_total_size:
            max_total_size = tf.minimum(max_total_size,
                                        sorted_boxes.num_boxes())
            sorted_boxes = box_list_ops.gather(sorted_boxes,
                                               tf.range(max_total_size))
            num_valid_nms_boxes_cumulative = tf.where(
                max_total_size > num_valid_nms_boxes_cumulative,
                num_valid_nms_boxes_cumulative, max_total_size)
        # Select only the valid boxes if pad_to_max_output_size is False.
        if not pad_to_max_output_size:
            sorted_boxes = box_list_ops.gather(
                sorted_boxes, tf.range(num_valid_nms_boxes_cumulative))

        return sorted_boxes, num_valid_nms_boxes_cumulative
Exemple #12
0
def pad_input_data_to_static_shapes(tensor_dict,
                                    max_num_boxes,
                                    num_classes,
                                    spatial_image_shape=None):

    if not spatial_image_shape or spatial_image_shape == [-1, -1]:
        height, width = None, None
    else:
        height, width = spatial_image_shape  # pylint: disable=unpacking-non-sequence

    num_additional_channels = 0
    if fields.InputDataFields.image_additional_channels in tensor_dict:
        num_additional_channels = shape_utils.get_dim_as_int(tensor_dict[
            fields.InputDataFields.image_additional_channels].shape[2])

    # We assume that if num_additional_channels > 0, then it has already been
    # concatenated to the base image (but not the ground truth).
    num_channels = 3
    if fields.InputDataFields.image in tensor_dict:
        num_channels = shape_utils.get_dim_as_int(
            tensor_dict[fields.InputDataFields.image].shape[2])

    if num_additional_channels:
        if num_additional_channels >= num_channels:
            raise ValueError(
                'Image must be already concatenated with additional channels.')

        if (fields.InputDataFields.original_image in tensor_dict
                and shape_utils.get_dim_as_int(tensor_dict[
                    fields.InputDataFields.original_image].shape[2])
                == num_channels):
            raise ValueError(
                'Image must be already concatenated with additional channels.')

    padding_shapes = {
        fields.InputDataFields.image: [height, width, num_channels],
        fields.InputDataFields.original_image_spatial_shape: [2],
        fields.InputDataFields.image_additional_channels:
        [height, width, num_additional_channels],
        fields.InputDataFields.source_id: [],
        fields.InputDataFields.filename: [],
        fields.InputDataFields.key: [],
        fields.InputDataFields.groundtruth_difficult: [max_num_boxes],
        fields.InputDataFields.groundtruth_boxes: [max_num_boxes, 4],
        fields.InputDataFields.groundtruth_classes:
        [max_num_boxes, num_classes],
        fields.InputDataFields.groundtruth_instance_masks:
        [max_num_boxes, height, width],
        fields.InputDataFields.groundtruth_is_crowd: [max_num_boxes],
        fields.InputDataFields.groundtruth_group_of: [max_num_boxes],
        fields.InputDataFields.groundtruth_area: [max_num_boxes],
        fields.InputDataFields.groundtruth_weights: [max_num_boxes],
        fields.InputDataFields.groundtruth_confidences:
        [max_num_boxes, num_classes],
        fields.InputDataFields.num_groundtruth_boxes: [],
        fields.InputDataFields.groundtruth_label_types: [max_num_boxes],
        fields.InputDataFields.groundtruth_label_weights: [max_num_boxes],
        fields.InputDataFields.true_image_shape: [3],
        fields.InputDataFields.groundtruth_image_classes: [num_classes],
        fields.InputDataFields.groundtruth_image_confidences: [num_classes],
    }

    if fields.InputDataFields.original_image in tensor_dict:
        padding_shapes[fields.InputDataFields.original_image] = [
            height, width,
            shape_utils.get_dim_as_int(
                tensor_dict[fields.InputDataFields.original_image].shape[2])
        ]
    if fields.InputDataFields.groundtruth_keypoints in tensor_dict:
        tensor_shape = (
            tensor_dict[fields.InputDataFields.groundtruth_keypoints].shape)
        padding_shape = [
            max_num_boxes,
            shape_utils.get_dim_as_int(tensor_shape[1]),
            shape_utils.get_dim_as_int(tensor_shape[2])
        ]
        padding_shapes[
            fields.InputDataFields.groundtruth_keypoints] = padding_shape
    if fields.InputDataFields.groundtruth_keypoint_visibilities in tensor_dict:
        tensor_shape = tensor_dict[
            fields.InputDataFields.groundtruth_keypoint_visibilities].shape
        padding_shape = [
            max_num_boxes,
            shape_utils.get_dim_as_int(tensor_shape[1])
        ]
        padding_shapes[fields.InputDataFields.
                       groundtruth_keypoint_visibilities] = padding_shape

    padded_tensor_dict = {}
    for tensor_name in tensor_dict:
        padded_tensor_dict[tensor_name] = shape_utils.pad_or_clip_nd(
            tensor_dict[tensor_name], padding_shapes[tensor_name])

    # Make sure that the number of groundtruth boxes now reflects the
    # padded/clipped tensors.
    if fields.InputDataFields.num_groundtruth_boxes in padded_tensor_dict:
        padded_tensor_dict[fields.InputDataFields.num_groundtruth_boxes] = (
            tf.minimum(
                padded_tensor_dict[
                    fields.InputDataFields.num_groundtruth_boxes],
                max_num_boxes))
    return padded_tensor_dict
Exemple #13
0
def pad_input_data_to_static_shapes(tensor_dict,
                                    max_num_boxes,
                                    num_classes,
                                    spatial_image_shape=None,
                                    max_num_context_features=None,
                                    context_feature_length=None,
                                    max_dp_points=336):
  """Pads input tensors to static shapes.

  In case num_additional_channels > 0, we assume that the additional channels
  have already been concatenated to the base image.

  Args:
    tensor_dict: Tensor dictionary of input data
    max_num_boxes: Max number of groundtruth boxes needed to compute shapes for
      padding.
    num_classes: Number of classes in the dataset needed to compute shapes for
      padding.
    spatial_image_shape: A list of two integers of the form [height, width]
      containing expected spatial shape of the image.
    max_num_context_features (optional): The maximum number of context
      features needed to compute shapes padding.
    context_feature_length (optional): The length of the context feature.
    max_dp_points (optional): The maximum number of DensePose sampled points per
      instance. The default (336) is selected since the original DensePose paper
      (https://arxiv.org/pdf/1802.00434.pdf) indicates that the maximum number
      of samples per part is 14, and therefore 24 * 14 = 336 is the maximum
      sampler per instance.

  Returns:
    A dictionary keyed by fields.InputDataFields containing padding shapes for
    tensors in the dataset.

  Raises:
    ValueError: If groundtruth classes is neither rank 1 nor rank 2, or if we
      detect that additional channels have not been concatenated yet, or if
      max_num_context_features is not specified and context_features is in the
      tensor dict.
  """

  if not spatial_image_shape or spatial_image_shape == [-1, -1]:
    height, width = None, None
  else:
    height, width = spatial_image_shape  # pylint: disable=unpacking-non-sequence

  num_additional_channels = 0
  if fields.InputDataFields.image_additional_channels in tensor_dict:
    num_additional_channels = shape_utils.get_dim_as_int(tensor_dict[
        fields.InputDataFields.image_additional_channels].shape[2])

  # We assume that if num_additional_channels > 0, then it has already been
  # concatenated to the base image (but not the ground truth).
  num_channels = 3
  if fields.InputDataFields.image in tensor_dict:
    num_channels = shape_utils.get_dim_as_int(
        tensor_dict[fields.InputDataFields.image].shape[2])

  if num_additional_channels:
    if num_additional_channels >= num_channels:
      raise ValueError(
          'Image must be already concatenated with additional channels.')

    if (fields.InputDataFields.original_image in tensor_dict and
        shape_utils.get_dim_as_int(
            tensor_dict[fields.InputDataFields.original_image].shape[2]) ==
        num_channels):
      raise ValueError(
          'Image must be already concatenated with additional channels.')

  if fields.InputDataFields.context_features in tensor_dict and (
      max_num_context_features is None):
    raise ValueError('max_num_context_features must be specified in the model '
                     'config if include_context is specified in the input '
                     'config')

  padding_shapes = {
      fields.InputDataFields.image: [height, width, num_channels],
      fields.InputDataFields.original_image_spatial_shape: [2],
      fields.InputDataFields.image_additional_channels: [
          height, width, num_additional_channels
      ],
      fields.InputDataFields.source_id: [],
      fields.InputDataFields.filename: [],
      fields.InputDataFields.key: [],
      fields.InputDataFields.groundtruth_difficult: [max_num_boxes],
      fields.InputDataFields.groundtruth_boxes: [max_num_boxes, 4],
      fields.InputDataFields.groundtruth_classes: [max_num_boxes, num_classes],
      fields.InputDataFields.groundtruth_instance_masks: [
          max_num_boxes, height, width
      ],
      fields.InputDataFields.groundtruth_is_crowd: [max_num_boxes],
      fields.InputDataFields.groundtruth_group_of: [max_num_boxes],
      fields.InputDataFields.groundtruth_area: [max_num_boxes],
      fields.InputDataFields.groundtruth_weights: [max_num_boxes],
      fields.InputDataFields.groundtruth_confidences: [
          max_num_boxes, num_classes
      ],
      fields.InputDataFields.num_groundtruth_boxes: [],
      fields.InputDataFields.groundtruth_label_types: [max_num_boxes],
      fields.InputDataFields.groundtruth_label_weights: [max_num_boxes],
      fields.InputDataFields.true_image_shape: [3],
      fields.InputDataFields.groundtruth_image_classes: [num_classes],
      fields.InputDataFields.groundtruth_image_confidences: [num_classes],
      fields.InputDataFields.groundtruth_labeled_classes: [num_classes],
  }

  if fields.InputDataFields.original_image in tensor_dict:
    padding_shapes[fields.InputDataFields.original_image] = [
        height, width,
        shape_utils.get_dim_as_int(tensor_dict[fields.InputDataFields.
                                               original_image].shape[2])
    ]
  if fields.InputDataFields.groundtruth_keypoints in tensor_dict:
    tensor_shape = (
        tensor_dict[fields.InputDataFields.groundtruth_keypoints].shape)
    padding_shape = [max_num_boxes,
                     shape_utils.get_dim_as_int(tensor_shape[1]),
                     shape_utils.get_dim_as_int(tensor_shape[2])]
    padding_shapes[fields.InputDataFields.groundtruth_keypoints] = padding_shape
  if fields.InputDataFields.groundtruth_keypoint_visibilities in tensor_dict:
    tensor_shape = tensor_dict[fields.InputDataFields.
                               groundtruth_keypoint_visibilities].shape
    padding_shape = [max_num_boxes, shape_utils.get_dim_as_int(tensor_shape[1])]
    padding_shapes[fields.InputDataFields.
                   groundtruth_keypoint_visibilities] = padding_shape

  if fields.InputDataFields.groundtruth_keypoint_weights in tensor_dict:
    tensor_shape = (
        tensor_dict[fields.InputDataFields.groundtruth_keypoint_weights].shape)
    padding_shape = [max_num_boxes, shape_utils.get_dim_as_int(tensor_shape[1])]
    padding_shapes[fields.InputDataFields.
                   groundtruth_keypoint_weights] = padding_shape
  if fields.InputDataFields.groundtruth_dp_num_points in tensor_dict:
    padding_shapes[
        fields.InputDataFields.groundtruth_dp_num_points] = [max_num_boxes]
    padding_shapes[
        fields.InputDataFields.groundtruth_dp_part_ids] = [
            max_num_boxes, max_dp_points]
    padding_shapes[
        fields.InputDataFields.groundtruth_dp_surface_coords] = [
            max_num_boxes, max_dp_points, 4]

  # Prepare for ContextRCNN related fields.
  if fields.InputDataFields.context_features in tensor_dict:
    padding_shape = [max_num_context_features, context_feature_length]
    padding_shapes[fields.InputDataFields.context_features] = padding_shape

    tensor_shape = tf.shape(
        tensor_dict[fields.InputDataFields.context_features])
    tensor_dict[fields.InputDataFields.valid_context_size] = tensor_shape[0]
    padding_shapes[fields.InputDataFields.valid_context_size] = []
  if fields.InputDataFields.context_feature_length in tensor_dict:
    padding_shapes[fields.InputDataFields.context_feature_length] = []

  if fields.InputDataFields.is_annotated in tensor_dict:
    padding_shapes[fields.InputDataFields.is_annotated] = []

  padded_tensor_dict = {}
  for tensor_name in tensor_dict:
    padded_tensor_dict[tensor_name] = shape_utils.pad_or_clip_nd(
        tensor_dict[tensor_name], padding_shapes[tensor_name])

  # Make sure that the number of groundtruth boxes now reflects the
  # padded/clipped tensors.
  if fields.InputDataFields.num_groundtruth_boxes in padded_tensor_dict:
    padded_tensor_dict[fields.InputDataFields.num_groundtruth_boxes] = (
        tf.minimum(
            padded_tensor_dict[fields.InputDataFields.num_groundtruth_boxes],
            max_num_boxes))
  return padded_tensor_dict