def test_die_on_tensor_shape_with_rank_three(self):
     tensor_shape = tf.TensorShape(dims=[32, 299, 384])
     with self.assertRaises(ValueError):
         static_shape.get_batch_size(tensor_shape)
         static_shape.get_height(tensor_shape)
         static_shape.get_width(tensor_shape)
         static_shape.get_depth(tensor_shape)
def pad_to_multiple(tensor, multiple):
  """Returns the tensor zero padded to the specified multiple.

  Appends 0s to the end of the first and second dimension (height and width) of
  the tensor until both dimensions are a multiple of the input argument
  'multiple'. E.g. given an input tensor of shape [1, 3, 5, 1] and an input
  multiple of 4, PadToMultiple will append 0s so that the resulting tensor will
  be of shape [1, 4, 8, 1].

  Args:
    tensor: rank 4 float32 tensor, where
            tensor -> [batch_size, height, width, channels].
    multiple: the multiple to pad to.

  Returns:
    padded_tensor: the tensor zero padded to the specified multiple.
  """
  tensor_shape = tensor.get_shape()
  batch_size = static_shape.get_batch_size(tensor_shape)
  tensor_height = static_shape.get_height(tensor_shape)
  tensor_width = static_shape.get_width(tensor_shape)
  tensor_depth = static_shape.get_depth(tensor_shape)

  if batch_size is None:
    batch_size = tf.shape(tensor)[0]

  if tensor_height is None:
    tensor_height = tf.shape(tensor)[1]
    padded_tensor_height = tf.to_int32(
        tf.ceil(tf.to_float(tensor_height) / tf.to_float(multiple))) * multiple
  else:
    padded_tensor_height = int(
        math.ceil(float(tensor_height) / multiple) * multiple)

  if tensor_width is None:
    tensor_width = tf.shape(tensor)[2]
    padded_tensor_width = tf.to_int32(
        tf.ceil(tf.to_float(tensor_width) / tf.to_float(multiple))) * multiple
  else:
    padded_tensor_width = int(
        math.ceil(float(tensor_width) / multiple) * multiple)

  if tensor_depth is None:
    tensor_depth = tf.shape(tensor)[3]

  # Use tf.concat instead of tf.pad to preserve static shape
  if padded_tensor_height != tensor_height:
    height_pad = tf.zeros([
        batch_size, padded_tensor_height - tensor_height, tensor_width,
        tensor_depth
    ])
    tensor = tf.concat([tensor, height_pad], 1)
  if padded_tensor_width != tensor_width:
    width_pad = tf.zeros([
        batch_size, padded_tensor_height, padded_tensor_width - tensor_width,
        tensor_depth
    ])
    tensor = tf.concat([tensor, width_pad], 2)

  return tensor
    def build(self, input_shapes):
        """Creates the variables of the layer."""
        if len(input_shapes) != len(self._prediction_heads[BOX_ENCODINGS]):
            raise ValueError(
                'This box predictor was constructed with %d heads,'
                'but there are %d inputs.' % (len(
                    self._prediction_heads[BOX_ENCODINGS]), len(input_shapes)))
        for stack_index, input_shape in enumerate(input_shapes):
            net = []

            # Add additional conv layers before the class predictor.
            features_depth = static_shape.get_depth(input_shape)
            depth = max(min(features_depth, self._max_depth), self._min_depth)
            tf.logging.info(
                'depth of additional conv before box predictor: {}'.format(
                    depth))

            if depth > 0 and self._num_layers_before_predictor > 0:
                for i in range(self._num_layers_before_predictor):
                    net.append(
                        keras.Conv2D(
                            depth, [1, 1],
                            name='SharedConvolutions_%d/Conv2d_%d_1x1_%d' %
                            (stack_index, i, depth),
                            padding='SAME',
                            **self._conv_hyperparams.params()))
                    net.append(
                        self._conv_hyperparams.build_batch_norm(
                            training=(self._is_training
                                      and not self._freeze_batchnorm),
                            name='SharedConvolutions_%d/Conv2d_%d_1x1_%d_norm'
                            % (stack_index, i, depth)))
                    net.append(
                        self._conv_hyperparams.build_activation_layer(
                            name=
                            'SharedConvolutions_%d/Conv2d_%d_1x1_%d_activation'
                            % (stack_index, i, depth), ))
            # Until certain bugs are fixed in checkpointable lists,
            # this net must be appended only once it's been filled with layers
            self._shared_nets.append(net)
        self.built = True
    def _predict(self, image_features, num_predictions_per_location):
        """Computes encoded object locations and corresponding confidences.

    Args:
      image_features: A float tensor of shape [batch_size, height, width,
        channels] containing features for a batch of images.
      num_predictions_per_location: an integer representing the number of box
        predictions to be made per spatial location in the feature map.

    Returns:
      A dictionary containing the following tensors.
        box_encodings: A float tensor of shape [batch_size, num_anchors, 1,
          code_size] representing the location of the objects, where
          num_anchors = feat_height * feat_width * num_predictions_per_location
        oriented_box_encodings: A float tensor of shape [batch_size, num_anchors,
          1, oriented_code_size] representing the location of the objects, where
          num_anchors = feat_height * feat_width * num_predictions_per_location
        class_predictions_with_background: A float tensor of shape
          [batch_size, num_anchors, num_classes + 1] representing the class
          predictions for the proposals.
    """
        features_depth = static_shape.get_depth(image_features.get_shape())
        depth = max(min(features_depth, self._max_depth), self._min_depth)

        # Add a slot for the background class.
        num_class_slots = self.num_classes + 1
        net = image_features
        with slim.arg_scope(self._conv_hyperparams), \
             slim.arg_scope([slim.dropout], is_training=self._is_training):
            # Add additional conv layers before the predictor.
            if depth > 0 and self._num_layers_before_predictor > 0:
                for i in range(self._num_layers_before_predictor):
                    net = slim.conv2d(net,
                                      depth, [1, 1],
                                      scope='Conv2d_%d_1x1_%d' % (i, depth))
            with slim.arg_scope([slim.conv2d],
                                activation_fn=None,
                                normalizer_fn=None,
                                normalizer_params=None):
                #regular box encoding predictions
                box_encodings = slim.conv2d(
                    net,
                    num_predictions_per_location * self._box_code_size,
                    [self._kernel_size, self._kernel_size],
                    scope='BoxEncodingPredictor')
                #oriented box encoding predictions
                oriented_box_encodings = slim.conv2d(
                    net,
                    num_predictions_per_location *
                    self._oriented_box_code_size,
                    [self._kernel_size, self._kernel_size],
                    scope='OrientedBoxEncodingPredictor')
                if self._use_dropout:
                    net = slim.dropout(net, keep_prob=self._dropout_keep_prob)
                class_predictions_with_background = slim.conv2d(
                    net,
                    num_predictions_per_location * num_class_slots,
                    [self._kernel_size, self._kernel_size],
                    scope='ClassPredictor')
                if self._apply_sigmoid_to_scores:
                    class_predictions_with_background = tf.sigmoid(
                        class_predictions_with_background)

        combined_feature_map_shape = shape_utils.combined_static_and_dynamic_shape(
            image_features)
        box_encodings = tf.reshape(
            box_encodings,
            tf.stack([
                combined_feature_map_shape[0], combined_feature_map_shape[1] *
                combined_feature_map_shape[2] * num_predictions_per_location,
                1, self._box_code_size
            ]))
        oriented_box_encodings = tf.reshape(
            oriented_box_encodings,
            tf.stack([
                combined_feature_map_shape[0], combined_feature_map_shape[1] *
                combined_feature_map_shape[2] * num_predictions_per_location,
                1, 4, 2
            ]))
        class_predictions_with_background = tf.reshape(
            class_predictions_with_background,
            tf.stack([
                combined_feature_map_shape[0], combined_feature_map_shape[1] *
                combined_feature_map_shape[2] * num_predictions_per_location,
                num_class_slots
            ]))
        return {
            BOX_ENCODINGS: box_encodings,
            BOX_ENCODINGS_ORIENTED: oriented_box_encodings,
            CLASS_PREDICTIONS_WITH_BACKGROUND:
            class_predictions_with_background
        }
Exemple #5
0
  def _predict(self, image_features, num_predictions_per_location_list):
    """Computes encoded object locations and corresponding confidences.

    Args:
      image_features: A list of float tensors of shape [batch_size, height_i,
        width_i, channels_i] containing features for a batch of images.
      num_predictions_per_location_list: A list of integers representing the
        number of box predictions to be made per spatial location for each
        feature map.

    Returns:
      A dictionary containing the following tensors.
        box_encodings: A float tensor of shape [batch_size, num_anchors, 1,
          code_size] representing the location of the objects, where
          num_anchors = feat_height * feat_width * num_predictions_per_location
        class_predictions_with_background: A float tensor of shape
          [batch_size, num_anchors, num_classes + 1] representing the class
          predictions for the proposals.

    """
    box_encodings_list = []
    class_predictions_list = []
    # TODO: Come up with a better way to generate scope names
    # in box predictor once we have time to retrain all models in the zoo.
    # The following lines create scope names to be backwards compatible with the
    # existing checkpoints.
    box_predictor_scopes = [_NoopVariableScope()]
    if len(image_features) > 1:
      box_predictor_scopes = [
          tf.variable_scope('BoxPredictor_{}'.format(i))
          for i in range(len(image_features))
      ]

    for (image_feature,
         num_predictions_per_location, box_predictor_scope) in zip(
             image_features, num_predictions_per_location_list,
             box_predictor_scopes):
      with box_predictor_scope:
        # Add a slot for the background class.
        num_class_slots = self.num_classes + 1
        net = image_feature
        with slim.arg_scope(self._conv_hyperparams), \
             slim.arg_scope([slim.dropout], is_training=self._is_training):
          # Add additional conv layers before the class predictor.
          features_depth = static_shape.get_depth(image_feature.get_shape())
          depth = max(min(features_depth, self._max_depth), self._min_depth)
          tf.logging.info('depth of additional conv before box predictor: {}'.
                          format(depth))
          if depth > 0 and self._num_layers_before_predictor > 0:
            for i in range(self._num_layers_before_predictor):
              net = slim.conv2d(
                  net, depth, [1, 1], scope='Conv2d_%d_1x1_%d' % (i, depth))
          with slim.arg_scope([slim.conv2d], activation_fn=None,
                              normalizer_fn=None, normalizer_params=None):
            if self._use_depthwise:
              box_encodings = slim.separable_conv2d(
                  net, None, [self._kernel_size, self._kernel_size],
                  padding='SAME', depth_multiplier=1, stride=1,
                  rate=1, scope='BoxEncodingPredictor_depthwise')
              box_encodings = slim.conv2d(
                  box_encodings,
                  num_predictions_per_location * self._box_code_size, [1, 1],
                  scope='BoxEncodingPredictor')
            else:
              box_encodings = slim.conv2d(
                  net, num_predictions_per_location * self._box_code_size,
                  [self._kernel_size, self._kernel_size],
                  scope='BoxEncodingPredictor')
            if self._use_dropout:
              net = slim.dropout(net, keep_prob=self._dropout_keep_prob)
            if self._use_depthwise:
              class_predictions_with_background = slim.separable_conv2d(
                  net, None, [self._kernel_size, self._kernel_size],
                  padding='SAME', depth_multiplier=1, stride=1,
                  rate=1, scope='ClassPredictor_depthwise')
              class_predictions_with_background = slim.conv2d(
                  class_predictions_with_background,
                  num_predictions_per_location * num_class_slots,
                  [1, 1], scope='ClassPredictor')
            else:
              class_predictions_with_background = slim.conv2d(
                  net, num_predictions_per_location * num_class_slots,
                  [self._kernel_size, self._kernel_size],
                  scope='ClassPredictor',
                  biases_initializer=tf.constant_initializer(
                      self._class_prediction_bias_init))
            if self._apply_sigmoid_to_scores:
              class_predictions_with_background = tf.sigmoid(
                  class_predictions_with_background)

        combined_feature_map_shape = (shape_utils.
                                      combined_static_and_dynamic_shape(
                                          image_feature))
        box_encodings = tf.reshape(
            box_encodings, tf.stack([combined_feature_map_shape[0],
                                     combined_feature_map_shape[1] *
                                     combined_feature_map_shape[2] *
                                     num_predictions_per_location,
                                     1, self._box_code_size]))
        box_encodings_list.append(box_encodings)
        class_predictions_with_background = tf.reshape(
            class_predictions_with_background,
            tf.stack([combined_feature_map_shape[0],
                      combined_feature_map_shape[1] *
                      combined_feature_map_shape[2] *
                      num_predictions_per_location,
                      num_class_slots]))
        class_predictions_list.append(class_predictions_with_background)
    return {BOX_ENCODINGS: tf.concat(box_encodings_list, axis=1),
            CLASS_PREDICTIONS_WITH_BACKGROUND:
            tf.concat(class_predictions_list, axis=1)}
 def test_return_correct_depth(self):
     tensor_shape = tf.TensorShape(dims=[32, 299, 384, 3])
     self.assertEqual(3, static_shape.get_depth(tensor_shape))
Exemple #7
0
    def _predict(self, image_features, num_predictions_per_location_list):
        """Computes encoded object locations and corresponding confidences.

    Args:
      image_features: A list of float tensors of shape [batch_size, height_i,
        width_i, channels_i] containing features for a batch of images.
      num_predictions_per_location_list: A list of integers representing the
        number of box predictions to be made per spatial location for each
        feature map.

    Returns:
      A dictionary containing:
        box_encodings: A list of float tensors of shape
          [batch_size, num_anchors_i, q, code_size] representing the location of
          the objects, where q is 1 or the number of classes. Each entry in the
          list corresponds to a feature map in the input `image_features` list.
        class_predictions_with_background: A list of float tensors of shape
          [batch_size, num_anchors_i, num_classes + 1] representing the class
          predictions for the proposals. Each entry in the list corresponds to a
          feature map in the input `image_features` list.
        (optional) Predictions from other heads.
    """
        predictions = {
            BOX_ENCODINGS: [],
            CLASS_PREDICTIONS_WITH_BACKGROUND: [],
        }
        for head_name in self._other_heads.keys():
            predictions[head_name] = []
        # TODO(rathodv): Come up with a better way to generate scope names
        # in box predictor once we have time to retrain all models in the zoo.
        # The following lines create scope names to be backwards compatible with the
        # existing checkpoints.
        box_predictor_scopes = [_NoopVariableScope()]
        if len(image_features) > 1:
            box_predictor_scopes = [
                tf.variable_scope('BoxPredictor_{}'.format(i))
                for i in range(len(image_features))
            ]
        for (image_feature, num_predictions_per_location,
             box_predictor_scope) in zip(image_features,
                                         num_predictions_per_location_list,
                                         box_predictor_scopes):
            net = image_feature
            with box_predictor_scope:
                with slim.arg_scope(self._conv_hyperparams_fn()):
                    with slim.arg_scope([slim.dropout],
                                        is_training=self._is_training):
                        # Add additional conv layers before the class predictor.
                        features_depth = static_shape.get_depth(
                            image_feature.get_shape())
                        depth = max(min(features_depth, self._max_depth),
                                    self._min_depth)
                        tf.logging.info(
                            'depth of additional conv before box predictor: {}'
                            .format(depth))
                        if depth > 0 and self._num_layers_before_predictor > 0:
                            for i in range(self._num_layers_before_predictor):
                                net = slim.conv2d(net,
                                                  depth, [1, 1],
                                                  reuse=tf.AUTO_REUSE,
                                                  scope='Conv2d_%d_1x1_%d' %
                                                  (i, depth))
                        sorted_keys = sorted(self._other_heads.keys())
                        sorted_keys.append(BOX_ENCODINGS)
                        sorted_keys.append(CLASS_PREDICTIONS_WITH_BACKGROUND)
                        for head_name in sorted_keys:
                            if head_name == BOX_ENCODINGS:
                                head_obj = self._box_prediction_head
                            elif head_name == CLASS_PREDICTIONS_WITH_BACKGROUND:
                                head_obj = self._class_prediction_head
                            else:
                                head_obj = self._other_heads[head_name]
                            prediction = head_obj.predict(
                                features=net,
                                num_predictions_per_location=
                                num_predictions_per_location)
                            predictions[head_name].append(prediction)
        return predictions