Esempio n. 1
0
    def test_position_sensitive_with_global_pool_false_and_single_bin(self):
        num_spatial_bins = [1, 1]
        image_shape = [2, 3, 3, 4]
        crop_size = [1, 1]

        image = tf.random_uniform(image_shape)
        boxes = tf.random_uniform((6, 4))
        box_ind = tf.constant([0, 0, 0, 1, 1, 1], dtype=tf.int32)

        # Since single_bin is used and crop_size = [1, 1] (i.e., no crop resize),
        # the outputs are the same whatever the global_pool value is.
        ps_crop_and_pool = ops.position_sensitive_crop_regions(
            image,
            boxes,
            box_ind,
            crop_size,
            num_spatial_bins,
            global_pool=True)
        ps_crop = ops.position_sensitive_crop_regions(image,
                                                      boxes,
                                                      box_ind,
                                                      crop_size,
                                                      num_spatial_bins,
                                                      global_pool=False)

        with self.test_session() as sess:
            pooled_output, unpooled_output = sess.run(
                (ps_crop_and_pool, ps_crop))
            self.assertAllClose(pooled_output, unpooled_output)
Esempio n. 2
0
  def test_raise_value_error_on_num_bins_less_than_one(self):
    num_spatial_bins = [1, -1]
    image_shape = [1, 1, 1, 2]
    crop_size = [2, 2]

    image = tf.constant(1, dtype=tf.float32, shape=image_shape)
    boxes = tf.constant([[0, 0, 1, 1]], dtype=tf.float32)
    box_ind = tf.constant([0], dtype=tf.int32)

    with self.assertRaisesRegexp(ValueError, 'num_spatial_bins should be >= 1'):
      ops.position_sensitive_crop_regions(
          image, boxes, box_ind, crop_size, num_spatial_bins, global_pool=True)
Esempio n. 3
0
  def test_raise_value_error_on_num_bins_less_than_one(self):
    num_spatial_bins = [1, -1]
    image_shape = [1, 1, 1, 2]
    crop_size = [2, 2]

    image = tf.constant(1, dtype=tf.float32, shape=image_shape)
    boxes = tf.constant([[0, 0, 1, 1]], dtype=tf.float32)
    box_ind = tf.constant([0], dtype=tf.int32)

    with self.assertRaisesRegexp(ValueError, 'num_spatial_bins should be >= 1'):
      ops.position_sensitive_crop_regions(
          image, boxes, box_ind, crop_size, num_spatial_bins, global_pool=True)
Esempio n. 4
0
  def test_raise_value_error_on_non_square_block_size(self):
    num_spatial_bins = [3, 2]
    image_shape = [1, 3, 2, 6]
    crop_size = [6, 2]

    image = tf.constant(1, dtype=tf.float32, shape=image_shape)
    boxes = tf.constant([[0, 0, 1, 1]], dtype=tf.float32)
    box_ind = tf.constant([0], dtype=tf.int32)

    with self.assertRaisesRegexp(
        ValueError, 'Only support square bin crop size for now.'):
      ops.position_sensitive_crop_regions(
          image, boxes, box_ind, crop_size, num_spatial_bins, global_pool=False)
Esempio n. 5
0
  def test_raise_value_error_on_non_divisible_crop_size(self):
    num_spatial_bins = [2, 3]
    image_shape = [1, 1, 1, 6]
    crop_size = [3, 2]

    image = tf.constant(1, dtype=tf.float32, shape=image_shape)
    boxes = tf.constant([[0, 0, 1, 1]], dtype=tf.float32)
    box_ind = tf.constant([0], dtype=tf.int32)

    with self.assertRaisesRegexp(
        ValueError, 'crop_size should be divisible by num_spatial_bins'):
      ops.position_sensitive_crop_regions(
          image, boxes, box_ind, crop_size, num_spatial_bins, global_pool=True)
Esempio n. 6
0
  def test_raise_value_error_on_non_divisible_num_channels(self):
    num_spatial_bins = [2, 2]
    image_shape = [1, 1, 1, 5]
    crop_size = [2, 2]

    image = tf.constant(1, dtype=tf.float32, shape=image_shape)
    boxes = tf.constant([[0, 0, 1, 1]], dtype=tf.float32)
    box_ind = tf.constant([0], dtype=tf.int32)

    with self.assertRaisesRegexp(
        ValueError, 'Dimension size must be evenly divisible by 4 but is 5'):
      ops.position_sensitive_crop_regions(
          image, boxes, box_ind, crop_size, num_spatial_bins, global_pool=True)
Esempio n. 7
0
  def test_raise_value_error_on_non_divisible_crop_size(self):
    num_spatial_bins = [2, 3]
    image_shape = [1, 1, 1, 6]
    crop_size = [3, 2]

    image = tf.constant(1, dtype=tf.float32, shape=image_shape)
    boxes = tf.constant([[0, 0, 1, 1]], dtype=tf.float32)
    box_ind = tf.constant([0], dtype=tf.int32)

    with self.assertRaisesRegexp(
        ValueError, 'crop_size should be divisible by num_spatial_bins'):
      ops.position_sensitive_crop_regions(
          image, boxes, box_ind, crop_size, num_spatial_bins, global_pool=True)
Esempio n. 8
0
  def test_raise_value_error_on_non_divisible_num_channels(self):
    num_spatial_bins = [2, 2]
    image_shape = [1, 1, 1, 5]
    crop_size = [2, 2]

    image = tf.constant(1, dtype=tf.float32, shape=image_shape)
    boxes = tf.constant([[0, 0, 1, 1]], dtype=tf.float32)
    box_ind = tf.constant([0], dtype=tf.int32)

    with self.assertRaisesRegexp(
        ValueError, 'Dimension size must be evenly divisible by 4 but is 5'):
      ops.position_sensitive_crop_regions(
          image, boxes, box_ind, crop_size, num_spatial_bins, global_pool=True)
Esempio n. 9
0
  def test_raise_value_error_on_non_square_block_size(self):
    num_spatial_bins = [3, 2]
    image_shape = [1, 3, 2, 6]
    crop_size = [6, 2]

    image = tf.constant(1, dtype=tf.float32, shape=image_shape)
    boxes = tf.constant([[0, 0, 1, 1]], dtype=tf.float32)
    box_ind = tf.constant([0], dtype=tf.int32)

    with self.assertRaisesRegexp(
        ValueError, 'Only support square bin crop size for now.'):
      ops.position_sensitive_crop_regions(
          image, boxes, box_ind, crop_size, num_spatial_bins, global_pool=False)
Esempio n. 10
0
  def test_position_sensitive_with_equal_channels(self):
    num_spatial_bins = [2, 2]
    image_shape = [1, 3, 3, 4]
    crop_size = [2, 2]

    image = tf.constant(range(1, 3 * 3 + 1), dtype=tf.float32,
                        shape=[1, 3, 3, 1])
    tiled_image = tf.tile(image, [1, 1, 1, image_shape[3]])
    boxes = tf.random_uniform((3, 4))
    box_ind = tf.constant([0, 0, 0], dtype=tf.int32)

    # All channels are equal so position-sensitive crop and resize should
    # work as the usual crop and resize for just one channel.
    crop = tf.image.crop_and_resize(image, boxes, box_ind, crop_size)
    crop_and_pool = tf.reduce_mean(crop, [1, 2], keep_dims=True)

    ps_crop_and_pool = ops.position_sensitive_crop_regions(
        tiled_image,
        boxes,
        box_ind,
        crop_size,
        num_spatial_bins,
        global_pool=True)

    with self.test_session() as sess:
      expected_output, output = sess.run((crop_and_pool, ps_crop_and_pool))
      self.assertAllClose(output, expected_output)
Esempio n. 11
0
    def test_position_sensitive_with_global_pool_false_and_known_boxes(self):
        num_spatial_bins = [2, 2]
        image_shape = [2, 2, 2, 4]
        crop_size = [2, 2]

        image = tf.constant(range(1, 2 * 2 * 4 + 1) * 2,
                            dtype=tf.float32,
                            shape=image_shape)

        # First box contains whole image, and second box contains only first row.
        boxes = tf.constant(np.array([[0., 0., 1., 1.], [0., 0., 0.5, 1.]]),
                            dtype=tf.float32)
        box_ind = tf.constant([0, 1], dtype=tf.int32)

        expected_output = []

        # Expected output, when the box containing whole image.
        expected_output.append(
            np.reshape(np.array([[4, 7], [10, 13]]), (1, 2, 2, 1)))

        # Expected output, when the box containing only first row.
        expected_output.append(
            np.reshape(np.array([[3, 6], [7, 10]]), (1, 2, 2, 1)))
        expected_output = np.concatenate(expected_output, axis=0)

        ps_crop = ops.position_sensitive_crop_regions(image,
                                                      boxes,
                                                      box_ind,
                                                      crop_size,
                                                      num_spatial_bins,
                                                      global_pool=False)

        with self.test_session() as sess:
            output = sess.run(ps_crop)
            self.assertAllEqual(output, expected_output)
Esempio n. 12
0
    def test_position_sensitive(self):
        num_spatial_bins = [3, 2]
        image_shape = [1, 3, 2, 6]

        # First channel is 1's, second channel is 2's, etc.
        image = tf.constant(range(1, 3 * 2 + 1) * 6,
                            dtype=tf.float32,
                            shape=image_shape)
        boxes = tf.random_uniform((2, 4))
        box_ind = tf.constant([0, 0], dtype=tf.int32)

        # The result for both boxes should be [[1, 2], [3, 4], [5, 6]]
        # before averaging.
        expected_output = np.array([3.5, 3.5]).reshape([2, 1, 1, 1])

        for crop_size_mult in range(1, 3):
            crop_size = [3 * crop_size_mult, 2 * crop_size_mult]
            ps_crop_and_pool = ops.position_sensitive_crop_regions(
                image,
                boxes,
                box_ind,
                crop_size,
                num_spatial_bins,
                global_pool=True)

            with self.test_session() as sess:
                output = sess.run(ps_crop_and_pool)
                self.assertAllClose(output, expected_output)
Esempio n. 13
0
  def test_position_sensitive_with_equal_channels(self):
    num_spatial_bins = [2, 2]
    image_shape = [1, 3, 3, 4]
    crop_size = [2, 2]

    image = tf.constant(range(1, 3 * 3 + 1), dtype=tf.float32,
                        shape=[1, 3, 3, 1])
    tiled_image = tf.tile(image, [1, 1, 1, image_shape[3]])
    boxes = tf.random_uniform((3, 4))
    box_ind = tf.constant([0, 0, 0], dtype=tf.int32)

    # All channels are equal so position-sensitive crop and resize should
    # work as the usual crop and resize for just one channel.
    crop = tf.image.crop_and_resize(image, boxes, box_ind, crop_size)
    crop_and_pool = tf.reduce_mean(crop, [1, 2], keep_dims=True)

    ps_crop_and_pool = ops.position_sensitive_crop_regions(
        tiled_image,
        boxes,
        box_ind,
        crop_size,
        num_spatial_bins,
        global_pool=True)

    with self.test_session() as sess:
      expected_output, output = sess.run((crop_and_pool, ps_crop_and_pool))
      self.assertAllClose(output, expected_output)
Esempio n. 14
0
    def test_position_sensitive_with_single_bin(self):
        num_spatial_bins = [1, 1]
        image_shape = [2, 3, 3, 4]
        crop_size = [2, 2]

        image = tf.random_uniform(image_shape)
        boxes = tf.random_uniform((6, 4))
        box_ind = tf.constant([0, 0, 0, 1, 1, 1], dtype=tf.int32)

        # When a single bin is used, position-sensitive crop and pool should be
        # the same as non-position sensitive crop and pool.
        crop = tf.image.crop_and_resize(image, boxes, box_ind, crop_size)
        crop_and_pool = tf.reduce_mean(crop, [1, 2], keep_dims=True)

        ps_crop_and_pool = ops.position_sensitive_crop_regions(
            image,
            boxes,
            box_ind,
            crop_size,
            num_spatial_bins,
            global_pool=True)

        with self.test_session() as sess:
            expected_output, output = sess.run(
                (crop_and_pool, ps_crop_and_pool))
            self.assertAllClose(output, expected_output)
Esempio n. 15
0
  def test_position_sensitive_with_global_pool_false_and_single_bin(self):
    num_spatial_bins = [1, 1]
    image_shape = [2, 3, 3, 4]
    crop_size = [1, 1]

    image = tf.random_uniform(image_shape)
    boxes = tf.random_uniform((6, 4))
    box_ind = tf.constant([0, 0, 0, 1, 1, 1], dtype=tf.int32)

    # Since single_bin is used and crop_size = [1, 1] (i.e., no crop resize),
    # the outputs are the same whatever the global_pool value is.
    ps_crop_and_pool = ops.position_sensitive_crop_regions(
        image, boxes, box_ind, crop_size, num_spatial_bins, global_pool=True)
    ps_crop = ops.position_sensitive_crop_regions(
        image, boxes, box_ind, crop_size, num_spatial_bins, global_pool=False)

    with self.test_session() as sess:
      pooled_output, unpooled_output = sess.run((ps_crop_and_pool, ps_crop))
      self.assertAllClose(pooled_output, unpooled_output)
Esempio n. 16
0
    def test_position_sensitive_with_global_pool_false_and_do_global_pool(
            self):
        num_spatial_bins = [3, 2]
        image_shape = [1, 3, 2, 6]
        num_boxes = 2

        # First channel is 1's, second channel is 2's, etc.
        image = tf.constant(range(1, 3 * 2 + 1) * 6,
                            dtype=tf.float32,
                            shape=image_shape)
        boxes = tf.random_uniform((num_boxes, 4))
        box_ind = tf.constant([0, 0], dtype=tf.int32)

        expected_output = []

        # Expected output, when crop_size = [3, 2].
        expected_output.append(
            np.mean(np.expand_dims(np.tile(np.array([[1, 2], [3, 4], [5, 6]]),
                                           (num_boxes, 1, 1)),
                                   axis=-1),
                    axis=(1, 2),
                    keepdims=True))

        # Expected output, when crop_size = [6, 4].
        expected_output.append(
            np.mean(np.expand_dims(np.tile(
                np.array([[1, 1, 2, 2], [1, 1, 2, 2], [3, 3, 4,
                                                       4], [3, 3, 4, 4],
                          [5, 5, 6, 6], [5, 5, 6, 6]]), (num_boxes, 1, 1)),
                                   axis=-1),
                    axis=(1, 2),
                    keepdims=True))

        for crop_size_mult in range(1, 3):
            crop_size = [3 * crop_size_mult, 2 * crop_size_mult]

            # Perform global_pooling after running the function with
            # global_pool=False.
            ps_crop = ops.position_sensitive_crop_regions(image,
                                                          boxes,
                                                          box_ind,
                                                          crop_size,
                                                          num_spatial_bins,
                                                          global_pool=False)
            ps_crop_and_pool = tf.reduce_mean(ps_crop,
                                              reduction_indices=(1, 2),
                                              keep_dims=True)

            with self.test_session() as sess:
                output = sess.run(ps_crop_and_pool)

            self.assertAllEqual(output, expected_output[crop_size_mult - 1])
Esempio n. 17
0
  def test_position_sensitive_with_global_pool_false_and_do_global_pool(self):
    num_spatial_bins = [3, 2]
    image_shape = [1, 3, 2, 6]
    num_boxes = 2

    # First channel is 1's, second channel is 2's, etc.
    image = tf.constant(range(1, 3 * 2 + 1) * 6, dtype=tf.float32,
                        shape=image_shape)
    boxes = tf.random_uniform((num_boxes, 4))
    box_ind = tf.constant([0, 0], dtype=tf.int32)

    expected_output = []

    # Expected output, when crop_size = [3, 2].
    expected_output.append(np.mean(
        np.expand_dims(
            np.tile(np.array([[1, 2],
                              [3, 4],
                              [5, 6]]), (num_boxes, 1, 1)),
            axis=-1),
        axis=(1, 2), keepdims=True))

    # Expected output, when crop_size = [6, 4].
    expected_output.append(np.mean(
        np.expand_dims(
            np.tile(np.array([[1, 1, 2, 2],
                              [1, 1, 2, 2],
                              [3, 3, 4, 4],
                              [3, 3, 4, 4],
                              [5, 5, 6, 6],
                              [5, 5, 6, 6]]), (num_boxes, 1, 1)),
            axis=-1),
        axis=(1, 2), keepdims=True))

    for crop_size_mult in range(1, 3):
      crop_size = [3 * crop_size_mult, 2 * crop_size_mult]

      # Perform global_pooling after running the function with
      # global_pool=False.
      ps_crop = ops.position_sensitive_crop_regions(
          image, boxes, box_ind, crop_size, num_spatial_bins, global_pool=False)
      ps_crop_and_pool = tf.reduce_mean(
          ps_crop, reduction_indices=(1, 2), keep_dims=True)

      with self.test_session() as sess:
        output = sess.run(ps_crop_and_pool)

      self.assertAllEqual(output, expected_output[crop_size_mult - 1])
Esempio n. 18
0
  def test_position_sensitive_with_single_bin(self):
    num_spatial_bins = [1, 1]
    image_shape = [2, 3, 3, 4]
    crop_size = [2, 2]

    image = tf.random_uniform(image_shape)
    boxes = tf.random_uniform((6, 4))
    box_ind = tf.constant([0, 0, 0, 1, 1, 1], dtype=tf.int32)

    # When a single bin is used, position-sensitive crop and pool should be
    # the same as non-position sensitive crop and pool.
    crop = tf.image.crop_and_resize(image, boxes, box_ind, crop_size)
    crop_and_pool = tf.reduce_mean(crop, [1, 2], keep_dims=True)

    ps_crop_and_pool = ops.position_sensitive_crop_regions(
        image, boxes, box_ind, crop_size, num_spatial_bins, global_pool=True)

    with self.test_session() as sess:
      expected_output, output = sess.run((crop_and_pool, ps_crop_and_pool))
      self.assertAllClose(output, expected_output)
Esempio n. 19
0
  def test_position_sensitive(self):
    num_spatial_bins = [3, 2]
    image_shape = [1, 3, 2, 6]

    # First channel is 1's, second channel is 2's, etc.
    image = tf.constant(range(1, 3 * 2 + 1) * 6, dtype=tf.float32,
                        shape=image_shape)
    boxes = tf.random_uniform((2, 4))
    box_ind = tf.constant([0, 0], dtype=tf.int32)

    # The result for both boxes should be [[1, 2], [3, 4], [5, 6]]
    # before averaging.
    expected_output = np.array([3.5, 3.5]).reshape([2, 1, 1, 1])

    for crop_size_mult in range(1, 3):
      crop_size = [3 * crop_size_mult, 2 * crop_size_mult]
      ps_crop_and_pool = ops.position_sensitive_crop_regions(
          image, boxes, box_ind, crop_size, num_spatial_bins, global_pool=True)

      with self.test_session() as sess:
        output = sess.run(ps_crop_and_pool)
        self.assertAllClose(output, expected_output)
Esempio n. 20
0
  def test_position_sensitive_with_global_pool_false_and_known_boxes(self):
    num_spatial_bins = [2, 2]
    image_shape = [2, 2, 2, 4]
    crop_size = [2, 2]

    image = tf.constant(range(1, 2 * 2 * 4  + 1) * 2, dtype=tf.float32,
                        shape=image_shape)

    # First box contains whole image, and second box contains only first row.
    boxes = tf.constant(np.array([[0., 0., 1., 1.],
                                  [0., 0., 0.5, 1.]]), dtype=tf.float32)
    box_ind = tf.constant([0, 1], dtype=tf.int32)

    expected_output = []

    # Expected output, when the box containing whole image.
    expected_output.append(
        np.reshape(np.array([[4, 7],
                             [10, 13]]),
                   (1, 2, 2, 1))
    )

    # Expected output, when the box containing only first row.
    expected_output.append(
        np.reshape(np.array([[3, 6],
                             [7, 10]]),
                   (1, 2, 2, 1))
    )
    expected_output = np.concatenate(expected_output, axis=0)

    ps_crop = ops.position_sensitive_crop_regions(
        image, boxes, box_ind, crop_size, num_spatial_bins, global_pool=False)

    with self.test_session() as sess:
      output = sess.run(ps_crop)
      self.assertAllEqual(output, expected_output)
    def _predict(self, image_features, num_predictions_per_location,
                 proposal_boxes):
        """Computes encoded object locations and corresponding confidences.

    Args:
      image_features: A float tensor of shape [batch_size, height, width,
        channels] containing features for a batch of images.
      num_predictions_per_location: an integer representing the number of box
        predictions to be made per spatial location in the feature map.
        Currently, this must be set to 1, or an error will be raised.
      proposal_boxes: A float tensor of shape [batch_size, num_proposals,
        box_code_size].

    Returns:
      box_encodings: A float tensor of shape
        [batch_size, 1, num_classes, code_size] representing the
        location of the objects.
      class_predictions_with_background: A float tensor of shape
        [batch_size, 1, num_classes + 1] representing the class
        predictions for the proposals.
    Raises:
      ValueError: if num_predictions_per_location is not 1.
    """
        if num_predictions_per_location != 1:
            raise ValueError('Currently RfcnBoxPredictor only supports '
                             'predicting a single box per class per location.')

        batch_size = tf.shape(proposal_boxes)[0]
        num_boxes = tf.shape(proposal_boxes)[1]

        def get_box_indices(proposals):
            proposals_shape = proposals.get_shape().as_list()
            if any(dim is None for dim in proposals_shape):
                proposals_shape = tf.shape(proposals)
            ones_mat = tf.ones(proposals_shape[:2], dtype=tf.int32)
            multiplier = tf.expand_dims(
                tf.range(start=0, limit=proposals_shape[0]), 1)
            return tf.reshape(ones_mat * multiplier, [-1])

        net = image_features
        with slim.arg_scope(self._conv_hyperparams):
            net = slim.conv2d(net, self._depth, [1, 1], scope='reduce_depth')
            # Location predictions.
            location_feature_map_depth = (self._num_spatial_bins[0] *
                                          self._num_spatial_bins[1] *
                                          self.num_classes *
                                          self._box_code_size)
            location_feature_map = slim.conv2d(net,
                                               location_feature_map_depth,
                                               [1, 1],
                                               activation_fn=None,
                                               scope='refined_locations')
            box_encodings = ops.position_sensitive_crop_regions(
                location_feature_map,
                boxes=tf.reshape(proposal_boxes, [-1, self._box_code_size]),
                box_ind=get_box_indices(proposal_boxes),
                crop_size=self._crop_size,
                num_spatial_bins=self._num_spatial_bins,
                global_pool=True)
            box_encodings = tf.squeeze(box_encodings, squeeze_dims=[1, 2])
            box_encodings = tf.reshape(box_encodings, [
                batch_size * num_boxes, 1, self.num_classes,
                self._box_code_size
            ])

            # Class predictions.
            total_classes = self.num_classes + 1  # Account for background class.
            class_feature_map_depth = (self._num_spatial_bins[0] *
                                       self._num_spatial_bins[1] *
                                       total_classes)
            class_feature_map = slim.conv2d(net,
                                            class_feature_map_depth, [1, 1],
                                            activation_fn=None,
                                            scope='class_predictions')
            class_predictions_with_background = ops.position_sensitive_crop_regions(
                class_feature_map,
                boxes=tf.reshape(proposal_boxes, [-1, self._box_code_size]),
                box_ind=get_box_indices(proposal_boxes),
                crop_size=self._crop_size,
                num_spatial_bins=self._num_spatial_bins,
                global_pool=True)
            class_predictions_with_background = tf.squeeze(
                class_predictions_with_background, squeeze_dims=[1, 2])
            class_predictions_with_background = tf.reshape(
                class_predictions_with_background,
                [batch_size * num_boxes, 1, total_classes])

        return {
            BOX_ENCODINGS: box_encodings,
            CLASS_PREDICTIONS_WITH_BACKGROUND:
            class_predictions_with_background
        }
Esempio n. 22
0
  def _predict(self, image_features, num_predictions_per_location,
               proposal_boxes):
    """Computes encoded object locations and corresponding confidences.

    Args:
      image_features: A list of float tensors of shape [batch_size, height_i,
      width_i, channels_i] containing features for a batch of images.
      num_predictions_per_location: A list of integers representing the number
        of box predictions to be made per spatial location for each feature map.
        Currently, this must be set to [1], or an error will be raised.
      proposal_boxes: A float tensor of shape [batch_size, num_proposals,
        box_code_size].

    Returns:
      box_encodings: A list of float tensors of shape
        [batch_size, num_anchors_i, q, code_size] representing the location of
        the objects, where q is 1 or the number of classes. Each entry in the
        list corresponds to a feature map in the input `image_features` list.
      class_predictions_with_background: A list of float tensors of shape
        [batch_size, num_anchors_i, num_classes + 1] representing the class
        predictions for the proposals. Each entry in the list corresponds to a
        feature map in the input `image_features` list.

    Raises:
      ValueError: if num_predictions_per_location is not 1 or if
        len(image_features) is not 1.
    """
    if (len(num_predictions_per_location) != 1 or
        num_predictions_per_location[0] != 1):
      raise ValueError('Currently RfcnBoxPredictor only supports '
                       'predicting a single box per class per location.')
    if len(image_features) != 1:
      raise ValueError('length of `image_features` must be 1. Found {}'.
                       format(len(image_features)))
    image_feature = image_features[0]
    num_predictions_per_location = num_predictions_per_location[0]
    batch_size = tf.shape(proposal_boxes)[0]
    num_boxes = tf.shape(proposal_boxes)[1]
    def get_box_indices(proposals):
      proposals_shape = proposals.get_shape().as_list()
      if any(dim is None for dim in proposals_shape):
        proposals_shape = tf.shape(proposals)
      ones_mat = tf.ones(proposals_shape[:2], dtype=tf.int32)
      multiplier = tf.expand_dims(
          tf.range(start=0, limit=proposals_shape[0]), 1)
      return tf.reshape(ones_mat * multiplier, [-1])

    net = image_feature
    with slim.arg_scope(self._conv_hyperparams):
      net = slim.conv2d(net, self._depth, [1, 1], scope='reduce_depth')
      # Location predictions.
      location_feature_map_depth = (self._num_spatial_bins[0] *
                                    self._num_spatial_bins[1] *
                                    self.num_classes *
                                    self._box_code_size)
      location_feature_map = slim.conv2d(net, location_feature_map_depth,
                                         [1, 1], activation_fn=None,
                                         scope='refined_locations')
      box_encodings = ops.position_sensitive_crop_regions(
          location_feature_map,
          boxes=tf.reshape(proposal_boxes, [-1, self._box_code_size]),
          box_ind=get_box_indices(proposal_boxes),
          crop_size=self._crop_size,
          num_spatial_bins=self._num_spatial_bins,
          global_pool=True)
      box_encodings = tf.squeeze(box_encodings, squeeze_dims=[1, 2])
      box_encodings = tf.reshape(box_encodings,
                                 [batch_size * num_boxes, 1, self.num_classes,
                                  self._box_code_size])

      # Class predictions.
      total_classes = self.num_classes + 1  # Account for background class.
      class_feature_map_depth = (self._num_spatial_bins[0] *
                                 self._num_spatial_bins[1] *
                                 total_classes)
      class_feature_map = slim.conv2d(net, class_feature_map_depth, [1, 1],
                                      activation_fn=None,
                                      scope='class_predictions')
      class_predictions_with_background = ops.position_sensitive_crop_regions(
          class_feature_map,
          boxes=tf.reshape(proposal_boxes, [-1, self._box_code_size]),
          box_ind=get_box_indices(proposal_boxes),
          crop_size=self._crop_size,
          num_spatial_bins=self._num_spatial_bins,
          global_pool=True)
      class_predictions_with_background = tf.squeeze(
          class_predictions_with_background, squeeze_dims=[1, 2])
      class_predictions_with_background = tf.reshape(
          class_predictions_with_background,
          [batch_size * num_boxes, 1, total_classes])

    return {BOX_ENCODINGS: [box_encodings],
            CLASS_PREDICTIONS_WITH_BACKGROUND:
            [class_predictions_with_background]}