def _build(self, image, gt_boxes=None, is_training=False):

        if gt_boxes is not None:
            gt_boxes = tf.cast(gt_boxes, tf.float32)

        image.set_shape((None, None, 3))

        conv_feature_map = self.base_network(
            tf.expand_dims(image, 0), is_training=is_training
        )

        # The RPN submodule which generates proposals of objects.
        self._rpn = RPN(
            self._num_anchors, self._config.model.rpn,
            debug=self._debug, seed=self._seed
        )
        if self._with_rcnn:
            self._rcnn = RCNN(
                self._num_classes, self._config.model.rcnn,
                debug=self._debug, seed=self._seed
            )

        image_shape = tf.shape(image)[0:2]

        variable_summaries(
            conv_feature_map, 'conv_feature_map', 'reduced'
        )

        all_anchors = self._generate_anchors(tf.shape(conv_feature_map))
        rpn_prediction = self._rpn(
            conv_feature_map, image_shape, all_anchors,
            gt_boxes=gt_boxes, is_training=is_training
        )

        prediction_dict = {
            'rpn_prediction': rpn_prediction,
        }

        if self._debug:
            prediction_dict['image'] = image
            prediction_dict['image_shape'] = image_shape
            prediction_dict['all_anchors'] = all_anchors
            prediction_dict['anchor_reference'] = tf.convert_to_tensor(
                self._anchor_reference
            )
            if gt_boxes is not None:
                prediction_dict['gt_boxes'] = gt_boxes
            prediction_dict['conv_feature_map'] = conv_feature_map

        if self._with_rcnn:
            proposals = tf.stop_gradient(rpn_prediction['proposals'])
            classification_pred = self._rcnn(
                conv_feature_map, proposals,
                image_shape, self.base_network,
                gt_boxes=gt_boxes, is_training=is_training
            )

            prediction_dict['classification_prediction'] = classification_pred

        return prediction_dict
Ejemplo n.º 2
0
    def setUp(self):
        tf.reset_default_graph()

        self._num_classes = 5
        self._num_proposals = 256
        self._total_num_gt = 128
        self._image_shape = (600, 800)
        # The score we'll give to the true labels when testing for perfect
        # score generation.
        self._high_score = 100

        self._equality_delta = 1e-03

        self._config = EasyDict({
            'enabled': True,
            'layer_sizes': [4096, 4096],
            'dropout_keep_prop': 1.0,
            'activation_function': 'relu6',
            'use_mean': False,
            'initializer': {
                'type': 'variance_scaling_initializer',
                'factor': 1.0,
                'uniform': True,
                'mode': 'FAN_AVG',
            },
            'l2_regularization_scale': 0.0005,
            'roi': {
                'pooling_mode': 'crop',
                'pooled_width': 7,
                'pooled_height': 7,
                'padding': 'VALID',
            },
            'proposals': {
                'class_max_detections': 100,
                'class_nms_threshold': 0.6,
                'total_max_detections': 300,
                'min_prob_threshold': 0.0,
            },
            'target': {
                'foreground_fraction': 0.25,
                'minibatch_size': 64,
                'foreground_threshold': 0.5,
                'background_threshold_high': 0.5,
                'background_threshold_low': 0.1,
            },
        })

        self._shared_model = RCNN(self._num_classes, self._config)

        # Declare placeholders
        # We use the '_ph' suffix for placeholders.
        self._pretrained_feature_map_shape = (self._num_proposals,
                                              self._config.roi.pooled_width,
                                              self._config.roi.pooled_height,
                                              4)
        self._pretrained_feature_map_ph = tf.placeholder(
            tf.float32, shape=self._pretrained_feature_map_shape)

        self._proposals_shape = (self._num_proposals, 5)
        self._proposals_ph = tf.placeholder(tf.float32,
                                            shape=self._proposals_shape)

        self._image_shape_shape = (2, )
        self._image_shape_ph = tf.placeholder(tf.float32,
                                              shape=self._image_shape_shape)

        self._gt_boxes_shape = (self._total_num_gt, 5)
        self._gt_boxes_ph = tf.placeholder(tf.float32,
                                           shape=self._gt_boxes_shape)
Ejemplo n.º 3
0
class RCNNTest(tf.test.TestCase):
    def setUp(self):
        tf.reset_default_graph()

        self._num_classes = 5
        self._num_proposals = 256
        self._total_num_gt = 128
        self._image_shape = (600, 800)
        # The score we'll give to the true labels when testing for perfect
        # score generation.
        self._high_score = 100

        self._equality_delta = 1e-03

        self._config = EasyDict({
            'enabled': True,
            'layer_sizes': [4096, 4096],
            'dropout_keep_prop': 1.0,
            'activation_function': 'relu6',
            'use_mean': False,
            'initializer': {
                'type': 'variance_scaling_initializer',
                'factor': 1.0,
                'uniform': True,
                'mode': 'FAN_AVG',
            },
            'l2_regularization_scale': 0.0005,
            'roi': {
                'pooling_mode': 'crop',
                'pooled_width': 7,
                'pooled_height': 7,
                'padding': 'VALID',
            },
            'proposals': {
                'class_max_detections': 100,
                'class_nms_threshold': 0.6,
                'total_max_detections': 300,
                'min_prob_threshold': 0.0,
            },
            'target': {
                'foreground_fraction': 0.25,
                'minibatch_size': 64,
                'foreground_threshold': 0.5,
                'background_threshold_high': 0.5,
                'background_threshold_low': 0.1,
            },
        })

        self._shared_model = RCNN(self._num_classes, self._config)

        # Declare placeholders
        # We use the '_ph' suffix for placeholders.
        self._pretrained_feature_map_shape = (self._num_proposals,
                                              self._config.roi.pooled_width,
                                              self._config.roi.pooled_height,
                                              4)
        self._pretrained_feature_map_ph = tf.placeholder(
            tf.float32, shape=self._pretrained_feature_map_shape)

        self._proposals_shape = (self._num_proposals, 5)
        self._proposals_ph = tf.placeholder(tf.float32,
                                            shape=self._proposals_shape)

        self._image_shape_shape = (2, )
        self._image_shape_ph = tf.placeholder(tf.float32,
                                              shape=self._image_shape_shape)

        self._gt_boxes_shape = (self._total_num_gt, 5)
        self._gt_boxes_ph = tf.placeholder(tf.float32,
                                           shape=self._gt_boxes_shape)

    def _run_net_with_feed_dict(self, net, feed_dict):
        with self.test_session() as sess:
            sess.run(tf.global_variables_initializer())
            return sess.run(net, feed_dict=feed_dict)

    def _check_returning_shapes(self, prediction_dict, training=False):
        """Asserts a prediction_dict has the right shapes.

        This includes testing that:
            - objects, objects_labels and objects_labels_prob have the same
                shape in the first dimension. (i.e. the same number of
                objects).
            - objects has shape (_, 4). objects_labels and objects_labels_prob
                have shape (_,).
            - cls_score and cls_prob have shape (num_proposals,
                num_classes + 1).
            - bbox_offsets has shape (num_proposals, num_classes * 4).

        And, if training:
            - cls_target has shape (num_proposals,).
            - bbox_offsets_target has shape (num_proposals, 4).
        """

        objects_shape = prediction_dict['objects'].shape
        objects_labels_shape = prediction_dict['labels'].shape
        objects_labels_prob_shape = prediction_dict['probs'] \
            .shape

        cls_score_shape = prediction_dict['rcnn']['cls_score'].shape
        cls_prob_shape = prediction_dict['rcnn']['cls_prob'].shape

        bbox_offsets_shape = prediction_dict['rcnn']['bbox_offsets'].shape

        # We choose cls_score as the 'standard' num_proposals which we will
        # compare to the other shapes that should include num_proposals. We
        # could have chosen a different one.
        num_proposals = cls_score_shape[0]

        self.assertEqual(objects_shape[0], objects_labels_shape[0])
        self.assertEqual(objects_shape[0], objects_labels_prob_shape[0])

        self.assertEqual(objects_shape[1], 4)
        self.assertEqual(len(objects_labels_shape), 1)
        self.assertEqual(len(objects_labels_prob_shape), 1)

        self.assertEqual(cls_score_shape, cls_prob_shape)
        self.assertEqual(cls_prob_shape,
                         (num_proposals, self._num_classes + 1))

        self.assertEqual(bbox_offsets_shape,
                         (num_proposals, self._num_classes * 4))

        if training:
            cls_target_shape = prediction_dict['target']['cls'].shape
            self.assertEqual(cls_target_shape, (num_proposals, ))

            bbox_offsets_trgt_shape = (
                prediction_dict['target']['bbox_offsets'].shape)
            self.assertEqual(bbox_offsets_trgt_shape, (num_proposals, 4))

    def testReturningShapes(self):
        """Tests we're returning consistent shapes.

        We test both the case where we're training and the case where we are
        not.
        """

        # Prediction session (not training)
        rcnn_net_not_training = self._shared_model(
            self._pretrained_feature_map_ph, self._proposals_ph,
            self._image_shape_ph)

        prediction_dict_not_training = self._run_net_with_feed_dict(
            rcnn_net_not_training,
            feed_dict={
                self._pretrained_feature_map_ph:
                np.random.rand(*self._pretrained_feature_map_shape),
                self._proposals_ph:
                np.random.randint(
                    low=0,
                    high=np.amin(self._image_shape),
                    size=self._proposals_shape,
                ),
                self._image_shape_ph:
                self._image_shape,
            })
        # Training session
        rcnn_net_training = self._shared_model(self._pretrained_feature_map_ph,
                                               self._proposals_ph,
                                               self._image_shape_ph,
                                               self._gt_boxes_ph)
        prediction_dict_training = self._run_net_with_feed_dict(
            rcnn_net_training,
            feed_dict={
                self._pretrained_feature_map_ph:
                np.random.rand(*self._pretrained_feature_map_shape),
                self._proposals_ph:
                np.random.randint(
                    low=0,
                    high=np.amin(self._image_shape),
                    size=self._proposals_shape,
                ),
                self._image_shape_ph:
                self._image_shape,
                self._gt_boxes_ph:
                np.random.randint(
                    low=0,
                    high=np.amin(self._image_shape),
                    size=self._gt_boxes_shape,
                ),
            })
        # Assertions
        self._check_returning_shapes(prediction_dict_not_training)
        self._check_returning_shapes(prediction_dict_training, training=True)

    def testMinibatchBehaviour(self):
        """Tests we're using minibatch_size correctly when testing.
        """

        rcnn_net = self._shared_model(self._pretrained_feature_map_ph,
                                      self._proposals_ph, self._image_shape_ph,
                                      self._gt_boxes_ph)

        prediction_dict = self._run_net_with_feed_dict(
            rcnn_net,
            feed_dict={
                self._pretrained_feature_map_ph:
                np.random.rand(*self._pretrained_feature_map_shape),
                self._proposals_ph:
                np.random.randint(
                    low=0,
                    high=np.amin(self._image_shape),
                    size=self._proposals_shape,
                ),
                self._image_shape_ph:
                self._image_shape,
                self._gt_boxes_ph:
                np.random.randint(
                    low=0,
                    high=np.amin(self._image_shape),
                    size=self._gt_boxes_shape,
                ),
            })
        # Assertions
        self.assertLessEqual(
            prediction_dict['target']['cls'][
                prediction_dict['target']['cls'] >= 0].shape[0],
            self._config.target.minibatch_size,
        )

    def testNumberOfObjects(self):
        """Tests we're not returning more objects than we get proposals.
        """

        rcnn_net = self._shared_model(self._pretrained_feature_map_ph,
                                      self._proposals_ph, self._image_shape_ph)

        prediction_dict = self._run_net_with_feed_dict(
            rcnn_net,
            feed_dict={
                self._pretrained_feature_map_ph:
                np.random.rand(*self._pretrained_feature_map_shape),
                self._proposals_ph:
                np.random.randint(
                    0,
                    high=np.amin(self._image_shape),
                    size=self._proposals_shape,
                ),
                self._image_shape_ph:
                self._image_shape,
            })
        # Assertions
        self.assertLessEqual(prediction_dict['objects'].shape[0],
                             self._num_proposals)

    def testLoss(self):
        """Tests we're computing loss correctly.

        In particular, we're testing whether computing a perfect score when we
        have to.
        """

        # Generate placeholders and loss_graph
        cls_score_shape = (self._num_proposals, self._num_classes + 1)
        cls_score_ph = tf.placeholder(tf.float32, cls_score_shape)

        cls_prob_shape = (self._num_proposals, self._num_classes + 1)
        cls_prob_ph = tf.placeholder(tf.float32, cls_prob_shape)

        cls_target_shape = (self._num_proposals, )
        cls_target_ph = tf.placeholder(tf.float32, cls_target_shape)

        bbox_offsets_shape = (self._num_proposals, self._num_classes * 4)
        bbox_offsets_ph = tf.placeholder(tf.float32, bbox_offsets_shape)

        bbox_offsets_target_shape = (self._num_proposals, 4)
        bbox_offsets_target_ph = tf.placeholder(tf.float32,
                                                bbox_offsets_target_shape)

        loss_graph = self._shared_model.loss({
            'rcnn': {
                'cls_score': cls_score_ph,
                'cls_prob': cls_prob_ph,
                'bbox_offsets': bbox_offsets_ph,
            },
            'target': {
                'cls': cls_target_ph,
                'bbox_offsets': bbox_offsets_target_ph,
            }
        })

        # Generate values that ensure a perfect score
        # We first initialize all our values to zero.
        cls_score = np.zeros(cls_score_shape, dtype=np.float32)
        cls_prob = np.zeros(cls_prob_shape, dtype=np.float32)
        cls_target = np.zeros(cls_target_shape, dtype=np.float32)
        bbox_offsets = np.zeros(bbox_offsets_shape, dtype=np.float32)
        bbox_offsets_target = np.zeros(bbox_offsets_target_shape,
                                       dtype=np.float32)
        for i in range(self._num_proposals):
            this_class = np.random.randint(low=1, high=self._num_classes + 1)

            cls_score[i][this_class] = self._high_score
            cls_prob[i][this_class] = 1.
            cls_target[i] = this_class

            # Find out where in the axis 1 in bbox_offsets we should
            # put the offsets, because the shape is
            # (num_proposals, num_classes * 4), and we're using
            # 1-indexed classes.
            class_place = (this_class - 1) * 4
            for j in range(4):
                this_coord = np.random.randint(low=0,
                                               high=np.amax(self._image_shape))

                bbox_offsets[i][class_place + j] = this_coord
                bbox_offsets_target[i][j] = this_coord
        # Now get the loss dict using the values we just generated.
        loss_dict = self._run_net_with_feed_dict(loss_graph,
                                                 feed_dict={
                                                     cls_score_ph:
                                                     cls_score,
                                                     cls_prob_ph:
                                                     cls_prob,
                                                     cls_target_ph:
                                                     cls_target,
                                                     bbox_offsets_ph:
                                                     bbox_offsets,
                                                     bbox_offsets_target_ph:
                                                     bbox_offsets_target,
                                                 })
        # Assertions
        self.assertAlmostEqual(loss_dict['rcnn_cls_loss'],
                               0,
                               delta=self._equality_delta)
        self.assertAlmostEqual(loss_dict['rcnn_reg_loss'],
                               0,
                               delta=self._equality_delta)
Ejemplo n.º 4
0
    def _build(self, image, gt_boxes=None, is_training=False):
        """
        Returns bounding boxes and classification probabilities.

        Args:
            image: A tensor with the image.
                Its shape should be `(height, width, 3)`.
            gt_boxes: A tensor with all the ground truth boxes of that image.
                Its shape should be `(num_gt_boxes, 5)`
                Where for each gt box we have (x1, y1, x2, y2, label),
                in that order.
            is_training: A boolean to whether or not it is used for training.

        Returns:
            classification_prob: A tensor with the softmax probability for
                each of the bounding boxes found in the image.
                Its shape should be: (num_bboxes, num_categories + 1)
            classification_bbox: A tensor with the bounding boxes found.
                It's shape should be: (num_bboxes, 4). For each of the bboxes
                we have (x1, y1, x2, y2)
        """
        if gt_boxes is not None:
            gt_boxes = tf.cast(gt_boxes, tf.float32)
        # A Tensor with the feature map for the image,
        # its shape should be `(feature_height, feature_width, 512)`.
        # The shape depends of the pretrained network in use.

        # Set rank and last dimension before using base network
        # TODO: Why does it loose information when using queue?
        image.set_shape((None, None, 3))

        conv_feature_map = self.base_network(tf.expand_dims(image, 0),
                                             is_training=is_training)

        # The RPN submodule which generates proposals of objects.
        self._rpn = RPN(
            self._num_anchors,
            self._config.model.rpn,
            debug=self._debug,
            seed=self._seed,
        )
        if self._with_rcnn:
            # The RCNN submodule which classifies RPN's proposals and
            # classifies them as background or a specific class.
            self._rcnn = RCNN(
                self._num_classes,
                self._config.model.rcnn,
                debug=self._debug,
                seed=self._seed,
            )

        image_shape = tf.shape(image)[0:2]

        variable_summaries(conv_feature_map, "conv_feature_map", "reduced")

        # Generate anchors for the image based on the anchor reference.
        all_anchors = self._generate_anchors(tf.shape(conv_feature_map))
        rpn_prediction = self._rpn(
            conv_feature_map,
            image_shape,
            all_anchors,
            gt_boxes=gt_boxes,
            is_training=is_training,
        )

        prediction_dict = {
            "rpn_prediction": rpn_prediction,
        }

        if self._debug:
            prediction_dict["image"] = image
            prediction_dict["image_shape"] = image_shape
            prediction_dict["all_anchors"] = all_anchors
            prediction_dict["anchor_reference"] = tf.convert_to_tensor(
                self._anchor_reference)
            if gt_boxes is not None:
                prediction_dict["gt_boxes"] = gt_boxes
            prediction_dict["conv_feature_map"] = conv_feature_map

        if self._with_rcnn:
            proposals = tf.stop_gradient(rpn_prediction["proposals"])
            classification_pred = self._rcnn(
                conv_feature_map,
                proposals,
                image_shape,
                self.base_network,
                gt_boxes=gt_boxes,
                is_training=is_training,
            )

            prediction_dict["classification_prediction"] = classification_pred

        return prediction_dict
Ejemplo n.º 5
0
class FasterRCNN(snt.AbstractModule):
    """Faster RCNN Network module

    Builds the Faster RCNN network architecture using different submodules.
    Calculates the total loss of the model based on the different losses by
    each of the submodules.

    It is also responsible for building the anchor reference which is used in
    graph for generating the dynamic anchors.
    """
    def __init__(self, config, name="fasterrcnn"):
        super(FasterRCNN, self).__init__(name=name)

        # Main configuration object, it holds not only the necessary
        # information for this module but also configuration for each of the
        # different submodules.
        self._config = config

        # Total number of classes to classify. If not using RCNN then it is not
        # used. TODO: Make it *more* optional.
        self._num_classes = config.model.network.num_classes

        # Generate network with RCNN thus allowing for classification of
        # objects and not just finding them.
        self._with_rcnn = config.model.network.with_rcnn

        # Turn on debug mode with returns more Tensors which can be used for
        # better visualization and (of course) debugging.
        self._debug = config.train.debug
        self._seed = config.train.seed

        # Anchor config, check out the docs of base_config.yml for a better
        # understanding of how anchors work.
        self._anchor_base_size = config.model.anchors.base_size
        self._anchor_scales = np.array(config.model.anchors.scales)
        self._anchor_ratios = np.array(config.model.anchors.ratios)
        self._anchor_stride = config.model.anchors.stride

        # Anchor reference for building dynamic anchors for each image in the
        # computation graph.
        self._anchor_reference = generate_anchors_reference(
            self._anchor_base_size, self._anchor_ratios, self._anchor_scales)

        # Total number of anchors per point.
        self._num_anchors = self._anchor_reference.shape[0]

        # Weights used to sum each of the losses of the submodules
        self._rpn_cls_loss_weight = config.model.loss.rpn_cls_loss_weight
        self._rpn_reg_loss_weight = config.model.loss.rpn_reg_loss_weights

        self._rcnn_cls_loss_weight = config.model.loss.rcnn_cls_loss_weight
        self._rcnn_reg_loss_weight = config.model.loss.rcnn_reg_loss_weights
        self._losses_collections = ["fastercnn_losses"]

        # We want the pretrained model to be outside the FasterRCNN name scope.
        self.base_network = TruncatedBaseNetwork(config.model.base_network)

    def _build(self, image, gt_boxes=None, is_training=False):
        """
        Returns bounding boxes and classification probabilities.

        Args:
            image: A tensor with the image.
                Its shape should be `(height, width, 3)`.
            gt_boxes: A tensor with all the ground truth boxes of that image.
                Its shape should be `(num_gt_boxes, 5)`
                Where for each gt box we have (x1, y1, x2, y2, label),
                in that order.
            is_training: A boolean to whether or not it is used for training.

        Returns:
            classification_prob: A tensor with the softmax probability for
                each of the bounding boxes found in the image.
                Its shape should be: (num_bboxes, num_categories + 1)
            classification_bbox: A tensor with the bounding boxes found.
                It's shape should be: (num_bboxes, 4). For each of the bboxes
                we have (x1, y1, x2, y2)
        """
        if gt_boxes is not None:
            gt_boxes = tf.cast(gt_boxes, tf.float32)
        # A Tensor with the feature map for the image,
        # its shape should be `(feature_height, feature_width, 512)`.
        # The shape depends of the pretrained network in use.

        # Set rank and last dimension before using base network
        # TODO: Why does it loose information when using queue?
        image.set_shape((None, None, 3))

        conv_feature_map = self.base_network(tf.expand_dims(image, 0),
                                             is_training=is_training)

        # The RPN submodule which generates proposals of objects.
        self._rpn = RPN(
            self._num_anchors,
            self._config.model.rpn,
            debug=self._debug,
            seed=self._seed,
        )
        if self._with_rcnn:
            # The RCNN submodule which classifies RPN's proposals and
            # classifies them as background or a specific class.
            self._rcnn = RCNN(
                self._num_classes,
                self._config.model.rcnn,
                debug=self._debug,
                seed=self._seed,
            )

        image_shape = tf.shape(image)[0:2]

        variable_summaries(conv_feature_map, "conv_feature_map", "reduced")

        # Generate anchors for the image based on the anchor reference.
        all_anchors = self._generate_anchors(tf.shape(conv_feature_map))
        rpn_prediction = self._rpn(
            conv_feature_map,
            image_shape,
            all_anchors,
            gt_boxes=gt_boxes,
            is_training=is_training,
        )

        prediction_dict = {
            "rpn_prediction": rpn_prediction,
        }

        if self._debug:
            prediction_dict["image"] = image
            prediction_dict["image_shape"] = image_shape
            prediction_dict["all_anchors"] = all_anchors
            prediction_dict["anchor_reference"] = tf.convert_to_tensor(
                self._anchor_reference)
            if gt_boxes is not None:
                prediction_dict["gt_boxes"] = gt_boxes
            prediction_dict["conv_feature_map"] = conv_feature_map

        if self._with_rcnn:
            proposals = tf.stop_gradient(rpn_prediction["proposals"])
            classification_pred = self._rcnn(
                conv_feature_map,
                proposals,
                image_shape,
                self.base_network,
                gt_boxes=gt_boxes,
                is_training=is_training,
            )

            prediction_dict["classification_prediction"] = classification_pred

        return prediction_dict

    def loss(self, prediction_dict, return_all=False):
        """Compute the joint training loss for Faster RCNN.

        Args:
            prediction_dict: The output dictionary of the _build method from
                which we use two different main keys:

                rpn_prediction: A dictionary with the output Tensors from the
                    RPN.
                classification_prediction: A dictionary with the output Tensors
                    from the RCNN.

        Returns:
            If `return_all` is False, a tensor for the total loss. If True, a
            dict with all the internal losses (RPN's, RCNN's, regularization
            and total loss).
        """

        with tf.name_scope("losses"):
            rpn_loss_dict = self._rpn.loss(prediction_dict["rpn_prediction"])

            # Losses have a weight assigned, we multiply by them before saving
            # them.
            rpn_loss_dict["rpn_cls_loss"] = (rpn_loss_dict["rpn_cls_loss"] *
                                             self._rpn_cls_loss_weight)
            rpn_loss_dict["rpn_reg_loss"] = (rpn_loss_dict["rpn_reg_loss"] *
                                             self._rpn_reg_loss_weight)

            prediction_dict["rpn_loss_dict"] = rpn_loss_dict

            if self._with_rcnn:
                rcnn_loss_dict = self._rcnn.loss(
                    prediction_dict["classification_prediction"])

                rcnn_loss_dict["rcnn_cls_loss"] = (
                    rcnn_loss_dict["rcnn_cls_loss"] *
                    self._rcnn_cls_loss_weight)
                rcnn_loss_dict["rcnn_reg_loss"] = (
                    rcnn_loss_dict["rcnn_reg_loss"] *
                    self._rcnn_reg_loss_weight)

                prediction_dict["rcnn_loss_dict"] = rcnn_loss_dict
            else:
                rcnn_loss_dict = {}

            all_losses_items = list(rpn_loss_dict.items()) + list(
                rcnn_loss_dict.items())

            for loss_name, loss_tensor in all_losses_items:
                tf.summary.scalar(loss_name,
                                  loss_tensor,
                                  collections=self._losses_collections)
                # We add losses to the losses collection instead of manually
                # summing them just in case somebody wants to use it in another
                # place.
                tf.losses.add_loss(loss_tensor)

            # Regularization loss is automatically saved by TensorFlow, we log
            # it differently so we can visualize it independently.
            regularization_loss = tf.losses.get_regularization_loss()
            # Total loss without regularization
            no_reg_loss = tf.losses.get_total_loss(
                add_regularization_losses=False)
            total_loss = tf.losses.get_total_loss()

            tf.summary.scalar("total_loss",
                              total_loss,
                              collections=self._losses_collections)
            tf.summary.scalar("no_reg_loss",
                              no_reg_loss,
                              collections=self._losses_collections)
            tf.summary.scalar(
                "regularization_loss",
                regularization_loss,
                collections=self._losses_collections,
            )

            if return_all:
                loss_dict = {
                    "total_loss": total_loss,
                    "no_reg_loss": no_reg_loss,
                    "regularization_loss": regularization_loss,
                }

                for loss_name, loss_tensor in all_losses_items:
                    loss_dict[loss_name] = loss_tensor

                return loss_dict

            # We return the total loss, which includes:
            # - rpn loss
            # - rcnn loss (if activated)
            # - regularization loss
            return total_loss

    def _generate_anchors(self, feature_map_shape):
        """Generate anchor for an image.

        Using the feature map, the output of the pretrained network for an
        image, and the anchor_reference generated using the anchor config
        values. We generate a list of anchors.

        Anchors are just fixed bounding boxes of different ratios and sizes
        that are uniformly generated throught the image.

        Args:
            feature_map_shape: Shape of the convolutional feature map used as
                input for the RPN. Should be (batch, height, width, depth).

        Returns:
            all_anchors: A flattened Tensor with all the anchors of shape
                `(num_anchors_per_points * feature_width * feature_height, 4)`
                using the (x1, y1, x2, y2) convention.
        """
        with tf.variable_scope("generate_anchors"):
            grid_width = feature_map_shape[2]  # width
            grid_height = feature_map_shape[1]  # height
            shift_x = tf.range(grid_width) * self._anchor_stride
            shift_y = tf.range(grid_height) * self._anchor_stride
            shift_x, shift_y = tf.meshgrid(shift_x, shift_y)

            shift_x = tf.reshape(shift_x, [-1])
            shift_y = tf.reshape(shift_y, [-1])

            shifts = tf.stack([shift_x, shift_y, shift_x, shift_y], axis=0)

            shifts = tf.transpose(shifts)
            # Shifts now is a (H x W, 4) Tensor

            # Expand dims to use broadcasting sum.
            all_anchors = np.expand_dims(self._anchor_reference,
                                         axis=0) + tf.expand_dims(shifts,
                                                                  axis=1)

            # Flatten
            all_anchors = tf.reshape(all_anchors, (-1, 4))
            return all_anchors

    @property
    def summary(self):
        """
        Generate merged summary of all the sub-summaries used inside the
        Faster R-CNN network.
        """
        summaries = [
            tf.summary.merge_all(key="rpn"),
        ]

        summaries.append(tf.summary.merge_all(key=self._losses_collections[0]))

        if self._with_rcnn:
            summaries.append(tf.summary.merge_all(key="rcnn"))

        return tf.summary.merge(summaries)

    @property
    def vars_summary(self):
        return {
            key: tf.summary.merge_all(key=collection)
            for key, collections in VAR_LOG_LEVELS.items()
            for collection in collections
        }

    def get_trainable_vars(self):
        """Get trainable vars included in the module."""
        trainable_vars = snt.get_variables_in_module(self)
        if self._config.model.base_network.trainable:
            pretrained_trainable_vars = self.base_network.get_trainable_vars()
            if len(pretrained_trainable_vars):
                tf.logging.info("Training {} vars from pretrained module; "
                                'from "{}" to "{}".'.format(
                                    len(pretrained_trainable_vars),
                                    pretrained_trainable_vars[0].name,
                                    pretrained_trainable_vars[-1].name,
                                ))
            else:
                tf.logging.info("No vars from pretrained module to train.")
            trainable_vars += pretrained_trainable_vars
        else:
            tf.logging.info("Not training variables from pretrained module")

        return trainable_vars

    def get_base_network_checkpoint_vars(self):
        return self.base_network.get_base_network_checkpoint_vars()

    def get_checkpoint_file(self):
        return self.base_network.get_checkpoint_file()
Ejemplo n.º 6
0
    def _build(self, image, gt_boxes=None, is_training=True):
        """
        Returns bounding boxes and classification probabilities.

        Args:
            image: A tensor with the image.
                Its shape should be `(1, height, width, 3)`.
            gt_boxes: A tensor with all the ground truth boxes of that image.
                Its shape should be `(num_gt_boxes, 5)`
                Where for each gt box we have (x1, y1, x2, y2, label),
                in that order.
            is_training: A boolean to whether or not it is used for training.

        Returns:
            classification_prob: A tensor with the softmax probability for
                each of the bounding boxes found in the image.
                Its shape should be: (num_bboxes, num_categories + 1)
            classification_bbox: A tensor with the bounding boxes found.
                It's shape should be: (num_bboxes, 4). For each of the bboxes
                we have (x1, y1, x2, y2)
        """
        if gt_boxes is not None:
            gt_boxes = tf.cast(gt_boxes, tf.float32)
        # A Tensor with the feature map for the image,
        # its shape should be `(feature_height, feature_width, 512)`.
        # The shape depends of the pretrained network in use.
        conv_feature_map = self.base_network(image, is_training=is_training)

        # The RPN submodule which generates proposals of objects.
        self._rpn = RPN(self._num_anchors,
                        self._config.rpn,
                        debug=self._debug,
                        seed=self._seed)
        if self._with_rcnn:
            # The RCNN submodule which classifies RPN's proposals and
            # classifies them as background or a specific class.
            self._rcnn = RCNN(self._num_classes,
                              self._config.rcnn,
                              debug=self._debug,
                              seed=self._seed)

        image_shape = tf.shape(image)[1:3]

        variable_summaries(conv_feature_map, 'conv_feature_map', ['rpn'])

        # Generate anchors for the image based on the anchor reference.
        all_anchors = self._generate_anchors(tf.shape(conv_feature_map))
        rpn_prediction = self._rpn(conv_feature_map,
                                   image_shape,
                                   all_anchors,
                                   gt_boxes=gt_boxes)

        prediction_dict = {
            'rpn_prediction': rpn_prediction,
        }

        if self._debug:
            prediction_dict['image'] = image
            prediction_dict['image_shape'] = image_shape
            prediction_dict['all_anchors'] = all_anchors
            prediction_dict['anchor_reference'] = tf.convert_to_tensor(
                self._anchor_reference)
            prediction_dict['gt_boxes'] = gt_boxes
            prediction_dict['conv_feature_map'] = conv_feature_map

        if self._with_rcnn:
            classification_pred = self._rcnn(conv_feature_map,
                                             rpn_prediction['proposals'],
                                             image_shape,
                                             gt_boxes=gt_boxes,
                                             is_training=is_training)

            prediction_dict['classification_prediction'] = classification_pred

        return prediction_dict
class FasterRCNN(snt.AbstractModule):
    def __init__(self, config, name='fasterrcnn'):
        super(FasterRCNN, self).__init__(name=name)

        self._config = config

        self._num_classes = config.model.network.num_classes

        # Generate network with RCNN 
        self._with_rcnn = config.model.network.with_rcnn

        self._debug = config.train.debug
        self._seed = config.train.seed

        self._anchor_base_size = config.model.anchors.base_size
        self._anchor_scales = np.array(config.model.anchors.scales)
        self._anchor_ratios = np.array(config.model.anchors.ratios)
        self._anchor_stride = config.model.anchors.stride

        self._anchor_reference = generate_anchors_reference(
            self._anchor_base_size, self._anchor_ratios, self._anchor_scales
        )

        self._num_anchors = self._anchor_reference.shape[0]

        # Weights used to sum each of the losses of the submodules
        self._rpn_cls_loss_weight = config.model.loss.rpn_cls_loss_weight
        self._rpn_reg_loss_weight = config.model.loss.rpn_reg_loss_weights

        self._rcnn_cls_loss_weight = config.model.loss.rcnn_cls_loss_weight
        self._rcnn_reg_loss_weight = config.model.loss.rcnn_reg_loss_weights
        self._losses_collections = ['fastercnn_losses']

        self.base_network = TruncatedBaseNetwork(config.model.base_network)

    def _build(self, image, gt_boxes=None, is_training=False):

        if gt_boxes is not None:
            gt_boxes = tf.cast(gt_boxes, tf.float32)

        image.set_shape((None, None, 3))

        conv_feature_map = self.base_network(
            tf.expand_dims(image, 0), is_training=is_training
        )

        # The RPN submodule which generates proposals of objects.
        self._rpn = RPN(
            self._num_anchors, self._config.model.rpn,
            debug=self._debug, seed=self._seed
        )
        if self._with_rcnn:
            self._rcnn = RCNN(
                self._num_classes, self._config.model.rcnn,
                debug=self._debug, seed=self._seed
            )

        image_shape = tf.shape(image)[0:2]

        variable_summaries(
            conv_feature_map, 'conv_feature_map', 'reduced'
        )

        all_anchors = self._generate_anchors(tf.shape(conv_feature_map))
        rpn_prediction = self._rpn(
            conv_feature_map, image_shape, all_anchors,
            gt_boxes=gt_boxes, is_training=is_training
        )

        prediction_dict = {
            'rpn_prediction': rpn_prediction,
        }

        if self._debug:
            prediction_dict['image'] = image
            prediction_dict['image_shape'] = image_shape
            prediction_dict['all_anchors'] = all_anchors
            prediction_dict['anchor_reference'] = tf.convert_to_tensor(
                self._anchor_reference
            )
            if gt_boxes is not None:
                prediction_dict['gt_boxes'] = gt_boxes
            prediction_dict['conv_feature_map'] = conv_feature_map

        if self._with_rcnn:
            proposals = tf.stop_gradient(rpn_prediction['proposals'])
            classification_pred = self._rcnn(
                conv_feature_map, proposals,
                image_shape, self.base_network,
                gt_boxes=gt_boxes, is_training=is_training
            )

            prediction_dict['classification_prediction'] = classification_pred

        return prediction_dict

    def loss(self, prediction_dict, return_all=False):
        with tf.name_scope('losses'):
            rpn_loss_dict = self._rpn.loss(
                prediction_dict['rpn_prediction']
            )

            rpn_loss_dict['rpn_cls_loss'] = (
                rpn_loss_dict['rpn_cls_loss'] * self._rpn_cls_loss_weight)
            rpn_loss_dict['rpn_reg_loss'] = (
                rpn_loss_dict['rpn_reg_loss'] * self._rpn_reg_loss_weight)

            prediction_dict['rpn_loss_dict'] = rpn_loss_dict

            if self._with_rcnn:
                rcnn_loss_dict = self._rcnn.loss(
                    prediction_dict['classification_prediction']
                )

                rcnn_loss_dict['rcnn_cls_loss'] = (
                    rcnn_loss_dict['rcnn_cls_loss'] *
                    self._rcnn_cls_loss_weight
                )
                rcnn_loss_dict['rcnn_reg_loss'] = (
                    rcnn_loss_dict['rcnn_reg_loss'] *
                    self._rcnn_reg_loss_weight
                )

                prediction_dict['rcnn_loss_dict'] = rcnn_loss_dict
            else:
                rcnn_loss_dict = {}

            all_losses_items = (
                list(rpn_loss_dict.items()) + list(rcnn_loss_dict.items()))

            for loss_name, loss_tensor in all_losses_items:
                tf.summary.scalar(
                    loss_name, loss_tensor,
                    collections=self._losses_collections
                )
                tf.losses.add_loss(loss_tensor)

            regularization_loss = tf.losses.get_regularization_loss()
            no_reg_loss = tf.losses.get_total_loss(
                add_regularization_losses=False
            )
            total_loss = tf.losses.get_total_loss()

            tf.summary.scalar(
                'total_loss', total_loss,
                collections=self._losses_collections
            )
            tf.summary.scalar(
                'no_reg_loss', no_reg_loss,
                collections=self._losses_collections
            )
            tf.summary.scalar(
                'regularization_loss', regularization_loss,
                collections=self._losses_collections
            )

            if return_all:
                loss_dict = {
                    'total_loss': total_loss,
                    'no_reg_loss': no_reg_loss,
                    'regularization_loss': regularization_loss,
                }

                for loss_name, loss_tensor in all_losses_items:
                    loss_dict[loss_name] = loss_tensor

                return loss_dict

            return total_loss

    def _generate_anchors(self, feature_map_shape):
        with tf.variable_scope('generate_anchors'):
            grid_width = feature_map_shape[2]  # width
            grid_height = feature_map_shape[1]  # height
            shift_x = tf.range(grid_width) * self._anchor_stride
            shift_y = tf.range(grid_height) * self._anchor_stride
            shift_x, shift_y = tf.meshgrid(shift_x, shift_y)

            shift_x = tf.reshape(shift_x, [-1])
            shift_y = tf.reshape(shift_y, [-1])

            shifts = tf.stack(
                [shift_x, shift_y, shift_x, shift_y],
                axis=0
            )

            shifts = tf.transpose(shifts)
            all_anchors = (
                np.expand_dims(self._anchor_reference, axis=0) +
                tf.expand_dims(shifts, axis=1)
            )

            all_anchors = tf.reshape(
                all_anchors, (-1, 4)
            )
            return all_anchors

    @property
    def summary(self):
        summaries = [
            tf.summary.merge_all(key='rpn'),
        ]

        summaries.append(
            tf.summary.merge_all(key=self._losses_collections[0])
        )

        if self._with_rcnn:
            summaries.append(tf.summary.merge_all(key='rcnn'))

        return tf.summary.merge(summaries)

    @property
    def vars_summary(self):
        return {
            key: tf.summary.merge_all(key=collection)
            for key, collections in VAR_LOG_LEVELS.items()
            for collection in collections
        }

    def get_trainable_vars(self):
        trainable_vars = snt.get_variables_in_module(self)
        if self._config.model.base_network.trainable:
            pretrained_trainable_vars = self.base_network.get_trainable_vars()
            if len(pretrained_trainable_vars):
                tf.logging.info(
                    'Training {} vars from pretrained module; '
                    'from "{}" to "{}".'.format(
                        len(pretrained_trainable_vars),
                        pretrained_trainable_vars[0].name,
                        pretrained_trainable_vars[-1].name,
                    )
                )
            else:
                tf.logging.info('No vars from pretrained module to train.')
            trainable_vars += pretrained_trainable_vars
        else:
            tf.logging.info('Not training variables from pretrained module')

        return trainable_vars

    def get_base_network_checkpoint_vars(self):
        return self.base_network.get_base_network_checkpoint_vars()

    def get_checkpoint_file(self):
        return self.base_network.get_checkpoint_file()
Ejemplo n.º 8
0
    def setUp(self):
        tf.reset_default_graph()

        self._num_classes = 5
        self._num_proposals = 256
        self._total_num_gt = 128
        self._image_shape = (600, 800)
        # The score we'll give to the true labels when testing for perfect
        # score generation.
        self._high_score = 100

        self._equality_delta = 1e-03

        self._config = EasyDict({
            "enabled": True,
            "layer_sizes": [4096, 4096],
            "dropout_keep_prob": 1.0,
            "activation_function": "relu6",
            "use_mean": False,
            "target_normalization_variances": [1.0, 1.0],
            "rcnn_initializer": {
                "type": "variance_scaling_initializer",
                "factor": 1.0,
                "uniform": True,
                "mode": "FAN_AVG",
            },
            "bbox_initializer": {
                "type": "variance_scaling_initializer",
                "factor": 1.0,
                "uniform": True,
                "mode": "FAN_AVG",
            },
            "cls_initializer": {
                "type": "variance_scaling_initializer",
                "factor": 1.0,
                "uniform": True,
                "mode": "FAN_AVG",
            },
            "l2_regularization_scale": 0.0005,
            "l1_sigma": 3.0,
            "loss": {
                "type": "cross_entropy",
                "weight": [1, 0.01, 0.05, 0.02, 0.1, 0.005],
            },
            "roi": {
                "pooling_mode": "crop",
                "pooled_width": 7,
                "pooled_height": 7,
                "padding": "VALID",
            },
            "proposals": {
                "class_max_detections": 100,
                "class_nms_threshold": 0.6,
                "total_max_detections": 300,
                "min_prob_threshold": 0.0,
            },
            "target": {
                "foreground_fraction": 0.25,
                "minibatch_size": 64,
                "foreground_threshold": 0.5,
                "background_threshold_high": 0.5,
                "background_threshold_low": 0.1,
                "target_normalization_variances": [1.0, 1.0],
            },
        })

        self._base_network = MockBaseNetwork()
        self._shared_model = RCNN(self._num_classes, self._config)

        # Declare placeholders
        # We use the '_ph' suffix for placeholders.
        self._pretrained_feature_map_shape = (
            self._num_proposals,
            self._config.roi.pooled_width,
            self._config.roi.pooled_height,
            4,
        )
        self._pretrained_feature_map_ph = tf.placeholder(
            tf.float32, shape=self._pretrained_feature_map_shape)

        self._proposals_shape = (self._num_proposals, 4)
        self._proposals_ph = tf.placeholder(tf.float32,
                                            shape=self._proposals_shape)

        self._image_shape_shape = (2, )
        self._image_shape_ph = tf.placeholder(tf.float32,
                                              shape=self._image_shape_shape)

        self._gt_boxes_shape = (self._total_num_gt, 5)
        self._gt_boxes_ph = tf.placeholder(tf.float32,
                                           shape=self._gt_boxes_shape)
Ejemplo n.º 9
0
class RCNNTest(tf.test.TestCase):
    def setUp(self):
        tf.reset_default_graph()

        self._num_classes = 5
        self._num_proposals = 256
        self._total_num_gt = 128
        self._image_shape = (600, 800)
        # The score we'll give to the true labels when testing for perfect
        # score generation.
        self._high_score = 100

        self._equality_delta = 1e-03

        self._config = EasyDict({
            "enabled": True,
            "layer_sizes": [4096, 4096],
            "dropout_keep_prob": 1.0,
            "activation_function": "relu6",
            "use_mean": False,
            "target_normalization_variances": [1.0, 1.0],
            "rcnn_initializer": {
                "type": "variance_scaling_initializer",
                "factor": 1.0,
                "uniform": True,
                "mode": "FAN_AVG",
            },
            "bbox_initializer": {
                "type": "variance_scaling_initializer",
                "factor": 1.0,
                "uniform": True,
                "mode": "FAN_AVG",
            },
            "cls_initializer": {
                "type": "variance_scaling_initializer",
                "factor": 1.0,
                "uniform": True,
                "mode": "FAN_AVG",
            },
            "l2_regularization_scale": 0.0005,
            "l1_sigma": 3.0,
            "loss": {
                "type": "cross_entropy",
                "weight": [1, 0.01, 0.05, 0.02, 0.1, 0.005],
            },
            "roi": {
                "pooling_mode": "crop",
                "pooled_width": 7,
                "pooled_height": 7,
                "padding": "VALID",
            },
            "proposals": {
                "class_max_detections": 100,
                "class_nms_threshold": 0.6,
                "total_max_detections": 300,
                "min_prob_threshold": 0.0,
            },
            "target": {
                "foreground_fraction": 0.25,
                "minibatch_size": 64,
                "foreground_threshold": 0.5,
                "background_threshold_high": 0.5,
                "background_threshold_low": 0.1,
                "target_normalization_variances": [1.0, 1.0],
            },
        })

        self._base_network = MockBaseNetwork()
        self._shared_model = RCNN(self._num_classes, self._config)

        # Declare placeholders
        # We use the '_ph' suffix for placeholders.
        self._pretrained_feature_map_shape = (
            self._num_proposals,
            self._config.roi.pooled_width,
            self._config.roi.pooled_height,
            4,
        )
        self._pretrained_feature_map_ph = tf.placeholder(
            tf.float32, shape=self._pretrained_feature_map_shape)

        self._proposals_shape = (self._num_proposals, 4)
        self._proposals_ph = tf.placeholder(tf.float32,
                                            shape=self._proposals_shape)

        self._image_shape_shape = (2, )
        self._image_shape_ph = tf.placeholder(tf.float32,
                                              shape=self._image_shape_shape)

        self._gt_boxes_shape = (self._total_num_gt, 5)
        self._gt_boxes_ph = tf.placeholder(tf.float32,
                                           shape=self._gt_boxes_shape)

    def _run_net_with_feed_dict(self, net, feed_dict):
        with self.test_session() as sess:
            sess.run(tf.global_variables_initializer())
            return sess.run(net, feed_dict=feed_dict)

    def _check_returning_shapes(self, prediction_dict, training=False):
        """Asserts a prediction_dict has the right shapes.

        This includes testing that:
            - objects, objects_labels and objects_labels_prob have the same
                shape in the first dimension. (i.e. the same number of
                objects).
            - objects has shape (_, 4). objects_labels and objects_labels_prob
                have shape (_,).
            - cls_score and cls_prob have shape (num_proposals,
                num_classes + 1).
            - bbox_offsets has shape (num_proposals, num_classes * 4).

        And, if training:
            - cls_target has shape (num_proposals,).
            - bbox_offsets_target has shape (num_proposals, 4).
        """

        objects_shape = prediction_dict["objects"].shape
        objects_labels_shape = prediction_dict["labels"].shape
        objects_labels_prob_shape = prediction_dict["probs"].shape

        cls_score_shape = prediction_dict["rcnn"]["cls_score"].shape
        cls_prob_shape = prediction_dict["rcnn"]["cls_prob"].shape

        bbox_offsets_shape = prediction_dict["rcnn"]["bbox_offsets"].shape

        # We choose cls_score as the 'standard' num_proposals which we will
        # compare to the other shapes that should include num_proposals. We
        # could have chosen a different one.
        num_proposals = cls_score_shape[0]

        self.assertEqual(objects_shape[0], objects_labels_shape[0])
        self.assertEqual(objects_shape[0], objects_labels_prob_shape[0])

        self.assertEqual(objects_shape[1], 4)
        self.assertEqual(len(objects_labels_shape), 1)
        self.assertEqual(len(objects_labels_prob_shape), 1)

        self.assertEqual(cls_score_shape, cls_prob_shape)
        self.assertEqual(cls_prob_shape,
                         (num_proposals, self._num_classes + 1))

        self.assertEqual(bbox_offsets_shape,
                         (num_proposals, self._num_classes * 4))

        if training:
            cls_target_shape = prediction_dict["target"]["cls"].shape
            self.assertEqual(cls_target_shape, (num_proposals, ))

            bbox_offsets_trgt_shape = prediction_dict["target"][
                "bbox_offsets"].shape
            self.assertEqual(bbox_offsets_trgt_shape, (num_proposals, 4))

    def testReturningShapes(self):
        """Tests we're returning consistent shapes.

        We test both the case where we're training and the case where we are
        not.
        """

        # Prediction session (not training)
        rcnn_net_not_training = self._shared_model(
            self._pretrained_feature_map_ph,
            self._proposals_ph,
            self._image_shape_ph,
            self._base_network,
        )

        prediction_dict_not_training = self._run_net_with_feed_dict(
            rcnn_net_not_training,
            feed_dict={
                self._pretrained_feature_map_ph:
                np.random.rand(*self._pretrained_feature_map_shape),
                self._proposals_ph:
                np.random.randint(
                    low=0,
                    high=np.amin(self._image_shape),
                    size=self._proposals_shape,
                ),
                self._image_shape_ph:
                self._image_shape,
            },
        )
        # Training session
        rcnn_net_training = self._shared_model(
            self._pretrained_feature_map_ph,
            self._proposals_ph,
            self._image_shape_ph,
            self._base_network,
            self._gt_boxes_ph,
        )
        prediction_dict_training = self._run_net_with_feed_dict(
            rcnn_net_training,
            feed_dict={
                self._pretrained_feature_map_ph:
                np.random.rand(*self._pretrained_feature_map_shape),
                self._proposals_ph:
                np.random.randint(
                    low=0,
                    high=np.amin(self._image_shape),
                    size=self._proposals_shape,
                ),
                self._image_shape_ph:
                self._image_shape,
                self._gt_boxes_ph:
                np.random.randint(
                    low=0,
                    high=np.amin(self._image_shape),
                    size=self._gt_boxes_shape,
                ),
            },
        )
        # Assertions
        self._check_returning_shapes(prediction_dict_not_training)
        self._check_returning_shapes(prediction_dict_training, training=True)

    def testMinibatchBehaviour(self):
        """Tests we're using minibatch_size correctly when testing."""

        rcnn_net = self._shared_model(
            self._pretrained_feature_map_ph,
            self._proposals_ph,
            self._image_shape_ph,
            self._base_network,
            self._gt_boxes_ph,
        )

        prediction_dict = self._run_net_with_feed_dict(
            rcnn_net,
            feed_dict={
                self._pretrained_feature_map_ph:
                np.random.rand(*self._pretrained_feature_map_shape),
                self._proposals_ph:
                np.random.randint(
                    low=0,
                    high=np.amin(self._image_shape),
                    size=self._proposals_shape,
                ),
                self._image_shape_ph:
                self._image_shape,
                self._gt_boxes_ph:
                np.random.randint(
                    low=0,
                    high=np.amin(self._image_shape),
                    size=self._gt_boxes_shape,
                ),
            },
        )
        # Assertions
        self.assertLessEqual(
            prediction_dict["target"]["cls"][
                prediction_dict["target"]["cls"] >= 0].shape[0],
            self._config.target.minibatch_size,
        )

    def testNumberOfObjects(self):
        """Tests we're not returning too many objects.

        The number of objects returned should be lower than the number of
        proposals received times the number of classes.
        """

        rcnn_net = self._shared_model(
            self._pretrained_feature_map_ph,
            self._proposals_ph,
            self._image_shape_ph,
            self._base_network,
        )

        prediction_dict = self._run_net_with_feed_dict(
            rcnn_net,
            feed_dict={
                self._pretrained_feature_map_ph:
                np.random.rand(*self._pretrained_feature_map_shape),
                self._proposals_ph:
                np.random.randint(
                    0,
                    high=np.amin(self._image_shape),
                    size=self._proposals_shape,
                ),
                self._image_shape_ph:
                self._image_shape,
            },
        )
        # Assertions
        self.assertLessEqual(prediction_dict["objects"].shape[0],
                             self._num_proposals * self._num_classes)

    def testLoss(self):
        """Tests we're computing loss correctly.

        In particular, we're testing whether computing a perfect score when we
        have to.
        """

        # Generate placeholders and loss_graph
        cls_score_shape = (self._num_proposals, self._num_classes + 1)
        cls_score_ph = tf.placeholder(tf.float32, cls_score_shape)

        cls_prob_shape = (self._num_proposals, self._num_classes + 1)
        cls_prob_ph = tf.placeholder(tf.float32, cls_prob_shape)

        cls_target_shape = (self._num_proposals, )
        cls_target_ph = tf.placeholder(tf.float32, cls_target_shape)

        bbox_offsets_shape = (self._num_proposals, self._num_classes * 4)
        bbox_offsets_ph = tf.placeholder(tf.float32, bbox_offsets_shape)

        bbox_offsets_target_shape = (self._num_proposals, 4)
        bbox_offsets_target_ph = tf.placeholder(tf.float32,
                                                bbox_offsets_target_shape)

        loss_graph = self._shared_model.loss({
            "rcnn": {
                "cls_score": cls_score_ph,
                "cls_prob": cls_prob_ph,
                "bbox_offsets": bbox_offsets_ph,
            },
            "target": {
                "cls": cls_target_ph,
                "bbox_offsets": bbox_offsets_target_ph,
            },
        })

        # Generate values that ensure a perfect score
        # We first initialize all our values to zero.
        cls_score = np.zeros(cls_score_shape, dtype=np.float32)
        cls_prob = np.zeros(cls_prob_shape, dtype=np.float32)
        cls_target = np.zeros(cls_target_shape, dtype=np.float32)
        bbox_offsets = np.zeros(bbox_offsets_shape, dtype=np.float32)
        bbox_offsets_target = np.zeros(bbox_offsets_target_shape,
                                       dtype=np.float32)
        for i in range(self._num_proposals):
            this_class = np.random.randint(low=1, high=self._num_classes + 1)

            cls_score[i][this_class] = self._high_score
            cls_prob[i][this_class] = 1.0
            cls_target[i] = this_class

            # Find out where in the axis 1 in bbox_offsets we should
            # put the offsets, because the shape is
            # (num_proposals, num_classes * 4), and we're using
            # 1-indexed classes.
            class_place = (this_class - 1) * 4
            for j in range(4):
                this_coord = np.random.randint(low=0,
                                               high=np.amax(self._image_shape))

                bbox_offsets[i][class_place + j] = this_coord
                bbox_offsets_target[i][j] = this_coord
        # Now get the loss dict using the values we just generated.
        loss_dict = self._run_net_with_feed_dict(
            loss_graph,
            feed_dict={
                cls_score_ph: cls_score,
                cls_prob_ph: cls_prob,
                cls_target_ph: cls_target,
                bbox_offsets_ph: bbox_offsets,
                bbox_offsets_target_ph: bbox_offsets_target,
            },
        )
        # Assertions
        self.assertAlmostEqual(loss_dict["rcnn_cls_loss"],
                               0,
                               delta=self._equality_delta)
        self.assertAlmostEqual(loss_dict["rcnn_reg_loss"],
                               0,
                               delta=self._equality_delta)