def _build(self, image, gt_boxes=None, is_training=False): if gt_boxes is not None: gt_boxes = tf.cast(gt_boxes, tf.float32) image.set_shape((None, None, 3)) conv_feature_map = self.base_network( tf.expand_dims(image, 0), is_training=is_training ) # The RPN submodule which generates proposals of objects. self._rpn = RPN( self._num_anchors, self._config.model.rpn, debug=self._debug, seed=self._seed ) if self._with_rcnn: self._rcnn = RCNN( self._num_classes, self._config.model.rcnn, debug=self._debug, seed=self._seed ) image_shape = tf.shape(image)[0:2] variable_summaries( conv_feature_map, 'conv_feature_map', 'reduced' ) all_anchors = self._generate_anchors(tf.shape(conv_feature_map)) rpn_prediction = self._rpn( conv_feature_map, image_shape, all_anchors, gt_boxes=gt_boxes, is_training=is_training ) prediction_dict = { 'rpn_prediction': rpn_prediction, } if self._debug: prediction_dict['image'] = image prediction_dict['image_shape'] = image_shape prediction_dict['all_anchors'] = all_anchors prediction_dict['anchor_reference'] = tf.convert_to_tensor( self._anchor_reference ) if gt_boxes is not None: prediction_dict['gt_boxes'] = gt_boxes prediction_dict['conv_feature_map'] = conv_feature_map if self._with_rcnn: proposals = tf.stop_gradient(rpn_prediction['proposals']) classification_pred = self._rcnn( conv_feature_map, proposals, image_shape, self.base_network, gt_boxes=gt_boxes, is_training=is_training ) prediction_dict['classification_prediction'] = classification_pred return prediction_dict
def setUp(self): tf.reset_default_graph() self._num_classes = 5 self._num_proposals = 256 self._total_num_gt = 128 self._image_shape = (600, 800) # The score we'll give to the true labels when testing for perfect # score generation. self._high_score = 100 self._equality_delta = 1e-03 self._config = EasyDict({ 'enabled': True, 'layer_sizes': [4096, 4096], 'dropout_keep_prop': 1.0, 'activation_function': 'relu6', 'use_mean': False, 'initializer': { 'type': 'variance_scaling_initializer', 'factor': 1.0, 'uniform': True, 'mode': 'FAN_AVG', }, 'l2_regularization_scale': 0.0005, 'roi': { 'pooling_mode': 'crop', 'pooled_width': 7, 'pooled_height': 7, 'padding': 'VALID', }, 'proposals': { 'class_max_detections': 100, 'class_nms_threshold': 0.6, 'total_max_detections': 300, 'min_prob_threshold': 0.0, }, 'target': { 'foreground_fraction': 0.25, 'minibatch_size': 64, 'foreground_threshold': 0.5, 'background_threshold_high': 0.5, 'background_threshold_low': 0.1, }, }) self._shared_model = RCNN(self._num_classes, self._config) # Declare placeholders # We use the '_ph' suffix for placeholders. self._pretrained_feature_map_shape = (self._num_proposals, self._config.roi.pooled_width, self._config.roi.pooled_height, 4) self._pretrained_feature_map_ph = tf.placeholder( tf.float32, shape=self._pretrained_feature_map_shape) self._proposals_shape = (self._num_proposals, 5) self._proposals_ph = tf.placeholder(tf.float32, shape=self._proposals_shape) self._image_shape_shape = (2, ) self._image_shape_ph = tf.placeholder(tf.float32, shape=self._image_shape_shape) self._gt_boxes_shape = (self._total_num_gt, 5) self._gt_boxes_ph = tf.placeholder(tf.float32, shape=self._gt_boxes_shape)
class RCNNTest(tf.test.TestCase): def setUp(self): tf.reset_default_graph() self._num_classes = 5 self._num_proposals = 256 self._total_num_gt = 128 self._image_shape = (600, 800) # The score we'll give to the true labels when testing for perfect # score generation. self._high_score = 100 self._equality_delta = 1e-03 self._config = EasyDict({ 'enabled': True, 'layer_sizes': [4096, 4096], 'dropout_keep_prop': 1.0, 'activation_function': 'relu6', 'use_mean': False, 'initializer': { 'type': 'variance_scaling_initializer', 'factor': 1.0, 'uniform': True, 'mode': 'FAN_AVG', }, 'l2_regularization_scale': 0.0005, 'roi': { 'pooling_mode': 'crop', 'pooled_width': 7, 'pooled_height': 7, 'padding': 'VALID', }, 'proposals': { 'class_max_detections': 100, 'class_nms_threshold': 0.6, 'total_max_detections': 300, 'min_prob_threshold': 0.0, }, 'target': { 'foreground_fraction': 0.25, 'minibatch_size': 64, 'foreground_threshold': 0.5, 'background_threshold_high': 0.5, 'background_threshold_low': 0.1, }, }) self._shared_model = RCNN(self._num_classes, self._config) # Declare placeholders # We use the '_ph' suffix for placeholders. self._pretrained_feature_map_shape = (self._num_proposals, self._config.roi.pooled_width, self._config.roi.pooled_height, 4) self._pretrained_feature_map_ph = tf.placeholder( tf.float32, shape=self._pretrained_feature_map_shape) self._proposals_shape = (self._num_proposals, 5) self._proposals_ph = tf.placeholder(tf.float32, shape=self._proposals_shape) self._image_shape_shape = (2, ) self._image_shape_ph = tf.placeholder(tf.float32, shape=self._image_shape_shape) self._gt_boxes_shape = (self._total_num_gt, 5) self._gt_boxes_ph = tf.placeholder(tf.float32, shape=self._gt_boxes_shape) def _run_net_with_feed_dict(self, net, feed_dict): with self.test_session() as sess: sess.run(tf.global_variables_initializer()) return sess.run(net, feed_dict=feed_dict) def _check_returning_shapes(self, prediction_dict, training=False): """Asserts a prediction_dict has the right shapes. This includes testing that: - objects, objects_labels and objects_labels_prob have the same shape in the first dimension. (i.e. the same number of objects). - objects has shape (_, 4). objects_labels and objects_labels_prob have shape (_,). - cls_score and cls_prob have shape (num_proposals, num_classes + 1). - bbox_offsets has shape (num_proposals, num_classes * 4). And, if training: - cls_target has shape (num_proposals,). - bbox_offsets_target has shape (num_proposals, 4). """ objects_shape = prediction_dict['objects'].shape objects_labels_shape = prediction_dict['labels'].shape objects_labels_prob_shape = prediction_dict['probs'] \ .shape cls_score_shape = prediction_dict['rcnn']['cls_score'].shape cls_prob_shape = prediction_dict['rcnn']['cls_prob'].shape bbox_offsets_shape = prediction_dict['rcnn']['bbox_offsets'].shape # We choose cls_score as the 'standard' num_proposals which we will # compare to the other shapes that should include num_proposals. We # could have chosen a different one. num_proposals = cls_score_shape[0] self.assertEqual(objects_shape[0], objects_labels_shape[0]) self.assertEqual(objects_shape[0], objects_labels_prob_shape[0]) self.assertEqual(objects_shape[1], 4) self.assertEqual(len(objects_labels_shape), 1) self.assertEqual(len(objects_labels_prob_shape), 1) self.assertEqual(cls_score_shape, cls_prob_shape) self.assertEqual(cls_prob_shape, (num_proposals, self._num_classes + 1)) self.assertEqual(bbox_offsets_shape, (num_proposals, self._num_classes * 4)) if training: cls_target_shape = prediction_dict['target']['cls'].shape self.assertEqual(cls_target_shape, (num_proposals, )) bbox_offsets_trgt_shape = ( prediction_dict['target']['bbox_offsets'].shape) self.assertEqual(bbox_offsets_trgt_shape, (num_proposals, 4)) def testReturningShapes(self): """Tests we're returning consistent shapes. We test both the case where we're training and the case where we are not. """ # Prediction session (not training) rcnn_net_not_training = self._shared_model( self._pretrained_feature_map_ph, self._proposals_ph, self._image_shape_ph) prediction_dict_not_training = self._run_net_with_feed_dict( rcnn_net_not_training, feed_dict={ self._pretrained_feature_map_ph: np.random.rand(*self._pretrained_feature_map_shape), self._proposals_ph: np.random.randint( low=0, high=np.amin(self._image_shape), size=self._proposals_shape, ), self._image_shape_ph: self._image_shape, }) # Training session rcnn_net_training = self._shared_model(self._pretrained_feature_map_ph, self._proposals_ph, self._image_shape_ph, self._gt_boxes_ph) prediction_dict_training = self._run_net_with_feed_dict( rcnn_net_training, feed_dict={ self._pretrained_feature_map_ph: np.random.rand(*self._pretrained_feature_map_shape), self._proposals_ph: np.random.randint( low=0, high=np.amin(self._image_shape), size=self._proposals_shape, ), self._image_shape_ph: self._image_shape, self._gt_boxes_ph: np.random.randint( low=0, high=np.amin(self._image_shape), size=self._gt_boxes_shape, ), }) # Assertions self._check_returning_shapes(prediction_dict_not_training) self._check_returning_shapes(prediction_dict_training, training=True) def testMinibatchBehaviour(self): """Tests we're using minibatch_size correctly when testing. """ rcnn_net = self._shared_model(self._pretrained_feature_map_ph, self._proposals_ph, self._image_shape_ph, self._gt_boxes_ph) prediction_dict = self._run_net_with_feed_dict( rcnn_net, feed_dict={ self._pretrained_feature_map_ph: np.random.rand(*self._pretrained_feature_map_shape), self._proposals_ph: np.random.randint( low=0, high=np.amin(self._image_shape), size=self._proposals_shape, ), self._image_shape_ph: self._image_shape, self._gt_boxes_ph: np.random.randint( low=0, high=np.amin(self._image_shape), size=self._gt_boxes_shape, ), }) # Assertions self.assertLessEqual( prediction_dict['target']['cls'][ prediction_dict['target']['cls'] >= 0].shape[0], self._config.target.minibatch_size, ) def testNumberOfObjects(self): """Tests we're not returning more objects than we get proposals. """ rcnn_net = self._shared_model(self._pretrained_feature_map_ph, self._proposals_ph, self._image_shape_ph) prediction_dict = self._run_net_with_feed_dict( rcnn_net, feed_dict={ self._pretrained_feature_map_ph: np.random.rand(*self._pretrained_feature_map_shape), self._proposals_ph: np.random.randint( 0, high=np.amin(self._image_shape), size=self._proposals_shape, ), self._image_shape_ph: self._image_shape, }) # Assertions self.assertLessEqual(prediction_dict['objects'].shape[0], self._num_proposals) def testLoss(self): """Tests we're computing loss correctly. In particular, we're testing whether computing a perfect score when we have to. """ # Generate placeholders and loss_graph cls_score_shape = (self._num_proposals, self._num_classes + 1) cls_score_ph = tf.placeholder(tf.float32, cls_score_shape) cls_prob_shape = (self._num_proposals, self._num_classes + 1) cls_prob_ph = tf.placeholder(tf.float32, cls_prob_shape) cls_target_shape = (self._num_proposals, ) cls_target_ph = tf.placeholder(tf.float32, cls_target_shape) bbox_offsets_shape = (self._num_proposals, self._num_classes * 4) bbox_offsets_ph = tf.placeholder(tf.float32, bbox_offsets_shape) bbox_offsets_target_shape = (self._num_proposals, 4) bbox_offsets_target_ph = tf.placeholder(tf.float32, bbox_offsets_target_shape) loss_graph = self._shared_model.loss({ 'rcnn': { 'cls_score': cls_score_ph, 'cls_prob': cls_prob_ph, 'bbox_offsets': bbox_offsets_ph, }, 'target': { 'cls': cls_target_ph, 'bbox_offsets': bbox_offsets_target_ph, } }) # Generate values that ensure a perfect score # We first initialize all our values to zero. cls_score = np.zeros(cls_score_shape, dtype=np.float32) cls_prob = np.zeros(cls_prob_shape, dtype=np.float32) cls_target = np.zeros(cls_target_shape, dtype=np.float32) bbox_offsets = np.zeros(bbox_offsets_shape, dtype=np.float32) bbox_offsets_target = np.zeros(bbox_offsets_target_shape, dtype=np.float32) for i in range(self._num_proposals): this_class = np.random.randint(low=1, high=self._num_classes + 1) cls_score[i][this_class] = self._high_score cls_prob[i][this_class] = 1. cls_target[i] = this_class # Find out where in the axis 1 in bbox_offsets we should # put the offsets, because the shape is # (num_proposals, num_classes * 4), and we're using # 1-indexed classes. class_place = (this_class - 1) * 4 for j in range(4): this_coord = np.random.randint(low=0, high=np.amax(self._image_shape)) bbox_offsets[i][class_place + j] = this_coord bbox_offsets_target[i][j] = this_coord # Now get the loss dict using the values we just generated. loss_dict = self._run_net_with_feed_dict(loss_graph, feed_dict={ cls_score_ph: cls_score, cls_prob_ph: cls_prob, cls_target_ph: cls_target, bbox_offsets_ph: bbox_offsets, bbox_offsets_target_ph: bbox_offsets_target, }) # Assertions self.assertAlmostEqual(loss_dict['rcnn_cls_loss'], 0, delta=self._equality_delta) self.assertAlmostEqual(loss_dict['rcnn_reg_loss'], 0, delta=self._equality_delta)
def _build(self, image, gt_boxes=None, is_training=False): """ Returns bounding boxes and classification probabilities. Args: image: A tensor with the image. Its shape should be `(height, width, 3)`. gt_boxes: A tensor with all the ground truth boxes of that image. Its shape should be `(num_gt_boxes, 5)` Where for each gt box we have (x1, y1, x2, y2, label), in that order. is_training: A boolean to whether or not it is used for training. Returns: classification_prob: A tensor with the softmax probability for each of the bounding boxes found in the image. Its shape should be: (num_bboxes, num_categories + 1) classification_bbox: A tensor with the bounding boxes found. It's shape should be: (num_bboxes, 4). For each of the bboxes we have (x1, y1, x2, y2) """ if gt_boxes is not None: gt_boxes = tf.cast(gt_boxes, tf.float32) # A Tensor with the feature map for the image, # its shape should be `(feature_height, feature_width, 512)`. # The shape depends of the pretrained network in use. # Set rank and last dimension before using base network # TODO: Why does it loose information when using queue? image.set_shape((None, None, 3)) conv_feature_map = self.base_network(tf.expand_dims(image, 0), is_training=is_training) # The RPN submodule which generates proposals of objects. self._rpn = RPN( self._num_anchors, self._config.model.rpn, debug=self._debug, seed=self._seed, ) if self._with_rcnn: # The RCNN submodule which classifies RPN's proposals and # classifies them as background or a specific class. self._rcnn = RCNN( self._num_classes, self._config.model.rcnn, debug=self._debug, seed=self._seed, ) image_shape = tf.shape(image)[0:2] variable_summaries(conv_feature_map, "conv_feature_map", "reduced") # Generate anchors for the image based on the anchor reference. all_anchors = self._generate_anchors(tf.shape(conv_feature_map)) rpn_prediction = self._rpn( conv_feature_map, image_shape, all_anchors, gt_boxes=gt_boxes, is_training=is_training, ) prediction_dict = { "rpn_prediction": rpn_prediction, } if self._debug: prediction_dict["image"] = image prediction_dict["image_shape"] = image_shape prediction_dict["all_anchors"] = all_anchors prediction_dict["anchor_reference"] = tf.convert_to_tensor( self._anchor_reference) if gt_boxes is not None: prediction_dict["gt_boxes"] = gt_boxes prediction_dict["conv_feature_map"] = conv_feature_map if self._with_rcnn: proposals = tf.stop_gradient(rpn_prediction["proposals"]) classification_pred = self._rcnn( conv_feature_map, proposals, image_shape, self.base_network, gt_boxes=gt_boxes, is_training=is_training, ) prediction_dict["classification_prediction"] = classification_pred return prediction_dict
class FasterRCNN(snt.AbstractModule): """Faster RCNN Network module Builds the Faster RCNN network architecture using different submodules. Calculates the total loss of the model based on the different losses by each of the submodules. It is also responsible for building the anchor reference which is used in graph for generating the dynamic anchors. """ def __init__(self, config, name="fasterrcnn"): super(FasterRCNN, self).__init__(name=name) # Main configuration object, it holds not only the necessary # information for this module but also configuration for each of the # different submodules. self._config = config # Total number of classes to classify. If not using RCNN then it is not # used. TODO: Make it *more* optional. self._num_classes = config.model.network.num_classes # Generate network with RCNN thus allowing for classification of # objects and not just finding them. self._with_rcnn = config.model.network.with_rcnn # Turn on debug mode with returns more Tensors which can be used for # better visualization and (of course) debugging. self._debug = config.train.debug self._seed = config.train.seed # Anchor config, check out the docs of base_config.yml for a better # understanding of how anchors work. self._anchor_base_size = config.model.anchors.base_size self._anchor_scales = np.array(config.model.anchors.scales) self._anchor_ratios = np.array(config.model.anchors.ratios) self._anchor_stride = config.model.anchors.stride # Anchor reference for building dynamic anchors for each image in the # computation graph. self._anchor_reference = generate_anchors_reference( self._anchor_base_size, self._anchor_ratios, self._anchor_scales) # Total number of anchors per point. self._num_anchors = self._anchor_reference.shape[0] # Weights used to sum each of the losses of the submodules self._rpn_cls_loss_weight = config.model.loss.rpn_cls_loss_weight self._rpn_reg_loss_weight = config.model.loss.rpn_reg_loss_weights self._rcnn_cls_loss_weight = config.model.loss.rcnn_cls_loss_weight self._rcnn_reg_loss_weight = config.model.loss.rcnn_reg_loss_weights self._losses_collections = ["fastercnn_losses"] # We want the pretrained model to be outside the FasterRCNN name scope. self.base_network = TruncatedBaseNetwork(config.model.base_network) def _build(self, image, gt_boxes=None, is_training=False): """ Returns bounding boxes and classification probabilities. Args: image: A tensor with the image. Its shape should be `(height, width, 3)`. gt_boxes: A tensor with all the ground truth boxes of that image. Its shape should be `(num_gt_boxes, 5)` Where for each gt box we have (x1, y1, x2, y2, label), in that order. is_training: A boolean to whether or not it is used for training. Returns: classification_prob: A tensor with the softmax probability for each of the bounding boxes found in the image. Its shape should be: (num_bboxes, num_categories + 1) classification_bbox: A tensor with the bounding boxes found. It's shape should be: (num_bboxes, 4). For each of the bboxes we have (x1, y1, x2, y2) """ if gt_boxes is not None: gt_boxes = tf.cast(gt_boxes, tf.float32) # A Tensor with the feature map for the image, # its shape should be `(feature_height, feature_width, 512)`. # The shape depends of the pretrained network in use. # Set rank and last dimension before using base network # TODO: Why does it loose information when using queue? image.set_shape((None, None, 3)) conv_feature_map = self.base_network(tf.expand_dims(image, 0), is_training=is_training) # The RPN submodule which generates proposals of objects. self._rpn = RPN( self._num_anchors, self._config.model.rpn, debug=self._debug, seed=self._seed, ) if self._with_rcnn: # The RCNN submodule which classifies RPN's proposals and # classifies them as background or a specific class. self._rcnn = RCNN( self._num_classes, self._config.model.rcnn, debug=self._debug, seed=self._seed, ) image_shape = tf.shape(image)[0:2] variable_summaries(conv_feature_map, "conv_feature_map", "reduced") # Generate anchors for the image based on the anchor reference. all_anchors = self._generate_anchors(tf.shape(conv_feature_map)) rpn_prediction = self._rpn( conv_feature_map, image_shape, all_anchors, gt_boxes=gt_boxes, is_training=is_training, ) prediction_dict = { "rpn_prediction": rpn_prediction, } if self._debug: prediction_dict["image"] = image prediction_dict["image_shape"] = image_shape prediction_dict["all_anchors"] = all_anchors prediction_dict["anchor_reference"] = tf.convert_to_tensor( self._anchor_reference) if gt_boxes is not None: prediction_dict["gt_boxes"] = gt_boxes prediction_dict["conv_feature_map"] = conv_feature_map if self._with_rcnn: proposals = tf.stop_gradient(rpn_prediction["proposals"]) classification_pred = self._rcnn( conv_feature_map, proposals, image_shape, self.base_network, gt_boxes=gt_boxes, is_training=is_training, ) prediction_dict["classification_prediction"] = classification_pred return prediction_dict def loss(self, prediction_dict, return_all=False): """Compute the joint training loss for Faster RCNN. Args: prediction_dict: The output dictionary of the _build method from which we use two different main keys: rpn_prediction: A dictionary with the output Tensors from the RPN. classification_prediction: A dictionary with the output Tensors from the RCNN. Returns: If `return_all` is False, a tensor for the total loss. If True, a dict with all the internal losses (RPN's, RCNN's, regularization and total loss). """ with tf.name_scope("losses"): rpn_loss_dict = self._rpn.loss(prediction_dict["rpn_prediction"]) # Losses have a weight assigned, we multiply by them before saving # them. rpn_loss_dict["rpn_cls_loss"] = (rpn_loss_dict["rpn_cls_loss"] * self._rpn_cls_loss_weight) rpn_loss_dict["rpn_reg_loss"] = (rpn_loss_dict["rpn_reg_loss"] * self._rpn_reg_loss_weight) prediction_dict["rpn_loss_dict"] = rpn_loss_dict if self._with_rcnn: rcnn_loss_dict = self._rcnn.loss( prediction_dict["classification_prediction"]) rcnn_loss_dict["rcnn_cls_loss"] = ( rcnn_loss_dict["rcnn_cls_loss"] * self._rcnn_cls_loss_weight) rcnn_loss_dict["rcnn_reg_loss"] = ( rcnn_loss_dict["rcnn_reg_loss"] * self._rcnn_reg_loss_weight) prediction_dict["rcnn_loss_dict"] = rcnn_loss_dict else: rcnn_loss_dict = {} all_losses_items = list(rpn_loss_dict.items()) + list( rcnn_loss_dict.items()) for loss_name, loss_tensor in all_losses_items: tf.summary.scalar(loss_name, loss_tensor, collections=self._losses_collections) # We add losses to the losses collection instead of manually # summing them just in case somebody wants to use it in another # place. tf.losses.add_loss(loss_tensor) # Regularization loss is automatically saved by TensorFlow, we log # it differently so we can visualize it independently. regularization_loss = tf.losses.get_regularization_loss() # Total loss without regularization no_reg_loss = tf.losses.get_total_loss( add_regularization_losses=False) total_loss = tf.losses.get_total_loss() tf.summary.scalar("total_loss", total_loss, collections=self._losses_collections) tf.summary.scalar("no_reg_loss", no_reg_loss, collections=self._losses_collections) tf.summary.scalar( "regularization_loss", regularization_loss, collections=self._losses_collections, ) if return_all: loss_dict = { "total_loss": total_loss, "no_reg_loss": no_reg_loss, "regularization_loss": regularization_loss, } for loss_name, loss_tensor in all_losses_items: loss_dict[loss_name] = loss_tensor return loss_dict # We return the total loss, which includes: # - rpn loss # - rcnn loss (if activated) # - regularization loss return total_loss def _generate_anchors(self, feature_map_shape): """Generate anchor for an image. Using the feature map, the output of the pretrained network for an image, and the anchor_reference generated using the anchor config values. We generate a list of anchors. Anchors are just fixed bounding boxes of different ratios and sizes that are uniformly generated throught the image. Args: feature_map_shape: Shape of the convolutional feature map used as input for the RPN. Should be (batch, height, width, depth). Returns: all_anchors: A flattened Tensor with all the anchors of shape `(num_anchors_per_points * feature_width * feature_height, 4)` using the (x1, y1, x2, y2) convention. """ with tf.variable_scope("generate_anchors"): grid_width = feature_map_shape[2] # width grid_height = feature_map_shape[1] # height shift_x = tf.range(grid_width) * self._anchor_stride shift_y = tf.range(grid_height) * self._anchor_stride shift_x, shift_y = tf.meshgrid(shift_x, shift_y) shift_x = tf.reshape(shift_x, [-1]) shift_y = tf.reshape(shift_y, [-1]) shifts = tf.stack([shift_x, shift_y, shift_x, shift_y], axis=0) shifts = tf.transpose(shifts) # Shifts now is a (H x W, 4) Tensor # Expand dims to use broadcasting sum. all_anchors = np.expand_dims(self._anchor_reference, axis=0) + tf.expand_dims(shifts, axis=1) # Flatten all_anchors = tf.reshape(all_anchors, (-1, 4)) return all_anchors @property def summary(self): """ Generate merged summary of all the sub-summaries used inside the Faster R-CNN network. """ summaries = [ tf.summary.merge_all(key="rpn"), ] summaries.append(tf.summary.merge_all(key=self._losses_collections[0])) if self._with_rcnn: summaries.append(tf.summary.merge_all(key="rcnn")) return tf.summary.merge(summaries) @property def vars_summary(self): return { key: tf.summary.merge_all(key=collection) for key, collections in VAR_LOG_LEVELS.items() for collection in collections } def get_trainable_vars(self): """Get trainable vars included in the module.""" trainable_vars = snt.get_variables_in_module(self) if self._config.model.base_network.trainable: pretrained_trainable_vars = self.base_network.get_trainable_vars() if len(pretrained_trainable_vars): tf.logging.info("Training {} vars from pretrained module; " 'from "{}" to "{}".'.format( len(pretrained_trainable_vars), pretrained_trainable_vars[0].name, pretrained_trainable_vars[-1].name, )) else: tf.logging.info("No vars from pretrained module to train.") trainable_vars += pretrained_trainable_vars else: tf.logging.info("Not training variables from pretrained module") return trainable_vars def get_base_network_checkpoint_vars(self): return self.base_network.get_base_network_checkpoint_vars() def get_checkpoint_file(self): return self.base_network.get_checkpoint_file()
def _build(self, image, gt_boxes=None, is_training=True): """ Returns bounding boxes and classification probabilities. Args: image: A tensor with the image. Its shape should be `(1, height, width, 3)`. gt_boxes: A tensor with all the ground truth boxes of that image. Its shape should be `(num_gt_boxes, 5)` Where for each gt box we have (x1, y1, x2, y2, label), in that order. is_training: A boolean to whether or not it is used for training. Returns: classification_prob: A tensor with the softmax probability for each of the bounding boxes found in the image. Its shape should be: (num_bboxes, num_categories + 1) classification_bbox: A tensor with the bounding boxes found. It's shape should be: (num_bboxes, 4). For each of the bboxes we have (x1, y1, x2, y2) """ if gt_boxes is not None: gt_boxes = tf.cast(gt_boxes, tf.float32) # A Tensor with the feature map for the image, # its shape should be `(feature_height, feature_width, 512)`. # The shape depends of the pretrained network in use. conv_feature_map = self.base_network(image, is_training=is_training) # The RPN submodule which generates proposals of objects. self._rpn = RPN(self._num_anchors, self._config.rpn, debug=self._debug, seed=self._seed) if self._with_rcnn: # The RCNN submodule which classifies RPN's proposals and # classifies them as background or a specific class. self._rcnn = RCNN(self._num_classes, self._config.rcnn, debug=self._debug, seed=self._seed) image_shape = tf.shape(image)[1:3] variable_summaries(conv_feature_map, 'conv_feature_map', ['rpn']) # Generate anchors for the image based on the anchor reference. all_anchors = self._generate_anchors(tf.shape(conv_feature_map)) rpn_prediction = self._rpn(conv_feature_map, image_shape, all_anchors, gt_boxes=gt_boxes) prediction_dict = { 'rpn_prediction': rpn_prediction, } if self._debug: prediction_dict['image'] = image prediction_dict['image_shape'] = image_shape prediction_dict['all_anchors'] = all_anchors prediction_dict['anchor_reference'] = tf.convert_to_tensor( self._anchor_reference) prediction_dict['gt_boxes'] = gt_boxes prediction_dict['conv_feature_map'] = conv_feature_map if self._with_rcnn: classification_pred = self._rcnn(conv_feature_map, rpn_prediction['proposals'], image_shape, gt_boxes=gt_boxes, is_training=is_training) prediction_dict['classification_prediction'] = classification_pred return prediction_dict
class FasterRCNN(snt.AbstractModule): def __init__(self, config, name='fasterrcnn'): super(FasterRCNN, self).__init__(name=name) self._config = config self._num_classes = config.model.network.num_classes # Generate network with RCNN self._with_rcnn = config.model.network.with_rcnn self._debug = config.train.debug self._seed = config.train.seed self._anchor_base_size = config.model.anchors.base_size self._anchor_scales = np.array(config.model.anchors.scales) self._anchor_ratios = np.array(config.model.anchors.ratios) self._anchor_stride = config.model.anchors.stride self._anchor_reference = generate_anchors_reference( self._anchor_base_size, self._anchor_ratios, self._anchor_scales ) self._num_anchors = self._anchor_reference.shape[0] # Weights used to sum each of the losses of the submodules self._rpn_cls_loss_weight = config.model.loss.rpn_cls_loss_weight self._rpn_reg_loss_weight = config.model.loss.rpn_reg_loss_weights self._rcnn_cls_loss_weight = config.model.loss.rcnn_cls_loss_weight self._rcnn_reg_loss_weight = config.model.loss.rcnn_reg_loss_weights self._losses_collections = ['fastercnn_losses'] self.base_network = TruncatedBaseNetwork(config.model.base_network) def _build(self, image, gt_boxes=None, is_training=False): if gt_boxes is not None: gt_boxes = tf.cast(gt_boxes, tf.float32) image.set_shape((None, None, 3)) conv_feature_map = self.base_network( tf.expand_dims(image, 0), is_training=is_training ) # The RPN submodule which generates proposals of objects. self._rpn = RPN( self._num_anchors, self._config.model.rpn, debug=self._debug, seed=self._seed ) if self._with_rcnn: self._rcnn = RCNN( self._num_classes, self._config.model.rcnn, debug=self._debug, seed=self._seed ) image_shape = tf.shape(image)[0:2] variable_summaries( conv_feature_map, 'conv_feature_map', 'reduced' ) all_anchors = self._generate_anchors(tf.shape(conv_feature_map)) rpn_prediction = self._rpn( conv_feature_map, image_shape, all_anchors, gt_boxes=gt_boxes, is_training=is_training ) prediction_dict = { 'rpn_prediction': rpn_prediction, } if self._debug: prediction_dict['image'] = image prediction_dict['image_shape'] = image_shape prediction_dict['all_anchors'] = all_anchors prediction_dict['anchor_reference'] = tf.convert_to_tensor( self._anchor_reference ) if gt_boxes is not None: prediction_dict['gt_boxes'] = gt_boxes prediction_dict['conv_feature_map'] = conv_feature_map if self._with_rcnn: proposals = tf.stop_gradient(rpn_prediction['proposals']) classification_pred = self._rcnn( conv_feature_map, proposals, image_shape, self.base_network, gt_boxes=gt_boxes, is_training=is_training ) prediction_dict['classification_prediction'] = classification_pred return prediction_dict def loss(self, prediction_dict, return_all=False): with tf.name_scope('losses'): rpn_loss_dict = self._rpn.loss( prediction_dict['rpn_prediction'] ) rpn_loss_dict['rpn_cls_loss'] = ( rpn_loss_dict['rpn_cls_loss'] * self._rpn_cls_loss_weight) rpn_loss_dict['rpn_reg_loss'] = ( rpn_loss_dict['rpn_reg_loss'] * self._rpn_reg_loss_weight) prediction_dict['rpn_loss_dict'] = rpn_loss_dict if self._with_rcnn: rcnn_loss_dict = self._rcnn.loss( prediction_dict['classification_prediction'] ) rcnn_loss_dict['rcnn_cls_loss'] = ( rcnn_loss_dict['rcnn_cls_loss'] * self._rcnn_cls_loss_weight ) rcnn_loss_dict['rcnn_reg_loss'] = ( rcnn_loss_dict['rcnn_reg_loss'] * self._rcnn_reg_loss_weight ) prediction_dict['rcnn_loss_dict'] = rcnn_loss_dict else: rcnn_loss_dict = {} all_losses_items = ( list(rpn_loss_dict.items()) + list(rcnn_loss_dict.items())) for loss_name, loss_tensor in all_losses_items: tf.summary.scalar( loss_name, loss_tensor, collections=self._losses_collections ) tf.losses.add_loss(loss_tensor) regularization_loss = tf.losses.get_regularization_loss() no_reg_loss = tf.losses.get_total_loss( add_regularization_losses=False ) total_loss = tf.losses.get_total_loss() tf.summary.scalar( 'total_loss', total_loss, collections=self._losses_collections ) tf.summary.scalar( 'no_reg_loss', no_reg_loss, collections=self._losses_collections ) tf.summary.scalar( 'regularization_loss', regularization_loss, collections=self._losses_collections ) if return_all: loss_dict = { 'total_loss': total_loss, 'no_reg_loss': no_reg_loss, 'regularization_loss': regularization_loss, } for loss_name, loss_tensor in all_losses_items: loss_dict[loss_name] = loss_tensor return loss_dict return total_loss def _generate_anchors(self, feature_map_shape): with tf.variable_scope('generate_anchors'): grid_width = feature_map_shape[2] # width grid_height = feature_map_shape[1] # height shift_x = tf.range(grid_width) * self._anchor_stride shift_y = tf.range(grid_height) * self._anchor_stride shift_x, shift_y = tf.meshgrid(shift_x, shift_y) shift_x = tf.reshape(shift_x, [-1]) shift_y = tf.reshape(shift_y, [-1]) shifts = tf.stack( [shift_x, shift_y, shift_x, shift_y], axis=0 ) shifts = tf.transpose(shifts) all_anchors = ( np.expand_dims(self._anchor_reference, axis=0) + tf.expand_dims(shifts, axis=1) ) all_anchors = tf.reshape( all_anchors, (-1, 4) ) return all_anchors @property def summary(self): summaries = [ tf.summary.merge_all(key='rpn'), ] summaries.append( tf.summary.merge_all(key=self._losses_collections[0]) ) if self._with_rcnn: summaries.append(tf.summary.merge_all(key='rcnn')) return tf.summary.merge(summaries) @property def vars_summary(self): return { key: tf.summary.merge_all(key=collection) for key, collections in VAR_LOG_LEVELS.items() for collection in collections } def get_trainable_vars(self): trainable_vars = snt.get_variables_in_module(self) if self._config.model.base_network.trainable: pretrained_trainable_vars = self.base_network.get_trainable_vars() if len(pretrained_trainable_vars): tf.logging.info( 'Training {} vars from pretrained module; ' 'from "{}" to "{}".'.format( len(pretrained_trainable_vars), pretrained_trainable_vars[0].name, pretrained_trainable_vars[-1].name, ) ) else: tf.logging.info('No vars from pretrained module to train.') trainable_vars += pretrained_trainable_vars else: tf.logging.info('Not training variables from pretrained module') return trainable_vars def get_base_network_checkpoint_vars(self): return self.base_network.get_base_network_checkpoint_vars() def get_checkpoint_file(self): return self.base_network.get_checkpoint_file()
def setUp(self): tf.reset_default_graph() self._num_classes = 5 self._num_proposals = 256 self._total_num_gt = 128 self._image_shape = (600, 800) # The score we'll give to the true labels when testing for perfect # score generation. self._high_score = 100 self._equality_delta = 1e-03 self._config = EasyDict({ "enabled": True, "layer_sizes": [4096, 4096], "dropout_keep_prob": 1.0, "activation_function": "relu6", "use_mean": False, "target_normalization_variances": [1.0, 1.0], "rcnn_initializer": { "type": "variance_scaling_initializer", "factor": 1.0, "uniform": True, "mode": "FAN_AVG", }, "bbox_initializer": { "type": "variance_scaling_initializer", "factor": 1.0, "uniform": True, "mode": "FAN_AVG", }, "cls_initializer": { "type": "variance_scaling_initializer", "factor": 1.0, "uniform": True, "mode": "FAN_AVG", }, "l2_regularization_scale": 0.0005, "l1_sigma": 3.0, "loss": { "type": "cross_entropy", "weight": [1, 0.01, 0.05, 0.02, 0.1, 0.005], }, "roi": { "pooling_mode": "crop", "pooled_width": 7, "pooled_height": 7, "padding": "VALID", }, "proposals": { "class_max_detections": 100, "class_nms_threshold": 0.6, "total_max_detections": 300, "min_prob_threshold": 0.0, }, "target": { "foreground_fraction": 0.25, "minibatch_size": 64, "foreground_threshold": 0.5, "background_threshold_high": 0.5, "background_threshold_low": 0.1, "target_normalization_variances": [1.0, 1.0], }, }) self._base_network = MockBaseNetwork() self._shared_model = RCNN(self._num_classes, self._config) # Declare placeholders # We use the '_ph' suffix for placeholders. self._pretrained_feature_map_shape = ( self._num_proposals, self._config.roi.pooled_width, self._config.roi.pooled_height, 4, ) self._pretrained_feature_map_ph = tf.placeholder( tf.float32, shape=self._pretrained_feature_map_shape) self._proposals_shape = (self._num_proposals, 4) self._proposals_ph = tf.placeholder(tf.float32, shape=self._proposals_shape) self._image_shape_shape = (2, ) self._image_shape_ph = tf.placeholder(tf.float32, shape=self._image_shape_shape) self._gt_boxes_shape = (self._total_num_gt, 5) self._gt_boxes_ph = tf.placeholder(tf.float32, shape=self._gt_boxes_shape)
class RCNNTest(tf.test.TestCase): def setUp(self): tf.reset_default_graph() self._num_classes = 5 self._num_proposals = 256 self._total_num_gt = 128 self._image_shape = (600, 800) # The score we'll give to the true labels when testing for perfect # score generation. self._high_score = 100 self._equality_delta = 1e-03 self._config = EasyDict({ "enabled": True, "layer_sizes": [4096, 4096], "dropout_keep_prob": 1.0, "activation_function": "relu6", "use_mean": False, "target_normalization_variances": [1.0, 1.0], "rcnn_initializer": { "type": "variance_scaling_initializer", "factor": 1.0, "uniform": True, "mode": "FAN_AVG", }, "bbox_initializer": { "type": "variance_scaling_initializer", "factor": 1.0, "uniform": True, "mode": "FAN_AVG", }, "cls_initializer": { "type": "variance_scaling_initializer", "factor": 1.0, "uniform": True, "mode": "FAN_AVG", }, "l2_regularization_scale": 0.0005, "l1_sigma": 3.0, "loss": { "type": "cross_entropy", "weight": [1, 0.01, 0.05, 0.02, 0.1, 0.005], }, "roi": { "pooling_mode": "crop", "pooled_width": 7, "pooled_height": 7, "padding": "VALID", }, "proposals": { "class_max_detections": 100, "class_nms_threshold": 0.6, "total_max_detections": 300, "min_prob_threshold": 0.0, }, "target": { "foreground_fraction": 0.25, "minibatch_size": 64, "foreground_threshold": 0.5, "background_threshold_high": 0.5, "background_threshold_low": 0.1, "target_normalization_variances": [1.0, 1.0], }, }) self._base_network = MockBaseNetwork() self._shared_model = RCNN(self._num_classes, self._config) # Declare placeholders # We use the '_ph' suffix for placeholders. self._pretrained_feature_map_shape = ( self._num_proposals, self._config.roi.pooled_width, self._config.roi.pooled_height, 4, ) self._pretrained_feature_map_ph = tf.placeholder( tf.float32, shape=self._pretrained_feature_map_shape) self._proposals_shape = (self._num_proposals, 4) self._proposals_ph = tf.placeholder(tf.float32, shape=self._proposals_shape) self._image_shape_shape = (2, ) self._image_shape_ph = tf.placeholder(tf.float32, shape=self._image_shape_shape) self._gt_boxes_shape = (self._total_num_gt, 5) self._gt_boxes_ph = tf.placeholder(tf.float32, shape=self._gt_boxes_shape) def _run_net_with_feed_dict(self, net, feed_dict): with self.test_session() as sess: sess.run(tf.global_variables_initializer()) return sess.run(net, feed_dict=feed_dict) def _check_returning_shapes(self, prediction_dict, training=False): """Asserts a prediction_dict has the right shapes. This includes testing that: - objects, objects_labels and objects_labels_prob have the same shape in the first dimension. (i.e. the same number of objects). - objects has shape (_, 4). objects_labels and objects_labels_prob have shape (_,). - cls_score and cls_prob have shape (num_proposals, num_classes + 1). - bbox_offsets has shape (num_proposals, num_classes * 4). And, if training: - cls_target has shape (num_proposals,). - bbox_offsets_target has shape (num_proposals, 4). """ objects_shape = prediction_dict["objects"].shape objects_labels_shape = prediction_dict["labels"].shape objects_labels_prob_shape = prediction_dict["probs"].shape cls_score_shape = prediction_dict["rcnn"]["cls_score"].shape cls_prob_shape = prediction_dict["rcnn"]["cls_prob"].shape bbox_offsets_shape = prediction_dict["rcnn"]["bbox_offsets"].shape # We choose cls_score as the 'standard' num_proposals which we will # compare to the other shapes that should include num_proposals. We # could have chosen a different one. num_proposals = cls_score_shape[0] self.assertEqual(objects_shape[0], objects_labels_shape[0]) self.assertEqual(objects_shape[0], objects_labels_prob_shape[0]) self.assertEqual(objects_shape[1], 4) self.assertEqual(len(objects_labels_shape), 1) self.assertEqual(len(objects_labels_prob_shape), 1) self.assertEqual(cls_score_shape, cls_prob_shape) self.assertEqual(cls_prob_shape, (num_proposals, self._num_classes + 1)) self.assertEqual(bbox_offsets_shape, (num_proposals, self._num_classes * 4)) if training: cls_target_shape = prediction_dict["target"]["cls"].shape self.assertEqual(cls_target_shape, (num_proposals, )) bbox_offsets_trgt_shape = prediction_dict["target"][ "bbox_offsets"].shape self.assertEqual(bbox_offsets_trgt_shape, (num_proposals, 4)) def testReturningShapes(self): """Tests we're returning consistent shapes. We test both the case where we're training and the case where we are not. """ # Prediction session (not training) rcnn_net_not_training = self._shared_model( self._pretrained_feature_map_ph, self._proposals_ph, self._image_shape_ph, self._base_network, ) prediction_dict_not_training = self._run_net_with_feed_dict( rcnn_net_not_training, feed_dict={ self._pretrained_feature_map_ph: np.random.rand(*self._pretrained_feature_map_shape), self._proposals_ph: np.random.randint( low=0, high=np.amin(self._image_shape), size=self._proposals_shape, ), self._image_shape_ph: self._image_shape, }, ) # Training session rcnn_net_training = self._shared_model( self._pretrained_feature_map_ph, self._proposals_ph, self._image_shape_ph, self._base_network, self._gt_boxes_ph, ) prediction_dict_training = self._run_net_with_feed_dict( rcnn_net_training, feed_dict={ self._pretrained_feature_map_ph: np.random.rand(*self._pretrained_feature_map_shape), self._proposals_ph: np.random.randint( low=0, high=np.amin(self._image_shape), size=self._proposals_shape, ), self._image_shape_ph: self._image_shape, self._gt_boxes_ph: np.random.randint( low=0, high=np.amin(self._image_shape), size=self._gt_boxes_shape, ), }, ) # Assertions self._check_returning_shapes(prediction_dict_not_training) self._check_returning_shapes(prediction_dict_training, training=True) def testMinibatchBehaviour(self): """Tests we're using minibatch_size correctly when testing.""" rcnn_net = self._shared_model( self._pretrained_feature_map_ph, self._proposals_ph, self._image_shape_ph, self._base_network, self._gt_boxes_ph, ) prediction_dict = self._run_net_with_feed_dict( rcnn_net, feed_dict={ self._pretrained_feature_map_ph: np.random.rand(*self._pretrained_feature_map_shape), self._proposals_ph: np.random.randint( low=0, high=np.amin(self._image_shape), size=self._proposals_shape, ), self._image_shape_ph: self._image_shape, self._gt_boxes_ph: np.random.randint( low=0, high=np.amin(self._image_shape), size=self._gt_boxes_shape, ), }, ) # Assertions self.assertLessEqual( prediction_dict["target"]["cls"][ prediction_dict["target"]["cls"] >= 0].shape[0], self._config.target.minibatch_size, ) def testNumberOfObjects(self): """Tests we're not returning too many objects. The number of objects returned should be lower than the number of proposals received times the number of classes. """ rcnn_net = self._shared_model( self._pretrained_feature_map_ph, self._proposals_ph, self._image_shape_ph, self._base_network, ) prediction_dict = self._run_net_with_feed_dict( rcnn_net, feed_dict={ self._pretrained_feature_map_ph: np.random.rand(*self._pretrained_feature_map_shape), self._proposals_ph: np.random.randint( 0, high=np.amin(self._image_shape), size=self._proposals_shape, ), self._image_shape_ph: self._image_shape, }, ) # Assertions self.assertLessEqual(prediction_dict["objects"].shape[0], self._num_proposals * self._num_classes) def testLoss(self): """Tests we're computing loss correctly. In particular, we're testing whether computing a perfect score when we have to. """ # Generate placeholders and loss_graph cls_score_shape = (self._num_proposals, self._num_classes + 1) cls_score_ph = tf.placeholder(tf.float32, cls_score_shape) cls_prob_shape = (self._num_proposals, self._num_classes + 1) cls_prob_ph = tf.placeholder(tf.float32, cls_prob_shape) cls_target_shape = (self._num_proposals, ) cls_target_ph = tf.placeholder(tf.float32, cls_target_shape) bbox_offsets_shape = (self._num_proposals, self._num_classes * 4) bbox_offsets_ph = tf.placeholder(tf.float32, bbox_offsets_shape) bbox_offsets_target_shape = (self._num_proposals, 4) bbox_offsets_target_ph = tf.placeholder(tf.float32, bbox_offsets_target_shape) loss_graph = self._shared_model.loss({ "rcnn": { "cls_score": cls_score_ph, "cls_prob": cls_prob_ph, "bbox_offsets": bbox_offsets_ph, }, "target": { "cls": cls_target_ph, "bbox_offsets": bbox_offsets_target_ph, }, }) # Generate values that ensure a perfect score # We first initialize all our values to zero. cls_score = np.zeros(cls_score_shape, dtype=np.float32) cls_prob = np.zeros(cls_prob_shape, dtype=np.float32) cls_target = np.zeros(cls_target_shape, dtype=np.float32) bbox_offsets = np.zeros(bbox_offsets_shape, dtype=np.float32) bbox_offsets_target = np.zeros(bbox_offsets_target_shape, dtype=np.float32) for i in range(self._num_proposals): this_class = np.random.randint(low=1, high=self._num_classes + 1) cls_score[i][this_class] = self._high_score cls_prob[i][this_class] = 1.0 cls_target[i] = this_class # Find out where in the axis 1 in bbox_offsets we should # put the offsets, because the shape is # (num_proposals, num_classes * 4), and we're using # 1-indexed classes. class_place = (this_class - 1) * 4 for j in range(4): this_coord = np.random.randint(low=0, high=np.amax(self._image_shape)) bbox_offsets[i][class_place + j] = this_coord bbox_offsets_target[i][j] = this_coord # Now get the loss dict using the values we just generated. loss_dict = self._run_net_with_feed_dict( loss_graph, feed_dict={ cls_score_ph: cls_score, cls_prob_ph: cls_prob, cls_target_ph: cls_target, bbox_offsets_ph: bbox_offsets, bbox_offsets_target_ph: bbox_offsets_target, }, ) # Assertions self.assertAlmostEqual(loss_dict["rcnn_cls_loss"], 0, delta=self._equality_delta) self.assertAlmostEqual(loss_dict["rcnn_reg_loss"], 0, delta=self._equality_delta)