def _parse_train_data(self, data):
        """Parses data for training.

        Args:
            data: the decoded tensor dictionary from TfExampleDecoder.

        Returns:
            image: image tensor that is preproessed to have normalized value and
                dimension [output_size[0], output_size[1], 3]
            labels: a dictionary of tensors used for training. The following describes
                {key: value} pairs in the dictionary.
            image_info: a 2D `Tensor` that encodes the information of the image and
                the applied preprocessing. It is in the format of
                [[original_height, original_width], [scaled_height, scaled_width],
            anchor_boxes: ordered dictionary with keys [min_level, min_level+1, ..., max_level].
                The values are tensor with shape [height_l, width_l, 4] representing anchor boxes at each level.
            rpn_score_targets: ordered dictionary with keys [min_level, min_level+1, ..., max_level].
                The values are tensor with shape [height_l, width_l, anchors_per_location]. The height_l and
                width_l represent the dimension of class logits at l-th level.
            rpn_box_targets: ordered dictionary with keys [min_level, min_level+1, ..., max_level].
                The values are tensor with shape [height_l, width_l, anchors_per_location * 4]. The height_l and
                width_l represent the dimension of bounding box regression output at
                l-th level.
            gt_boxes: Groundtruth bounding box annotations. The box is represented in [y1, x1, y2, x2] format.
                The coordinates are w.r.t the scaled image that is fed to the network. The tennsor is
                padded with -1 to the fixed dimension [self._max_num_instances, 4].
            gt_classes: Groundtruth classes annotations. The tennsor is padded with -1 to the fixed
                dimension [self._max_num_instances].
            gt_masks: groundtrugh masks cropped by the bounding box and resized to a fixed size
                determined by mask_crop_size.
        """

        classes = data['groundtruth_classes']
        boxes = data['groundtruth_boxes']
        is_crowds = data['groundtruth_is_crowd']
        if self._include_mask:
            masks = data['groundtruth_instance_masks']

        # Skips annotations with `is_crowd` = True.
        if self._skip_crowd_during_training and self._is_training:
            num_groundtruths = tf.shape(classes)[0]
            with tf.control_dependencies([num_groundtruths, is_crowds]):
                indices = tf.cond(tf.greater(tf.size(is_crowds), 0),
                                  lambda: tf.where(tf.logical_not(is_crowds))[:, 0],
                                  lambda: tf.cast(tf.range(num_groundtruths), tf.int64))
            classes = tf.gather(classes, indices, axis=None)
            boxes = tf.gather(boxes, indices, axis=None)
            if self._include_mask:
                masks = tf.gather(masks, indices, axis=None)

        # Gets original image and its size.
        image = data['image']
        image_shape = tf.shape(image)[0:2]

        # Normalizes image with mean and std pixel values.
        image = input_utils.normalize_image(image)

        # Flips image randomly during training.
        if self._aug_rand_hflip:
            if self._include_mask:
                image, boxes, masks = input_utils.random_horizontal_flip(image, boxes, masks) # pylint: disable=W0632
            else:
                image, boxes = input_utils.random_horizontal_flip(image, boxes) # pylint: disable=W0632

        # Converts boxes from normalized coordinates to pixel coordinates.
        # Now the coordinates of boxes are w.r.t. the original image.
        boxes = box_utils.denormalize_boxes(boxes, image_shape)

        # Resizes and crops image.
        image, image_info = input_utils.resize_and_crop_image(
            image,
            self._output_size,
            padded_size=input_utils.compute_padded_size(self._output_size, 2 ** self._max_level),
            aug_scale_min=self._aug_scale_min,
            aug_scale_max=self._aug_scale_max)
        image_height, image_width, _ = image.get_shape().as_list()

        # Resizes and crops boxes.
        # Now the coordinates of boxes are w.r.t the scaled image.
        image_scale = image_info[2, :]
        offset = image_info[3, :]
        boxes = input_utils.resize_and_crop_boxes(boxes, image_scale, image_info[1, :], offset)

        # Filters out ground truth boxes that are all zeros.
        indices = box_utils.get_non_empty_box_indices(boxes)
        boxes = tf.gather(boxes, indices, axis=None)
        classes = tf.gather(classes, indices, axis=None)

        if self._include_mask:
            masks = tf.gather(masks, indices, axis=None)
            # Transfer boxes to the original image space and do normalization.
            cropped_boxes = boxes + tf.tile(tf.expand_dims(offset, axis=0), [1, 2])
            cropped_boxes /= tf.tile(tf.expand_dims(image_scale, axis=0), [1, 2])
            cropped_boxes = box_utils.normalize_boxes(cropped_boxes, image_shape)
            num_masks = tf.shape(masks)[0]
            masks = tf.image.crop_and_resize(
                tf.expand_dims(masks, axis=-1),
                cropped_boxes,
                box_indices=tf.range(num_masks, dtype=tf.int32),
                crop_size=[self._mask_crop_size, self._mask_crop_size],
                method='bilinear')
            masks = tf.squeeze(masks, axis=-1)

        # Assigns anchor targets.
        # Note that after the target assignment, box targets are absolute pixel
        # offsets w.r.t. the scaled image.
        input_anchor = anchor.Anchor(self._min_level, self._max_level,
                                     self._num_scales, self._aspect_ratios,
                                     self._anchor_size, (image_height, image_width))

        anchor_labeler = anchor.RpnAnchorLabeler(input_anchor,
                                                 self._rpn_match_threshold,
                                                 self._rpn_unmatched_threshold,
                                                 self._rpn_batch_size_per_im,
                                                 self._rpn_fg_fraction)

        rpn_score_targets, rpn_box_targets = anchor_labeler.label_anchors(
            boxes, tf.cast(tf.expand_dims(classes, axis=-1), tf.float32))

        inputs = {
            'image': image,
            'image_info': image_info,
        }

        # Packs labels for model_fn outputs.
        labels = {
            'anchor_boxes': input_anchor.multilevel_boxes,
            'image_info': image_info,
            'rpn_score_targets': rpn_score_targets,
            'rpn_box_targets': rpn_box_targets,
        }

        inputs['gt_boxes'] = input_utils.pad_to_fixed_size(boxes, self._max_num_instances, -1)
        inputs['gt_classes'] = input_utils.pad_to_fixed_size(classes, self._max_num_instances, -1)

        if self._include_mask:
            inputs['gt_masks'] = input_utils.pad_to_fixed_size(masks, self._max_num_instances, -1)

        return inputs, labels
Beispiel #2
0
    def _parse_train_data(self, data):
        """Parses data for training and evaluation."""
        classes = data['groundtruth_classes']
        boxes = data['groundtruth_boxes']
        is_crowds = data['groundtruth_is_crowd']

        # Skips annotations with `is_crowd` = True.
        if self._skip_crowd_during_training and self._is_training:
            num_groundtrtuhs = tf.shape(input=classes)[0]
            with tf.control_dependencies([num_groundtrtuhs, is_crowds]):
                indices = tf.cond(
                    pred=tf.greater(tf.size(input=is_crowds), 0),
                    true_fn=lambda: tf.where(tf.logical_not(is_crowds))[:, 0],
                    false_fn=lambda: tf.cast(tf.range(num_groundtrtuhs), tf.
                                             int64))
            classes = tf.gather(classes, indices, axis=None)
            boxes = tf.gather(boxes, indices, axis=None)

        # Gets original image and its size.
        image = data['image']
        image_shape = tf.shape(input=image)[0:2]

        # Normalizes image with mean and std pixel values.
        image = input_utils.normalize_image(image)

        # Flips image randomly during training.
        if self._aug_rand_hflip:
            image, boxes = input_utils.random_horizontal_flip(image, boxes)  # pylint: disable=W0632

        # Converts boxes from normalized coordinates to pixel coordinates.
        boxes = box_utils.denormalize_boxes(boxes, image_shape)

        # Resizes and crops image.
        image, image_info = input_utils.resize_and_crop_image(
            image,
            self._output_size,
            padded_size=input_utils.compute_padded_size(
                self._output_size, 2**self._max_level),
            aug_scale_min=self._aug_scale_min,
            aug_scale_max=self._aug_scale_max)
        image_height, image_width, _ = image.get_shape().as_list()

        # Resizes and crops boxes.
        image_scale = image_info[2, :]
        offset = image_info[3, :]
        boxes = input_utils.resize_and_crop_boxes(boxes, image_scale,
                                                  image_info[1, :], offset)

        # Filters out ground truth boxes that are all zeros.
        indices = box_utils.get_non_empty_box_indices(boxes)
        boxes = tf.gather(boxes, indices, axis=None)
        classes = tf.gather(classes, indices, axis=None)

        # Assigns anchors.
        input_anchor = anchor.Anchor(self._min_level, self._max_level,
                                     self._num_scales, self._aspect_ratios,
                                     self._anchor_size,
                                     (image_height, image_width))

        anchor_labeler = anchor.AnchorLabeler(input_anchor,
                                              self._match_threshold,
                                              self._unmatched_threshold)

        (cls_targets, box_targets,
         num_positives) = anchor_labeler.label_anchors(
             boxes, tf.cast(tf.expand_dims(classes, axis=1), tf.float32))

        # Packs labels for model_fn outputs.
        labels = {
            'cls_targets': cls_targets,
            'box_targets': box_targets,
            'anchor_boxes': input_anchor.multilevel_boxes,
            'num_positives': num_positives,
            'image_info': image_info,
        }

        return image, labels
Beispiel #3
0
    def _parse_predict_data(self, data):
        """Parses data for prediction."""
        # Gets original image and its size.
        image = data['image']
        image_shape = tf.shape(input=image)[0:2]

        # Normalizes image with mean and std pixel values.
        image = input_utils.normalize_image(image)

        # Resizes and crops image.
        image, image_info = input_utils.resize_and_crop_image(
            image,
            self._output_size,
            padded_size=input_utils.compute_padded_size(
                self._output_size, 2**self._max_level),
            aug_scale_min=1.0,
            aug_scale_max=1.0)
        image_height, image_width, _ = image.get_shape().as_list()

        # Compute Anchor boxes.
        input_anchor = anchor.Anchor(self._min_level, self._max_level,
                                     self._num_scales, self._aspect_ratios,
                                     self._anchor_size,
                                     (image_height, image_width))

        labels = {
            'anchor_boxes': input_anchor.multilevel_boxes,
            'image_info': image_info,
        }

        # Converts boxes from normalized coordinates to pixel coordinates.
        boxes = box_utils.denormalize_boxes(data['groundtruth_boxes'],
                                            image_shape)
        groundtruths = {
            'source_id': data['source_id'],
            'num_detections':
            tf.squeeze(tf.shape(data['groundtruth_classes'])),
            'boxes': boxes,
            'classes': data['groundtruth_classes'],
            'areas': data['groundtruth_area'],
            'is_crowds': tf.cast(data['groundtruth_is_crowd'], tf.int32),
        }
        groundtruths['source_id'] = dataloader_utils.process_source_id(
            groundtruths['source_id'])
        groundtruths = dataloader_utils.pad_groundtruths_to_fixed_size(
            groundtruths, self._max_num_instances)
        labels.update(groundtruths)

        # Computes training objective for evaluation loss.
        classes = data['groundtruth_classes']

        image_scale = image_info[2, :]
        offset = image_info[3, :]
        boxes = input_utils.resize_and_crop_boxes(boxes, image_scale,
                                                  image_info[1, :], offset)

        # Filters out ground truth boxes that are all zeros.
        indices = box_utils.get_non_empty_box_indices(boxes)
        boxes = tf.gather(boxes, indices, axis=None)

        # Assigns anchors.
        anchor_labeler = anchor.AnchorLabeler(input_anchor,
                                              self._match_threshold,
                                              self._unmatched_threshold)

        (cls_targets, box_targets,
         num_positives) = anchor_labeler.label_anchors(
             boxes, tf.cast(tf.expand_dims(classes, axis=1), tf.float32))

        labels['cls_targets'] = cls_targets
        labels['box_targets'] = box_targets
        labels['num_positives'] = num_positives

        return image, labels
Beispiel #4
0
    def build_outputs(self, inputs, is_training):
        model_outputs = {}

        image = inputs['image']
        _, image_height, image_width, _ = image.get_shape().as_list()
        backbone_features = self._backbone_fn(image, is_training)
        fpn_features = self._fpn_fn(backbone_features, is_training)

        rpn_score_outputs, rpn_box_outputs = self._rpn_head_fn(
            fpn_features, is_training)

        model_outputs.update({
            'rpn_score_outputs':
            tf.nest.map_structure(lambda x: tf.cast(x, tf.float32),
                                  rpn_score_outputs),
            'rpn_box_outputs':
            tf.nest.map_structure(lambda x: tf.cast(x, tf.float32),
                                  rpn_box_outputs),
        })

        input_anchor = anchor.Anchor(
            self._params.model_params.architecture.min_level,
            self._params.model_params.architecture.max_level,
            self._params.anchor.num_scales, self._params.anchor.aspect_ratios,
            self._params.anchor.anchor_size, (image_height, image_width))

        rpn_rois, _ = self._generate_rois_fn(rpn_box_outputs,
                                             rpn_score_outputs,
                                             input_anchor.multilevel_boxes,
                                             inputs['image_info'][:, 1, :],
                                             is_training)

        if is_training:
            rpn_rois = tf.stop_gradient(rpn_rois)

            # Sample proposals.
            rpn_rois, matched_gt_boxes, matched_gt_classes, matched_gt_indices = (
                self._sample_rois_fn(rpn_rois, inputs['gt_boxes'],
                                     inputs['gt_classes']))

            # Create bounding box training targets.
            box_targets = box_utils.encode_boxes(
                matched_gt_boxes, rpn_rois, weights=[10.0, 10.0, 5.0, 5.0])

            # If the target is background, the box target is set to all 0s.
            box_targets = tf.where(
                tf.tile(
                    tf.expand_dims(tf.equal(matched_gt_classes, 0), axis=-1),
                    [1, 1, 4]), tf.zeros_like(box_targets), box_targets)

            model_outputs.update({
                'class_targets': matched_gt_classes,
                'box_targets': box_targets,
            })

        roi_features = spatial_transform_ops.multilevel_crop_and_resize(
            fpn_features, rpn_rois, output_size=7)

        class_outputs, box_outputs = self._frcnn_head_fn(
            roi_features, is_training)

        model_outputs.update({
            'class_outputs':
            tf.nest.map_structure(lambda x: tf.cast(x, tf.float32),
                                  class_outputs),
            'box_outputs':
            tf.nest.map_structure(lambda x: tf.cast(x, tf.float32),
                                  box_outputs),
        })

        # Add this output to train to make the checkpoint loadable in predict mode.
        # If we skip it in train mode, the heads will be out-of-order and checkpoint
        # loading will fail.
        boxes, scores, classes, valid_detections = self._generate_detections_fn(
            box_outputs, class_outputs, rpn_rois, inputs['image_info'][:,
                                                                       1:2, :])

        model_outputs.update({
            'num_detections': valid_detections,
            'detection_boxes': boxes,
            'detection_classes': classes,
            'detection_scores': scores,
        })

        if not self._include_mask:
            return model_outputs

        if is_training:
            rpn_rois, classes, mask_targets = self._sample_masks_fn(
                rpn_rois, matched_gt_boxes, matched_gt_classes,
                matched_gt_indices, inputs['gt_masks'])
            mask_targets = tf.stop_gradient(mask_targets)

            classes = tf.cast(classes, tf.int32)

            model_outputs.update({
                'mask_targets': mask_targets,
                'sampled_class_targets': classes,
            })
        else:
            rpn_rois = boxes
            classes = tf.cast(classes, tf.int32)

        mask_roi_features = spatial_transform_ops.multilevel_crop_and_resize(
            fpn_features, rpn_rois, output_size=14)

        mask_outputs = self._mrcnn_head_fn(mask_roi_features, classes,
                                           is_training)

        if is_training:
            model_outputs.update({
                'mask_outputs':
                tf.nest.map_structure(lambda x: tf.cast(x, tf.float32),
                                      mask_outputs),
            })
        else:
            model_outputs.update(
                {'detection_masks': tf.nn.sigmoid(mask_outputs)})

        return model_outputs