def pad_groundtruths_to_fixed_size(groundtruths: Dict[str, tf.Tensor], size: int) -> Dict[str, tf.Tensor]: """Pads the first dimension of groundtruths labels to the fixed size. Args: groundtruths: A dictionary of {`str`: `tf.Tensor`} that contains groundtruth annotations of `boxes`, `is_crowds`, `areas` and `classes`. size: An `int` that specifies the expected size of the first dimension of padded tensors. Returns: A dictionary of the same keys as input and padded tensors as values. """ groundtruths['boxes'] = preprocess_ops.clip_or_pad_to_fixed_size( groundtruths['boxes'], size, -1) groundtruths['is_crowds'] = preprocess_ops.clip_or_pad_to_fixed_size( groundtruths['is_crowds'], size, 0) groundtruths['areas'] = preprocess_ops.clip_or_pad_to_fixed_size( groundtruths['areas'], size, -1) groundtruths['classes'] = preprocess_ops.clip_or_pad_to_fixed_size( groundtruths['classes'], size, -1) if 'attributes' in groundtruths: for k, v in groundtruths['attributes'].items(): groundtruths['attributes'][k] = preprocess_ops.clip_or_pad_to_fixed_size( v, size, -1) return groundtruths
def test_batch_generate_targets(self): input_size = [512, 512] output_size = [128, 128] max_num_instances = 128 boxes = tf.constant([ (10, 300, 15, 370), # center (y, x) = (12, 335) (100, 300, 150, 370), # center (y, x) = (125, 335) (15, 100, 200, 170), # center (y, x) = (107, 135) ], dtype=tf.float32) classes = tf.constant((1, 1, 1), dtype=tf.float32) boxes = preprocess_ops.clip_or_pad_to_fixed_size( boxes, max_num_instances, 0) classes = preprocess_ops.clip_or_pad_to_fixed_size( classes, max_num_instances, 0) boxes = tf.stack([boxes, boxes], axis=0) classes = tf.stack([classes, classes], axis=0) # pylint: disable=g-long-lambda labels = tf.map_fn( fn=lambda x: target_assigner.assign_centernet_targets( labels=x, output_size=output_size, input_size=input_size), elems={ 'boxes': boxes, 'classes': classes, 'groundtruths': { 'num_detections': tf.constant([3, 3]), } }, dtype={ 'ct_heatmaps': tf.float32, 'ct_offset': tf.float32, 'size': tf.float32, 'box_mask': tf.int32, 'box_indices': tf.int32 } ) ct_heatmaps = labels['ct_heatmaps'] ct_offset = labels['ct_offset'] size = labels['size'] box_mask = labels['box_mask'] box_indices = labels['box_indices'] self.assertEqual(ct_heatmaps.shape, (2, output_size[0], output_size[1], 90)) self.assertEqual(ct_offset.shape, (2, max_num_instances, 2)) self.assertEqual(size.shape, (2, max_num_instances, 2)) self.assertEqual(box_mask.shape, (2, max_num_instances)) self.assertEqual(box_indices.shape, (2, max_num_instances, 2))
def pad_groundtruths_to_fixed_size(groundtruths, size): """Pads the first dimension of groundtruths labels to the fixed size.""" groundtruths['boxes'] = preprocess_ops.clip_or_pad_to_fixed_size( groundtruths['boxes'], size, -1) groundtruths['is_crowds'] = preprocess_ops.clip_or_pad_to_fixed_size( groundtruths['is_crowds'], size, 0) groundtruths['areas'] = preprocess_ops.clip_or_pad_to_fixed_size( groundtruths['areas'], size, -1) groundtruths['classes'] = preprocess_ops.clip_or_pad_to_fixed_size( groundtruths['classes'], size, -1) return groundtruths
def _build_label(self, boxes, classes, image_info, unpad_image_shape, data): # Sets up groundtruth data for evaluation. groundtruths = { 'source_id': data['source_id'], 'height': data['height'], 'width': data['width'], 'num_detections': tf.shape(data['groundtruth_classes'])[0], 'boxes': box_ops.denormalize_boxes(data['groundtruth_boxes'], tf.shape(input=data['image'])[0:2]), 'classes': data['groundtruth_classes'], 'areas': data['groundtruth_area'], 'is_crowds': tf.cast(data['groundtruth_is_crowd'], tf.int32), } groundtruths['source_id'] = utils.process_source_id( groundtruths['source_id']) groundtruths = utils.pad_groundtruths_to_fixed_size( groundtruths, self._max_num_instances) labels = { 'boxes': preprocess_ops.clip_or_pad_to_fixed_size(boxes, self._max_num_instances, -1), 'classes': preprocess_ops.clip_or_pad_to_fixed_size(classes, self._max_num_instances, -1), 'image_info': image_info, 'unpad_image_shapes': unpad_image_shape, 'groundtruths': groundtruths } return labels
def testPadToFixedSize(self, input_shape, output_size): # Copies input shape to padding shape. clip_shape = input_shape[:] clip_shape[0] = min(output_size, clip_shape[0]) padding_shape = input_shape[:] padding_shape[0] = max(output_size - input_shape[0], 0) expected_outputs = np.concatenate( [np.ones(clip_shape), np.zeros(padding_shape)], axis=0) data = tf.ones(input_shape) output_data = preprocess_ops.clip_or_pad_to_fixed_size( data, output_size, constant_values=0) output_data = output_data.numpy() self.assertAllClose(output_size, output_data.shape[0]) self.assertAllClose(expected_outputs, output_data)
def check_labels_correct(self, boxes, classes, output_size, input_size): max_num_instances = 128 num_detections = len(boxes) boxes = tf.constant(boxes, dtype=tf.float32) classes = tf.constant(classes, dtype=tf.float32) boxes = preprocess_ops.clip_or_pad_to_fixed_size( boxes, max_num_instances, 0) classes = preprocess_ops.clip_or_pad_to_fixed_size( classes, max_num_instances, 0) # pylint: disable=g-long-lambda labels = target_assigner.assign_centernet_targets( labels={ 'boxes': boxes, 'classes': classes, 'groundtruths': { 'num_detections': num_detections, } }, output_size=output_size, input_size=input_size) ct_heatmaps = labels['ct_heatmaps'] ct_offset = labels['ct_offset'] size = labels['size'] box_mask = labels['box_mask'] box_indices = labels['box_indices'] boxes = tf.cast(boxes, tf.float32) classes = tf.cast(classes, tf.float32) height_ratio = output_size[0] / input_size[0] width_ratio = output_size[1] / input_size[1] # Shape checks self.assertEqual(ct_heatmaps.shape, (output_size[0], output_size[1], 90)) self.assertEqual(ct_offset.shape, (max_num_instances, 2)) self.assertEqual(size.shape, (max_num_instances, 2)) self.assertEqual(box_mask.shape, (max_num_instances,)) self.assertEqual(box_indices.shape, (max_num_instances, 2)) self.assertAllInRange(ct_heatmaps, 0, 1) for i in range(len(boxes)): # Check sizes self.assertAllEqual(size[i], [(boxes[i][2] - boxes[i][0]) * height_ratio, (boxes[i][3] - boxes[i][1]) * width_ratio, ]) # Check box indices y = tf.math.floor((boxes[i][0] + boxes[i][2]) / 2 * height_ratio) x = tf.math.floor((boxes[i][1] + boxes[i][3]) / 2 * width_ratio) self.assertAllEqual(box_indices[i], [y, x]) # check offsets true_y = (boxes[i][0] + boxes[i][2]) / 2 * height_ratio true_x = (boxes[i][1] + boxes[i][3]) / 2 * width_ratio self.assertAllEqual(ct_offset[i], [true_y - y, true_x - x]) for i in range(len(boxes), max_num_instances): # Make sure rest are zero self.assertAllEqual(size[i], [0, 0]) self.assertAllEqual(box_indices[i], [0, 0]) self.assertAllEqual(ct_offset[i], [0, 0]) # Check mask indices self.assertAllEqual(tf.cast(box_mask[3:], tf.int32), tf.repeat(0, repeats=max_num_instances - 3)) self.assertAllEqual(tf.cast(box_mask[:3], tf.int32), tf.repeat(1, repeats=3))
def _parse_train_data(self, data): """Generates images and labels that are usable for model training. Args: data: a dict of Tensors produced by the decoder. Returns: images: the image tensor. labels: a dict of Tensors that contains labels. """ shape = tf.shape(data['image']) image = data['image'] / 255 boxes = data['groundtruth_boxes'] width = shape[0] height = shape[1] image, boxes = yolo_preprocess_ops.fit_preserve_aspect_ratio( image, boxes, width=width, height=height, target_dim=self._max_process_size) image_shape = tf.shape(image)[:2] if self._random_flip: image, boxes, _ = preprocess_ops.random_horizontal_flip( image, boxes, seed=self._seed) randscale = self._image_w // self._net_down_scale if not self._fixed_size: do_scale = tf.greater( tf.random.uniform([], minval=0, maxval=1, seed=self._seed), 0.5) if do_scale: # This scales the image to a random multiple of net_down_scale # between 320 to 608 randscale = tf.random.uniform( [], minval=self._min_process_size // self._net_down_scale, maxval=self._max_process_size // self._net_down_scale, seed=self._seed, dtype=tf.int32) * self._net_down_scale if self._jitter_boxes != 0.0: boxes = box_ops.denormalize_boxes(boxes, image_shape) boxes = box_ops.jitter_boxes(boxes, 0.025) boxes = box_ops.normalize_boxes(boxes, image_shape) # YOLO loss function uses x-center, y-center format boxes = yolo_box_ops.yxyx_to_xcycwh(boxes) if self._jitter_im != 0.0: image, boxes = yolo_preprocess_ops.random_translate( image, boxes, self._jitter_im, seed=self._seed) if self._aug_rand_zoom: image, boxes = yolo_preprocess_ops.resize_crop_filter( image, boxes, default_width=self._image_w, default_height=self._image_h, target_width=randscale, target_height=randscale) image = tf.image.resize(image, (416, 416), preserve_aspect_ratio=False) if self._aug_rand_brightness: image = tf.image.random_brightness(image=image, max_delta=.1) # Brightness if self._aug_rand_saturation: image = tf.image.random_saturation(image=image, lower=0.75, upper=1.25) # Saturation if self._aug_rand_hue: image = tf.image.random_hue(image=image, max_delta=.3) # Hue image = tf.clip_by_value(image, 0.0, 1.0) # Find the best anchor for the ground truth labels to maximize the iou best_anchors = yolo_preprocess_ops.get_best_anchor( boxes, self._anchors, width=self._image_w, height=self._image_h) # Padding boxes = preprocess_ops.clip_or_pad_to_fixed_size( boxes, self._max_num_instances, 0) classes = preprocess_ops.clip_or_pad_to_fixed_size( data['groundtruth_classes'], self._max_num_instances, -1) best_anchors = preprocess_ops.clip_or_pad_to_fixed_size( best_anchors, self._max_num_instances, 0) area = preprocess_ops.clip_or_pad_to_fixed_size( data['groundtruth_area'], self._max_num_instances, 0) is_crowd = preprocess_ops.clip_or_pad_to_fixed_size( tf.cast(data['groundtruth_is_crowd'], tf.int32), self._max_num_instances, 0) labels = { 'source_id': data['source_id'], 'bbox': tf.cast(boxes, self._dtype), 'classes': tf.cast(classes, self._dtype), 'area': tf.cast(area, self._dtype), 'is_crowd': is_crowd, 'best_anchors': tf.cast(best_anchors, self._dtype), 'width': width, 'height': height, 'num_detections': tf.shape(data['groundtruth_classes'])[0], } if self._fixed_size: grid = self._build_grid(labels, self._image_w, use_tie_breaker=self._use_tie_breaker) labels.update({'grid_form': grid}) return image, labels
def _parse_train_data(self, data): """Parses data for training. Args: data: the decoded tensor dictionary from TfExampleDecoder. Returns: image: image tensor that is preproessed to have normalized value and dimension [output_size[0], output_size[1], 3] labels: a dictionary of tensors used for training. The following describes {key: value} pairs in the dictionary. image_info: a 2D `Tensor` that encodes the information of the image and the applied preprocessing. It is in the format of [[original_height, original_width], [scaled_height, scaled_width], anchor_boxes: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, 4] representing anchor boxes at each level. rpn_score_targets: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, anchors_per_location]. The height_l and width_l represent the dimension of class logits at l-th level. rpn_box_targets: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, anchors_per_location * 4]. The height_l and width_l represent the dimension of bounding box regression output at l-th level. gt_boxes: Groundtruth bounding box annotations. The box is represented in [y1, x1, y2, x2] format. The coordinates are w.r.t the scaled image that is fed to the network. The tennsor is padded with -1 to the fixed dimension [self._max_num_instances, 4]. gt_classes: Groundtruth classes annotations. The tennsor is padded with -1 to the fixed dimension [self._max_num_instances]. gt_masks: groundtrugh masks cropped by the bounding box and resized to a fixed size determined by mask_crop_size. """ classes = data['groundtruth_classes'] boxes = data['groundtruth_boxes'] if self._include_mask: masks = data['groundtruth_instance_masks'] is_crowds = data['groundtruth_is_crowd'] # Skips annotations with `is_crowd` = True. if self._skip_crowd_during_training: num_groundtruths = tf.shape(classes)[0] with tf.control_dependencies([num_groundtruths, is_crowds]): indices = tf.cond( tf.greater(tf.size(is_crowds), 0), lambda: tf.where(tf.logical_not(is_crowds))[:, 0], lambda: tf.cast(tf.range(num_groundtruths), tf.int64)) classes = tf.gather(classes, indices) boxes = tf.gather(boxes, indices) if self._include_mask: masks = tf.gather(masks, indices) # Gets original image and its size. image = data['image'] image_shape = tf.shape(image)[0:2] # Normalizes image with mean and std pixel values. image = preprocess_ops.normalize_image(image) # Flips image randomly during training. if self._aug_rand_hflip: if self._include_mask: image, boxes, masks = preprocess_ops.random_horizontal_flip( image, boxes, masks) else: image, boxes, _ = preprocess_ops.random_horizontal_flip( image, boxes) # Converts boxes from normalized coordinates to pixel coordinates. # Now the coordinates of boxes are w.r.t. the original image. boxes = box_ops.denormalize_boxes(boxes, image_shape) # Resizes and crops image. image, image_info = preprocess_ops.resize_and_crop_image( image, self._output_size, padded_size=preprocess_ops.compute_padded_size( self._output_size, 2**self._max_level), aug_scale_min=self._aug_scale_min, aug_scale_max=self._aug_scale_max) image_height, image_width, _ = image.get_shape().as_list() # Resizes and crops boxes. # Now the coordinates of boxes are w.r.t the scaled image. image_scale = image_info[2, :] offset = image_info[3, :] boxes = preprocess_ops.resize_and_crop_boxes(boxes, image_scale, image_info[1, :], offset) # Filters out ground truth boxes that are all zeros. indices = box_ops.get_non_empty_box_indices(boxes) boxes = tf.gather(boxes, indices) classes = tf.gather(classes, indices) if self._include_mask: masks = tf.gather(masks, indices) # Transfer boxes to the original image space and do normalization. cropped_boxes = boxes + tf.tile(tf.expand_dims(offset, axis=0), [1, 2]) cropped_boxes /= tf.tile(tf.expand_dims(image_scale, axis=0), [1, 2]) cropped_boxes = box_ops.normalize_boxes(cropped_boxes, image_shape) num_masks = tf.shape(masks)[0] masks = tf.image.crop_and_resize( tf.expand_dims(masks, axis=-1), cropped_boxes, box_indices=tf.range(num_masks, dtype=tf.int32), crop_size=[self._mask_crop_size, self._mask_crop_size], method='bilinear') masks = tf.squeeze(masks, axis=-1) # Assigns anchor targets. # Note that after the target assignment, box targets are absolute pixel # offsets w.r.t. the scaled image. input_anchor = anchor.build_anchor_generator( min_level=self._min_level, max_level=self._max_level, num_scales=self._num_scales, aspect_ratios=self._aspect_ratios, anchor_size=self._anchor_size) anchor_boxes = input_anchor(image_size=(image_height, image_width)) anchor_labeler = anchor.RpnAnchorLabeler(self._rpn_match_threshold, self._rpn_unmatched_threshold, self._rpn_batch_size_per_im, self._rpn_fg_fraction) rpn_score_targets, rpn_box_targets = anchor_labeler.label_anchors( anchor_boxes, boxes, tf.cast(tf.expand_dims(classes, axis=-1), dtype=tf.float32)) # Casts input image to self._dtype image = tf.cast(image, dtype=self._dtype) # Packs labels for model_fn outputs. labels = { 'anchor_boxes': anchor_boxes, 'image_info': image_info, 'rpn_score_targets': rpn_score_targets, 'rpn_box_targets': rpn_box_targets, 'gt_boxes': preprocess_ops.clip_or_pad_to_fixed_size(boxes, self._max_num_instances, -1), 'gt_classes': preprocess_ops.clip_or_pad_to_fixed_size(classes, self._max_num_instances, -1), } if self._include_mask: labels['gt_masks'] = preprocess_ops.clip_or_pad_to_fixed_size( masks, self._max_num_instances, -1) return image, labels
def _generate_detections_per_image( boxes: tf.Tensor, scores: tf.Tensor, attributes: Optional[Mapping[str, tf.Tensor]] = None, pre_nms_top_k: int = 5000, pre_nms_score_threshold: float = 0.05, nms_iou_threshold: float = 0.5, max_num_detections: int = 100, soft_nms_sigma: Optional[float] = None): """Generates the final detections per image given the model outputs. Args: boxes: A `tf.Tensor` with shape `[N, num_classes, 4]` or `[N, 1, 4]`, which box predictions on all feature levels. The N is the number of total anchors on all levels. scores: A `tf.Tensor` with shape `[N, num_classes]`, which stacks class probability on all feature levels. The N is the number of total anchors on all levels. The num_classes is the number of classes predicted by the model. Note that the class_outputs here is the raw score. attributes: If not None, a dict of `tf.Tensor`. Each value is in shape `[N, num_classes, attribute_size]` or `[N, 1, attribute_size]` of attribute predictions on all feature levels. The N is the number of total anchors on all levels. pre_nms_top_k: An `int` number of top candidate detections per class before NMS. pre_nms_score_threshold: A `float` representing the threshold for deciding when to remove boxes based on score. nms_iou_threshold: A `float` representing the threshold for deciding whether boxes overlap too much with respect to IOU. max_num_detections: A `scalar` representing maximum number of boxes retained over all classes. soft_nms_sigma: A `float` representing the sigma parameter for Soft NMS. When soft_nms_sigma=0.0, we fall back to standard NMS. If set to None, `tf.image.non_max_suppression_padded` is called instead. Returns: nms_boxes: A `float` tf.Tensor of shape `[max_num_detections, 4]` representing top detected boxes in `[y1, x1, y2, x2]`. nms_scores: A `float` tf.Tensor of shape `[max_num_detections]` representing sorted confidence scores for detected boxes. The values are between [0, 1]. nms_classes: An `int` tf.Tensor of shape `[max_num_detections]` representing classes for detected boxes. valid_detections: An `int` tf.Tensor of shape [1] only the top `valid_detections` boxes are valid detections. nms_attributes: None or a dict. Each value is a `float` tf.Tensor of shape `[max_num_detections, attribute_size]` representing attribute predictions for detected boxes. Can be an empty dict if `attributes` is None. """ nmsed_boxes = [] nmsed_scores = [] nmsed_classes = [] num_classes_for_box = boxes.get_shape().as_list()[1] num_classes = scores.get_shape().as_list()[1] if attributes: nmsed_attributes = {att_name: [] for att_name in attributes.keys()} else: nmsed_attributes = {} for i in range(num_classes): boxes_i = boxes[:, min(num_classes_for_box - 1, i)] scores_i = scores[:, i] # Obtains pre_nms_top_k before running NMS. scores_i, indices = tf.nn.top_k( scores_i, k=tf.minimum(tf.shape(scores_i)[-1], pre_nms_top_k)) boxes_i = tf.gather(boxes_i, indices) if soft_nms_sigma is not None: (nmsed_indices_i, nmsed_scores_i) = tf.image.non_max_suppression_with_scores( tf.cast(boxes_i, tf.float32), tf.cast(scores_i, tf.float32), max_num_detections, iou_threshold=nms_iou_threshold, score_threshold=pre_nms_score_threshold, soft_nms_sigma=soft_nms_sigma, name='nms_detections_' + str(i)) nmsed_boxes_i = tf.gather(boxes_i, nmsed_indices_i) nmsed_boxes_i = preprocess_ops.clip_or_pad_to_fixed_size( nmsed_boxes_i, max_num_detections, 0.0) nmsed_scores_i = preprocess_ops.clip_or_pad_to_fixed_size( nmsed_scores_i, max_num_detections, -1.0) else: (nmsed_indices_i, nmsed_num_valid_i) = tf.image.non_max_suppression_padded( tf.cast(boxes_i, tf.float32), tf.cast(scores_i, tf.float32), max_num_detections, iou_threshold=nms_iou_threshold, score_threshold=pre_nms_score_threshold, pad_to_max_output_size=True, name='nms_detections_' + str(i)) nmsed_boxes_i = tf.gather(boxes_i, nmsed_indices_i) nmsed_scores_i = tf.gather(scores_i, nmsed_indices_i) # Sets scores of invalid boxes to -1. nmsed_scores_i = tf.where( tf.less(tf.range(max_num_detections), [nmsed_num_valid_i]), nmsed_scores_i, -tf.ones_like(nmsed_scores_i)) nmsed_classes_i = tf.fill([max_num_detections], i) nmsed_boxes.append(nmsed_boxes_i) nmsed_scores.append(nmsed_scores_i) nmsed_classes.append(nmsed_classes_i) if attributes: for att_name, att in attributes.items(): num_classes_for_attr = att.get_shape().as_list()[1] att_i = att[:, min(num_classes_for_attr - 1, i)] att_i = tf.gather(att_i, indices) nmsed_att_i = tf.gather(att_i, nmsed_indices_i) nmsed_att_i = preprocess_ops.clip_or_pad_to_fixed_size( nmsed_att_i, max_num_detections, 0.0) nmsed_attributes[att_name].append(nmsed_att_i) # Concats results from all classes and sort them. nmsed_boxes = tf.concat(nmsed_boxes, axis=0) nmsed_scores = tf.concat(nmsed_scores, axis=0) nmsed_classes = tf.concat(nmsed_classes, axis=0) nmsed_scores, indices = tf.nn.top_k( nmsed_scores, k=max_num_detections, sorted=True) nmsed_boxes = tf.gather(nmsed_boxes, indices) nmsed_classes = tf.gather(nmsed_classes, indices) valid_detections = tf.reduce_sum( tf.cast(tf.greater(nmsed_scores, -1), tf.int32)) if attributes: for att_name in attributes.keys(): nmsed_attributes[att_name] = tf.concat(nmsed_attributes[att_name], axis=0) nmsed_attributes[att_name] = tf.gather(nmsed_attributes[att_name], indices) return nmsed_boxes, nmsed_scores, nmsed_classes, valid_detections, nmsed_attributes
def _parse_train_data(self, data): """Generates images and labels that are usable for model training. Args: data: a dict of Tensors produced by the decoder. Returns: images: the image tensor. labels: a dict of Tensors that contains labels. """ image = data['image'] / 255 # / 255 boxes = data['groundtruth_boxes'] classes = data['groundtruth_classes'] do_blur = tf.random.uniform([], minval=0, maxval=1, seed=self._seed, dtype=tf.float32) if do_blur > 0.9: image = tfa.image.gaussian_filter2d(image, filter_shape=7, sigma=15) elif do_blur > 0.7: image = tfa.image.gaussian_filter2d(image, filter_shape=5, sigma=6) elif do_blur > 0.4: image = tfa.image.gaussian_filter2d(image, filter_shape=5, sigma=3) image = tf.image.rgb_to_hsv(image) i_h, i_s, i_v = tf.split(image, 3, axis=-1) if self._aug_rand_hue: delta = preprocessing_ops.rand_uniform_strong( -0.1, 0.1 ) # tf.random.uniform([], minval= -0.1,maxval=0.1, seed=self._seed, dtype=tf.float32) i_h = i_h + delta # Hue i_h = tf.clip_by_value(i_h, 0.0, 1.0) if self._aug_rand_saturation: delta = preprocessing_ops.rand_scale( 0.75 ) # tf.random.uniform([], minval= 0.5,maxval=1.1, seed=self._seed, dtype=tf.float32) i_s = i_s * delta if self._aug_rand_brightness: delta = preprocessing_ops.rand_scale( 0.75 ) # tf.random.uniform([], minval= -0.15,maxval=0.15, seed=self._seed, dtype=tf.float32) i_v = i_v * delta image = tf.concat([i_h, i_s, i_v], axis=-1) image = tf.image.hsv_to_rgb(image) stddev = tf.random.uniform([], minval=0, maxval=40 / 255, seed=self._seed, dtype=tf.float32) noise = tf.random.normal( shape=tf.shape(image), mean=0.0, stddev=stddev, seed=self._seed) noise = tf.math.minimum(noise, 0.5) noise = tf.math.maximum(noise, 0) image += noise image = tf.clip_by_value(image, 0.0, 1.0) image_shape = tf.shape(image)[:2] if self._random_flip: image, boxes, _ = preprocess_ops.random_horizontal_flip( image, boxes, seed=self._seed) if self._jitter_boxes != 0.0: boxes = box_ops.denormalize_boxes(boxes, image_shape) boxes = box_ops.jitter_boxes(boxes, 0.025) boxes = box_ops.normalize_boxes(boxes, image_shape) if self._jitter_im != 0.0: image, boxes, classes = preprocessing_ops.random_jitter( image, boxes, classes, self._jitter_im, seed=self._seed) # image, boxes, classes = preprocessing_ops.random_translate(image, boxes, classes, 0.2, seed=self._seed) if self._aug_rand_zoom: image, boxes, classes = preprocessing_ops.random_zoom_crop( image, boxes, classes, self._jitter_im) shape = tf.shape(image) width = shape[1] height = shape[0] randscale = self._image_w // self._net_down_scale if self._fixed_size: do_scale = tf.greater( tf.random.uniform([], minval=0, maxval=1, seed=self._seed), 1 - self._pct_rand) if do_scale: randscale = tf.random.uniform([], minval=10, maxval=15, seed=self._seed, dtype=tf.int32) if self._letter_box: image, boxes = preprocessing_ops.fit_preserve_aspect_ratio( image, boxes, width=width, height=height, target_dim=randscale * self._net_down_scale) width = randscale * self._net_down_scale height = randscale * self._net_down_scale shape = tf.shape(image) width = shape[1] height = shape[0] image, boxes, classes = preprocessing_ops.resize_crop_filter( image, boxes, classes, default_width=width, # randscale * self._net_down_scale, default_height=height, # randscale * self._net_down_scale, target_width=self._image_w, target_height=self._image_h, randomize=False) boxes = box_utils.yxyx_to_xcycwh(boxes) image = tf.clip_by_value(image, 0.0, 1.0) num_dets = tf.shape(classes)[0] # padding classes = preprocess_ops.clip_or_pad_to_fixed_size(classes, self._max_num_instances, -1) if self._fixed_size and not self._cutmix: best_anchors = preprocessing_ops.get_best_anchor( boxes, self._anchors, width=self._image_w, height=self._image_h) best_anchors = preprocess_ops.clip_or_pad_to_fixed_size( best_anchors, self._max_num_instances, 0) boxes = preprocess_ops.clip_or_pad_to_fixed_size(boxes, self._max_num_instances, 0) labels = { 'source_id': data['source_id'], 'bbox': tf.cast(boxes, self._dtype), 'classes': tf.cast(classes, self._dtype), 'best_anchors': tf.cast(best_anchors, self._dtype), 'width': width, 'height': height, 'num_detections': num_dets } grid = self._build_grid( labels, self._image_w, use_tie_breaker=self._use_tie_breaker) labels.update({'grid_form': grid}) labels['bbox'] = box_utils.xcycwh_to_yxyx(labels['bbox']) else: boxes = preprocess_ops.clip_or_pad_to_fixed_size(boxes, self._max_num_instances, 0) labels = { 'source_id': data['source_id'], 'bbox': tf.cast(boxes, self._dtype), 'classes': tf.cast(classes, self._dtype), 'width': width, 'height': height, 'num_detections': num_dets } return image, labels
def preprocess(self, inputs): """Preprocess COCO for DETR.""" image = inputs['image'] boxes = inputs['objects']['bbox'] classes = inputs['objects']['label'] + 1 is_crowd = inputs['objects']['is_crowd'] image = preprocess_ops.normalize_image(image) if self._params.is_training: image, boxes, _ = preprocess_ops.random_horizontal_flip(image, boxes) do_crop = tf.greater(tf.random.uniform([]), 0.5) if do_crop: # Rescale boxes = box_ops.denormalize_boxes(boxes, tf.shape(image)[:2]) index = tf.random.categorical(tf.zeros([1, 3]), 1)[0] scales = tf.gather([400.0, 500.0, 600.0], index, axis=0) short_side = scales[0] image, image_info = preprocess_ops.resize_image(image, short_side) boxes = preprocess_ops.resize_and_crop_boxes(boxes, image_info[2, :], image_info[1, :], image_info[3, :]) boxes = box_ops.normalize_boxes(boxes, image_info[1, :]) # Do croping shape = tf.cast(image_info[1], dtype=tf.int32) h = tf.random.uniform( [], 384, tf.math.minimum(shape[0], 600), dtype=tf.int32) w = tf.random.uniform( [], 384, tf.math.minimum(shape[1], 600), dtype=tf.int32) i = tf.random.uniform([], 0, shape[0] - h + 1, dtype=tf.int32) j = tf.random.uniform([], 0, shape[1] - w + 1, dtype=tf.int32) image = tf.image.crop_to_bounding_box(image, i, j, h, w) boxes = tf.clip_by_value( (boxes[..., :] * tf.cast( tf.stack([shape[0], shape[1], shape[0], shape[1]]), dtype=tf.float32) - tf.cast(tf.stack([i, j, i, j]), dtype=tf.float32)) / tf.cast(tf.stack([h, w, h, w]), dtype=tf.float32), 0.0, 1.0) scales = tf.constant( self._params.resize_scales, dtype=tf.float32) index = tf.random.categorical(tf.zeros([1, 11]), 1)[0] scales = tf.gather(scales, index, axis=0) else: scales = tf.constant([self._params.resize_scales[-1]], tf.float32) image_shape = tf.shape(image)[:2] boxes = box_ops.denormalize_boxes(boxes, image_shape) gt_boxes = boxes short_side = scales[0] image, image_info = preprocess_ops.resize_image( image, short_side, max(self._params.output_size)) boxes = preprocess_ops.resize_and_crop_boxes(boxes, image_info[2, :], image_info[1, :], image_info[3, :]) boxes = box_ops.normalize_boxes(boxes, image_info[1, :]) # Filters out ground truth boxes that are all zeros. indices = box_ops.get_non_empty_box_indices(boxes) boxes = tf.gather(boxes, indices) classes = tf.gather(classes, indices) is_crowd = tf.gather(is_crowd, indices) boxes = box_ops.yxyx_to_cycxhw(boxes) image = tf.image.pad_to_bounding_box( image, 0, 0, self._params.output_size[0], self._params.output_size[1]) labels = { 'classes': preprocess_ops.clip_or_pad_to_fixed_size( classes, self._params.max_num_boxes), 'boxes': preprocess_ops.clip_or_pad_to_fixed_size( boxes, self._params.max_num_boxes) } if not self._params.is_training: labels.update({ 'id': inputs['image/id'], 'image_info': image_info, 'is_crowd': preprocess_ops.clip_or_pad_to_fixed_size( is_crowd, self._params.max_num_boxes), 'gt_boxes': preprocess_ops.clip_or_pad_to_fixed_size( gt_boxes, self._params.max_num_boxes), }) return image, labels