def _parse_train_data(self, data): """Parses data for training and evaluation. !!! All augmentations and transformations are on bboxes with format (ymin, xmin, ymax, xmax). Required to do the appropriate transformations. !!! Images are supposed to be in RGB format """ image, boxes = data['image'], data['boxes'] # Execute RandAugment first as some ops require uint8 colors if self._augmenter is not None: image = self._augmenter.distort(image) if self._aug_rand_hflip: image, boxes = yolo_ops.random_horizontal_flip(image, boxes) image, image_info = preprocess_ops.resize_and_crop_image( image, self._input_size[:2], self._input_size[:2], aug_scale_min=self._aug_scale_min, aug_scale_max=self._aug_scale_max, preserve_aspect_ratio=self._preserve_aspect_ratio) boxes = preprocess_ops.resize_and_crop_boxes(boxes, image_info[2, :], image_info[1, :], image_info[3, :]) if self._aug_jitter_im != 0.0: image, boxes = yolo_ops.random_translate(image, boxes, self._aug_jitter_im) if self._aug_jitter_boxes != 0.0: boxes = box_ops.jitter_boxes(boxes, self._aug_jitter_boxes) image = preprocess_ops.normalize_image(image, offset=MEAN_RGB, scale=STDDEV_RGB) image = tf.cast(image, dtype=self._dtype) boxes = tf.clip_by_value(boxes, 0, self._input_size[0] - 1) bbox_labels = yolo_box_ops.yxyx_to_xcycwh(boxes) bbox_labels = tf.concat([bbox_labels, data['classes'][:, tf.newaxis]], axis=-1) labels, bbox_labels = yolo_ops.preprocess_true_boxes( bboxes=bbox_labels, train_output_sizes=self.train_output_sizes, anchor_per_scale=self.anchor_per_scale, num_classes=self.num_classes, max_bbox_per_scale=self.max_bbox_per_scale, strides=self.strides, anchors=self.anchors) # TODO: Figure out why we need to fix the num BBOX if not there will be an error # https://github.com/whizzmobility/models/pull/61 # pad / limit to MAX_DISPLAY_BBOX boxes for constant size raw_bboxes = boxes num_bboxes = tf.shape(raw_bboxes)[0] if num_bboxes > MAX_DISPLAY_BBOX: raw_bboxes = raw_bboxes[:, :MAX_DISPLAY_BBOX] else: paddings = tf.stack([0, MAX_DISPLAY_BBOX - num_bboxes], axis=-1) paddings = tf.stack([paddings, [0, 0]], axis=0) raw_bboxes = tf.pad(raw_bboxes, paddings) targets = { 'labels': labels, 'bboxes': bbox_labels, 'raw_bboxes': raw_bboxes } return image, targets
def jitter_fn(input_boxes, arg_noise_scale): return box_ops.jitter_boxes(input_boxes, arg_noise_scale)
def _parse_train_data(self, data): """Generates images and labels that are usable for model training. Args: data: a dict of Tensors produced by the decoder. Returns: images: the image tensor. labels: a dict of Tensors that contains labels. """ shape = tf.shape(data['image']) image = data['image'] / 255 boxes = data['groundtruth_boxes'] width = shape[0] height = shape[1] image, boxes = yolo_preprocess_ops.fit_preserve_aspect_ratio( image, boxes, width=width, height=height, target_dim=self._max_process_size) image_shape = tf.shape(image)[:2] if self._random_flip: image, boxes, _ = preprocess_ops.random_horizontal_flip( image, boxes, seed=self._seed) randscale = self._image_w // self._net_down_scale if not self._fixed_size: do_scale = tf.greater( tf.random.uniform([], minval=0, maxval=1, seed=self._seed), 0.5) if do_scale: # This scales the image to a random multiple of net_down_scale # between 320 to 608 randscale = tf.random.uniform( [], minval=self._min_process_size // self._net_down_scale, maxval=self._max_process_size // self._net_down_scale, seed=self._seed, dtype=tf.int32) * self._net_down_scale if self._jitter_boxes != 0.0: boxes = box_ops.denormalize_boxes(boxes, image_shape) boxes = box_ops.jitter_boxes(boxes, 0.025) boxes = box_ops.normalize_boxes(boxes, image_shape) # YOLO loss function uses x-center, y-center format boxes = yolo_box_ops.yxyx_to_xcycwh(boxes) if self._jitter_im != 0.0: image, boxes = yolo_preprocess_ops.random_translate( image, boxes, self._jitter_im, seed=self._seed) if self._aug_rand_zoom: image, boxes = yolo_preprocess_ops.resize_crop_filter( image, boxes, default_width=self._image_w, default_height=self._image_h, target_width=randscale, target_height=randscale) image = tf.image.resize(image, (416, 416), preserve_aspect_ratio=False) if self._aug_rand_brightness: image = tf.image.random_brightness(image=image, max_delta=.1) # Brightness if self._aug_rand_saturation: image = tf.image.random_saturation(image=image, lower=0.75, upper=1.25) # Saturation if self._aug_rand_hue: image = tf.image.random_hue(image=image, max_delta=.3) # Hue image = tf.clip_by_value(image, 0.0, 1.0) # Find the best anchor for the ground truth labels to maximize the iou best_anchors = yolo_preprocess_ops.get_best_anchor( boxes, self._anchors, width=self._image_w, height=self._image_h) # Padding boxes = preprocess_ops.clip_or_pad_to_fixed_size( boxes, self._max_num_instances, 0) classes = preprocess_ops.clip_or_pad_to_fixed_size( data['groundtruth_classes'], self._max_num_instances, -1) best_anchors = preprocess_ops.clip_or_pad_to_fixed_size( best_anchors, self._max_num_instances, 0) area = preprocess_ops.clip_or_pad_to_fixed_size( data['groundtruth_area'], self._max_num_instances, 0) is_crowd = preprocess_ops.clip_or_pad_to_fixed_size( tf.cast(data['groundtruth_is_crowd'], tf.int32), self._max_num_instances, 0) labels = { 'source_id': data['source_id'], 'bbox': tf.cast(boxes, self._dtype), 'classes': tf.cast(classes, self._dtype), 'area': tf.cast(area, self._dtype), 'is_crowd': is_crowd, 'best_anchors': tf.cast(best_anchors, self._dtype), 'width': width, 'height': height, 'num_detections': tf.shape(data['groundtruth_classes'])[0], } if self._fixed_size: grid = self._build_grid(labels, self._image_w, use_tie_breaker=self._use_tie_breaker) labels.update({'grid_form': grid}) return image, labels
def _parse_train_data(self, data): """Generates images and labels that are usable for model training. Args: data: a dict of Tensors produced by the decoder. Returns: images: the image tensor. labels: a dict of Tensors that contains labels. """ image = data['image'] / 255 # / 255 boxes = data['groundtruth_boxes'] classes = data['groundtruth_classes'] do_blur = tf.random.uniform([], minval=0, maxval=1, seed=self._seed, dtype=tf.float32) if do_blur > 0.9: image = tfa.image.gaussian_filter2d(image, filter_shape=7, sigma=15) elif do_blur > 0.7: image = tfa.image.gaussian_filter2d(image, filter_shape=5, sigma=6) elif do_blur > 0.4: image = tfa.image.gaussian_filter2d(image, filter_shape=5, sigma=3) image = tf.image.rgb_to_hsv(image) i_h, i_s, i_v = tf.split(image, 3, axis=-1) if self._aug_rand_hue: delta = preprocessing_ops.rand_uniform_strong( -0.1, 0.1 ) # tf.random.uniform([], minval= -0.1,maxval=0.1, seed=self._seed, dtype=tf.float32) i_h = i_h + delta # Hue i_h = tf.clip_by_value(i_h, 0.0, 1.0) if self._aug_rand_saturation: delta = preprocessing_ops.rand_scale( 0.75 ) # tf.random.uniform([], minval= 0.5,maxval=1.1, seed=self._seed, dtype=tf.float32) i_s = i_s * delta if self._aug_rand_brightness: delta = preprocessing_ops.rand_scale( 0.75 ) # tf.random.uniform([], minval= -0.15,maxval=0.15, seed=self._seed, dtype=tf.float32) i_v = i_v * delta image = tf.concat([i_h, i_s, i_v], axis=-1) image = tf.image.hsv_to_rgb(image) stddev = tf.random.uniform([], minval=0, maxval=40 / 255, seed=self._seed, dtype=tf.float32) noise = tf.random.normal( shape=tf.shape(image), mean=0.0, stddev=stddev, seed=self._seed) noise = tf.math.minimum(noise, 0.5) noise = tf.math.maximum(noise, 0) image += noise image = tf.clip_by_value(image, 0.0, 1.0) image_shape = tf.shape(image)[:2] if self._random_flip: image, boxes, _ = preprocess_ops.random_horizontal_flip( image, boxes, seed=self._seed) if self._jitter_boxes != 0.0: boxes = box_ops.denormalize_boxes(boxes, image_shape) boxes = box_ops.jitter_boxes(boxes, 0.025) boxes = box_ops.normalize_boxes(boxes, image_shape) if self._jitter_im != 0.0: image, boxes, classes = preprocessing_ops.random_jitter( image, boxes, classes, self._jitter_im, seed=self._seed) # image, boxes, classes = preprocessing_ops.random_translate(image, boxes, classes, 0.2, seed=self._seed) if self._aug_rand_zoom: image, boxes, classes = preprocessing_ops.random_zoom_crop( image, boxes, classes, self._jitter_im) shape = tf.shape(image) width = shape[1] height = shape[0] randscale = self._image_w // self._net_down_scale if self._fixed_size: do_scale = tf.greater( tf.random.uniform([], minval=0, maxval=1, seed=self._seed), 1 - self._pct_rand) if do_scale: randscale = tf.random.uniform([], minval=10, maxval=15, seed=self._seed, dtype=tf.int32) if self._letter_box: image, boxes = preprocessing_ops.fit_preserve_aspect_ratio( image, boxes, width=width, height=height, target_dim=randscale * self._net_down_scale) width = randscale * self._net_down_scale height = randscale * self._net_down_scale shape = tf.shape(image) width = shape[1] height = shape[0] image, boxes, classes = preprocessing_ops.resize_crop_filter( image, boxes, classes, default_width=width, # randscale * self._net_down_scale, default_height=height, # randscale * self._net_down_scale, target_width=self._image_w, target_height=self._image_h, randomize=False) boxes = box_utils.yxyx_to_xcycwh(boxes) image = tf.clip_by_value(image, 0.0, 1.0) num_dets = tf.shape(classes)[0] # padding classes = preprocess_ops.clip_or_pad_to_fixed_size(classes, self._max_num_instances, -1) if self._fixed_size and not self._cutmix: best_anchors = preprocessing_ops.get_best_anchor( boxes, self._anchors, width=self._image_w, height=self._image_h) best_anchors = preprocess_ops.clip_or_pad_to_fixed_size( best_anchors, self._max_num_instances, 0) boxes = preprocess_ops.clip_or_pad_to_fixed_size(boxes, self._max_num_instances, 0) labels = { 'source_id': data['source_id'], 'bbox': tf.cast(boxes, self._dtype), 'classes': tf.cast(classes, self._dtype), 'best_anchors': tf.cast(best_anchors, self._dtype), 'width': width, 'height': height, 'num_detections': num_dets } grid = self._build_grid( labels, self._image_w, use_tie_breaker=self._use_tie_breaker) labels.update({'grid_form': grid}) labels['bbox'] = box_utils.xcycwh_to_yxyx(labels['bbox']) else: boxes = preprocess_ops.clip_or_pad_to_fixed_size(boxes, self._max_num_instances, 0) labels = { 'source_id': data['source_id'], 'bbox': tf.cast(boxes, self._dtype), 'classes': tf.cast(classes, self._dtype), 'width': width, 'height': height, 'num_detections': num_dets } return image, labels