def serve(self, images: tf.Tensor): """Cast image to float and run inference. Args: images: uint8 Tensor of shape [batch_size, None, None, 3] Returns: Tensor holding detection output logits. """ images, anchor_boxes, image_info = self.preprocess(images) input_image_shape = image_info[:, 1, :] # To overcome keras.Model extra limitation to save a model with layers that # have multiple inputs, we use `model.call` here to trigger the forward # path. Note that, this disables some keras magics happens in `__call__`. detections = self.model.call(images=images, image_shape=input_image_shape, anchor_boxes=anchor_boxes, training=False) if self.params.task.model.detection_generator.apply_nms: # For RetinaNet model, apply export_config. # TODO(huizhongc): Add export_config to fasterrcnn and maskrcnn as needed. if isinstance(self.params.task.model, configs.retinanet.RetinaNet): export_config = self.params.task.export_config # Normalize detection box coordinates to [0, 1]. if export_config.output_normalized_coordinates: detection_boxes = ( detections['detection_boxes'] / tf.tile(image_info[:, 2:3, :], [1, 1, 2])) detections['detection_boxes'] = box_ops.normalize_boxes( detection_boxes, image_info[:, 0:1, :]) # Cast num_detections and detection_classes to float. This allows the # model inference to work on chain (go/chain) as chain requires floating # point outputs. if export_config.cast_num_detections_to_float: detections['num_detections'] = tf.cast( detections['num_detections'], dtype=tf.float32) if export_config.cast_detection_classes_to_float: detections['detection_classes'] = tf.cast( detections['detection_classes'], dtype=tf.float32) final_outputs = { 'detection_boxes': detections['detection_boxes'], 'detection_scores': detections['detection_scores'], 'detection_classes': detections['detection_classes'], 'num_detections': detections['num_detections'] } else: final_outputs = { 'decoded_boxes': detections['decoded_boxes'], 'decoded_box_scores': detections['decoded_box_scores'] } if 'detection_masks' in detections.keys(): final_outputs['detection_masks'] = detections['detection_masks'] final_outputs.update({'image_info': image_info}) return final_outputs
def _mosaic_crop_image(self, image, boxes, classes, is_crowd, area): """Process a patched image in preperation for final output.""" if self._mosaic_crop_mode != 'crop': shape = tf.cast(preprocessing_ops.get_image_shape(image), tf.float32) center = shape * self._mosaic_center # shift the center of the image by applying a translation to the whole # image ch = tf.math.round( preprocessing_ops.random_uniform_strong(-center[0], center[0], seed=self._seed)) cw = tf.math.round( preprocessing_ops.random_uniform_strong(-center[1], center[1], seed=self._seed)) # clip the boxes to those with in the image image = tfa.image.translate(image, [cw, ch], fill_value=self._pad_value) boxes = box_ops.denormalize_boxes(boxes, shape[:2]) boxes = boxes + tf.cast([ch, cw, ch, cw], boxes.dtype) boxes = box_ops.clip_boxes(boxes, shape[:2]) inds = box_ops.get_non_empty_box_indices(boxes) boxes = box_ops.normalize_boxes(boxes, shape[:2]) boxes, classes, is_crowd, area = self._select_ind( inds, boxes, classes, # pylint:disable=unbalanced-tuple-unpacking is_crowd, area) # warp and scale the fully stitched sample image, _, affine = preprocessing_ops.affine_warp_image( image, [self._output_size[0], self._output_size[1]], scale_min=self._aug_scale_min, scale_max=self._aug_scale_max, translate=self._aug_rand_translate, degrees=self._aug_rand_angle, perspective=self._aug_rand_perspective, random_pad=self._random_pad, seed=self._seed) height, width = self._output_size[0], self._output_size[1] image = tf.image.resize(image, (height, width)) # clip and clean boxes boxes, inds = preprocessing_ops.transform_and_clip_boxes( boxes, None, affine=affine, area_thresh=self._area_thresh, seed=self._seed) classes, is_crowd, area = self._select_ind(inds, classes, is_crowd, area) # pylint:disable=unbalanced-tuple-unpacking return image, boxes, classes, is_crowd, area, area
def scale_boxes(self, patch, ishape, boxes, classes, xs, ys): """Scale and translate the boxes for each image prior to patching.""" xs = tf.cast(xs, boxes.dtype) ys = tf.cast(ys, boxes.dtype) pshape = tf.cast(tf.shape(patch), boxes.dtype) ishape = tf.cast(ishape, boxes.dtype) translate = tf.cast((ishape - pshape), boxes.dtype) boxes = box_ops.denormalize_boxes(boxes, pshape[:2]) boxes = boxes + tf.cast([ translate[0] * ys, translate[1] * xs, translate[0] * ys, translate[1] * xs ], boxes.dtype) boxes = box_ops.normalize_boxes(boxes, ishape[:2]) return boxes, classes
def serve(self, images): """Cast image to float and run inference. Args: images: uint8 Tensor of shape [batch_size, None, None, 3] Returns: Tensor holding classification output logits. """ with tf.device('cpu:0'): images = tf.cast(images, dtype=tf.float32) images = tf.nest.map_structure( tf.identity, tf.map_fn(self._build_inputs, elems=images, fn_output_signature=tf.TensorSpec( shape=self._input_image_size + [3], dtype=tf.float32), parallel_iterations=32)) outputs = self.inference_step( images) # tf.keras.Model's __call__ method num_classes = outputs['predictions']['0'].shape[-1] - 5 bbox_tensors, _, prob_tensors = yolo_ops.concat_tensor_dict( tensor_dict=outputs['predictions'], num_classes=num_classes) boxes = tf.concat(bbox_tensors, axis=1) boxes = tf.squeeze(yolo_box_ops.xcycwh_to_yxyx(boxes)) scores = tf.concat(prob_tensors, axis=1) scores = tf.squeeze(tf.math.reduce_max(scores, axis=-1)) classes = tf.argmax(prob_tensors, axis=-1) indices = tf.image.non_max_suppression(boxes=boxes, scores=scores, max_output_size=20, iou_threshold=0.5, score_threshold=0.25) boxes = tf.expand_dims(tf.gather(boxes, indices), axis=0) boxes = box_ops.normalize_boxes(boxes, self._input_image_size) scores = tf.expand_dims(tf.gather(scores, indices), axis=0) classes = tf.gather(classes, indices, axis=1) return {'boxes': boxes, 'classes': classes, 'scores': scores}
def visualize_images_with_bounding_boxes(images, box_outputs, step, summary_writer): """Records subset of evaluation images with bounding boxes.""" if not isinstance(images, list): logging.warning( 'visualize_images_with_bounding_boxes expects list of ' 'images but received type: %s and value: %s', type(images), images) return image_shape = tf.shape(images[0]) image_height = tf.cast(image_shape[0], tf.float32) image_width = tf.cast(image_shape[1], tf.float32) normalized_boxes = box_ops.normalize_boxes(box_outputs, [image_height, image_width]) bounding_box_color = tf.constant([[1.0, 1.0, 0.0, 1.0]]) image_summary = tf.image.draw_bounding_boxes( tf.cast(images, tf.float32), normalized_boxes, bounding_box_color) with summary_writer.as_default(): tf.summary.image('bounding_box_summary', image_summary, step=step) summary_writer.flush()
def serve(self, images: tf.Tensor): """Cast image to float and run inference. Args: images: uint8 Tensor of shape [batch_size, None, None, 3] Returns: Tensor holding detection output logits. """ # Skip image preprocessing when input_type is tflite so it is compatible # with TFLite quantization. if self._input_type != 'tflite': images, anchor_boxes, image_info = self.preprocess(images) else: with tf.device('cpu:0'): anchor_boxes = self._build_anchor_boxes() # image_info is a 3D tensor of shape [batch_size, 4, 2]. It is in the # format of [[original_height, original_width], # [desired_height, desired_width], [y_scale, x_scale], # [y_offset, x_offset]]. When input_type is tflite, input image is # supposed to be preprocessed already. image_info = tf.convert_to_tensor([[ self._input_image_size, self._input_image_size, [1.0, 1.0], [0, 0] ]], dtype=tf.float32) input_image_shape = image_info[:, 1, :] # To overcome keras.Model extra limitation to save a model with layers that # have multiple inputs, we use `model.call` here to trigger the forward # path. Note that, this disables some keras magics happens in `__call__`. detections = self.model.call(images=images, image_shape=input_image_shape, anchor_boxes=anchor_boxes, training=False) if self.params.task.model.detection_generator.apply_nms: # For RetinaNet model, apply export_config. # TODO(huizhongc): Add export_config to fasterrcnn and maskrcnn as needed. if isinstance(self.params.task.model, configs.retinanet.RetinaNet): export_config = self.params.task.export_config # Normalize detection box coordinates to [0, 1]. if export_config.output_normalized_coordinates: detection_boxes = ( detections['detection_boxes'] / tf.tile(image_info[:, 2:3, :], [1, 1, 2])) detections['detection_boxes'] = box_ops.normalize_boxes( detection_boxes, image_info[:, 0:1, :]) # Cast num_detections and detection_classes to float. This allows the # model inference to work on chain (go/chain) as chain requires floating # point outputs. if export_config.cast_num_detections_to_float: detections['num_detections'] = tf.cast( detections['num_detections'], dtype=tf.float32) if export_config.cast_detection_classes_to_float: detections['detection_classes'] = tf.cast( detections['detection_classes'], dtype=tf.float32) final_outputs = { 'detection_boxes': detections['detection_boxes'], 'detection_scores': detections['detection_scores'], 'detection_classes': detections['detection_classes'], 'num_detections': detections['num_detections'] } else: final_outputs = { 'decoded_boxes': detections['decoded_boxes'], 'decoded_box_scores': detections['decoded_box_scores'] } if 'detection_masks' in detections.keys(): final_outputs['detection_masks'] = detections['detection_masks'] final_outputs.update({'image_info': image_info}) return final_outputs
def transform_and_clip_boxes(boxes, infos, affine=None, shuffle_boxes=False, area_thresh=0.1, seed=None, augment=True): """Clips and cleans the boxes. Args: boxes: A `Tensor` for the boxes. infos: A `list` that contains the image infos. affine: A `list` that contains parameters for resize and crop. shuffle_boxes: A `bool` for shuffling the boxes. area_thresh: An `int` for the area threshold. seed: seed for random number generation. augment: A `bool` for clipping the boxes to [0, 1]. Returns: boxes: A `Tensor` representing the augmented boxes. ind: A `Tensor` valid box indices. """ # Clip and clean boxes. def get_valid_boxes(boxes): """Get indices for non-empty boxes.""" # Convert the boxes to center width height formatting. height = boxes[:, 2] - boxes[:, 0] width = boxes[:, 3] - boxes[:, 1] base = tf.logical_and(tf.greater(height, 0), tf.greater(width, 0)) return base # Initialize history to track operation applied to boxes box_history = boxes # Make sure all boxes are valid to start, clip to [0, 1] and get only the # valid boxes. output_size = tf.cast([640, 640], tf.float32) if augment: boxes = tf.math.maximum(tf.math.minimum(boxes, 1.0), 0.0) cond = get_valid_boxes(boxes) if infos is None: infos = [] for info in infos: # Denormalize the boxes. boxes = bbox_ops.denormalize_boxes(boxes, info[0]) box_history = bbox_ops.denormalize_boxes(box_history, info[0]) # Shift and scale all boxes, and keep track of box history with no # box clipping, history is used for removing boxes that have become # too small or exit the image area. (boxes, box_history) = resize_and_crop_boxes(boxes, info[2, :], info[1, :], info[3, :], box_history=box_history) # Get all the boxes that still remain in the image and store # in a bit vector for later use. cond = tf.logical_and(get_valid_boxes(boxes), cond) # Normalize the boxes to [0, 1]. output_size = info[1] boxes = bbox_ops.normalize_boxes(boxes, output_size) box_history = bbox_ops.normalize_boxes(box_history, output_size) if affine is not None: # Denormalize the boxes. boxes = bbox_ops.denormalize_boxes(boxes, affine[0]) box_history = bbox_ops.denormalize_boxes(box_history, affine[0]) # Clipped final boxes. (boxes, box_history) = affine_warp_boxes(affine[2], boxes, affine[1], box_history=box_history) # Get all the boxes that still remain in the image and store # in a bit vector for later use. cond = tf.logical_and(get_valid_boxes(boxes), cond) # Normalize the boxes to [0, 1]. output_size = affine[1] boxes = bbox_ops.normalize_boxes(boxes, output_size) box_history = bbox_ops.normalize_boxes(box_history, output_size) # Remove the bad boxes. boxes *= tf.cast(tf.expand_dims(cond, axis=-1), boxes.dtype) # Threshold the existing boxes. if augment: boxes_ = bbox_ops.denormalize_boxes(boxes, output_size) box_history_ = bbox_ops.denormalize_boxes(box_history, output_size) inds = boxes_candidates(boxes_, box_history_, area_thr=area_thresh) # Select and gather the good boxes. if shuffle_boxes: inds = tf.random.shuffle(inds, seed=seed) else: boxes = box_history boxes_ = bbox_ops.denormalize_boxes(boxes, output_size) inds = bbox_ops.get_non_empty_box_indices(boxes_) boxes = tf.gather(boxes, inds) return boxes, inds
def _parse_train_data(self, data): """Generates images and labels that are usable for model training. Args: data: a dict of Tensors produced by the decoder. Returns: images: the image tensor. labels: a dict of Tensors that contains labels. """ shape = tf.shape(data['image']) image = data['image'] / 255 boxes = data['groundtruth_boxes'] width = shape[0] height = shape[1] image, boxes = yolo_preprocess_ops.fit_preserve_aspect_ratio( image, boxes, width=width, height=height, target_dim=self._max_process_size) image_shape = tf.shape(image)[:2] if self._random_flip: image, boxes, _ = preprocess_ops.random_horizontal_flip( image, boxes, seed=self._seed) randscale = self._image_w // self._net_down_scale if not self._fixed_size: do_scale = tf.greater( tf.random.uniform([], minval=0, maxval=1, seed=self._seed), 0.5) if do_scale: # This scales the image to a random multiple of net_down_scale # between 320 to 608 randscale = tf.random.uniform( [], minval=self._min_process_size // self._net_down_scale, maxval=self._max_process_size // self._net_down_scale, seed=self._seed, dtype=tf.int32) * self._net_down_scale if self._jitter_boxes != 0.0: boxes = box_ops.denormalize_boxes(boxes, image_shape) boxes = box_ops.jitter_boxes(boxes, 0.025) boxes = box_ops.normalize_boxes(boxes, image_shape) # YOLO loss function uses x-center, y-center format boxes = yolo_box_ops.yxyx_to_xcycwh(boxes) if self._jitter_im != 0.0: image, boxes = yolo_preprocess_ops.random_translate( image, boxes, self._jitter_im, seed=self._seed) if self._aug_rand_zoom: image, boxes = yolo_preprocess_ops.resize_crop_filter( image, boxes, default_width=self._image_w, default_height=self._image_h, target_width=randscale, target_height=randscale) image = tf.image.resize(image, (416, 416), preserve_aspect_ratio=False) if self._aug_rand_brightness: image = tf.image.random_brightness(image=image, max_delta=.1) # Brightness if self._aug_rand_saturation: image = tf.image.random_saturation(image=image, lower=0.75, upper=1.25) # Saturation if self._aug_rand_hue: image = tf.image.random_hue(image=image, max_delta=.3) # Hue image = tf.clip_by_value(image, 0.0, 1.0) # Find the best anchor for the ground truth labels to maximize the iou best_anchors = yolo_preprocess_ops.get_best_anchor( boxes, self._anchors, width=self._image_w, height=self._image_h) # Padding boxes = preprocess_ops.clip_or_pad_to_fixed_size( boxes, self._max_num_instances, 0) classes = preprocess_ops.clip_or_pad_to_fixed_size( data['groundtruth_classes'], self._max_num_instances, -1) best_anchors = preprocess_ops.clip_or_pad_to_fixed_size( best_anchors, self._max_num_instances, 0) area = preprocess_ops.clip_or_pad_to_fixed_size( data['groundtruth_area'], self._max_num_instances, 0) is_crowd = preprocess_ops.clip_or_pad_to_fixed_size( tf.cast(data['groundtruth_is_crowd'], tf.int32), self._max_num_instances, 0) labels = { 'source_id': data['source_id'], 'bbox': tf.cast(boxes, self._dtype), 'classes': tf.cast(classes, self._dtype), 'area': tf.cast(area, self._dtype), 'is_crowd': is_crowd, 'best_anchors': tf.cast(best_anchors, self._dtype), 'width': width, 'height': height, 'num_detections': tf.shape(data['groundtruth_classes'])[0], } if self._fixed_size: grid = self._build_grid(labels, self._image_w, use_tie_breaker=self._use_tie_breaker) labels.update({'grid_form': grid}) return image, labels
def serve(self, images: tf.Tensor): """Cast image to float and run inference. Args: images: uint8 Tensor of shape [batch_size, None, None, 3] Returns: Tensor holding detection output logits. """ model_params = self.params.task.model with tf.device('cpu:0'): images = tf.cast(images, dtype=tf.float32) # Tensor Specs for map_fn outputs (images, anchor_boxes, and image_info). images_spec = tf.TensorSpec(shape=self._input_image_size + [3], dtype=tf.float32) num_anchors = model_params.anchor.num_scales * len( model_params.anchor.aspect_ratios) * 4 anchor_shapes = [] for level in range(model_params.min_level, model_params.max_level + 1): anchor_level_spec = tf.TensorSpec(shape=[ self._input_image_size[0] // 2**level, self._input_image_size[1] // 2**level, num_anchors ], dtype=tf.float32) anchor_shapes.append((str(level), anchor_level_spec)) image_info_spec = tf.TensorSpec(shape=[4, 2], dtype=tf.float32) images, anchor_boxes, image_info = tf.nest.map_structure( tf.identity, tf.map_fn(self._build_inputs, elems=images, fn_output_signature=(images_spec, dict(anchor_shapes), image_info_spec), parallel_iterations=32)) input_image_shape = image_info[:, 1, :] # To overcome keras.Model extra limitation to save a model with layers that # have multiple inputs, we use `model.call` here to trigger the forward # path. Note that, this disables some keras magics happens in `__call__`. detections = self.model.call(images=images, image_shape=input_image_shape, anchor_boxes=anchor_boxes, training=False) if self.params.task.model.detection_generator.apply_nms: # For RetinaNet model, apply export_config. # TODO(huizhongc): Add export_config to fasterrcnn and maskrcnn as needed. if isinstance(self.params.task.model, configs.retinanet.RetinaNet): export_config = self.params.task.export_config # Normalize detection box coordinates to [0, 1]. if export_config.output_normalized_coordinates: detection_boxes = ( detections['detection_boxes'] / tf.tile(image_info[:, 2:3, :], [1, 1, 2])) detections['detection_boxes'] = box_ops.normalize_boxes( detection_boxes, image_info[:, 0:1, :]) # Cast num_detections and detection_classes to float. This allows the # model inference to work on chain (go/chain) as chain requires floating # point outputs. if export_config.cast_num_detections_to_float: detections['num_detections'] = tf.cast( detections['num_detections'], dtype=tf.float32) if export_config.cast_detection_classes_to_float: detections['detection_classes'] = tf.cast( detections['detection_classes'], dtype=tf.float32) final_outputs = { 'detection_boxes': detections['detection_boxes'], 'detection_scores': detections['detection_scores'], 'detection_classes': detections['detection_classes'], 'num_detections': detections['num_detections'] } else: final_outputs = { 'decoded_boxes': detections['decoded_boxes'], 'decoded_box_scores': detections['decoded_box_scores'] } if 'detection_masks' in detections.keys(): final_outputs['detection_masks'] = detections['detection_masks'] final_outputs.update({'image_info': image_info}) return final_outputs
def _parse_train_data(self, data): """Parses data for training. Args: data: the decoded tensor dictionary from TfExampleDecoder. Returns: image: image tensor that is preproessed to have normalized value and dimension [output_size[0], output_size[1], 3] labels: a dictionary of tensors used for training. The following describes {key: value} pairs in the dictionary. image_info: a 2D `Tensor` that encodes the information of the image and the applied preprocessing. It is in the format of [[original_height, original_width], [scaled_height, scaled_width], anchor_boxes: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, 4] representing anchor boxes at each level. rpn_score_targets: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, anchors_per_location]. The height_l and width_l represent the dimension of class logits at l-th level. rpn_box_targets: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, anchors_per_location * 4]. The height_l and width_l represent the dimension of bounding box regression output at l-th level. gt_boxes: Groundtruth bounding box annotations. The box is represented in [y1, x1, y2, x2] format. The coordinates are w.r.t the scaled image that is fed to the network. The tennsor is padded with -1 to the fixed dimension [self._max_num_instances, 4]. gt_classes: Groundtruth classes annotations. The tennsor is padded with -1 to the fixed dimension [self._max_num_instances]. gt_masks: groundtrugh masks cropped by the bounding box and resized to a fixed size determined by mask_crop_size. """ classes = data['groundtruth_classes'] boxes = data['groundtruth_boxes'] if self._include_mask: masks = data['groundtruth_instance_masks'] is_crowds = data['groundtruth_is_crowd'] # Skips annotations with `is_crowd` = True. if self._skip_crowd_during_training: num_groundtruths = tf.shape(classes)[0] with tf.control_dependencies([num_groundtruths, is_crowds]): indices = tf.cond( tf.greater(tf.size(is_crowds), 0), lambda: tf.where(tf.logical_not(is_crowds))[:, 0], lambda: tf.cast(tf.range(num_groundtruths), tf.int64)) classes = tf.gather(classes, indices) boxes = tf.gather(boxes, indices) if self._include_mask: masks = tf.gather(masks, indices) # Gets original image and its size. image = data['image'] image_shape = tf.shape(image)[0:2] # Normalizes image with mean and std pixel values. image = preprocess_ops.normalize_image(image) # Flips image randomly during training. if self._aug_rand_hflip: if self._include_mask: image, boxes, masks = preprocess_ops.random_horizontal_flip( image, boxes, masks) else: image, boxes, _ = preprocess_ops.random_horizontal_flip( image, boxes) # Converts boxes from normalized coordinates to pixel coordinates. # Now the coordinates of boxes are w.r.t. the original image. boxes = box_ops.denormalize_boxes(boxes, image_shape) # Resizes and crops image. image, image_info = preprocess_ops.resize_and_crop_image( image, self._output_size, padded_size=preprocess_ops.compute_padded_size( self._output_size, 2**self._max_level), aug_scale_min=self._aug_scale_min, aug_scale_max=self._aug_scale_max) image_height, image_width, _ = image.get_shape().as_list() # Resizes and crops boxes. # Now the coordinates of boxes are w.r.t the scaled image. image_scale = image_info[2, :] offset = image_info[3, :] boxes = preprocess_ops.resize_and_crop_boxes(boxes, image_scale, image_info[1, :], offset) # Filters out ground truth boxes that are all zeros. indices = box_ops.get_non_empty_box_indices(boxes) boxes = tf.gather(boxes, indices) classes = tf.gather(classes, indices) if self._include_mask: masks = tf.gather(masks, indices) # Transfer boxes to the original image space and do normalization. cropped_boxes = boxes + tf.tile(tf.expand_dims(offset, axis=0), [1, 2]) cropped_boxes /= tf.tile(tf.expand_dims(image_scale, axis=0), [1, 2]) cropped_boxes = box_ops.normalize_boxes(cropped_boxes, image_shape) num_masks = tf.shape(masks)[0] masks = tf.image.crop_and_resize( tf.expand_dims(masks, axis=-1), cropped_boxes, box_indices=tf.range(num_masks, dtype=tf.int32), crop_size=[self._mask_crop_size, self._mask_crop_size], method='bilinear') masks = tf.squeeze(masks, axis=-1) # Assigns anchor targets. # Note that after the target assignment, box targets are absolute pixel # offsets w.r.t. the scaled image. input_anchor = anchor.build_anchor_generator( min_level=self._min_level, max_level=self._max_level, num_scales=self._num_scales, aspect_ratios=self._aspect_ratios, anchor_size=self._anchor_size) anchor_boxes = input_anchor(image_size=(image_height, image_width)) anchor_labeler = anchor.RpnAnchorLabeler(self._rpn_match_threshold, self._rpn_unmatched_threshold, self._rpn_batch_size_per_im, self._rpn_fg_fraction) rpn_score_targets, rpn_box_targets = anchor_labeler.label_anchors( anchor_boxes, boxes, tf.cast(tf.expand_dims(classes, axis=-1), dtype=tf.float32)) # Casts input image to self._dtype image = tf.cast(image, dtype=self._dtype) # Packs labels for model_fn outputs. labels = { 'anchor_boxes': anchor_boxes, 'image_info': image_info, 'rpn_score_targets': rpn_score_targets, 'rpn_box_targets': rpn_box_targets, 'gt_boxes': preprocess_ops.clip_or_pad_to_fixed_size(boxes, self._max_num_instances, -1), 'gt_classes': preprocess_ops.clip_or_pad_to_fixed_size(classes, self._max_num_instances, -1), } if self._include_mask: labels['gt_masks'] = preprocess_ops.clip_or_pad_to_fixed_size( masks, self._max_num_instances, -1) return image, labels
def run_on_image_dir(self, image_path_glob: str, output_dir: str, preprocess_fn: Callable[[tf.Tensor], tf.Tensor], inference_fn: Callable[[tf.Tensor], tf.Tensor], class_names_path: str, save_logits_bin: bool = False, *args, **kwargs): """Runs inference graph for the model, for given directory of images Args: image_path_glob: `str`, path pattern for images output_dir: `str`, path to output logs preprocess_fn: `Callable`, takes image tensor of shape (1, height, width, channels), produces altered image tensor of same shape inference_fn: `Callable`, takes image tensor of shape (1, height, width, channels), outputs Tensor of shape [batch_size, None, None, 3] class_names_path: `str`, path to txt file containing classes. Text file should contain one class name per line. save_logits_bin: `bool`, flag to save tensors and binary files """ cmap = get_colormap(cmap_type='cityscapes').numpy() dataset = run_lib.inference_dataset(image_path_glob=image_path_glob, output_dir=output_dir, preprocess_fn=preprocess_fn) class_names = run_lib.load_class_names( class_names_paths=class_names_path) if len(class_names) != 2: raise ValueError('Class name paths found: %s' %class_names + \ ' , please specify only 2 (cls, yolo).') for image, img_filename, save_basename in dataset: logits = inference_fn(image) if len(logits) != 7: raise NotImplementedError("Inferences for multitask only implemented for " +\ "argmax_outputs=True, visualise_outputs=True, class_present_outputs=True.") cls_env, seg_mask, seg_visualised, is_classes_present, yolo_boxes, yolo_classes, yolo_scores = logits if yolo_classes.dtype == 'float32': yolo_classes, yolo_scores = yolo_scores, yolo_classes if save_logits_bin: run_lib.write_tensor_as_bin(tensor=image, output_path=save_basename + '_input') run_lib.write_tensor_as_bin(tensor=seg_mask, output_path=save_basename + '_mask') run_lib.write_tensor_as_bin(tensor=seg_visualised, output_path=save_basename + '_visualised_mask') run_lib.write_tensor_as_bin(tensor=yolo_boxes, output_path=save_basename + '_boxes') run_lib.write_tensor_as_bin(tensor=yolo_scores, output_path=save_basename + '_scores') run_lib.write_tensor_as_bin(tensor=yolo_classes, output_path=save_basename + '_classes') image = tf.image.resize(image, self._input_image_size) image = tf.cast(image, tf.uint8) yolo_boxes = box_ops.normalize_boxes(yolo_boxes, self._input_image_size) output_image = run_lib.draw_bbox( image=run_lib.tensor_to_numpy(image).squeeze(), bboxes=run_lib.tensor_to_numpy(yolo_boxes), scores=run_lib.tensor_to_numpy(yolo_scores), classes=run_lib.tensor_to_numpy(yolo_classes), num_bboxes=tf.constant(yolo_classes.shape[0]).numpy(), class_names=class_names[1]) env_val = run_lib.tensor_to_numpy(cls_env)[0] output_image = run_lib.draw_text( image=output_image, text_list=[class_names[0][env_val]], spacing=20) seg_mask = tf.squeeze(seg_mask).numpy() if seg_mask.ndim > 2: seg_mask = np.argmax(seg_mask, axis=-1).astype(np.uint8) seg_mask = cmap[seg_mask] output_image = np.hstack((output_image, seg_mask)) output_image = tf.image.encode_png(output_image) tf.io.write_file(save_basename + '.png', output_image) print("Visualised %s, saving result at %s" % (img_filename, save_basename + '.png'))
def train_step(self, inputs: Tuple, model: tf.keras.Model, optimizer: tf.keras.optimizers.Optimizer, metrics: Optional[List[Any]] = None): """Does forward and backward. Args: inputs: a dictionary of input tensors. model: the model, forward pass definition. optimizer: the optimizer for this training step. metrics: a nested structure of metrics objects. Returns: A dictionary of logs. """ features, labels = inputs input_partition_dims = self.task_config.train_input_partition_dims if input_partition_dims: strategy = tf.distribute.get_strategy() features = strategy.experimental_split_to_logical_devices( features, input_partition_dims) input_shape = self.task_config.model.input_size[:2] normalized_boxes = box_ops.normalize_boxes(labels['raw_bboxes'], input_shape) bbox_color = tf.constant([[1.0, 1.0, 0.0, 1.0]]) self.image_summary_manager.write_summaries({ 'input_images': features, 'bbox': tf.image.draw_bounding_boxes(features, normalized_boxes, bbox_color) }) num_replicas = tf.distribute.get_strategy().num_replicas_in_sync with tf.GradientTape() as tape: outputs = model(features, training=True) # Casting output layer as float32 is necessary when mixed_precision is # mixed_float16 or mixed_bfloat16 to ensure output is casted as float32. outputs = tf.nest.map_structure( lambda x: tf.cast(x, tf.float32), outputs) # Computes per-replica loss. loss, giou_loss, conf_loss, prob_loss = self.build_losses( model_outputs=outputs, labels=labels, aux_losses=model.losses) # Scales loss as the default gradients allreduce performs sum inside the # optimizer. scaled_loss = loss / num_replicas # For mixed_precision policy, when LossScaleOptimizer is used, loss is # scaled for numerical stability. if isinstance(optimizer, tf.keras.mixed_precision.LossScaleOptimizer): scaled_loss = optimizer.get_scaled_loss(scaled_loss) tvars = model.trainable_variables grads = tape.gradient(scaled_loss, tvars) # Scales back gradient before apply_gradients when LossScaleOptimizer is # used. if isinstance(optimizer, tf.keras.mixed_precision.LossScaleOptimizer): grads = optimizer.get_unscaled_gradients(grads) optimizer.apply_gradients(list(zip(grads, tvars))) logs = {self.loss: loss} all_losses = { 'giou_loss': giou_loss, 'conf_loss': conf_loss, 'prob_loss': prob_loss } if metrics: # process metrics uses labels and outputs, metrics.mean uses values only for m in metrics: m.update_state(all_losses[m.name]) logs.update({m.name: m.result()}) return logs
def _parse_train_data(self, data): """Generates images and labels that are usable for model training. Args: data: a dict of Tensors produced by the decoder. Returns: images: the image tensor. labels: a dict of Tensors that contains labels. """ image = data['image'] / 255 # / 255 boxes = data['groundtruth_boxes'] classes = data['groundtruth_classes'] do_blur = tf.random.uniform([], minval=0, maxval=1, seed=self._seed, dtype=tf.float32) if do_blur > 0.9: image = tfa.image.gaussian_filter2d(image, filter_shape=7, sigma=15) elif do_blur > 0.7: image = tfa.image.gaussian_filter2d(image, filter_shape=5, sigma=6) elif do_blur > 0.4: image = tfa.image.gaussian_filter2d(image, filter_shape=5, sigma=3) image = tf.image.rgb_to_hsv(image) i_h, i_s, i_v = tf.split(image, 3, axis=-1) if self._aug_rand_hue: delta = preprocessing_ops.rand_uniform_strong( -0.1, 0.1 ) # tf.random.uniform([], minval= -0.1,maxval=0.1, seed=self._seed, dtype=tf.float32) i_h = i_h + delta # Hue i_h = tf.clip_by_value(i_h, 0.0, 1.0) if self._aug_rand_saturation: delta = preprocessing_ops.rand_scale( 0.75 ) # tf.random.uniform([], minval= 0.5,maxval=1.1, seed=self._seed, dtype=tf.float32) i_s = i_s * delta if self._aug_rand_brightness: delta = preprocessing_ops.rand_scale( 0.75 ) # tf.random.uniform([], minval= -0.15,maxval=0.15, seed=self._seed, dtype=tf.float32) i_v = i_v * delta image = tf.concat([i_h, i_s, i_v], axis=-1) image = tf.image.hsv_to_rgb(image) stddev = tf.random.uniform([], minval=0, maxval=40 / 255, seed=self._seed, dtype=tf.float32) noise = tf.random.normal( shape=tf.shape(image), mean=0.0, stddev=stddev, seed=self._seed) noise = tf.math.minimum(noise, 0.5) noise = tf.math.maximum(noise, 0) image += noise image = tf.clip_by_value(image, 0.0, 1.0) image_shape = tf.shape(image)[:2] if self._random_flip: image, boxes, _ = preprocess_ops.random_horizontal_flip( image, boxes, seed=self._seed) if self._jitter_boxes != 0.0: boxes = box_ops.denormalize_boxes(boxes, image_shape) boxes = box_ops.jitter_boxes(boxes, 0.025) boxes = box_ops.normalize_boxes(boxes, image_shape) if self._jitter_im != 0.0: image, boxes, classes = preprocessing_ops.random_jitter( image, boxes, classes, self._jitter_im, seed=self._seed) # image, boxes, classes = preprocessing_ops.random_translate(image, boxes, classes, 0.2, seed=self._seed) if self._aug_rand_zoom: image, boxes, classes = preprocessing_ops.random_zoom_crop( image, boxes, classes, self._jitter_im) shape = tf.shape(image) width = shape[1] height = shape[0] randscale = self._image_w // self._net_down_scale if self._fixed_size: do_scale = tf.greater( tf.random.uniform([], minval=0, maxval=1, seed=self._seed), 1 - self._pct_rand) if do_scale: randscale = tf.random.uniform([], minval=10, maxval=15, seed=self._seed, dtype=tf.int32) if self._letter_box: image, boxes = preprocessing_ops.fit_preserve_aspect_ratio( image, boxes, width=width, height=height, target_dim=randscale * self._net_down_scale) width = randscale * self._net_down_scale height = randscale * self._net_down_scale shape = tf.shape(image) width = shape[1] height = shape[0] image, boxes, classes = preprocessing_ops.resize_crop_filter( image, boxes, classes, default_width=width, # randscale * self._net_down_scale, default_height=height, # randscale * self._net_down_scale, target_width=self._image_w, target_height=self._image_h, randomize=False) boxes = box_utils.yxyx_to_xcycwh(boxes) image = tf.clip_by_value(image, 0.0, 1.0) num_dets = tf.shape(classes)[0] # padding classes = preprocess_ops.clip_or_pad_to_fixed_size(classes, self._max_num_instances, -1) if self._fixed_size and not self._cutmix: best_anchors = preprocessing_ops.get_best_anchor( boxes, self._anchors, width=self._image_w, height=self._image_h) best_anchors = preprocess_ops.clip_or_pad_to_fixed_size( best_anchors, self._max_num_instances, 0) boxes = preprocess_ops.clip_or_pad_to_fixed_size(boxes, self._max_num_instances, 0) labels = { 'source_id': data['source_id'], 'bbox': tf.cast(boxes, self._dtype), 'classes': tf.cast(classes, self._dtype), 'best_anchors': tf.cast(best_anchors, self._dtype), 'width': width, 'height': height, 'num_detections': num_dets } grid = self._build_grid( labels, self._image_w, use_tie_breaker=self._use_tie_breaker) labels.update({'grid_form': grid}) labels['bbox'] = box_utils.xcycwh_to_yxyx(labels['bbox']) else: boxes = preprocess_ops.clip_or_pad_to_fixed_size(boxes, self._max_num_instances, 0) labels = { 'source_id': data['source_id'], 'bbox': tf.cast(boxes, self._dtype), 'classes': tf.cast(classes, self._dtype), 'width': width, 'height': height, 'num_detections': num_dets } return image, labels
def preprocess(self, inputs): """Preprocess COCO for DETR.""" image = inputs['image'] boxes = inputs['objects']['bbox'] classes = inputs['objects']['label'] + 1 is_crowd = inputs['objects']['is_crowd'] image = preprocess_ops.normalize_image(image) if self._params.is_training: image, boxes, _ = preprocess_ops.random_horizontal_flip(image, boxes) do_crop = tf.greater(tf.random.uniform([]), 0.5) if do_crop: # Rescale boxes = box_ops.denormalize_boxes(boxes, tf.shape(image)[:2]) index = tf.random.categorical(tf.zeros([1, 3]), 1)[0] scales = tf.gather([400.0, 500.0, 600.0], index, axis=0) short_side = scales[0] image, image_info = preprocess_ops.resize_image(image, short_side) boxes = preprocess_ops.resize_and_crop_boxes(boxes, image_info[2, :], image_info[1, :], image_info[3, :]) boxes = box_ops.normalize_boxes(boxes, image_info[1, :]) # Do croping shape = tf.cast(image_info[1], dtype=tf.int32) h = tf.random.uniform( [], 384, tf.math.minimum(shape[0], 600), dtype=tf.int32) w = tf.random.uniform( [], 384, tf.math.minimum(shape[1], 600), dtype=tf.int32) i = tf.random.uniform([], 0, shape[0] - h + 1, dtype=tf.int32) j = tf.random.uniform([], 0, shape[1] - w + 1, dtype=tf.int32) image = tf.image.crop_to_bounding_box(image, i, j, h, w) boxes = tf.clip_by_value( (boxes[..., :] * tf.cast( tf.stack([shape[0], shape[1], shape[0], shape[1]]), dtype=tf.float32) - tf.cast(tf.stack([i, j, i, j]), dtype=tf.float32)) / tf.cast(tf.stack([h, w, h, w]), dtype=tf.float32), 0.0, 1.0) scales = tf.constant( self._params.resize_scales, dtype=tf.float32) index = tf.random.categorical(tf.zeros([1, 11]), 1)[0] scales = tf.gather(scales, index, axis=0) else: scales = tf.constant([self._params.resize_scales[-1]], tf.float32) image_shape = tf.shape(image)[:2] boxes = box_ops.denormalize_boxes(boxes, image_shape) gt_boxes = boxes short_side = scales[0] image, image_info = preprocess_ops.resize_image( image, short_side, max(self._params.output_size)) boxes = preprocess_ops.resize_and_crop_boxes(boxes, image_info[2, :], image_info[1, :], image_info[3, :]) boxes = box_ops.normalize_boxes(boxes, image_info[1, :]) # Filters out ground truth boxes that are all zeros. indices = box_ops.get_non_empty_box_indices(boxes) boxes = tf.gather(boxes, indices) classes = tf.gather(classes, indices) is_crowd = tf.gather(is_crowd, indices) boxes = box_ops.yxyx_to_cycxhw(boxes) image = tf.image.pad_to_bounding_box( image, 0, 0, self._params.output_size[0], self._params.output_size[1]) labels = { 'classes': preprocess_ops.clip_or_pad_to_fixed_size( classes, self._params.max_num_boxes), 'boxes': preprocess_ops.clip_or_pad_to_fixed_size( boxes, self._params.max_num_boxes) } if not self._params.is_training: labels.update({ 'id': inputs['image/id'], 'image_info': image_info, 'is_crowd': preprocess_ops.clip_or_pad_to_fixed_size( is_crowd, self._params.max_num_boxes), 'gt_boxes': preprocess_ops.clip_or_pad_to_fixed_size( gt_boxes, self._params.max_num_boxes), }) return image, labels