def encode_labels(gt_boxes, gt_labels): """Labels anchors with ground truth inputs. Args: gt_boxes: A float tensor with shape [N, 4] representing groundtruth boxes. For each row, it stores [y0, x0, y1, x1] for four corners of a box. gt_labels: A integer tensor with shape [N, 1] representing groundtruth classes. Returns: encoded_classes: a tensor with shape [num_anchors, 1]. encoded_boxes: a tensor with shape [num_anchors, 4]. num_positives: scalar tensor storing number of positives in an image. """ similarity_calc = region_similarity_calculator.IouSimilarity() matcher = argmax_matcher.ArgMaxMatcher( matched_threshold=ssd_constants.MATCH_THRESHOLD, unmatched_threshold=ssd_constants.MATCH_THRESHOLD, negatives_lower_than_unmatched=True, force_match_for_each_row=True) box_coder = faster_rcnn_box_coder.FasterRcnnBoxCoder( scale_factors=ssd_constants.BOX_CODER_SCALES) default_boxes = box_list.BoxList( tf.convert_to_tensor(DefaultBoxes()('ltrb'))) target_boxes = box_list.BoxList(gt_boxes) assigner = target_assigner.TargetAssigner(similarity_calc, matcher, box_coder) encoded_classes, _, encoded_boxes, _, matches = assigner.assign( default_boxes, target_boxes, gt_labels) num_matched_boxes = tf.reduce_sum( tf.cast(tf.not_equal(matches.match_results, -1), tf.float32)) return encoded_classes, encoded_boxes, num_matched_boxes
def label_anchors(self, gt_boxes, gt_labels): """Labels anchors with ground truth inputs. Args: gt_boxes: a float tensor with shape [N, 4] representing groundtruth boxes. For each row, it stores [y0, x0, y1, x1] for four corners of a box. gt_labels: an integer tensor with shape [N, 1] representing groundtruth classes. Returns: score_targets_dict: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, num_anchors]. The height_l and width_l represent the dimension of class logits at l-th level. box_targets_dict: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, num_anchors * 4]. The height_l and width_l represent the dimension of bounding box regression output at l-th level. """ gt_box_list = box_list.BoxList(gt_boxes) anchor_box_list = box_list.BoxList(self._anchors.boxes) # cls_targets, cls_weights, box_weights are not used _, _, box_targets, _, matches = self._target_assigner.assign( anchor_box_list, gt_box_list, gt_labels) # score_targets contains the subsampled positive and negative anchors. score_targets, _, _ = self._get_rpn_samples(matches.match_results) # Unpack labels. score_targets_dict = self._anchors.unpack_labels(score_targets) box_targets_dict = self._anchors.unpack_labels(box_targets) return score_targets_dict, box_targets_dict
def box_list_scale(boxlist, y_scale, x_scale, scope=None): """scale box coordinates in x and y dimensions. Args: boxlist: BoxList holding N boxes y_scale: (float) scalar tensor x_scale: (float) scalar tensor scope: name scope. Returns: boxlist: BoxList holding N boxes """ with tf.name_scope(scope, 'Scale'): y_scale = tf.cast(y_scale, tf.float32) x_scale = tf.cast(x_scale, tf.float32) y_min, x_min, y_max, x_max = tf.split(value=boxlist.get(), num_or_size_splits=4, axis=1) y_min = y_scale * y_min y_max = y_scale * y_max x_min = x_scale * x_min x_max = x_scale * x_max scaled_boxlist = box_list.BoxList( tf.concat([y_min, x_min, y_max, x_max], 1)) return _copy_extra_fields(scaled_boxlist, boxlist)
def scale_boxes_to_pixel_coordinates(image, boxes, keypoints=None): """Scales boxes from normalized to pixel coordinates. Args: image: A 3D float32 tensor of shape [height, width, channels]. boxes: A 2D float32 tensor of shape [num_boxes, 4] containing the bounding boxes in normalized coordinates. Each row is of the form [ymin, xmin, ymax, xmax]. keypoints: (optional) rank 3 float32 tensor with shape [num_instances, num_keypoints, 2]. The keypoints are in y-x normalized coordinates. Returns: image: unchanged input image. scaled_boxes: a 2D float32 tensor of shape [num_boxes, 4] containing the bounding boxes in pixel coordinates. scaled_keypoints: a 3D float32 tensor with shape [num_instances, num_keypoints, 2] containing the keypoints in pixel coordinates. """ boxlist = box_list.BoxList(boxes) image_height = tf.shape(image)[0] image_width = tf.shape(image)[1] scaled_boxes = box_list_scale(boxlist, image_height, image_width).get() result = [image, scaled_boxes] if keypoints is not None: scaled_keypoints = keypoint_scale(keypoints, image_height, image_width) result.append(scaled_keypoints) return tuple(result)
def change_coordinate_frame(boxlist, window, scope=None): """Change coordinate frame of the boxlist to be relative to window's frame. Given a window of the form [ymin, xmin, ymax, xmax], changes bounding box coordinates from boxlist to be relative to this window (e.g., the min corner maps to (0,0) and the max corner maps to (1,1)). An example use case is data augmentation: where we are given groundtruth boxes (boxlist) and would like to randomly crop the image to some window (window). In this case we need to change the coordinate frame of each groundtruth box to be relative to this new window. Args: boxlist: A BoxList object holding N boxes. window: A rank 1 tensor [4]. scope: name scope. Returns: Returns a BoxList object with N boxes. """ with tf.name_scope(scope, 'ChangeCoordinateFrame'): win_height = window[2] - window[0] win_width = window[3] - window[1] boxlist_new = scale( box_list.BoxList(boxlist.get() - [window[0], window[1], window[0], window[1]]), 1.0 / win_height, 1.0 / win_width) boxlist_new = _copy_extra_fields(boxlist_new, boxlist) return boxlist_new
def _decode(self, rel_codes, anchors): """Decode relative codes to boxes. Args: rel_codes: a tensor representing N anchor-encoded boxes. anchors: BoxList of anchors. Returns: boxes: BoxList holding N bounding boxes. """ ycenter_a, xcenter_a, ha, wa = anchors.get_center_coordinates_and_sizes() ty, tx, th, tw = tf.unstack(tf.transpose(rel_codes)) if self._scale_factors: ty /= self._scale_factors[0] tx /= self._scale_factors[1] th /= self._scale_factors[2] tw /= self._scale_factors[3] w = tf.exp(tw) * wa h = tf.exp(th) * ha ycenter = ty * ha + ycenter_a xcenter = tx * wa + xcenter_a ymin = ycenter - h / 2. xmin = xcenter - w / 2. ymax = ycenter + h / 2. xmax = xcenter + w / 2. return box_list.BoxList(tf.transpose(tf.stack([ymin, xmin, ymax, xmax])))
def concatenate(boxlists, fields=None, scope=None): """Concatenate list of BoxLists. This op concatenates a list of input BoxLists into a larger BoxList. It also handles concatenation of BoxList fields as long as the field tensor shapes are equal except for the first dimension. Args: boxlists: list of BoxList objects fields: optional list of fields to also concatenate. By default, all fields from the first BoxList in the list are included in the concatenation. scope: name scope. Returns: a BoxList with number of boxes equal to sum([boxlist.num_boxes() for boxlist in BoxList]) Raises: ValueError: if boxlists is invalid (i.e., is not a list, is empty, or contains non BoxList objects), or if requested fields are not contained in all boxlists """ with tf.name_scope(scope, 'Concatenate'): if not isinstance(boxlists, list): raise ValueError('boxlists should be a list') if not boxlists: raise ValueError('boxlists should have nonzero length') for boxlist in boxlists: if not isinstance(boxlist, box_list.BoxList): raise ValueError( 'all elements of boxlists should be BoxList objects') concatenated = box_list.BoxList( tf.concat([boxlist.get() for boxlist in boxlists], 0)) if fields is None: fields = boxlists[0].get_extra_fields() for field in fields: first_field_shape = boxlists[0].get_field( field).get_shape().as_list() first_field_shape[0] = -1 if None in first_field_shape: raise ValueError( 'field %s must have fully defined shape except for the' ' 0th dimension.' % field) for boxlist in boxlists: if not boxlist.has_field(field): raise ValueError( 'boxlist must contain all requested fields') field_shape = boxlist.get_field(field).get_shape().as_list() field_shape[0] = -1 if field_shape != first_field_shape: raise ValueError( 'field %s must have same shape for all boxlists ' 'except for the 0th dimension.' % field) concatenated_field = tf.concat( [boxlist.get_field(field) for boxlist in boxlists], 0) concatenated.add_field(field, concatenated_field) return concatenated
def label_anchors(self, gt_boxes, gt_labels): """Labels anchors with ground truth inputs. Args: gt_boxes: A float tensor with shape [N, 4] representing groundtruth boxes. For each row, it stores [y0, x0, y1, x1] for four corners of a box. gt_labels: A integer tensor with shape [N, 1] representing groundtruth classes. Returns: cls_targets_dict: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, num_anchors * num_classes]. The height_l and width_l represent the dimension of class logits at l-th level. box_targets_dict: ordered dictionary with keys [min_level, min_level+1, ..., max_level]. The values are tensor with shape [height_l, width_l, num_anchors * 4]. The height_l and width_l represent the dimension of bounding box regression output at l-th level. num_positives: scalar tensor storing number of positives in an image. """ gt_box_list = box_list.BoxList(gt_boxes) anchor_box_list = box_list.BoxList(self._anchors.boxes) # cls_weights, box_weights are not used cls_targets, _, box_targets, _, matches = self._target_assigner.assign( anchor_box_list, gt_box_list, gt_labels) # class labels start from 1 and the background class = -1 cls_targets -= 1 # create one-hot labels cls_targets_one_hot = tf.one_hot(tf.cast(cls_targets, dtype=tf.int32), self._num_classes) cls_targets_one_hot = tf.reshape(cls_targets_one_hot, [-1, self._num_classes]) cls_targets_dict = self._unpack_labels(cls_targets_one_hot) box_targets_dict = self._unpack_labels(box_targets) num_positives = tf.reduce_sum( tf.cast(tf.not_equal(matches.match_results, -1), tf.float32)) return cls_targets_dict, box_targets_dict, num_positives
def _assign_targets(self, gt_boxes_list, gt_labels_list): """ Assign gt targets Args: gt_boxes_list: a list of 2-D tensor of shape [num_boxes, 4] containing coordinates of gt boxes gt_labels_list: a list of 2-D one-hot tensors of shape [num_boxes, num_classes] containing gt classes Returns: batch_cls_targets: class tensor with shape [batch_size, num_anchors, num_classes] batch_reg_target: box tensor with shape [batch_size, num_anchors, 4] match_list: a list of matcher.Match object encoding the match between anchors and gt boxes for each image of the batch, with rows corresponding to gt-box and columns corresponding to anchors """ gt_boxlist_list = [box_list.BoxList(boxes) for boxes in gt_boxes_list] gt_labels_with_bg = [ tf.pad(gt_class, [[0, 0], [1, 0]], mode='CONSTANT') for gt_class in gt_labels_list ] anchors = box_list.BoxList(self._anchors) return batch_assign_targets(self._target_assigner, anchors, gt_boxlist_list, gt_labels_with_bg)
def sample_boxes_by_jittering(boxlist, num_boxes_to_sample, stddev=0.1, scope=None): """Samples num_boxes_to_sample boxes by jittering around boxlist boxes. It is possible that this function might generate boxes with size 0. The larger the stddev, this is more probable. For a small stddev of 0.1 this probability is very small. Args: boxlist: A boxlist containing N boxes in normalized coordinates. num_boxes_to_sample: A positive integer containing the number of boxes to sample. stddev: Standard deviation. This is used to draw random offsets for the box corners from a normal distribution. The offset is multiplied by the box size so will be larger in terms of pixels for larger boxes. scope: Name scope. Returns: sampled_boxlist: A boxlist containing num_boxes_to_sample boxes in normalized coordinates. """ with tf.name_scope(scope, 'SampleBoxesByJittering'): num_boxes = boxlist.num_boxes() box_indices = tf.random_uniform([num_boxes_to_sample], minval=0, maxval=num_boxes, dtype=tf.int32) sampled_boxes = tf.gather(boxlist.get(), box_indices) sampled_boxes_height = sampled_boxes[:, 2] - sampled_boxes[:, 0] sampled_boxes_width = sampled_boxes[:, 3] - sampled_boxes[:, 1] rand_miny_gaussian = tf.random_normal([num_boxes_to_sample], stddev=stddev) rand_minx_gaussian = tf.random_normal([num_boxes_to_sample], stddev=stddev) rand_maxy_gaussian = tf.random_normal([num_boxes_to_sample], stddev=stddev) rand_maxx_gaussian = tf.random_normal([num_boxes_to_sample], stddev=stddev) miny = rand_miny_gaussian * sampled_boxes_height + sampled_boxes[:, 0] minx = rand_minx_gaussian * sampled_boxes_width + sampled_boxes[:, 1] maxy = rand_maxy_gaussian * sampled_boxes_height + sampled_boxes[:, 2] maxx = rand_maxx_gaussian * sampled_boxes_width + sampled_boxes[:, 3] maxy = tf.maximum(miny, maxy) maxx = tf.maximum(minx, maxx) sampled_boxes = tf.stack([miny, minx, maxy, maxx], axis=1) sampled_boxes = tf.maximum(tf.minimum(sampled_boxes, 1.0), 0.0) return box_list.BoxList(sampled_boxes)
def _batch_decode(self, box_encodings): """ Decode batch of box encodings with respect to anchors Args: box_encodings: box prediction tensor with shape [batch_size, num_anchors, 4] Returns: decoded_boxes: decoded box tensor with same shape as input tensor """ input_shape = shape_utils.combined_static_and_dynamic_shape( box_encodings) batch_size = input_shape[0] tiled_anchor_boxes = tf.tile(tf.expand_dims(self._anchors, 0), [batch_size, 1, 1]) tiled_anchor_boxlist = box_list.BoxList( tf.reshape(tiled_anchor_boxes, [-1, 4])) decoded_boxes = self._box_coder.decode( tf.reshape(box_encodings, [-1, self._box_coder.code_size]), tiled_anchor_boxlist) return tf.reshape(decoded_boxes.get(), [batch_size, -1, 4])
def gather(boxlist, indices, fields=None, scope=None, use_static_shapes=False): """Gather boxes from BoxList according to indices and return new BoxList. By default, `gather` returns boxes corresponding to the input index list, as well as all additional fields stored in the boxlist (indexing into the first dimension). However one can optionally only gather from a subset of fields. Args: boxlist: BoxList holding N boxes indices: a rank-1 tensor of type int32 / int64 fields: (optional) list of fields to also gather from. If None (default), all fields are gathered from. Pass an empty fields list to only gather the box coordinates. scope: name scope. use_static_shapes: Whether to use an implementation with static shape gurantees. Returns: subboxlist: a BoxList corresponding to the subset of the input BoxList specified by indices Raises: ValueError: if specified field is not contained in boxlist or if the indices are not of type int32 """ with tf.name_scope(scope, 'Gather'): if len(indices.shape.as_list()) != 1: raise ValueError('indices should have rank 1') if indices.dtype != tf.int32 and indices.dtype != tf.int64: raise ValueError('indices should be an int32 / int64 tensor') gather_op = tf.gather subboxlist = box_list.BoxList(gather_op(boxlist.get(), indices)) if fields is None: fields = boxlist.get_extra_fields() fields += ['boxes'] for field in fields: if not boxlist.has_field(field): raise ValueError('boxlist must contain all specified fields') subfieldlist = gather_op(boxlist.get_field(field), indices) subboxlist.add_field(field, subfieldlist) return subboxlist
def pad_or_clip_box_list(boxlist, num_boxes, scope=None): """Pads or clips all fields of a BoxList. Args: boxlist: A BoxList with arbitrary of number of boxes. num_boxes: First num_boxes in boxlist are kept. The fields are zero-padded if num_boxes is bigger than the actual number of boxes. scope: name scope. Returns: BoxList with all fields padded or clipped. """ with tf.name_scope(scope, 'PadOrClipBoxList'): subboxlist = box_list.BoxList( shape_utils.pad_or_clip_tensor(boxlist.get(), num_boxes)) for field in boxlist.get_extra_fields(): subfield = shape_utils.pad_or_clip_tensor(boxlist.get_field(field), num_boxes) subboxlist.add_field(field, subfield) return subboxlist
def clip_to_window(boxlist, window, filter_nonoverlapping=True, scope=None): """Clip bounding boxes to a window. This op clips any input bounding boxes (represented by bounding box corners) to a window, optionally filtering out boxes that do not overlap at all with the window. Args: boxlist: BoxList holding M_in boxes window: a tensor of shape [4] representing the [y_min, x_min, y_max, x_max] window to which the op should clip boxes. filter_nonoverlapping: whether to filter out boxes that do not overlap at all with the window. scope: name scope. Returns: a BoxList holding M_out boxes where M_out <= M_in """ with tf.name_scope(scope, 'ClipToWindow'): y_min, x_min, y_max, x_max = tf.split(value=boxlist.get(), num_or_size_splits=4, axis=1) win_y_min, win_x_min, win_y_max, win_x_max = tf.unstack(window) y_min_clipped = tf.maximum(tf.minimum(y_min, win_y_max), win_y_min) y_max_clipped = tf.maximum(tf.minimum(y_max, win_y_max), win_y_min) x_min_clipped = tf.maximum(tf.minimum(x_min, win_x_max), win_x_min) x_max_clipped = tf.maximum(tf.minimum(x_max, win_x_max), win_x_min) clipped = box_list.BoxList( tf.concat( [y_min_clipped, x_min_clipped, y_max_clipped, x_max_clipped], 1)) clipped = _copy_extra_fields(clipped, boxlist) if filter_nonoverlapping: areas = area(clipped) nonzero_area_indices = tf.cast( tf.reshape(tf.where(tf.greater(areas, 0.0)), [-1]), tf.int32) clipped = gather(clipped, nonzero_area_indices) return clipped
def _create_regression_targets(self, anchors, groundtruth_boxes, match): """Returns a regression target for each anchor. Args: anchors: a BoxList representing N anchors groundtruth_boxes: a BoxList representing M groundtruth_boxes match: a matcher.Match object Returns: reg_targets: a float32 tensor with shape [N, box_code_dimension] """ matched_gt_boxes = match.gather_based_on_match( groundtruth_boxes.get(), unmatched_value=tf.zeros(4), ignored_value=tf.zeros(4)) matched_gt_boxlist = box_list.BoxList(matched_gt_boxes) if groundtruth_boxes.has_field(KEYPOINTS_FIELD_NAME): groundtruth_keypoints = groundtruth_boxes.get_field( KEYPOINTS_FIELD_NAME) matched_keypoints = match.gather_based_on_match( groundtruth_keypoints, unmatched_value=tf.zeros( groundtruth_keypoints.get_shape()[1:]), ignored_value=tf.zeros(groundtruth_keypoints.get_shape()[1:])) matched_gt_boxlist.add_field(KEYPOINTS_FIELD_NAME, matched_keypoints) matched_reg_targets = self._box_coder.encode(matched_gt_boxlist, anchors) match_results_shape = shape_utils.combined_static_and_dynamic_shape( match.match_results) # Zero out the unmatched and ignored regression targets. unmatched_ignored_reg_targets = tf.tile( self._default_regression_target(), [match_results_shape[0], 1]) matched_anchors_mask = match.matched_column_indicator() reg_targets = tf.where(matched_anchors_mask, matched_reg_targets, unmatched_ignored_reg_targets) return reg_targets
def _parse_example(data): with tf.name_scope('augmentation'): source_id = data['source_id'] image = data['image'] # dtype uint8 raw_shape = tf.shape(image) boxes = data['groundtruth_boxes'] classes = tf.reshape(data['groundtruth_classes'], [-1, 1]) # Only 80 of the 90 COCO classes are used. class_map = tf.convert_to_tensor(ssd_constants.CLASS_MAP) classes = tf.gather(class_map, classes) classes = tf.cast(classes, dtype=tf.float32) if self._is_training: image, boxes, classes = ssd_crop(image, boxes, classes) # ssd_crop resizes and returns image of dtype float32 and does not # change its range (i.e., value in between 0--255). Divide by 255. # converts it to [0, 1] range. Not doing this before cropping to # avoid dtype cast (which incurs additional memory copy). image /= 255.0 # random_horizontal_flip() is hard coded to flip with 50% chance. image, boxes = preprocessor.random_horizontal_flip( image=image, boxes=boxes) # TODO(shibow): Investigate the parameters for color jitter. image = color_jitter(image, brightness=0.125, contrast=0.5, saturation=0.5, hue=0.05) if params['use_bfloat16']: image = tf.cast(image, dtype=tf.bfloat16) encoded_classes, encoded_boxes, num_matched_boxes = encode_labels( boxes, classes) # TODO(taylorrobie): Check that this cast is valid. encoded_classes = tf.cast(encoded_classes, tf.int32) labels = { ssd_constants.NUM_MATCHED_BOXES: num_matched_boxes, ssd_constants.BOXES: encoded_boxes, ssd_constants.CLASSES: tf.squeeze(encoded_classes, axis=1), } # This is for dataloader visualization; actual model doesn't use this. if params['visualize_dataloader']: box_coder = faster_rcnn_box_coder.FasterRcnnBoxCoder( scale_factors=ssd_constants.BOX_CODER_SCALES) decoded_boxes = tf.expand_dims(box_coder.decode( rel_codes=tf.squeeze(encoded_boxes), anchors=box_list.BoxList( tf.convert_to_tensor( DefaultBoxes()('ltrb')))).get(), axis=0) labels['decoded_boxes'] = tf.squeeze(decoded_boxes) return image, labels else: image = tf.image.resize_images( image, size=(ssd_constants.IMAGE_SIZE, ssd_constants.IMAGE_SIZE)) # resize_image returns image of dtype float32 and does not change its # range. Divide by 255 to convert image to [0, 1] range. image /= 255. if params['use_bfloat16']: image = tf.cast(image, dtype=tf.bfloat16) def trim_and_pad(inp_tensor, dim_1): """Limit the number of boxes, and pad if necessary.""" inp_tensor = inp_tensor[:ssd_constants. MAX_NUM_EVAL_BOXES] num_pad = ssd_constants.MAX_NUM_EVAL_BOXES - tf.shape( inp_tensor)[0] inp_tensor = tf.pad(inp_tensor, [[0, num_pad], [0, 0]]) return tf.reshape( inp_tensor, [ssd_constants.MAX_NUM_EVAL_BOXES, dim_1]) boxes, classes = trim_and_pad(boxes, 4), trim_and_pad(classes, 1) sample = { ssd_constants.IMAGE: image, ssd_constants.BOXES: boxes, ssd_constants.CLASSES: classes, ssd_constants.SOURCE_ID: tf.string_to_number(source_id, tf.int32), ssd_constants.RAW_SHAPE: raw_shape, } if not self._is_training and self._count > params[ 'eval_samples']: sample[ssd_constants.IS_PADDED] = data[ ssd_constants.IS_PADDED] return sample
def boolean_mask(boxlist, indicator, fields=None, scope=None, use_static_shapes=False, indicator_sum=None): """Select boxes from BoxList according to indicator and return new BoxList. `boolean_mask` returns the subset of boxes that are marked as "True" by the indicator tensor. By default, `boolean_mask` returns boxes corresponding to the input index list, as well as all additional fields stored in the boxlist (indexing into the first dimension). However one can optionally only draw from a subset of fields. Args: boxlist: BoxList holding N boxes indicator: a rank-1 boolean tensor fields: (optional) list of fields to also gather from. If None (default), all fields are gathered from. Pass an empty fields list to only gather the box coordinates. scope: name scope. use_static_shapes: Whether to use an implementation with static shape gurantees. indicator_sum: An integer containing the sum of `indicator` vector. Only required if `use_static_shape` is True. Returns: subboxlist: a BoxList corresponding to the subset of the input BoxList specified by indicator Raises: ValueError: if `indicator` is not a rank-1 boolean tensor. """ with tf.name_scope(scope, 'BooleanMask'): if indicator.shape.ndims != 1: raise ValueError('indicator should have rank 1') if indicator.dtype != tf.bool: raise ValueError('indicator should be a boolean tensor') if use_static_shapes: if not (indicator_sum and isinstance(indicator_sum, int)): raise ValueError('`indicator_sum` must be a of type int') selected_positions = tf.to_float(indicator) indexed_positions = tf.cast(tf.multiply( tf.cumsum(selected_positions), selected_positions), dtype=tf.int32) one_hot_selector = tf.one_hot(indexed_positions - 1, indicator_sum, dtype=tf.float32) sampled_indices = tf.cast(tf.tensordot(tf.to_float( tf.range(tf.shape(indicator)[0])), one_hot_selector, axes=[0, 0]), dtype=tf.int32) return gather(boxlist, sampled_indices, use_static_shapes=True) else: subboxlist = box_list.BoxList( tf.boolean_mask(boxlist.get(), indicator)) if fields is None: fields = boxlist.get_extra_fields() for field in fields: if not boxlist.has_field(field): raise ValueError( 'boxlist must contain all specified fields') subfieldlist = tf.boolean_mask(boxlist.get_field(field), indicator) subboxlist.add_field(field, subfieldlist) return subboxlist
def box_voting(selected_boxes, pool_boxes, iou_thresh=0.5): """Performs box voting as described in S. Gidaris and N. Komodakis, ICCV 2015. Performs box voting as described in 'Object detection via a multi-region & semantic segmentation-aware CNN model', Gidaris and Komodakis, ICCV 2015. For each box 'B' in selected_boxes, we find the set 'S' of boxes in pool_boxes with iou overlap >= iou_thresh. The location of B is set to the weighted average location of boxes in S (scores are used for weighting). And the score of B is set to the average score of boxes in S. Args: selected_boxes: BoxList containing a subset of boxes in pool_boxes. These boxes are usually selected from pool_boxes using non max suppression. pool_boxes: BoxList containing a set of (possibly redundant) boxes. iou_thresh: (float scalar) iou threshold for matching boxes in selected_boxes and pool_boxes. Returns: BoxList containing averaged locations and scores for each box in selected_boxes. Raises: ValueError: if a) selected_boxes or pool_boxes is not a BoxList. b) if iou_thresh is not in [0, 1]. c) pool_boxes does not have a scores field. """ if not 0.0 <= iou_thresh <= 1.0: raise ValueError('iou_thresh must be between 0 and 1') if not isinstance(selected_boxes, box_list.BoxList): raise ValueError('selected_boxes must be a BoxList') if not isinstance(pool_boxes, box_list.BoxList): raise ValueError('pool_boxes must be a BoxList') if not pool_boxes.has_field('scores'): raise ValueError('pool_boxes must have a \'scores\' field') iou_ = iou(selected_boxes, pool_boxes) match_indicator = tf.to_float(tf.greater(iou_, iou_thresh)) num_matches = tf.reduce_sum(match_indicator, 1) # TODO(kbanoop): Handle the case where some boxes in selected_boxes do not # match to any boxes in pool_boxes. For such boxes without any matches, we # should return the original boxes without voting. match_assert = tf.Assert(tf.reduce_all(tf.greater(num_matches, 0)), [ 'Each box in selected_boxes must match with at least one box ' 'in pool_boxes.' ]) scores = tf.expand_dims(pool_boxes.get_field('scores'), 1) scores_assert = tf.Assert(tf.reduce_all(tf.greater_equal(scores, 0)), ['Scores must be non negative.']) with tf.control_dependencies([scores_assert, match_assert]): sum_scores = tf.matmul(match_indicator, scores) averaged_scores = tf.reshape(sum_scores, [-1]) / num_matches box_locations = tf.matmul(match_indicator, pool_boxes.get() * scores) / sum_scores averaged_boxes = box_list.BoxList(box_locations) _copy_extra_fields(averaged_boxes, selected_boxes) averaged_boxes.add_field('scores', averaged_scores) return averaged_boxes
def _model_fn(features, labels, mode, params, model): """Model defination for the SSD model based on ResNet-50. Args: features: the input image tensor with shape [batch_size, height, width, 3]. The height and width are fixed and equal. labels: the input labels in a dictionary. The labels include class targets and box targets which are dense label maps. The labels are generated from get_input_fn function in data/dataloader.py mode: the mode of TPUEstimator including TRAIN, EVAL, and PREDICT. params: the dictionary defines hyperparameters of model. The default settings are in default_hparams function in this file. model: the SSD model outputs class logits and box regression outputs. Returns: spec: the EstimatorSpec or TPUEstimatorSpec to run training, evaluation, or prediction. """ if mode == tf.estimator.ModeKeys.PREDICT: labels = features features = labels.pop('image') features -= tf.constant(constants.NORMALIZATION_MEAN, shape=[1, 1, 3], dtype=features.dtype) COEF_STD = 1.0 / tf.constant( constants.NORMALIZATION_STD, shape=[1, 1, 3], dtype=features.dtype) features *= COEF_STD def _model_outputs(): return model(features, params, is_training_bn=(mode == tf.estimator.ModeKeys.TRAIN)) if params['dtype'] == 'bf16': with tf.compat.v1.tpu.bfloat16_scope(): cls_outputs, box_outputs = _model_outputs() levels = cls_outputs.keys() for level in levels: cls_outputs[level] = tf.cast(cls_outputs[level], tf.float32) box_outputs[level] = tf.cast(box_outputs[level], tf.float32) else: cls_outputs, box_outputs = _model_outputs() levels = cls_outputs.keys() # First check if it is in PREDICT mode. if mode == tf.estimator.ModeKeys.PREDICT: flattened_cls, flattened_box = concat_outputs(cls_outputs, box_outputs, True) ssd_box_coder = faster_rcnn_box_coder.FasterRcnnBoxCoder( scale_factors=constants.BOX_CODER_SCALES) anchors = box_list.BoxList( tf.convert_to_tensor(dataloader.DefaultBoxes()('ltrb'))) decoded_boxes = box_coder.batch_decode(encoded_boxes=flattened_box, box_coder=ssd_box_coder, anchors=anchors) pred_scores = tf.nn.softmax(flattened_cls, axis=2) pred_scores, indices = select_top_k_scores( pred_scores, constants.MAX_NUM_EVAL_BOXES) predictions = dict( labels, indices=indices, pred_scores=pred_scores, pred_box=decoded_boxes, ) if params['visualize_dataloader']: # this is for inference visualization. predictions['image'] = features return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions) # Load pretrained model from checkpoint. if params['resnet_checkpoint'] and mode == tf.estimator.ModeKeys.TRAIN: def scaffold_fn(): """Loads pretrained model through scaffold function.""" tf.train.init_from_checkpoint( params['resnet_checkpoint'], { '/': 'resnet%s/' % constants.RESNET_DEPTH, }) return tf.train.Scaffold() else: scaffold_fn = None # Set up training loss and learning rate. update_learning_rate_schedule_parameters(params) global_step = tf.train.get_or_create_global_step() learning_rate = learning_rate_schedule(params, global_step) # cls_loss and box_loss are for logging. only total_loss is optimized. loss, cls_loss, box_loss = detection_loss(cls_outputs, box_outputs, labels) total_loss = loss + params['weight_decay'] * tf.add_n( [tf.nn.l2_loss(v) for v in tf.trainable_variables()]) if mode == tf.estimator.ModeKeys.TRAIN: optimizer = tf.train.MomentumOptimizer(learning_rate, momentum=constants.MOMENTUM) if params['distributed_optimizer']: optimizer = params['distributed_optimizer'](optimizer) # Batch norm requires update_ops to be added as a train_op dependency. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) train_op = tf.group(optimizer.minimize(total_loss, global_step), update_ops) return model_fn_lib.EstimatorSpec(mode=mode, loss=loss, train_op=train_op, scaffold=scaffold_fn()) if mode == tf.estimator.ModeKeys.EVAL: raise NotImplementedError
def _strict_random_crop_image(image, boxes, labels, min_object_covered=1.0, aspect_ratio_range=(0.75, 1.33), area_range=(0.1, 1.0), overlap_thresh=0.3): """Performs random crop. Note: boxes will be clipped to the crop. Keypoint coordinates that are outside the crop will be set to NaN, which is consistent with the original keypoint encoding for non-existing keypoints. This function always crops the image and is supposed to be used by `random_crop_image` function which sometimes returns image unchanged. Args: image: rank 3 float32 tensor containing 1 image -> [height, width, channels] with pixel values varying between [0, 1]. boxes: rank 2 float32 tensor containing the bounding boxes with shape [num_instances, 4]. Boxes are in normalized form meaning their coordinates vary between [0, 1]. Each row is in the form of [ymin, xmin, ymax, xmax]. labels: rank 1 int32 tensor containing the object classes. min_object_covered: the cropped image must cover at least this fraction of at least one of the input bounding boxes. aspect_ratio_range: allowed range for aspect ratio of cropped image. area_range: allowed range for area ratio between cropped image and the original image. overlap_thresh: minimum overlap thresh with new cropped image to keep the box. Returns: image: image which is the same rank as input image. boxes: boxes which is the same rank as input boxes. Boxes are in normalized form. labels: new labels. If label_scores, multiclass_scores, masks, or keypoints is not None, the function also returns: label_scores: rank 1 float32 tensor with shape [num_instances]. """ with tf.name_scope('RandomCropImage', values=[image, boxes]): image_shape = tf.shape(image) # boxes are [N, 4]. Lets first make them [N, 1, 4]. boxes_expanded = tf.expand_dims( tf.clip_by_value(boxes, clip_value_min=0.0, clip_value_max=1.0), 1) im_box_begin, im_box_size, im_box = tf.image.sample_distorted_bounding_box( image_shape, bounding_boxes=boxes_expanded, min_object_covered=min_object_covered, aspect_ratio_range=aspect_ratio_range, area_range=area_range, max_attempts=100, use_image_if_no_bounding_boxes=True) new_image = tf.slice(image, im_box_begin, im_box_size) new_image.set_shape([None, None, image.get_shape()[2]]) # [1, 4] im_box_rank2 = tf.squeeze(im_box, squeeze_dims=[0]) # [4] im_box_rank1 = tf.squeeze(im_box) boxlist = box_list.BoxList(boxes) boxlist.add_field('labels', labels) im_boxlist = box_list.BoxList(im_box_rank2) # remove boxes that are outside cropped image boxlist, inside_window_ids = box_list_ops.prune_completely_outside_window( boxlist, im_box_rank1) # remove boxes that are outside image overlapping_boxlist, keep_ids = box_list_ops.prune_non_overlapping_boxes( boxlist, im_boxlist, overlap_thresh) # change the coordinate of the remaining boxes new_labels = overlapping_boxlist.get_field('labels') new_boxlist = box_list_ops.change_coordinate_frame( overlapping_boxlist, im_box_rank1) new_boxes = new_boxlist.get() new_boxes = tf.clip_by_value(new_boxes, clip_value_min=0.0, clip_value_max=1.0) result = [new_image, new_boxes, new_labels] return tuple(result)
def multiclass_non_max_suppression(boxes, scores, score_thresh, iou_thresh, max_size_per_class, max_total_size=0, clip_window=None, scope=None): """Multi-class version of non maximum suppression. This op greedily selects a subset of detection bounding boxes, pruning away boxes that have high IOU (intersection over union) overlap (> thresh) with already selected boxes. It operates independently for each class for which scores are provided (via the scores field of the input box_list), pruning boxes with score less than a provided threshold prior to applying NMS. Please note that this operation is performed on *all* classes, therefore any background classes should be removed prior to calling this function. Selected boxes are guaranteed to be sorted in decreasing order by score (but the sort is not guaranteed to be stable). Args: boxes: A [k, q, 4] float32 tensor containing k detections. `q` can be either number of classes or 1 depending on whether a separate box is predicted per class. scores: A [k, num_classes] float32 tensor containing the scores for each of the k detections. The scores have to be non-negative when pad_to_max_output_size is True. score_thresh: scalar threshold for score (low scoring boxes are removed). iou_thresh: scalar threshold for IOU (new boxes that have high IOU overlap with previously selected boxes are removed). max_size_per_class: maximum number of retained boxes per class. max_total_size: maximum number of boxes retained over all classes. By default returns all boxes retained after capping boxes per class. clip_window: A float32 tensor of the form [y_min, x_min, y_max, x_max] representing the window to clip and normalize boxes to before performing non-max suppression. scope: name scope. Returns: A tuple of sorted_boxes and num_valid_nms_boxes. The sorted_boxes is a BoxList holds M boxes with a rank-1 scores field representing corresponding scores for each box with scores sorted in decreasing order and a rank-1 classes field representing a class label for each box. The num_valid_nms_boxes is a 0-D integer tensor representing the number of valid elements in `BoxList`, with the valid elements appearing first. Raises: ValueError: if iou_thresh is not in [0, 1] or if input boxlist does not have a valid scores field. """ if not 0 <= iou_thresh <= 1.0: raise ValueError('iou_thresh must be between 0 and 1') if scores.shape.ndims != 2: raise ValueError('scores field must be of rank 2') if scores.shape[1].value is None: raise ValueError('scores must have statically defined second ' 'dimension') if boxes.shape.ndims != 3: raise ValueError('boxes must be of rank 3.') if not (boxes.shape[1].value == scores.shape[1].value or boxes.shape[1].value == 1): raise ValueError('second dimension of boxes must be either 1 or equal ' 'to the second dimension of scores') if boxes.shape[2].value != 4: raise ValueError('last dimension of boxes must be of size 4.') with tf.name_scope(scope, 'MultiClassNonMaxSuppression'): num_scores = tf.shape(scores)[0] num_classes = scores.get_shape()[1] selected_boxes_list = [] num_valid_nms_boxes_cumulative = tf.constant(0) per_class_boxes_list = tf.unstack(boxes, axis=1) boxes_ids = (range(num_classes) if len(per_class_boxes_list) > 1 else [0] * num_classes.value) for class_idx, boxes_idx in zip(range(num_classes), boxes_ids): per_class_boxes = per_class_boxes_list[boxes_idx] boxlist_and_class_scores = box_list.BoxList(per_class_boxes) class_scores = tf.reshape( tf.slice(scores, [0, class_idx], tf.stack([num_scores, 1])), [-1]) boxlist_and_class_scores.add_field("scores", class_scores) max_selection_size = tf.minimum( max_size_per_class, boxlist_and_class_scores.num_boxes()) selected_indices = tf.image.non_max_suppression( boxlist_and_class_scores.get(), boxlist_and_class_scores.get_field("scores"), max_selection_size, iou_threshold=iou_thresh, score_threshold=score_thresh) num_valid_nms_boxes = tf.shape(selected_indices)[0] selected_indices = tf.concat([ selected_indices, tf.zeros(max_selection_size - num_valid_nms_boxes, tf.int32) ], 0) nms_result = box_list_ops.gather(boxlist_and_class_scores, selected_indices) # Make the scores -1 for invalid boxes. valid_nms_boxes_indx = tf.less(tf.range(max_selection_size), num_valid_nms_boxes) nms_scores = nms_result.get_field("scores") nms_result.add_field( "scores", tf.where(valid_nms_boxes_indx, nms_scores, -1 * tf.ones(max_selection_size))) num_valid_nms_boxes_cumulative += num_valid_nms_boxes nms_result.add_field( "classes", (tf.zeros_like(nms_result.get_field("scores")) + class_idx)) selected_boxes_list.append(nms_result) selected_boxes = box_list_ops.concatenate(selected_boxes_list) sorted_boxes = box_list_ops.sort_by_field(selected_boxes, "scores") if clip_window is not None: # When pad_to_max_output_size is False, it prunes the boxes with zero # area. sorted_boxes = box_list_ops.clip_to_window( sorted_boxes, clip_window, filter_nonoverlapping=True) # Set the scores of boxes with zero area to -1 to keep the default # behaviour of pruning out zero area boxes. sorted_boxes_size = tf.shape(sorted_boxes.get())[0] non_zero_box_area = tf.cast(box_list_ops.area(sorted_boxes), tf.bool) sorted_boxes_scores = tf.where(non_zero_box_area, sorted_boxes.get_field("scores"), -1 * tf.ones(sorted_boxes_size)) sorted_boxes.add_field("scores", sorted_boxes_scores) num_valid_nms_boxes_cumulative = tf.reduce_sum( tf.cast(tf.greater_equal(sorted_boxes_scores, 0), tf.int32)) sorted_boxes = box_list_ops.sort_by_field(sorted_boxes, "scores") if max_total_size: max_total_size = tf.minimum(max_total_size, sorted_boxes.num_boxes()) sorted_boxes = box_list_ops.gather(sorted_boxes, tf.range(max_total_size)) num_valid_nms_boxes_cumulative = tf.where( max_total_size > num_valid_nms_boxes_cumulative, num_valid_nms_boxes_cumulative, max_total_size) return sorted_boxes, num_valid_nms_boxes_cumulative
def _model_fn(features, labels, mode, params, model): """Model defination for the SSD model based on ResNet-50. Args: features: the input image tensor with shape [batch_size, height, width, 3]. The height and width are fixed and equal. labels: the input labels in a dictionary. The labels include class targets and box targets which are dense label maps. The labels are generated from get_input_fn function in data/dataloader.py mode: the mode of TPUEstimator including TRAIN, EVAL, and PREDICT. params: the dictionary defines hyperparameters of model. The default settings are in default_hparams function in this file. model: the SSD model outputs class logits and box regression outputs. Returns: spec: the EstimatorSpec or TPUEstimatorSpec to run training, evaluation, or prediction. """ if mode == tf.estimator.ModeKeys.PREDICT: labels = features features = labels.pop('image') # Manually apply the double transpose trick for training data. if params['transpose_input'] and mode != tf.estimator.ModeKeys.PREDICT: features = tf.transpose(features, [3, 0, 1, 2]) labels[ssd_constants.BOXES] = tf.transpose(labels[ssd_constants.BOXES], [2, 0, 1]) labels[ssd_constants.CLASSES] = tf.transpose( labels[ssd_constants.CLASSES], [2, 0, 1]) # Normalize the image to zero mean and unit variance. mlperf_log.ssd_print(key=mlperf_log.DATA_NORMALIZATION_MEAN, value=ssd_constants.NORMALIZATION_MEAN) mlperf_log.ssd_print(key=mlperf_log.DATA_NORMALIZATION_STD, value=ssd_constants.NORMALIZATION_STD) features -= tf.constant(ssd_constants.NORMALIZATION_MEAN, shape=[1, 1, 3], dtype=features.dtype) features /= tf.constant(ssd_constants.NORMALIZATION_STD, shape=[1, 1, 3], dtype=features.dtype) def _model_outputs(): return model(features, params, is_training_bn=(mode == tf.estimator.ModeKeys.TRAIN)) if params['use_bfloat16']: with bfloat16.bfloat16_scope(): cls_outputs, box_outputs = _model_outputs() levels = cls_outputs.keys() for level in levels: cls_outputs[level] = tf.cast(cls_outputs[level], tf.float32) box_outputs[level] = tf.cast(box_outputs[level], tf.float32) else: cls_outputs, box_outputs = _model_outputs() levels = cls_outputs.keys() # First check if it is in PREDICT mode. if mode == tf.estimator.ModeKeys.PREDICT: flattened_cls, flattened_box = concat_outputs(cls_outputs, box_outputs) mlperf_log.ssd_print(key=mlperf_log.SCALES, value=ssd_constants.BOX_CODER_SCALES) ssd_box_coder = faster_rcnn_box_coder.FasterRcnnBoxCoder( scale_factors=ssd_constants.BOX_CODER_SCALES) anchors = box_list.BoxList( tf.convert_to_tensor(dataloader.DefaultBoxes()('ltrb'))) decoded_boxes = box_coder.batch_decode(encoded_boxes=flattened_box, box_coder=ssd_box_coder, anchors=anchors) pred_scores = tf.nn.softmax(flattened_cls, axis=2) pred_scores, indices = select_top_k_scores( pred_scores, ssd_constants.MAX_NUM_EVAL_BOXES) predictions = dict( labels, indices=indices, pred_scores=pred_scores, pred_box=decoded_boxes, ) if params['visualize_dataloader']: # this is for inference visualization. predictions['image'] = features if params['use_tpu']: return tpu_estimator.TPUEstimatorSpec(mode=mode, predictions=predictions) return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions) # Load pretrained model from checkpoint. if params['resnet_checkpoint'] and mode == tf.estimator.ModeKeys.TRAIN: def scaffold_fn(): """Loads pretrained model through scaffold function.""" tf.train.init_from_checkpoint( params['resnet_checkpoint'], { '/': 'resnet%s/' % ssd_constants.RESNET_DEPTH, }) return tf.train.Scaffold() else: scaffold_fn = None # Set up training loss and learning rate. update_learning_rate_schedule_parameters(params) global_step = tf.train.get_or_create_global_step() learning_rate = learning_rate_schedule(params, global_step) mlperf_log.ssd_print(key=mlperf_log.OPT_LR, deferred=True) # cls_loss and box_loss are for logging. only total_loss is optimized. total_loss, cls_loss, box_loss = detection_loss(cls_outputs, box_outputs, labels) total_loss += params['weight_decay'] * tf.add_n( [tf.nn.l2_loss(v) for v in tf.trainable_variables()]) host_call = None if mode == tf.estimator.ModeKeys.TRAIN: optimizer = tf.train.MomentumOptimizer(learning_rate, momentum=ssd_constants.MOMENTUM) if params['use_tpu']: optimizer = tpu_optimizer.CrossShardOptimizer(optimizer) mlperf_log.ssd_print(key=mlperf_log.OPT_NAME, value='tf.train.MomentumOptimizer') # TODO(wangtao): figure out how to log learning rate. # mlperf_log.ssd_print(key=mlperf_log.OPT_LR, value=learning_rate) mlperf_log.ssd_print(key=mlperf_log.OPT_MOMENTUM, value=ssd_constants.MOMENTUM) mlperf_log.ssd_print(key=mlperf_log.OPT_WEIGHT_DECAY, value=params['weight_decay']) # Batch norm requires update_ops to be added as a train_op dependency. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) if params['device'] == 'gpu': # GPU uses tf.group to avoid dependency overhead on update_ops; also, # multi-GPU requires a different EstimatorSpec class object train_op = tf.group(optimizer.minimize(total_loss, global_step), update_ops) return model_fn_lib.EstimatorSpec(mode=mode, loss=total_loss, train_op=train_op, scaffold=scaffold_fn()) else: with tf.control_dependencies(update_ops): train_op = optimizer.minimize(total_loss, global_step) if params['use_host_call']: def host_call_fn(global_step, total_loss, cls_loss, box_loss, learning_rate): """Training host call. Creates scalar summaries for training metrics. This function is executed on the CPU and should not directly reference any Tensors in the rest of the `model_fn`. To pass Tensors from the model to the `metric_fn`, provide as part of the `host_call`. See https://www.tensorflow.org/api_docs/python/tf/contrib/tpu/TPUEstimatorSpec for more information. Arguments should match the list of `Tensor` objects passed as the second element in the tuple passed to `host_call`. Args: global_step: `Tensor with shape `[batch, ]` for the global_step. total_loss: `Tensor` with shape `[batch, ]` for the training loss. cls_loss: `Tensor` with shape `[batch, ]` for the training cls loss. box_loss: `Tensor` with shape `[batch, ]` for the training box loss. learning_rate: `Tensor` with shape `[batch, ]` for the learning_rate. Returns: List of summary ops to run on the CPU host. """ # Outfeed supports int32 but global_step is expected to be int64. global_step = tf.reduce_mean(global_step) # Host call fns are executed FLAGS.iterations_per_loop times after one # TPU loop is finished, setting max_queue value to the same as number of # iterations will make the summary writer only flush the data to storage # once per loop. with (tf.contrib.summary.create_file_writer( params['model_dir'], max_queue=params['iterations_per_loop']).as_default()): with tf.contrib.summary.always_record_summaries(): tf.contrib.summary.scalar('total_loss', tf.reduce_mean(total_loss), step=global_step) tf.contrib.summary.scalar('cls_loss', tf.reduce_mean(cls_loss), step=global_step) tf.contrib.summary.scalar('box_loss', tf.reduce_mean(box_loss), step=global_step) tf.contrib.summary.scalar( 'learning_rate', tf.reduce_mean(learning_rate), step=global_step) return tf.contrib.summary.all_summary_ops() # To log the loss, current learning rate, and epoch for Tensorboard, the # summary op needs to be run on the host CPU via host_call. host_call # expects [batch_size, ...] Tensors, thus reshape to introduce a batch # dimension. These Tensors are implicitly concatenated to # [params['batch_size']]. global_step_t = tf.reshape(global_step, [1]) total_loss_t = tf.reshape(total_loss, [1]) cls_loss_t = tf.reshape(cls_loss, [1]) box_loss_t = tf.reshape(box_loss, [1]) learning_rate_t = tf.reshape(learning_rate, [1]) host_call = (host_call_fn, [ global_step_t, total_loss_t, cls_loss_t, box_loss_t, learning_rate_t ]) else: train_op = None eval_metrics = None if mode == tf.estimator.ModeKeys.EVAL: raise NotImplementedError return tpu_estimator.TPUEstimatorSpec(mode=mode, loss=total_loss, train_op=train_op, host_call=host_call, eval_metrics=eval_metrics, scaffold_fn=scaffold_fn)
def _parse_example(data): with tf.name_scope('augmentation'): source_id = data['source_id'] image = tf.image.convert_image_dtype(data['image'], dtype=tf.float32) raw_shape = tf.shape(image) boxes = data['groundtruth_boxes'] classes = tf.reshape(data['groundtruth_classes'], [-1, 1]) # Only 80 of the 90 COCO classes are used. class_map = tf.convert_to_tensor(ssd_constants.CLASS_MAP) classes = tf.gather(class_map, classes) classes = tf.cast(classes, dtype=tf.float32) if self._is_training: image, boxes, classes = ssd_crop(image, boxes, classes) # random_horizontal_flip() is hard coded to flip with 50% chance. mlperf_log.ssd_print( key=mlperf_log.RANDOM_FLIP_PROBABILITY, value=0.5) image, boxes = preprocessor.random_horizontal_flip( image=image, boxes=boxes) # TODO(shibow): Investigate the parameters for color jitter. image = color_jitter(image, brightness=0.125, contrast=0.5, saturation=0.5, hue=0.05) image = normalize_image(image) if params['use_bfloat16']: image = tf.cast(image, dtype=tf.bfloat16) encoded_classes, encoded_boxes, num_matched_boxes = encode_labels( boxes, classes) # TODO(taylorrobie): Check that this cast is valid. encoded_classes = tf.cast(encoded_classes, tf.int32) labels = { ssd_constants.NUM_MATCHED_BOXES: num_matched_boxes, ssd_constants.BOXES: encoded_boxes, ssd_constants.CLASSES: encoded_classes, } # This is for dataloader visualization; actual model doesn't use this. if params['visualize_dataloader']: box_coder = faster_rcnn_box_coder.FasterRcnnBoxCoder( scale_factors=ssd_constants.BOX_CODER_SCALES) decoded_boxes = tf.expand_dims(box_coder.decode( rel_codes=tf.squeeze(encoded_boxes), anchors=box_list.BoxList( tf.convert_to_tensor( DefaultBoxes()('ltrb')))).get(), axis=0) labels['decoded_boxes'] = tf.squeeze(decoded_boxes) return image, labels else: mlperf_log.ssd_print(key=mlperf_log.INPUT_SIZE, value=ssd_constants.IMAGE_SIZE) image = tf.image.resize_images( image[tf.newaxis, :, :, :], size=(ssd_constants.IMAGE_SIZE, ssd_constants.IMAGE_SIZE))[0, :, :, :] image = normalize_image(image) if params['use_bfloat16']: image = tf.cast(image, dtype=tf.bfloat16) def trim_and_pad(inp_tensor, dim_1): """Limit the number of boxes, and pad if necessary.""" inp_tensor = inp_tensor[:ssd_constants. MAX_NUM_EVAL_BOXES] num_pad = ssd_constants.MAX_NUM_EVAL_BOXES - tf.shape( inp_tensor)[0] inp_tensor = tf.pad(inp_tensor, [[0, num_pad], [0, 0]]) return tf.reshape( inp_tensor, [ssd_constants.MAX_NUM_EVAL_BOXES, dim_1]) boxes, classes = trim_and_pad(boxes, 4), trim_and_pad(classes, 1) return { ssd_constants.IMAGE: image, ssd_constants.BOXES: boxes, ssd_constants.CLASSES: classes, ssd_constants.SOURCE_ID: tf.string_to_number(source_id, tf.int32), ssd_constants.RAW_SHAPE: raw_shape, }