def encode_labels(gt_boxes, gt_labels): """Labels anchors with ground truth inputs. Args: gt_boxes: A float tensor with shape [N, 4] representing groundtruth boxes. For each row, it stores [y0, x0, y1, x1] for four corners of a box. gt_labels: A integer tensor with shape [N, 1] representing groundtruth classes. Returns: encoded_classes: a tensor with shape [num_anchors, 1]. encoded_boxes: a tensor with shape [num_anchors, 4]. num_positives: scalar tensor storing number of positives in an image. """ similarity_calc = region_similarity_calculator.IouSimilarity() matcher = argmax_matcher.ArgMaxMatcher( matched_threshold=constants.MATCH_THRESHOLD, unmatched_threshold=constants.MATCH_THRESHOLD, negatives_lower_than_unmatched=True, force_match_for_each_row=True) box_coder = faster_rcnn_box_coder.FasterRcnnBoxCoder( scale_factors=constants.BOX_CODER_SCALES) default_boxes = box_list.BoxList( tf.convert_to_tensor(DefaultBoxes()('ltrb'))) target_boxes = box_list.BoxList(gt_boxes) assigner = target_assigner.TargetAssigner( similarity_calc, matcher, box_coder) encoded_classes, _, encoded_boxes, _, matches = assigner.assign( default_boxes, target_boxes, gt_labels) num_matched_boxes = tf.reduce_sum( tf.cast(tf.not_equal(matches.match_results, -1), tf.float32)) return encoded_classes, encoded_boxes, num_matched_boxes
def _decode(self, rel_codes, anchors): """Decode relative codes to boxes. Args: rel_codes: a tensor representing N anchor-encoded boxes. anchors: BoxList of anchors. Returns: boxes: BoxList holding N bounding boxes. """ ycenter_a, xcenter_a, ha, wa = anchors.get_center_coordinates_and_sizes() ty, tx, th, tw = tf.unstack(tf.transpose(rel_codes)) if self._scale_factors: ty /= self._scale_factors[0] tx /= self._scale_factors[1] th /= self._scale_factors[2] tw /= self._scale_factors[3] w = tf.exp(tw) * wa h = tf.exp(th) * ha ycenter = ty * ha + ycenter_a xcenter = tx * wa + xcenter_a ymin = ycenter - h / 2. xmin = xcenter - w / 2. ymax = ycenter + h / 2. xmax = xcenter + w / 2. return box_list.BoxList(tf.transpose(tf.stack([ymin, xmin, ymax, xmax])))
def scale_boxes_to_pixel_coordinates(image, boxes, keypoints=None): """Scales boxes from normalized to pixel coordinates. Args: image: A 3D float32 tensor of shape [height, width, channels]. boxes: A 2D float32 tensor of shape [num_boxes, 4] containing the bounding boxes in normalized coordinates. Each row is of the form [ymin, xmin, ymax, xmax]. keypoints: (optional) rank 3 float32 tensor with shape [num_instances, num_keypoints, 2]. The keypoints are in y-x normalized coordinates. Returns: image: unchanged input image. scaled_boxes: a 2D float32 tensor of shape [num_boxes, 4] containing the bounding boxes in pixel coordinates. scaled_keypoints: a 3D float32 tensor with shape [num_instances, num_keypoints, 2] containing the keypoints in pixel coordinates. """ boxlist = box_list.BoxList(boxes) image_height = tf.shape(image)[0] image_width = tf.shape(image)[1] scaled_boxes = box_list_scale(boxlist, image_height, image_width).get() result = [image, scaled_boxes] if keypoints is not None: scaled_keypoints = keypoint_scale(keypoints, image_height, image_width) result.append(scaled_keypoints) return tuple(result)
def box_list_scale(boxlist, y_scale, x_scale, scope=None): """scale box coordinates in x and y dimensions. Args: boxlist: BoxList holding N boxes y_scale: (float) scalar tensor x_scale: (float) scalar tensor scope: name scope. Returns: boxlist: BoxList holding N boxes """ with tf.name_scope(scope, 'Scale'): y_scale = tf.cast(y_scale, tf.float32) x_scale = tf.cast(x_scale, tf.float32) y_min, x_min, y_max, x_max = tf.split(value=boxlist.get(), num_or_size_splits=4, axis=1) y_min = y_scale * y_min y_max = y_scale * y_max x_min = x_scale * x_min x_max = x_scale * x_max scaled_boxlist = box_list.BoxList( tf.concat([y_min, x_min, y_max, x_max], 1)) return _copy_extra_fields(scaled_boxlist, boxlist)
def _create_regression_targets(self, anchors, groundtruth_boxes, match): """Returns a regression target for each anchor. Args: anchors: a BoxList representing N anchors groundtruth_boxes: a BoxList representing M groundtruth_boxes match: a matcher.Match object Returns: reg_targets: a float32 tensor with shape [N, box_code_dimension] """ matched_gt_boxes = match.gather_based_on_match( groundtruth_boxes.get(), unmatched_value=tf.zeros(4), ignored_value=tf.zeros(4)) matched_gt_boxlist = box_list.BoxList(matched_gt_boxes) if groundtruth_boxes.has_field(KEYPOINTS_FIELD_NAME): groundtruth_keypoints = groundtruth_boxes.get_field( KEYPOINTS_FIELD_NAME) matched_keypoints = match.gather_based_on_match( groundtruth_keypoints, unmatched_value=tf.zeros( groundtruth_keypoints.get_shape()[1:]), ignored_value=tf.zeros(groundtruth_keypoints.get_shape()[1:])) matched_gt_boxlist.add_field(KEYPOINTS_FIELD_NAME, matched_keypoints) matched_reg_targets = self._box_coder.encode(matched_gt_boxlist, anchors) match_results_shape = shape_utils.combined_static_and_dynamic_shape( match.match_results) # Zero out the unmatched and ignored regression targets. unmatched_ignored_reg_targets = tf.tile( self._default_regression_target(), [match_results_shape[0], 1]) matched_anchors_mask = match.matched_column_indicator() reg_targets = tf.where(matched_anchors_mask, matched_reg_targets, unmatched_ignored_reg_targets) return reg_targets
def _parse_example(data): with tf.name_scope('augmentation'): source_id = data['source_id'] image = data['image'] # dtype uint8 raw_shape = tf.shape(image) boxes = data['groundtruth_boxes'] classes = tf.reshape(data['groundtruth_classes'], [-1, 1]) # Only 80 of the 90 COCO classes are used. class_map = tf.convert_to_tensor(constants.CLASS_MAP) classes = tf.gather(class_map, classes) classes = tf.cast(classes, dtype=tf.float32) if self._is_training: image, boxes, classes = ssd_crop(image, boxes, classes) # ssd_crop resizes and returns image of dtype float32 and does not # change its range (i.e., value in between 0--255). Divide by 255. # converts it to [0, 1] range. Not doing this before cropping to # avoid dtype cast (which incurs additional memory copy). image /= 255.0 # random_horizontal_flip() is hard coded to flip with 50% chance. image, boxes = preprocessor.random_horizontal_flip( image=image, boxes=boxes) # TODO(shibow): Investigate the parameters for color jitter. image = color_jitter( image, brightness=0.125, contrast=0.5, saturation=0.5, hue=0.05) if params['dtype'] == 'bf16': image = tf.cast(image, dtype=tf.bfloat16) encoded_classes, encoded_boxes, num_matched_boxes = encode_labels( boxes, classes) # We transpose in dataloader instead of in the topology to save time encoded_classes, encoded_boxes = transpose_labels( encoded_classes, encoded_boxes) encoded_classes = tf.cast(encoded_classes, tf.int32) labels = { constants.NUM_MATCHED_BOXES: num_matched_boxes, constants.BOXES: encoded_boxes, constants.CLASSES: tf.squeeze(encoded_classes, axis=1), } # This is for dataloader visualization; actual model doesn't use this. if params['visualize_dataloader']: box_coder = faster_rcnn_box_coder.FasterRcnnBoxCoder( scale_factors=constants.BOX_CODER_SCALES) decoded_boxes = tf.expand_dims(box_coder.decode( rel_codes=tf.squeeze(encoded_boxes), anchors=box_list.BoxList( tf.convert_to_tensor(DefaultBoxes()('ltrb'))) ).get(), axis=0) labels['decoded_boxes'] = tf.squeeze(decoded_boxes) return image, labels else: image = tf.image.resize_images( image, size=(constants.IMAGE_SIZE, constants.IMAGE_SIZE)) # resize_image returns image of dtype float32 and does not change its # range. Divide by 255 to convert image to [0, 1] range. image /= 255. if params['dtype'] == 'bf16': image = tf.cast(image, dtype=tf.bfloat16) def trim_and_pad(inp_tensor, dim_1): """Limit the number of boxes, and pad if necessary.""" inp_tensor = inp_tensor[:constants.MAX_NUM_EVAL_BOXES] num_pad = constants.MAX_NUM_EVAL_BOXES - \ tf.shape(inp_tensor)[0] inp_tensor = tf.pad(inp_tensor, [[0, num_pad], [0, 0]]) return tf.reshape( inp_tensor, [constants.MAX_NUM_EVAL_BOXES, dim_1]) boxes, classes = trim_and_pad( boxes, 4), trim_and_pad(classes, 1) sample = { constants.IMAGE: image, constants.BOXES: boxes, constants.CLASSES: classes, constants.SOURCE_ID: tf.string_to_number(source_id, tf.int32), constants.RAW_SHAPE: raw_shape, } if not self._is_training and self._count > params['eval_samples']: sample[constants.IS_PADDED] = data[constants.IS_PADDED] return sample
def _model_fn(features, labels, mode, params, model): """Model defination for the SSD model based on ResNet-50. Args: features: the input image tensor with shape [batch_size, height, width, 3]. The height and width are fixed and equal. labels: the input labels in a dictionary. The labels include class targets and box targets which are dense label maps. The labels are generated from get_input_fn function in data/dataloader.py mode: the mode of TPUEstimator including TRAIN, EVAL, and PREDICT. params: the dictionary defines hyperparameters of model. The default settings are in default_hparams function in this file. model: the SSD model outputs class logits and box regression outputs. Returns: spec: the EstimatorSpec or TPUEstimatorSpec to run training, evaluation, or prediction. """ if mode == tf.estimator.ModeKeys.PREDICT: labels = features features = labels.pop('image') features -= tf.constant(constants.NORMALIZATION_MEAN, shape=[1, 1, 3], dtype=features.dtype) COEF_STD = 1.0 / tf.constant( constants.NORMALIZATION_STD, shape=[1, 1, 3], dtype=features.dtype) features *= COEF_STD def _model_outputs(): return model(features, params, is_training_bn=(mode == tf.estimator.ModeKeys.TRAIN)) if params['dtype'] == 'bf16': with tf.compat.v1.tpu.bfloat16_scope(): cls_outputs, box_outputs = _model_outputs() levels = cls_outputs.keys() for level in levels: cls_outputs[level] = tf.cast(cls_outputs[level], tf.float32) box_outputs[level] = tf.cast(box_outputs[level], tf.float32) else: cls_outputs, box_outputs = _model_outputs() levels = cls_outputs.keys() # First check if it is in PREDICT mode. if mode == tf.estimator.ModeKeys.PREDICT: flattened_cls, flattened_box = concat_outputs(cls_outputs, box_outputs, True) ssd_box_coder = faster_rcnn_box_coder.FasterRcnnBoxCoder( scale_factors=constants.BOX_CODER_SCALES) anchors = box_list.BoxList( tf.convert_to_tensor(dataloader.DefaultBoxes()('ltrb'))) decoded_boxes = box_coder.batch_decode(encoded_boxes=flattened_box, box_coder=ssd_box_coder, anchors=anchors) pred_scores = tf.nn.softmax(flattened_cls, axis=2) pred_scores, indices = select_top_k_scores( pred_scores, constants.MAX_NUM_EVAL_BOXES) predictions = dict( labels, indices=indices, pred_scores=pred_scores, pred_box=decoded_boxes, ) if params['visualize_dataloader']: # this is for inference visualization. predictions['image'] = features return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions) # Load pretrained model from checkpoint. if params['resnet_checkpoint'] and mode == tf.estimator.ModeKeys.TRAIN: def scaffold_fn(): """Loads pretrained model through scaffold function.""" tf.train.init_from_checkpoint( params['resnet_checkpoint'], { '/': 'resnet%s/' % constants.RESNET_DEPTH, }) return tf.train.Scaffold() else: scaffold_fn = None # Set up training loss and learning rate. update_learning_rate_schedule_parameters(params) global_step = tf.train.get_or_create_global_step() learning_rate = learning_rate_schedule(params, global_step) # cls_loss and box_loss are for logging. only total_loss is optimized. loss, cls_loss, box_loss = detection_loss(cls_outputs, box_outputs, labels) total_loss = loss + params['weight_decay'] * tf.add_n( [tf.nn.l2_loss(v) for v in tf.trainable_variables()]) if mode == tf.estimator.ModeKeys.TRAIN: optimizer = tf.train.MomentumOptimizer(learning_rate, momentum=constants.MOMENTUM) if params['distributed_optimizer']: optimizer = params['distributed_optimizer'](optimizer) # Batch norm requires update_ops to be added as a train_op dependency. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) train_op = tf.group(optimizer.minimize(total_loss, global_step), update_ops) save_summary_steps = params['save_summary_steps'] if save_summary_steps is not None: tf.summary.scalar("learning_rate", learning_rate) tf.summary.scalar("class_loss", cls_loss) tf.summary.scalar("box_loss", box_loss) return model_fn_lib.EstimatorSpec(mode=mode, loss=loss, train_op=train_op, training_hooks=None, scaffold=scaffold_fn()) if mode == tf.estimator.ModeKeys.EVAL: raise NotImplementedError