Example #1
0
def encode_labels(gt_boxes, gt_labels):
    """Labels anchors with ground truth inputs.

  Args:
    gt_boxes: A float tensor with shape [N, 4] representing groundtruth boxes.
      For each row, it stores [y0, x0, y1, x1] for four corners of a box.
    gt_labels: A integer tensor with shape [N, 1] representing groundtruth
      classes.
  Returns:
    encoded_classes: a tensor with shape [num_anchors, 1].
    encoded_boxes: a tensor with shape [num_anchors, 4].
    num_positives: scalar tensor storing number of positives in an image.
  """
    similarity_calc = region_similarity_calculator.IouSimilarity()
    matcher = argmax_matcher.ArgMaxMatcher(
        matched_threshold=ssd_constants.MATCH_THRESHOLD,
        unmatched_threshold=ssd_constants.MATCH_THRESHOLD,
        negatives_lower_than_unmatched=True,
        force_match_for_each_row=True)

    box_coder = faster_rcnn_box_coder.FasterRcnnBoxCoder(
        scale_factors=ssd_constants.BOX_CODER_SCALES)

    default_boxes = box_list.BoxList(
        tf.convert_to_tensor(DefaultBoxes()('ltrb')))
    target_boxes = box_list.BoxList(gt_boxes)

    assigner = target_assigner.TargetAssigner(similarity_calc, matcher,
                                              box_coder)

    encoded_classes, _, encoded_boxes, _, matches = assigner.assign(
        default_boxes, target_boxes, gt_labels)
    num_matched_boxes = tf.reduce_sum(
        tf.cast(tf.not_equal(matches.match_results, -1), tf.float32))
    return encoded_classes, encoded_boxes, num_matched_boxes
Example #2
0
  def label_anchors(self, gt_boxes, gt_labels):
    """Labels anchors with ground truth inputs.

    Args:
      gt_boxes: a float tensor with shape [N, 4] representing groundtruth boxes.
        For each row, it stores [y0, x0, y1, x1] for four corners of a box.
      gt_labels: an integer tensor with shape [N, 1] representing groundtruth
        classes.
    Returns:
      score_targets_dict: ordered dictionary with keys
        [min_level, min_level+1, ..., max_level]. The values are tensor with
        shape [height_l, width_l, num_anchors]. The height_l and width_l
        represent the dimension of class logits at l-th level.
      box_targets_dict: ordered dictionary with keys
        [min_level, min_level+1, ..., max_level]. The values are tensor with
        shape [height_l, width_l, num_anchors * 4]. The height_l and
        width_l represent the dimension of bounding box regression output at
        l-th level.
    """
    gt_box_list = box_list.BoxList(gt_boxes)
    anchor_box_list = box_list.BoxList(self._anchors.boxes)

    # cls_targets, cls_weights, box_weights are not used
    _, _, box_targets, _, matches = self._target_assigner.assign(
        anchor_box_list, gt_box_list, gt_labels)

    # score_targets contains the subsampled positive and negative anchors.
    score_targets, _, _ = self._get_rpn_samples(matches.match_results)

    # Unpack labels.
    score_targets_dict = self._anchors.unpack_labels(score_targets)
    box_targets_dict = self._anchors.unpack_labels(box_targets)

    return score_targets_dict, box_targets_dict
Example #3
0
def box_list_scale(boxlist, y_scale, x_scale, scope=None):
    """scale box coordinates in x and y dimensions.

  Args:
    boxlist: BoxList holding N boxes
    y_scale: (float) scalar tensor
    x_scale: (float) scalar tensor
    scope: name scope.

  Returns:
    boxlist: BoxList holding N boxes
  """
    with tf.name_scope(scope, 'Scale'):
        y_scale = tf.cast(y_scale, tf.float32)
        x_scale = tf.cast(x_scale, tf.float32)
        y_min, x_min, y_max, x_max = tf.split(value=boxlist.get(),
                                              num_or_size_splits=4,
                                              axis=1)
        y_min = y_scale * y_min
        y_max = y_scale * y_max
        x_min = x_scale * x_min
        x_max = x_scale * x_max
        scaled_boxlist = box_list.BoxList(
            tf.concat([y_min, x_min, y_max, x_max], 1))
        return _copy_extra_fields(scaled_boxlist, boxlist)
Example #4
0
def scale_boxes_to_pixel_coordinates(image, boxes, keypoints=None):
    """Scales boxes from normalized to pixel coordinates.

  Args:
    image: A 3D float32 tensor of shape [height, width, channels].
    boxes: A 2D float32 tensor of shape [num_boxes, 4] containing the bounding
      boxes in normalized coordinates. Each row is of the form
      [ymin, xmin, ymax, xmax].
    keypoints: (optional) rank 3 float32 tensor with shape
      [num_instances, num_keypoints, 2]. The keypoints are in y-x normalized
      coordinates.

  Returns:
    image: unchanged input image.
    scaled_boxes: a 2D float32 tensor of shape [num_boxes, 4] containing the
      bounding boxes in pixel coordinates.
    scaled_keypoints: a 3D float32 tensor with shape
      [num_instances, num_keypoints, 2] containing the keypoints in pixel
      coordinates.
  """
    boxlist = box_list.BoxList(boxes)
    image_height = tf.shape(image)[0]
    image_width = tf.shape(image)[1]
    scaled_boxes = box_list_scale(boxlist, image_height, image_width).get()
    result = [image, scaled_boxes]
    if keypoints is not None:
        scaled_keypoints = keypoint_scale(keypoints, image_height, image_width)
        result.append(scaled_keypoints)
    return tuple(result)
Example #5
0
def change_coordinate_frame(boxlist, window, scope=None):
    """Change coordinate frame of the boxlist to be relative to window's frame.

  Given a window of the form [ymin, xmin, ymax, xmax],
  changes bounding box coordinates from boxlist to be relative to this window
  (e.g., the min corner maps to (0,0) and the max corner maps to (1,1)).

  An example use case is data augmentation: where we are given groundtruth
  boxes (boxlist) and would like to randomly crop the image to some
  window (window). In this case we need to change the coordinate frame of
  each groundtruth box to be relative to this new window.

  Args:
    boxlist: A BoxList object holding N boxes.
    window: A rank 1 tensor [4].
    scope: name scope.

  Returns:
    Returns a BoxList object with N boxes.
  """
    with tf.name_scope(scope, 'ChangeCoordinateFrame'):
        win_height = window[2] - window[0]
        win_width = window[3] - window[1]
        boxlist_new = scale(
            box_list.BoxList(boxlist.get() -
                             [window[0], window[1], window[0], window[1]]),
            1.0 / win_height, 1.0 / win_width)
        boxlist_new = _copy_extra_fields(boxlist_new, boxlist)
        return boxlist_new
  def _decode(self, rel_codes, anchors):
    """Decode relative codes to boxes.

    Args:
      rel_codes: a tensor representing N anchor-encoded boxes.
      anchors: BoxList of anchors.

    Returns:
      boxes: BoxList holding N bounding boxes.
    """
    ycenter_a, xcenter_a, ha, wa = anchors.get_center_coordinates_and_sizes()

    ty, tx, th, tw = tf.unstack(tf.transpose(rel_codes))
    if self._scale_factors:
      ty /= self._scale_factors[0]
      tx /= self._scale_factors[1]
      th /= self._scale_factors[2]
      tw /= self._scale_factors[3]
    w = tf.exp(tw) * wa
    h = tf.exp(th) * ha
    ycenter = ty * ha + ycenter_a
    xcenter = tx * wa + xcenter_a
    ymin = ycenter - h / 2.
    xmin = xcenter - w / 2.
    ymax = ycenter + h / 2.
    xmax = xcenter + w / 2.
    return box_list.BoxList(tf.transpose(tf.stack([ymin, xmin, ymax, xmax])))
Example #7
0
def concatenate(boxlists, fields=None, scope=None):
    """Concatenate list of BoxLists.

  This op concatenates a list of input BoxLists into a larger BoxList.  It also
  handles concatenation of BoxList fields as long as the field tensor shapes
  are equal except for the first dimension.

  Args:
    boxlists: list of BoxList objects
    fields: optional list of fields to also concatenate.  By default, all
      fields from the first BoxList in the list are included in the
      concatenation.
    scope: name scope.

  Returns:
    a BoxList with number of boxes equal to
      sum([boxlist.num_boxes() for boxlist in BoxList])
  Raises:
    ValueError: if boxlists is invalid (i.e., is not a list, is empty, or
      contains non BoxList objects), or if requested fields are not contained in
      all boxlists
  """
    with tf.name_scope(scope, 'Concatenate'):
        if not isinstance(boxlists, list):
            raise ValueError('boxlists should be a list')
        if not boxlists:
            raise ValueError('boxlists should have nonzero length')
        for boxlist in boxlists:
            if not isinstance(boxlist, box_list.BoxList):
                raise ValueError(
                    'all elements of boxlists should be BoxList objects')
        concatenated = box_list.BoxList(
            tf.concat([boxlist.get() for boxlist in boxlists], 0))
        if fields is None:
            fields = boxlists[0].get_extra_fields()
        for field in fields:
            first_field_shape = boxlists[0].get_field(
                field).get_shape().as_list()
            first_field_shape[0] = -1
            if None in first_field_shape:
                raise ValueError(
                    'field %s must have fully defined shape except for the'
                    ' 0th dimension.' % field)
            for boxlist in boxlists:
                if not boxlist.has_field(field):
                    raise ValueError(
                        'boxlist must contain all requested fields')
                field_shape = boxlist.get_field(field).get_shape().as_list()
                field_shape[0] = -1
                if field_shape != first_field_shape:
                    raise ValueError(
                        'field %s must have same shape for all boxlists '
                        'except for the 0th dimension.' % field)
            concatenated_field = tf.concat(
                [boxlist.get_field(field) for boxlist in boxlists], 0)
            concatenated.add_field(field, concatenated_field)
        return concatenated
Example #8
0
    def label_anchors(self, gt_boxes, gt_labels):
        """Labels anchors with ground truth inputs.

    Args:
      gt_boxes: A float tensor with shape [N, 4] representing groundtruth boxes.
        For each row, it stores [y0, x0, y1, x1] for four corners of a box.
      gt_labels: A integer tensor with shape [N, 1] representing groundtruth
        classes.
    Returns:
      cls_targets_dict: ordered dictionary with keys
        [min_level, min_level+1, ..., max_level]. The values are tensor with
        shape [height_l, width_l, num_anchors * num_classes]. The
        height_l and width_l represent the dimension of class logits at l-th
        level.
      box_targets_dict: ordered dictionary with keys
        [min_level, min_level+1, ..., max_level]. The values are tensor with
        shape [height_l, width_l, num_anchors * 4]. The height_l and
        width_l represent the dimension of bounding box regression output at
        l-th level.
      num_positives: scalar tensor storing number of positives in an image.
    """
        gt_box_list = box_list.BoxList(gt_boxes)
        anchor_box_list = box_list.BoxList(self._anchors.boxes)

        # cls_weights, box_weights are not used
        cls_targets, _, box_targets, _, matches = self._target_assigner.assign(
            anchor_box_list, gt_box_list, gt_labels)

        # class labels start from 1 and the background class = -1
        cls_targets -= 1

        # create one-hot labels
        cls_targets_one_hot = tf.one_hot(tf.cast(cls_targets, dtype=tf.int32),
                                         self._num_classes)
        cls_targets_one_hot = tf.reshape(cls_targets_one_hot,
                                         [-1, self._num_classes])

        cls_targets_dict = self._unpack_labels(cls_targets_one_hot)
        box_targets_dict = self._unpack_labels(box_targets)
        num_positives = tf.reduce_sum(
            tf.cast(tf.not_equal(matches.match_results, -1), tf.float32))

        return cls_targets_dict, box_targets_dict, num_positives
Example #9
0
 def _assign_targets(self, gt_boxes_list, gt_labels_list):
     """
     Assign gt targets
     Args:
          gt_boxes_list: a list of 2-D tensor of shape [num_boxes, 4] containing coordinates of gt boxes
          gt_labels_list: a list of 2-D one-hot tensors of shape [num_boxes, num_classes] containing gt classes
     Returns:
         batch_cls_targets: class tensor with shape [batch_size, num_anchors, num_classes]
         batch_reg_target: box tensor with shape [batch_size, num_anchors, 4]
         match_list: a list of matcher.Match object encoding the match between anchors and gt boxes for each image
                     of the batch, with rows corresponding to gt-box and columns corresponding to anchors
     """
     gt_boxlist_list = [box_list.BoxList(boxes) for boxes in gt_boxes_list]
     gt_labels_with_bg = [
         tf.pad(gt_class, [[0, 0], [1, 0]], mode='CONSTANT')
         for gt_class in gt_labels_list
     ]
     anchors = box_list.BoxList(self._anchors)
     return batch_assign_targets(self._target_assigner, anchors,
                                 gt_boxlist_list, gt_labels_with_bg)
Example #10
0
def sample_boxes_by_jittering(boxlist,
                              num_boxes_to_sample,
                              stddev=0.1,
                              scope=None):
    """Samples num_boxes_to_sample boxes by jittering around boxlist boxes.

  It is possible that this function might generate boxes with size 0. The larger
  the stddev, this is more probable. For a small stddev of 0.1 this probability
  is very small.

  Args:
    boxlist: A boxlist containing N boxes in normalized coordinates.
    num_boxes_to_sample: A positive integer containing the number of boxes to
      sample.
    stddev: Standard deviation. This is used to draw random offsets for the
      box corners from a normal distribution. The offset is multiplied by the
      box size so will be larger in terms of pixels for larger boxes.
    scope: Name scope.

  Returns:
    sampled_boxlist: A boxlist containing num_boxes_to_sample boxes in
      normalized coordinates.
  """
    with tf.name_scope(scope, 'SampleBoxesByJittering'):
        num_boxes = boxlist.num_boxes()
        box_indices = tf.random_uniform([num_boxes_to_sample],
                                        minval=0,
                                        maxval=num_boxes,
                                        dtype=tf.int32)
        sampled_boxes = tf.gather(boxlist.get(), box_indices)
        sampled_boxes_height = sampled_boxes[:, 2] - sampled_boxes[:, 0]
        sampled_boxes_width = sampled_boxes[:, 3] - sampled_boxes[:, 1]
        rand_miny_gaussian = tf.random_normal([num_boxes_to_sample],
                                              stddev=stddev)
        rand_minx_gaussian = tf.random_normal([num_boxes_to_sample],
                                              stddev=stddev)
        rand_maxy_gaussian = tf.random_normal([num_boxes_to_sample],
                                              stddev=stddev)
        rand_maxx_gaussian = tf.random_normal([num_boxes_to_sample],
                                              stddev=stddev)
        miny = rand_miny_gaussian * sampled_boxes_height + sampled_boxes[:, 0]
        minx = rand_minx_gaussian * sampled_boxes_width + sampled_boxes[:, 1]
        maxy = rand_maxy_gaussian * sampled_boxes_height + sampled_boxes[:, 2]
        maxx = rand_maxx_gaussian * sampled_boxes_width + sampled_boxes[:, 3]
        maxy = tf.maximum(miny, maxy)
        maxx = tf.maximum(minx, maxx)
        sampled_boxes = tf.stack([miny, minx, maxy, maxx], axis=1)
        sampled_boxes = tf.maximum(tf.minimum(sampled_boxes, 1.0), 0.0)
        return box_list.BoxList(sampled_boxes)
Example #11
0
 def _batch_decode(self, box_encodings):
     """
     Decode batch of box encodings with respect to anchors
     Args:
         box_encodings: box prediction tensor with shape [batch_size, num_anchors, 4]
     Returns:
         decoded_boxes: decoded box tensor with same shape as input tensor
     """
     input_shape = shape_utils.combined_static_and_dynamic_shape(
         box_encodings)
     batch_size = input_shape[0]
     tiled_anchor_boxes = tf.tile(tf.expand_dims(self._anchors, 0),
                                  [batch_size, 1, 1])
     tiled_anchor_boxlist = box_list.BoxList(
         tf.reshape(tiled_anchor_boxes, [-1, 4]))
     decoded_boxes = self._box_coder.decode(
         tf.reshape(box_encodings, [-1, self._box_coder.code_size]),
         tiled_anchor_boxlist)
     return tf.reshape(decoded_boxes.get(), [batch_size, -1, 4])
Example #12
0
def gather(boxlist, indices, fields=None, scope=None, use_static_shapes=False):
    """Gather boxes from BoxList according to indices and return new BoxList.

  By default, `gather` returns boxes corresponding to the input index list, as
  well as all additional fields stored in the boxlist (indexing into the
  first dimension).  However one can optionally only gather from a
  subset of fields.

  Args:
    boxlist: BoxList holding N boxes
    indices: a rank-1 tensor of type int32 / int64
    fields: (optional) list of fields to also gather from.  If None (default),
      all fields are gathered from.  Pass an empty fields list to only gather
      the box coordinates.
    scope: name scope.
    use_static_shapes: Whether to use an implementation with static shape
      gurantees.

  Returns:
    subboxlist: a BoxList corresponding to the subset of the input BoxList
    specified by indices
  Raises:
    ValueError: if specified field is not contained in boxlist or if the
      indices are not of type int32
  """
    with tf.name_scope(scope, 'Gather'):
        if len(indices.shape.as_list()) != 1:
            raise ValueError('indices should have rank 1')
        if indices.dtype != tf.int32 and indices.dtype != tf.int64:
            raise ValueError('indices should be an int32 / int64 tensor')
        gather_op = tf.gather
        subboxlist = box_list.BoxList(gather_op(boxlist.get(), indices))
        if fields is None:
            fields = boxlist.get_extra_fields()
        fields += ['boxes']
        for field in fields:
            if not boxlist.has_field(field):
                raise ValueError('boxlist must contain all specified fields')
            subfieldlist = gather_op(boxlist.get_field(field), indices)
            subboxlist.add_field(field, subfieldlist)
        return subboxlist
Example #13
0
def pad_or_clip_box_list(boxlist, num_boxes, scope=None):
    """Pads or clips all fields of a BoxList.

  Args:
    boxlist: A BoxList with arbitrary of number of boxes.
    num_boxes: First num_boxes in boxlist are kept.
      The fields are zero-padded if num_boxes is bigger than the
      actual number of boxes.
    scope: name scope.

  Returns:
    BoxList with all fields padded or clipped.
  """
    with tf.name_scope(scope, 'PadOrClipBoxList'):
        subboxlist = box_list.BoxList(
            shape_utils.pad_or_clip_tensor(boxlist.get(), num_boxes))
        for field in boxlist.get_extra_fields():
            subfield = shape_utils.pad_or_clip_tensor(boxlist.get_field(field),
                                                      num_boxes)
            subboxlist.add_field(field, subfield)
        return subboxlist
Example #14
0
def clip_to_window(boxlist, window, filter_nonoverlapping=True, scope=None):
    """Clip bounding boxes to a window.

  This op clips any input bounding boxes (represented by bounding box
  corners) to a window, optionally filtering out boxes that do not
  overlap at all with the window.

  Args:
    boxlist: BoxList holding M_in boxes
    window: a tensor of shape [4] representing the [y_min, x_min, y_max, x_max]
      window to which the op should clip boxes.
    filter_nonoverlapping: whether to filter out boxes that do not overlap at
      all with the window.
    scope: name scope.

  Returns:
    a BoxList holding M_out boxes where M_out <= M_in
  """
    with tf.name_scope(scope, 'ClipToWindow'):
        y_min, x_min, y_max, x_max = tf.split(value=boxlist.get(),
                                              num_or_size_splits=4,
                                              axis=1)
        win_y_min, win_x_min, win_y_max, win_x_max = tf.unstack(window)
        y_min_clipped = tf.maximum(tf.minimum(y_min, win_y_max), win_y_min)
        y_max_clipped = tf.maximum(tf.minimum(y_max, win_y_max), win_y_min)
        x_min_clipped = tf.maximum(tf.minimum(x_min, win_x_max), win_x_min)
        x_max_clipped = tf.maximum(tf.minimum(x_max, win_x_max), win_x_min)
        clipped = box_list.BoxList(
            tf.concat(
                [y_min_clipped, x_min_clipped, y_max_clipped, x_max_clipped],
                1))
        clipped = _copy_extra_fields(clipped, boxlist)
        if filter_nonoverlapping:
            areas = area(clipped)
            nonzero_area_indices = tf.cast(
                tf.reshape(tf.where(tf.greater(areas, 0.0)), [-1]), tf.int32)
            clipped = gather(clipped, nonzero_area_indices)
        return clipped
Example #15
0
    def _create_regression_targets(self, anchors, groundtruth_boxes, match):
        """Returns a regression target for each anchor.

    Args:
      anchors: a BoxList representing N anchors
      groundtruth_boxes: a BoxList representing M groundtruth_boxes
      match: a matcher.Match object

    Returns:
      reg_targets: a float32 tensor with shape [N, box_code_dimension]
    """
        matched_gt_boxes = match.gather_based_on_match(
            groundtruth_boxes.get(),
            unmatched_value=tf.zeros(4),
            ignored_value=tf.zeros(4))
        matched_gt_boxlist = box_list.BoxList(matched_gt_boxes)
        if groundtruth_boxes.has_field(KEYPOINTS_FIELD_NAME):
            groundtruth_keypoints = groundtruth_boxes.get_field(
                KEYPOINTS_FIELD_NAME)
            matched_keypoints = match.gather_based_on_match(
                groundtruth_keypoints,
                unmatched_value=tf.zeros(
                    groundtruth_keypoints.get_shape()[1:]),
                ignored_value=tf.zeros(groundtruth_keypoints.get_shape()[1:]))
            matched_gt_boxlist.add_field(KEYPOINTS_FIELD_NAME,
                                         matched_keypoints)
        matched_reg_targets = self._box_coder.encode(matched_gt_boxlist,
                                                     anchors)
        match_results_shape = shape_utils.combined_static_and_dynamic_shape(
            match.match_results)

        # Zero out the unmatched and ignored regression targets.
        unmatched_ignored_reg_targets = tf.tile(
            self._default_regression_target(), [match_results_shape[0], 1])
        matched_anchors_mask = match.matched_column_indicator()
        reg_targets = tf.where(matched_anchors_mask, matched_reg_targets,
                               unmatched_ignored_reg_targets)
        return reg_targets
Example #16
0
        def _parse_example(data):
            with tf.name_scope('augmentation'):
                source_id = data['source_id']
                image = data['image']  # dtype uint8
                raw_shape = tf.shape(image)
                boxes = data['groundtruth_boxes']
                classes = tf.reshape(data['groundtruth_classes'], [-1, 1])

                # Only 80 of the 90 COCO classes are used.
                class_map = tf.convert_to_tensor(ssd_constants.CLASS_MAP)
                classes = tf.gather(class_map, classes)
                classes = tf.cast(classes, dtype=tf.float32)

                if self._is_training:
                    image, boxes, classes = ssd_crop(image, boxes, classes)
                    # ssd_crop resizes and returns image of dtype float32 and does not
                    # change its range (i.e., value in between 0--255). Divide by 255.
                    # converts it to [0, 1] range. Not doing this before cropping to
                    # avoid dtype cast (which incurs additional memory copy).
                    image /= 255.0

                    # random_horizontal_flip() is hard coded to flip with 50% chance.
                    image, boxes = preprocessor.random_horizontal_flip(
                        image=image, boxes=boxes)

                    # TODO(shibow): Investigate the parameters for color jitter.
                    image = color_jitter(image,
                                         brightness=0.125,
                                         contrast=0.5,
                                         saturation=0.5,
                                         hue=0.05)

                    if params['use_bfloat16']:
                        image = tf.cast(image, dtype=tf.bfloat16)

                    encoded_classes, encoded_boxes, num_matched_boxes = encode_labels(
                        boxes, classes)

                    # TODO(taylorrobie): Check that this cast is valid.
                    encoded_classes = tf.cast(encoded_classes, tf.int32)

                    labels = {
                        ssd_constants.NUM_MATCHED_BOXES: num_matched_boxes,
                        ssd_constants.BOXES: encoded_boxes,
                        ssd_constants.CLASSES: tf.squeeze(encoded_classes,
                                                          axis=1),
                    }
                    # This is for dataloader visualization; actual model doesn't use this.
                    if params['visualize_dataloader']:
                        box_coder = faster_rcnn_box_coder.FasterRcnnBoxCoder(
                            scale_factors=ssd_constants.BOX_CODER_SCALES)
                        decoded_boxes = tf.expand_dims(box_coder.decode(
                            rel_codes=tf.squeeze(encoded_boxes),
                            anchors=box_list.BoxList(
                                tf.convert_to_tensor(
                                    DefaultBoxes()('ltrb')))).get(),
                                                       axis=0)
                        labels['decoded_boxes'] = tf.squeeze(decoded_boxes)

                    return image, labels

                else:
                    image = tf.image.resize_images(
                        image,
                        size=(ssd_constants.IMAGE_SIZE,
                              ssd_constants.IMAGE_SIZE))
                    # resize_image returns image of dtype float32 and does not change its
                    # range. Divide by 255 to convert image to [0, 1] range.
                    image /= 255.

                    if params['use_bfloat16']:
                        image = tf.cast(image, dtype=tf.bfloat16)

                    def trim_and_pad(inp_tensor, dim_1):
                        """Limit the number of boxes, and pad if necessary."""
                        inp_tensor = inp_tensor[:ssd_constants.
                                                MAX_NUM_EVAL_BOXES]
                        num_pad = ssd_constants.MAX_NUM_EVAL_BOXES - tf.shape(
                            inp_tensor)[0]
                        inp_tensor = tf.pad(inp_tensor, [[0, num_pad], [0, 0]])
                        return tf.reshape(
                            inp_tensor,
                            [ssd_constants.MAX_NUM_EVAL_BOXES, dim_1])

                    boxes, classes = trim_and_pad(boxes,
                                                  4), trim_and_pad(classes, 1)

                    sample = {
                        ssd_constants.IMAGE:
                        image,
                        ssd_constants.BOXES:
                        boxes,
                        ssd_constants.CLASSES:
                        classes,
                        ssd_constants.SOURCE_ID:
                        tf.string_to_number(source_id, tf.int32),
                        ssd_constants.RAW_SHAPE:
                        raw_shape,
                    }

                    if not self._is_training and self._count > params[
                            'eval_samples']:
                        sample[ssd_constants.IS_PADDED] = data[
                            ssd_constants.IS_PADDED]
                    return sample
Example #17
0
def boolean_mask(boxlist,
                 indicator,
                 fields=None,
                 scope=None,
                 use_static_shapes=False,
                 indicator_sum=None):
    """Select boxes from BoxList according to indicator and return new BoxList.

  `boolean_mask` returns the subset of boxes that are marked as "True" by the
  indicator tensor. By default, `boolean_mask` returns boxes corresponding to
  the input index list, as well as all additional fields stored in the boxlist
  (indexing into the first dimension).  However one can optionally only draw
  from a subset of fields.

  Args:
    boxlist: BoxList holding N boxes
    indicator: a rank-1 boolean tensor
    fields: (optional) list of fields to also gather from.  If None (default),
      all fields are gathered from.  Pass an empty fields list to only gather
      the box coordinates.
    scope: name scope.
    use_static_shapes: Whether to use an implementation with static shape
      gurantees.
    indicator_sum: An integer containing the sum of `indicator` vector. Only
      required if `use_static_shape` is True.

  Returns:
    subboxlist: a BoxList corresponding to the subset of the input BoxList
      specified by indicator
  Raises:
    ValueError: if `indicator` is not a rank-1 boolean tensor.
  """
    with tf.name_scope(scope, 'BooleanMask'):
        if indicator.shape.ndims != 1:
            raise ValueError('indicator should have rank 1')
        if indicator.dtype != tf.bool:
            raise ValueError('indicator should be a boolean tensor')
        if use_static_shapes:
            if not (indicator_sum and isinstance(indicator_sum, int)):
                raise ValueError('`indicator_sum` must be a of type int')
            selected_positions = tf.to_float(indicator)
            indexed_positions = tf.cast(tf.multiply(
                tf.cumsum(selected_positions), selected_positions),
                                        dtype=tf.int32)
            one_hot_selector = tf.one_hot(indexed_positions - 1,
                                          indicator_sum,
                                          dtype=tf.float32)
            sampled_indices = tf.cast(tf.tensordot(tf.to_float(
                tf.range(tf.shape(indicator)[0])),
                                                   one_hot_selector,
                                                   axes=[0, 0]),
                                      dtype=tf.int32)
            return gather(boxlist, sampled_indices, use_static_shapes=True)
        else:
            subboxlist = box_list.BoxList(
                tf.boolean_mask(boxlist.get(), indicator))
            if fields is None:
                fields = boxlist.get_extra_fields()
            for field in fields:
                if not boxlist.has_field(field):
                    raise ValueError(
                        'boxlist must contain all specified fields')
                subfieldlist = tf.boolean_mask(boxlist.get_field(field),
                                               indicator)
                subboxlist.add_field(field, subfieldlist)
            return subboxlist
Example #18
0
def box_voting(selected_boxes, pool_boxes, iou_thresh=0.5):
    """Performs box voting as described in S. Gidaris and N. Komodakis, ICCV 2015.

  Performs box voting as described in 'Object detection via a multi-region &
  semantic segmentation-aware CNN model', Gidaris and Komodakis, ICCV 2015. For
  each box 'B' in selected_boxes, we find the set 'S' of boxes in pool_boxes
  with iou overlap >= iou_thresh. The location of B is set to the weighted
  average location of boxes in S (scores are used for weighting). And the score
  of B is set to the average score of boxes in S.

  Args:
    selected_boxes: BoxList containing a subset of boxes in pool_boxes. These
      boxes are usually selected from pool_boxes using non max suppression.
    pool_boxes: BoxList containing a set of (possibly redundant) boxes.
    iou_thresh: (float scalar) iou threshold for matching boxes in
      selected_boxes and pool_boxes.

  Returns:
    BoxList containing averaged locations and scores for each box in
    selected_boxes.

  Raises:
    ValueError: if
      a) selected_boxes or pool_boxes is not a BoxList.
      b) if iou_thresh is not in [0, 1].
      c) pool_boxes does not have a scores field.
  """
    if not 0.0 <= iou_thresh <= 1.0:
        raise ValueError('iou_thresh must be between 0 and 1')
    if not isinstance(selected_boxes, box_list.BoxList):
        raise ValueError('selected_boxes must be a BoxList')
    if not isinstance(pool_boxes, box_list.BoxList):
        raise ValueError('pool_boxes must be a BoxList')
    if not pool_boxes.has_field('scores'):
        raise ValueError('pool_boxes must have a \'scores\' field')

    iou_ = iou(selected_boxes, pool_boxes)
    match_indicator = tf.to_float(tf.greater(iou_, iou_thresh))
    num_matches = tf.reduce_sum(match_indicator, 1)
    # TODO(kbanoop): Handle the case where some boxes in selected_boxes do not
    # match to any boxes in pool_boxes. For such boxes without any matches, we
    # should return the original boxes without voting.
    match_assert = tf.Assert(tf.reduce_all(tf.greater(num_matches, 0)), [
        'Each box in selected_boxes must match with at least one box '
        'in pool_boxes.'
    ])

    scores = tf.expand_dims(pool_boxes.get_field('scores'), 1)
    scores_assert = tf.Assert(tf.reduce_all(tf.greater_equal(scores, 0)),
                              ['Scores must be non negative.'])

    with tf.control_dependencies([scores_assert, match_assert]):
        sum_scores = tf.matmul(match_indicator, scores)
    averaged_scores = tf.reshape(sum_scores, [-1]) / num_matches

    box_locations = tf.matmul(match_indicator,
                              pool_boxes.get() * scores) / sum_scores
    averaged_boxes = box_list.BoxList(box_locations)
    _copy_extra_fields(averaged_boxes, selected_boxes)
    averaged_boxes.add_field('scores', averaged_scores)
    return averaged_boxes
Example #19
0
def _model_fn(features, labels, mode, params, model):
    """Model defination for the SSD model based on ResNet-50.

  Args:
    features: the input image tensor with shape [batch_size, height, width, 3].
      The height and width are fixed and equal.
    labels: the input labels in a dictionary. The labels include class targets
      and box targets which are dense label maps. The labels are generated from
      get_input_fn function in data/dataloader.py
    mode: the mode of TPUEstimator including TRAIN, EVAL, and PREDICT.
    params: the dictionary defines hyperparameters of model. The default
      settings are in default_hparams function in this file.
    model: the SSD model outputs class logits and box regression outputs.

  Returns:
    spec: the EstimatorSpec or TPUEstimatorSpec to run training, evaluation,
      or prediction.
  """
    if mode == tf.estimator.ModeKeys.PREDICT:
        labels = features
        features = labels.pop('image')

    features -= tf.constant(constants.NORMALIZATION_MEAN,
                            shape=[1, 1, 3],
                            dtype=features.dtype)
    COEF_STD = 1.0 / tf.constant(
        constants.NORMALIZATION_STD, shape=[1, 1, 3], dtype=features.dtype)
    features *= COEF_STD

    def _model_outputs():
        return model(features,
                     params,
                     is_training_bn=(mode == tf.estimator.ModeKeys.TRAIN))

    if params['dtype'] == 'bf16':
        with tf.compat.v1.tpu.bfloat16_scope():
            cls_outputs, box_outputs = _model_outputs()
            levels = cls_outputs.keys()
            for level in levels:
                cls_outputs[level] = tf.cast(cls_outputs[level], tf.float32)
                box_outputs[level] = tf.cast(box_outputs[level], tf.float32)
    else:
        cls_outputs, box_outputs = _model_outputs()
        levels = cls_outputs.keys()

    # First check if it is in PREDICT mode.
    if mode == tf.estimator.ModeKeys.PREDICT:
        flattened_cls, flattened_box = concat_outputs(cls_outputs, box_outputs,
                                                      True)
        ssd_box_coder = faster_rcnn_box_coder.FasterRcnnBoxCoder(
            scale_factors=constants.BOX_CODER_SCALES)

        anchors = box_list.BoxList(
            tf.convert_to_tensor(dataloader.DefaultBoxes()('ltrb')))

        decoded_boxes = box_coder.batch_decode(encoded_boxes=flattened_box,
                                               box_coder=ssd_box_coder,
                                               anchors=anchors)

        pred_scores = tf.nn.softmax(flattened_cls, axis=2)

        pred_scores, indices = select_top_k_scores(
            pred_scores, constants.MAX_NUM_EVAL_BOXES)
        predictions = dict(
            labels,
            indices=indices,
            pred_scores=pred_scores,
            pred_box=decoded_boxes,
        )

        if params['visualize_dataloader']:
            # this is for inference visualization.
            predictions['image'] = features

        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)

    # Load pretrained model from checkpoint.
    if params['resnet_checkpoint'] and mode == tf.estimator.ModeKeys.TRAIN:

        def scaffold_fn():
            """Loads pretrained model through scaffold function."""
            tf.train.init_from_checkpoint(
                params['resnet_checkpoint'], {
                    '/': 'resnet%s/' % constants.RESNET_DEPTH,
                })
            return tf.train.Scaffold()
    else:
        scaffold_fn = None

    # Set up training loss and learning rate.
    update_learning_rate_schedule_parameters(params)
    global_step = tf.train.get_or_create_global_step()
    learning_rate = learning_rate_schedule(params, global_step)
    # cls_loss and box_loss are for logging. only total_loss is optimized.
    loss, cls_loss, box_loss = detection_loss(cls_outputs, box_outputs, labels)

    total_loss = loss + params['weight_decay'] * tf.add_n(
        [tf.nn.l2_loss(v) for v in tf.trainable_variables()])

    if mode == tf.estimator.ModeKeys.TRAIN:
        optimizer = tf.train.MomentumOptimizer(learning_rate,
                                               momentum=constants.MOMENTUM)

        if params['distributed_optimizer']:
            optimizer = params['distributed_optimizer'](optimizer)

        # Batch norm requires update_ops to be added as a train_op dependency.
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        train_op = tf.group(optimizer.minimize(total_loss, global_step),
                            update_ops)
        return model_fn_lib.EstimatorSpec(mode=mode,
                                          loss=loss,
                                          train_op=train_op,
                                          scaffold=scaffold_fn())

    if mode == tf.estimator.ModeKeys.EVAL:
        raise NotImplementedError
def _strict_random_crop_image(image,
                              boxes,
                              labels,
                              min_object_covered=1.0,
                              aspect_ratio_range=(0.75, 1.33),
                              area_range=(0.1, 1.0),
                              overlap_thresh=0.3):
    """Performs random crop.

  Note: boxes will be clipped to the crop. Keypoint coordinates that are
  outside the crop will be set to NaN, which is consistent with the original
  keypoint encoding for non-existing keypoints. This function always crops
  the image and is supposed to be used by `random_crop_image` function which
  sometimes returns image unchanged.

  Args:
    image: rank 3 float32 tensor containing 1 image -> [height, width, channels]
           with pixel values varying between [0, 1].
    boxes: rank 2 float32 tensor containing the bounding boxes with shape
           [num_instances, 4].
           Boxes are in normalized form meaning their coordinates vary
           between [0, 1].
           Each row is in the form of [ymin, xmin, ymax, xmax].
    labels: rank 1 int32 tensor containing the object classes.
    min_object_covered: the cropped image must cover at least this fraction of
                        at least one of the input bounding boxes.
    aspect_ratio_range: allowed range for aspect ratio of cropped image.
    area_range: allowed range for area ratio between cropped image and the
                original image.
    overlap_thresh: minimum overlap thresh with new cropped
                    image to keep the box.

  Returns:
    image: image which is the same rank as input image.
    boxes: boxes which is the same rank as input boxes.
           Boxes are in normalized form.
    labels: new labels.

    If label_scores, multiclass_scores, masks, or keypoints is not None, the
    function also returns:
    label_scores: rank 1 float32 tensor with shape [num_instances].
  """
    with tf.name_scope('RandomCropImage', values=[image, boxes]):
        image_shape = tf.shape(image)

        # boxes are [N, 4]. Lets first make them [N, 1, 4].
        boxes_expanded = tf.expand_dims(
            tf.clip_by_value(boxes, clip_value_min=0.0, clip_value_max=1.0), 1)
        im_box_begin, im_box_size, im_box = tf.image.sample_distorted_bounding_box(
            image_shape,
            bounding_boxes=boxes_expanded,
            min_object_covered=min_object_covered,
            aspect_ratio_range=aspect_ratio_range,
            area_range=area_range,
            max_attempts=100,
            use_image_if_no_bounding_boxes=True)

        new_image = tf.slice(image, im_box_begin, im_box_size)
        new_image.set_shape([None, None, image.get_shape()[2]])

        # [1, 4]
        im_box_rank2 = tf.squeeze(im_box, squeeze_dims=[0])
        # [4]
        im_box_rank1 = tf.squeeze(im_box)

        boxlist = box_list.BoxList(boxes)
        boxlist.add_field('labels', labels)

        im_boxlist = box_list.BoxList(im_box_rank2)

        # remove boxes that are outside cropped image
        boxlist, inside_window_ids = box_list_ops.prune_completely_outside_window(
            boxlist, im_box_rank1)

        # remove boxes that are outside image
        overlapping_boxlist, keep_ids = box_list_ops.prune_non_overlapping_boxes(
            boxlist, im_boxlist, overlap_thresh)

        # change the coordinate of the remaining boxes
        new_labels = overlapping_boxlist.get_field('labels')
        new_boxlist = box_list_ops.change_coordinate_frame(
            overlapping_boxlist, im_box_rank1)
        new_boxes = new_boxlist.get()
        new_boxes = tf.clip_by_value(new_boxes,
                                     clip_value_min=0.0,
                                     clip_value_max=1.0)

        result = [new_image, new_boxes, new_labels]
        return tuple(result)
Example #21
0
def multiclass_non_max_suppression(boxes,
                                   scores,
                                   score_thresh,
                                   iou_thresh,
                                   max_size_per_class,
                                   max_total_size=0,
                                   clip_window=None,
                                   scope=None):
    """Multi-class version of non maximum suppression.

  This op greedily selects a subset of detection bounding boxes, pruning
  away boxes that have high IOU (intersection over union) overlap (> thresh)
  with already selected boxes.  It operates independently for each class for
  which scores are provided (via the scores field of the input box_list),
  pruning boxes with score less than a provided threshold prior to
  applying NMS.

  Please note that this operation is performed on *all* classes, therefore any
  background classes should be removed prior to calling this function.

  Selected boxes are guaranteed to be sorted in decreasing order by score (but
  the sort is not guaranteed to be stable).

  Args:
    boxes: A [k, q, 4] float32 tensor containing k detections. `q` can be either
      number of classes or 1 depending on whether a separate box is predicted
      per class.
    scores: A [k, num_classes] float32 tensor containing the scores for each of
      the k detections. The scores have to be non-negative when
      pad_to_max_output_size is True.
    score_thresh: scalar threshold for score (low scoring boxes are removed).
    iou_thresh: scalar threshold for IOU (new boxes that have high IOU overlap
      with previously selected boxes are removed).
    max_size_per_class: maximum number of retained boxes per class.
    max_total_size: maximum number of boxes retained over all classes. By
      default returns all boxes retained after capping boxes per class.
    clip_window: A float32 tensor of the form [y_min, x_min, y_max, x_max]
      representing the window to clip and normalize boxes to before performing
      non-max suppression.
    scope: name scope.

  Returns:
    A tuple of sorted_boxes and num_valid_nms_boxes. The sorted_boxes is a
      BoxList holds M boxes with a rank-1 scores field representing
      corresponding scores for each box with scores sorted in decreasing order
      and a rank-1 classes field representing a class label for each box. The
      num_valid_nms_boxes is a 0-D integer tensor representing the number of
      valid elements in `BoxList`, with the valid elements appearing first.

  Raises:
    ValueError: if iou_thresh is not in [0, 1] or if input boxlist does not have
      a valid scores field.
  """
    if not 0 <= iou_thresh <= 1.0:
        raise ValueError('iou_thresh must be between 0 and 1')
    if scores.shape.ndims != 2:
        raise ValueError('scores field must be of rank 2')
    if scores.shape[1].value is None:
        raise ValueError('scores must have statically defined second '
                         'dimension')
    if boxes.shape.ndims != 3:
        raise ValueError('boxes must be of rank 3.')
    if not (boxes.shape[1].value == scores.shape[1].value
            or boxes.shape[1].value == 1):
        raise ValueError('second dimension of boxes must be either 1 or equal '
                         'to the second dimension of scores')
    if boxes.shape[2].value != 4:
        raise ValueError('last dimension of boxes must be of size 4.')

    with tf.name_scope(scope, 'MultiClassNonMaxSuppression'):
        num_scores = tf.shape(scores)[0]
        num_classes = scores.get_shape()[1]

        selected_boxes_list = []
        num_valid_nms_boxes_cumulative = tf.constant(0)
        per_class_boxes_list = tf.unstack(boxes, axis=1)
        boxes_ids = (range(num_classes) if len(per_class_boxes_list) > 1 else
                     [0] * num_classes.value)
        for class_idx, boxes_idx in zip(range(num_classes), boxes_ids):
            per_class_boxes = per_class_boxes_list[boxes_idx]
            boxlist_and_class_scores = box_list.BoxList(per_class_boxes)
            class_scores = tf.reshape(
                tf.slice(scores, [0, class_idx], tf.stack([num_scores, 1])),
                [-1])

            boxlist_and_class_scores.add_field("scores", class_scores)
            max_selection_size = tf.minimum(
                max_size_per_class, boxlist_and_class_scores.num_boxes())
            selected_indices = tf.image.non_max_suppression(
                boxlist_and_class_scores.get(),
                boxlist_and_class_scores.get_field("scores"),
                max_selection_size,
                iou_threshold=iou_thresh,
                score_threshold=score_thresh)
            num_valid_nms_boxes = tf.shape(selected_indices)[0]
            selected_indices = tf.concat([
                selected_indices,
                tf.zeros(max_selection_size - num_valid_nms_boxes, tf.int32)
            ], 0)
            nms_result = box_list_ops.gather(boxlist_and_class_scores,
                                             selected_indices)
            # Make the scores -1 for invalid boxes.
            valid_nms_boxes_indx = tf.less(tf.range(max_selection_size),
                                           num_valid_nms_boxes)
            nms_scores = nms_result.get_field("scores")
            nms_result.add_field(
                "scores",
                tf.where(valid_nms_boxes_indx, nms_scores,
                         -1 * tf.ones(max_selection_size)))
            num_valid_nms_boxes_cumulative += num_valid_nms_boxes

            nms_result.add_field(
                "classes",
                (tf.zeros_like(nms_result.get_field("scores")) + class_idx))
            selected_boxes_list.append(nms_result)
        selected_boxes = box_list_ops.concatenate(selected_boxes_list)
        sorted_boxes = box_list_ops.sort_by_field(selected_boxes, "scores")
        if clip_window is not None:
            # When pad_to_max_output_size is False, it prunes the boxes with zero
            # area.
            sorted_boxes = box_list_ops.clip_to_window(
                sorted_boxes, clip_window, filter_nonoverlapping=True)
            # Set the scores of boxes with zero area to -1 to keep the default
            # behaviour of pruning out zero area boxes.
            sorted_boxes_size = tf.shape(sorted_boxes.get())[0]
            non_zero_box_area = tf.cast(box_list_ops.area(sorted_boxes),
                                        tf.bool)
            sorted_boxes_scores = tf.where(non_zero_box_area,
                                           sorted_boxes.get_field("scores"),
                                           -1 * tf.ones(sorted_boxes_size))
            sorted_boxes.add_field("scores", sorted_boxes_scores)
            num_valid_nms_boxes_cumulative = tf.reduce_sum(
                tf.cast(tf.greater_equal(sorted_boxes_scores, 0), tf.int32))
            sorted_boxes = box_list_ops.sort_by_field(sorted_boxes, "scores")

        if max_total_size:
            max_total_size = tf.minimum(max_total_size,
                                        sorted_boxes.num_boxes())
            sorted_boxes = box_list_ops.gather(sorted_boxes,
                                               tf.range(max_total_size))
            num_valid_nms_boxes_cumulative = tf.where(
                max_total_size > num_valid_nms_boxes_cumulative,
                num_valid_nms_boxes_cumulative, max_total_size)

        return sorted_boxes, num_valid_nms_boxes_cumulative
Example #22
0
def _model_fn(features, labels, mode, params, model):
    """Model defination for the SSD model based on ResNet-50.

  Args:
    features: the input image tensor with shape [batch_size, height, width, 3].
      The height and width are fixed and equal.
    labels: the input labels in a dictionary. The labels include class targets
      and box targets which are dense label maps. The labels are generated from
      get_input_fn function in data/dataloader.py
    mode: the mode of TPUEstimator including TRAIN, EVAL, and PREDICT.
    params: the dictionary defines hyperparameters of model. The default
      settings are in default_hparams function in this file.
    model: the SSD model outputs class logits and box regression outputs.

  Returns:
    spec: the EstimatorSpec or TPUEstimatorSpec to run training, evaluation,
      or prediction.
  """
    if mode == tf.estimator.ModeKeys.PREDICT:
        labels = features
        features = labels.pop('image')

    # Manually apply the double transpose trick for training data.
    if params['transpose_input'] and mode != tf.estimator.ModeKeys.PREDICT:
        features = tf.transpose(features, [3, 0, 1, 2])
        labels[ssd_constants.BOXES] = tf.transpose(labels[ssd_constants.BOXES],
                                                   [2, 0, 1])
        labels[ssd_constants.CLASSES] = tf.transpose(
            labels[ssd_constants.CLASSES], [2, 0, 1])

    # Normalize the image to zero mean and unit variance.
    mlperf_log.ssd_print(key=mlperf_log.DATA_NORMALIZATION_MEAN,
                         value=ssd_constants.NORMALIZATION_MEAN)
    mlperf_log.ssd_print(key=mlperf_log.DATA_NORMALIZATION_STD,
                         value=ssd_constants.NORMALIZATION_STD)

    features -= tf.constant(ssd_constants.NORMALIZATION_MEAN,
                            shape=[1, 1, 3],
                            dtype=features.dtype)

    features /= tf.constant(ssd_constants.NORMALIZATION_STD,
                            shape=[1, 1, 3],
                            dtype=features.dtype)

    def _model_outputs():
        return model(features,
                     params,
                     is_training_bn=(mode == tf.estimator.ModeKeys.TRAIN))

    if params['use_bfloat16']:
        with bfloat16.bfloat16_scope():
            cls_outputs, box_outputs = _model_outputs()
            levels = cls_outputs.keys()
            for level in levels:
                cls_outputs[level] = tf.cast(cls_outputs[level], tf.float32)
                box_outputs[level] = tf.cast(box_outputs[level], tf.float32)
    else:
        cls_outputs, box_outputs = _model_outputs()
        levels = cls_outputs.keys()

    # First check if it is in PREDICT mode.
    if mode == tf.estimator.ModeKeys.PREDICT:
        flattened_cls, flattened_box = concat_outputs(cls_outputs, box_outputs)
        mlperf_log.ssd_print(key=mlperf_log.SCALES,
                             value=ssd_constants.BOX_CODER_SCALES)
        ssd_box_coder = faster_rcnn_box_coder.FasterRcnnBoxCoder(
            scale_factors=ssd_constants.BOX_CODER_SCALES)

        anchors = box_list.BoxList(
            tf.convert_to_tensor(dataloader.DefaultBoxes()('ltrb')))

        decoded_boxes = box_coder.batch_decode(encoded_boxes=flattened_box,
                                               box_coder=ssd_box_coder,
                                               anchors=anchors)

        pred_scores = tf.nn.softmax(flattened_cls, axis=2)

        pred_scores, indices = select_top_k_scores(
            pred_scores, ssd_constants.MAX_NUM_EVAL_BOXES)

        predictions = dict(
            labels,
            indices=indices,
            pred_scores=pred_scores,
            pred_box=decoded_boxes,
        )

        if params['visualize_dataloader']:
            # this is for inference visualization.
            predictions['image'] = features

        if params['use_tpu']:
            return tpu_estimator.TPUEstimatorSpec(mode=mode,
                                                  predictions=predictions)

        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)

    # Load pretrained model from checkpoint.
    if params['resnet_checkpoint'] and mode == tf.estimator.ModeKeys.TRAIN:

        def scaffold_fn():
            """Loads pretrained model through scaffold function."""
            tf.train.init_from_checkpoint(
                params['resnet_checkpoint'], {
                    '/': 'resnet%s/' % ssd_constants.RESNET_DEPTH,
                })
            return tf.train.Scaffold()
    else:
        scaffold_fn = None

    # Set up training loss and learning rate.
    update_learning_rate_schedule_parameters(params)
    global_step = tf.train.get_or_create_global_step()
    learning_rate = learning_rate_schedule(params, global_step)
    mlperf_log.ssd_print(key=mlperf_log.OPT_LR, deferred=True)
    # cls_loss and box_loss are for logging. only total_loss is optimized.
    total_loss, cls_loss, box_loss = detection_loss(cls_outputs, box_outputs,
                                                    labels)

    total_loss += params['weight_decay'] * tf.add_n(
        [tf.nn.l2_loss(v) for v in tf.trainable_variables()])

    host_call = None
    if mode == tf.estimator.ModeKeys.TRAIN:
        optimizer = tf.train.MomentumOptimizer(learning_rate,
                                               momentum=ssd_constants.MOMENTUM)
        if params['use_tpu']:
            optimizer = tpu_optimizer.CrossShardOptimizer(optimizer)

        mlperf_log.ssd_print(key=mlperf_log.OPT_NAME,
                             value='tf.train.MomentumOptimizer')
        # TODO(wangtao): figure out how to log learning rate.
        # mlperf_log.ssd_print(key=mlperf_log.OPT_LR, value=learning_rate)
        mlperf_log.ssd_print(key=mlperf_log.OPT_MOMENTUM,
                             value=ssd_constants.MOMENTUM)
        mlperf_log.ssd_print(key=mlperf_log.OPT_WEIGHT_DECAY,
                             value=params['weight_decay'])

        # Batch norm requires update_ops to be added as a train_op dependency.
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        if params['device'] == 'gpu':
            # GPU uses tf.group to avoid dependency overhead on update_ops; also,
            # multi-GPU requires a different EstimatorSpec class object
            train_op = tf.group(optimizer.minimize(total_loss, global_step),
                                update_ops)
            return model_fn_lib.EstimatorSpec(mode=mode,
                                              loss=total_loss,
                                              train_op=train_op,
                                              scaffold=scaffold_fn())
        else:
            with tf.control_dependencies(update_ops):
                train_op = optimizer.minimize(total_loss, global_step)

        if params['use_host_call']:

            def host_call_fn(global_step, total_loss, cls_loss, box_loss,
                             learning_rate):
                """Training host call. Creates scalar summaries for training metrics.

        This function is executed on the CPU and should not directly reference
        any Tensors in the rest of the `model_fn`. To pass Tensors from the
        model to the `metric_fn`, provide as part of the `host_call`. See
        https://www.tensorflow.org/api_docs/python/tf/contrib/tpu/TPUEstimatorSpec
        for more information.

        Arguments should match the list of `Tensor` objects passed as the second
        element in the tuple passed to `host_call`.

        Args:
          global_step: `Tensor with shape `[batch, ]` for the global_step.
          total_loss: `Tensor` with shape `[batch, ]` for the training loss.
          cls_loss: `Tensor` with shape `[batch, ]` for the training cls loss.
          box_loss: `Tensor` with shape `[batch, ]` for the training box loss.
          learning_rate: `Tensor` with shape `[batch, ]` for the learning_rate.

        Returns:
          List of summary ops to run on the CPU host.
        """
                # Outfeed supports int32 but global_step is expected to be int64.
                global_step = tf.reduce_mean(global_step)
                # Host call fns are executed FLAGS.iterations_per_loop times after one
                # TPU loop is finished, setting max_queue value to the same as number of
                # iterations will make the summary writer only flush the data to storage
                # once per loop.
                with (tf.contrib.summary.create_file_writer(
                        params['model_dir'],
                        max_queue=params['iterations_per_loop']).as_default()):
                    with tf.contrib.summary.always_record_summaries():
                        tf.contrib.summary.scalar('total_loss',
                                                  tf.reduce_mean(total_loss),
                                                  step=global_step)
                        tf.contrib.summary.scalar('cls_loss',
                                                  tf.reduce_mean(cls_loss),
                                                  step=global_step)
                        tf.contrib.summary.scalar('box_loss',
                                                  tf.reduce_mean(box_loss),
                                                  step=global_step)
                        tf.contrib.summary.scalar(
                            'learning_rate',
                            tf.reduce_mean(learning_rate),
                            step=global_step)

                        return tf.contrib.summary.all_summary_ops()

            # To log the loss, current learning rate, and epoch for Tensorboard, the
            # summary op needs to be run on the host CPU via host_call. host_call
            # expects [batch_size, ...] Tensors, thus reshape to introduce a batch
            # dimension. These Tensors are implicitly concatenated to
            # [params['batch_size']].
            global_step_t = tf.reshape(global_step, [1])
            total_loss_t = tf.reshape(total_loss, [1])
            cls_loss_t = tf.reshape(cls_loss, [1])
            box_loss_t = tf.reshape(box_loss, [1])
            learning_rate_t = tf.reshape(learning_rate, [1])
            host_call = (host_call_fn, [
                global_step_t, total_loss_t, cls_loss_t, box_loss_t,
                learning_rate_t
            ])
    else:
        train_op = None

    eval_metrics = None
    if mode == tf.estimator.ModeKeys.EVAL:
        raise NotImplementedError

    return tpu_estimator.TPUEstimatorSpec(mode=mode,
                                          loss=total_loss,
                                          train_op=train_op,
                                          host_call=host_call,
                                          eval_metrics=eval_metrics,
                                          scaffold_fn=scaffold_fn)
Example #23
0
        def _parse_example(data):
            with tf.name_scope('augmentation'):
                source_id = data['source_id']
                image = tf.image.convert_image_dtype(data['image'],
                                                     dtype=tf.float32)
                raw_shape = tf.shape(image)
                boxes = data['groundtruth_boxes']
                classes = tf.reshape(data['groundtruth_classes'], [-1, 1])

                # Only 80 of the 90 COCO classes are used.
                class_map = tf.convert_to_tensor(ssd_constants.CLASS_MAP)
                classes = tf.gather(class_map, classes)
                classes = tf.cast(classes, dtype=tf.float32)

                if self._is_training:
                    image, boxes, classes = ssd_crop(image, boxes, classes)

                    # random_horizontal_flip() is hard coded to flip with 50% chance.
                    mlperf_log.ssd_print(
                        key=mlperf_log.RANDOM_FLIP_PROBABILITY, value=0.5)
                    image, boxes = preprocessor.random_horizontal_flip(
                        image=image, boxes=boxes)

                    # TODO(shibow): Investigate the parameters for color jitter.
                    image = color_jitter(image,
                                         brightness=0.125,
                                         contrast=0.5,
                                         saturation=0.5,
                                         hue=0.05)
                    image = normalize_image(image)

                    if params['use_bfloat16']:
                        image = tf.cast(image, dtype=tf.bfloat16)

                    encoded_classes, encoded_boxes, num_matched_boxes = encode_labels(
                        boxes, classes)

                    # TODO(taylorrobie): Check that this cast is valid.
                    encoded_classes = tf.cast(encoded_classes, tf.int32)

                    labels = {
                        ssd_constants.NUM_MATCHED_BOXES: num_matched_boxes,
                        ssd_constants.BOXES: encoded_boxes,
                        ssd_constants.CLASSES: encoded_classes,
                    }
                    # This is for dataloader visualization; actual model doesn't use this.
                    if params['visualize_dataloader']:
                        box_coder = faster_rcnn_box_coder.FasterRcnnBoxCoder(
                            scale_factors=ssd_constants.BOX_CODER_SCALES)
                        decoded_boxes = tf.expand_dims(box_coder.decode(
                            rel_codes=tf.squeeze(encoded_boxes),
                            anchors=box_list.BoxList(
                                tf.convert_to_tensor(
                                    DefaultBoxes()('ltrb')))).get(),
                                                       axis=0)
                        labels['decoded_boxes'] = tf.squeeze(decoded_boxes)

                    return image, labels

                else:
                    mlperf_log.ssd_print(key=mlperf_log.INPUT_SIZE,
                                         value=ssd_constants.IMAGE_SIZE)
                    image = tf.image.resize_images(
                        image[tf.newaxis, :, :, :],
                        size=(ssd_constants.IMAGE_SIZE,
                              ssd_constants.IMAGE_SIZE))[0, :, :, :]

                    image = normalize_image(image)

                    if params['use_bfloat16']:
                        image = tf.cast(image, dtype=tf.bfloat16)

                    def trim_and_pad(inp_tensor, dim_1):
                        """Limit the number of boxes, and pad if necessary."""
                        inp_tensor = inp_tensor[:ssd_constants.
                                                MAX_NUM_EVAL_BOXES]
                        num_pad = ssd_constants.MAX_NUM_EVAL_BOXES - tf.shape(
                            inp_tensor)[0]
                        inp_tensor = tf.pad(inp_tensor, [[0, num_pad], [0, 0]])
                        return tf.reshape(
                            inp_tensor,
                            [ssd_constants.MAX_NUM_EVAL_BOXES, dim_1])

                    boxes, classes = trim_and_pad(boxes,
                                                  4), trim_and_pad(classes, 1)

                    return {
                        ssd_constants.IMAGE:
                        image,
                        ssd_constants.BOXES:
                        boxes,
                        ssd_constants.CLASSES:
                        classes,
                        ssd_constants.SOURCE_ID:
                        tf.string_to_number(source_id, tf.int32),
                        ssd_constants.RAW_SHAPE:
                        raw_shape,
                    }